From a0c863118e087600f63f9ba4a3241884f827cbc6 Mon Sep 17 00:00:00 2001
From: Martin Bauer <martin.bauer@fau.de>
Date: Wed, 31 Oct 2018 13:48:09 +0100
Subject: [PATCH] CUDA Parallel Streams & UniformGridGPU Benchmark

- LB on GPU, Uniform Grid Benchmark app
- helper class to schedule tasks to multiple CUDA streams
---
 apps/benchmarks/CMakeLists.txt                |   1 +
 apps/benchmarks/UniformGridGPU/CMakeLists.txt |   6 +
 .../UniformGridGPU/UniformGridGPU.cpp         | 180 ++++++++++++++++++
 .../UniformGridGPU/UniformGridGPU.gen.py      |  59 ++++++
 .../UniformGridGPU/UniformGridGPU.prm         |  27 +++
 .../UniformGridGPU/UniformGridGPUSmall.prm    |  27 +++
 src/cuda/CMakeLists.txt                       |   2 +-
 src/cuda/CudaRAII.h                           | 138 ++++++++------
 src/cuda/ParallelStreams.cpp                  | 113 +++++++++++
 src/cuda/ParallelStreams.h                    | 100 ++++++++++
 src/cuda/communication/UniformGPUScheme.h     |  14 +-
 .../communication/UniformGPUScheme.impl.h     |  86 ++++-----
 12 files changed, 647 insertions(+), 106 deletions(-)
 create mode 100644 apps/benchmarks/UniformGridGPU/CMakeLists.txt
 create mode 100644 apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
 create mode 100644 apps/benchmarks/UniformGridGPU/UniformGridGPU.gen.py
 create mode 100644 apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
 create mode 100644 apps/benchmarks/UniformGridGPU/UniformGridGPUSmall.prm
 create mode 100644 src/cuda/ParallelStreams.cpp
 create mode 100644 src/cuda/ParallelStreams.h

diff --git a/apps/benchmarks/CMakeLists.txt b/apps/benchmarks/CMakeLists.txt
index 4fa0c4cd0..2e835d671 100644
--- a/apps/benchmarks/CMakeLists.txt
+++ b/apps/benchmarks/CMakeLists.txt
@@ -10,3 +10,4 @@ add_subdirectory( PeriodicGranularGas )
 add_subdirectory( PoiseuilleChannel )
 add_subdirectory( SchaeferTurek )
 add_subdirectory( UniformGrid )
+add_subdirectory( UniformGridGPU )
diff --git a/apps/benchmarks/UniformGridGPU/CMakeLists.txt b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
new file mode 100644
index 000000000..0698ea06c
--- /dev/null
+++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt
@@ -0,0 +1,6 @@
+
+waLBerla_link_files_to_builddir( "*.prm" )
+
+waLBerla_add_executable ( NAME UniformGridBenchmarkGPU
+                          FILES UniformGridGPU.cpp UniformGridGPU.gen.py
+                          DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk )
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
new file mode 100644
index 000000000..0b2ff6b34
--- /dev/null
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp
@@ -0,0 +1,180 @@
+#include "core/Environment.h"
+#include "python_coupling/CreateConfig.h"
+#include "blockforest/Initialization.h"
+#include "lbm/field/PdfField.h"
+#include "lbm/field/AddToStorage.h"
+#include "field/FlagField.h"
+#include "field/AddToStorage.h"
+#include "lbm/communication/PdfFieldPackInfo.h"
+#include "lbm/vtk/VTKOutput.h"
+#include "lbm/PerformanceLogger.h"
+#include "blockforest/communication/UniformBufferedScheme.h"
+#include "timeloop/all.h"
+#include "core/math/Random.h"
+#include "geometry/all.h"
+#include "cuda/HostFieldAllocator.h"
+#include "cuda/communication/GPUPackInfo.h"
+#include "cuda/ParallelStreams.h"
+#include "core/timing/TimingPool.h"
+#include "core/timing/RemainingTimeLogger.h"
+#include "cuda/AddGPUFieldToStorage.h"
+#include "cuda/communication/UniformGPUScheme.h"
+#include "lbm/sweeps/CellwiseSweep.h"
+#include "domain_decomposition/SharedSweep.h"
+
+#include "UniformGridGPU_LatticeModel.h"
+#include "UniformGridGPU_LbKernel.h"
+#include "UniformGridGPU_PackInfo.h"
+#include "UniformGridGPU_UBB.h"
+#include "UniformGridGPU_NoSlip.h"
+
+
+using namespace walberla;
+
+using LatticeModel_T = lbm::UniformGridGPU_LatticeModel;
+
+using Stencil_T = LatticeModel_T::Stencil;
+using CommunicationStencil_T = LatticeModel_T::CommunicationStencil;
+using PdfField_T = lbm::PdfField<LatticeModel_T>;
+using CommScheme_T = cuda::communication::UniformGPUScheme<CommunicationStencil_T>;
+
+using flag_t = walberla::uint8_t;
+using FlagField_T = FlagField<flag_t>;
+
+
+
+int main( int argc, char **argv )
+{
+   mpi::Environment env( argc, argv );
+
+   for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
+   {
+      auto config = *cfg;
+      auto blocks = blockforest::createUniformBlockGridFromConfig( config );
+
+      // Reading parameters
+      auto parameters = config->getOneBlock( "Parameters" );
+      const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
+      const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 ));
+      const Vector3<real_t> initialVelocity = parameters.getParameter< Vector3<real_t> >( "initialVelocity", Vector3<real_t>() );
+
+      // Creating fields
+      auto latticeModel = LatticeModel_T( omega );
+      BlockDataID pdfFieldCpuID = lbm::addPdfFieldToStorage( blocks, "pdfs on CPU", latticeModel, initialVelocity, real_t(1), field::fzyx );
+      BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage<PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
+      BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" );
+
+      // Boundaries
+      const FlagUID fluidFlagUID( "Fluid" );
+      auto boundariesConfig = config->getOneBlock( "Boundaries" );
+      geometry::initBoundaryHandling<FlagField_T>(*blocks, flagFieldID, boundariesConfig);
+      geometry::setNonBoundaryCellsToDomain<FlagField_T>(*blocks, flagFieldID, fluidFlagUID);
+
+      lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID);
+      lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID);
+      //lbm::GeneratedFixedDensity pressure(blocks, pdfFieldGpuID);
+
+      ubb.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID );
+      noSlip.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID );
+      //pressure.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("pressure"), fluidFlagUID );
+
+
+
+      // Communication setup
+      bool overlapCommunication = parameters.getParameter<bool>( "overlapCommunication", true );
+      bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
+
+      int streamHighPriority = 0;
+      int streamLowPriority = 0;
+      WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority) );
+
+      pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega );
+      lbKernel.setOuterPriority( streamHighPriority );
+      CommScheme_T gpuComm( blocks, cudaEnabledMPI );
+      gpuComm.addPackInfo( make_shared<pystencils::UniformGridGPU_PackInfo>( pdfFieldGpuID ));
+
+
+      auto defaultStream = cuda::StreamRAII::newPriorityStream( streamLowPriority );
+      auto innerOuterStreams = cuda::ParallelStreams( streamHighPriority );
+      auto boundaryOuterStreams = cuda::ParallelStreams( streamHighPriority );
+      auto boundaryInnerStreams = cuda::ParallelStreams( streamHighPriority );
+
+      auto overlapTimeStep = [&]()
+      {
+         auto innerOuterSection = innerOuterStreams.parallelSection( defaultStream );
+
+         innerOuterSection.run([&]( auto innerStream )
+         {
+            for( auto &block: *blocks )
+            {
+               {
+                  auto p = boundaryInnerStreams.parallelSection( innerStream );
+                  p.run( [&]( auto s ) { ubb.inner( &block, s ); } );
+                  p.run( [&]( auto s ) { noSlip.inner( &block, s ); } );
+               }
+               lbKernel.inner( &block, innerStream );
+            }
+         });
+
+         innerOuterSection.run([&]( auto outerStream )
+         {
+            gpuComm( outerStream );
+            for( auto &block: *blocks )
+            {
+               {
+                  auto p = boundaryOuterStreams.parallelSection( outerStream );
+                  p.run( [&]( auto s ) { ubb.outer( &block, s ); } );
+                  p.run( [&]( auto s ) { noSlip.outer( &block, s ); } );
+               }
+               lbKernel.outer( &block, outerStream );
+            }
+         });
+      };
+
+
+      auto boundaryStreams = cuda::ParallelStreams( streamHighPriority );
+      auto normalTimeStep = [&]()
+      {
+         gpuComm();
+         for( auto &block: *blocks )
+         {
+            {
+               auto p = boundaryStreams.parallelSection( defaultStream );
+               p.run( [&]( auto s ) { ubb( &block, s ); } );
+               p.run( [&]( auto s ) { noSlip( &block, s ); } );
+            }
+            lbKernel( &block );
+         }
+      };
+
+      SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps );
+      std::function<void()> timeStep = overlapCommunication ? std::function<void()>( overlapTimeStep ) :
+                                                              std::function<void()>( normalTimeStep );
+      timeLoop.add() << BeforeFunction( timeStep  )
+                     << Sweep( []( IBlock * ) {}, "time step" );
+
+      // VTK
+      uint_t vtkWriteFrequency = parameters.getParameter<uint_t>( "vtkWriteFrequency", 0 );
+      if( vtkWriteFrequency > 0 )
+      {
+         auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
+                                                          "simulation_step", false, true, true, false, 0 );
+         vtkOutput->addCellDataWriter(
+                 make_shared<lbm::VelocityVTKWriter<LatticeModel_T> >( pdfFieldCpuID, "Velocity" ));
+         vtkOutput->addCellDataWriter( make_shared<lbm::DensityVTKWriter<LatticeModel_T> >( pdfFieldCpuID, "Density" ));
+         vtkOutput->addBeforeFunction(
+                 cuda::fieldCpyFunctor<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID ));
+         timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
+      }
+
+      auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", 3.0 ); // in seconds
+      timeLoop.addFuncAfterTimeStep( timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps(), remainingTimeLoggerFrequency ), "remaining time logger" );
+
+      lbm::PerformanceLogger<FlagField_T> performanceLogger(blocks, flagFieldID, fluidFlagUID, 500);
+      timeLoop.addFuncAfterTimeStep( performanceLogger, "remaining time logger" );
+
+      timeLoop.run();
+   }
+
+   return 0;
+}
\ No newline at end of file
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.gen.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU.gen.py
new file mode 100644
index 000000000..731897463
--- /dev/null
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.gen.py
@@ -0,0 +1,59 @@
+import sympy as sp
+from lbmpy_walberla import generate_lattice_model_files
+from lbmpy.creationfunctions import create_lb_update_rule
+from pystencils_walberla.sweep import Sweep
+from lbmpy.boundaries import NoSlip, UBB
+from lbmpy.creationfunctions import create_lb_method
+from lbmpy_walberla.boundary import create_boundary_class
+from pystencils_walberla.cmake_integration import codegen
+
+
+dtype = 'float64'
+
+# LB options
+options = {
+    'method': 'srt',
+    'stencil': 'D3Q19',
+    'relaxation_rate': sp.Symbol("omega"),
+    'field_name': 'pdfs',
+    'compressible': False,
+    'temporary_field_name': 'pdfs_tmp',
+    'optimization': {'cse_global': True,
+                     'cse_pdfs': True,
+                     'double_precision': dtype == 'float64'}
+}
+
+# GPU optimization options
+inner_opt = {'gpu_indexing_params': {'block_size': (128, 1, 1)},  'data_type': dtype}
+outer_opt = {'gpu_indexing_params': {'block_size': (32, 32, 32)}, 'data_type': dtype}
+
+
+def lb_assignments():
+    ur = create_lb_update_rule(**options)
+    return ur.all_assignments
+
+
+def genBoundary():
+    boundary = UBB([0.05, 0, 0], dim=3, name="UniformGridGPU_UBB")
+    return create_boundary_class(boundary, create_lb_method(**options), target='gpu')
+
+
+def genNoSlip():
+    boundary = NoSlip(name='UniformGridGPU_NoSlip')
+    return create_boundary_class(boundary, create_lb_method(**options), target='gpu')
+
+
+generate_lattice_model_files(class_name='UniformGridGPU_LatticeModel', **options)
+
+Sweep.generate_inner_outer_kernel('UniformGridGPU_LbKernel',
+                                  lambda: create_lb_update_rule(**options).all_assignments,
+                                  target='gpu',
+                                  temporary_fields=['pdfs_tmp'],
+                                  field_swaps=[('pdfs', 'pdfs_tmp')],
+                                  optimization=inner_opt,
+                                  outer_optimization=outer_opt)
+
+Sweep.generate_pack_info('UniformGridGPU_PackInfo', lb_assignments, target='gpu')
+
+codegen.register(['UniformGridGPU_UBB.h', 'UniformGridGPU_UBB.cu'], genBoundary)
+codegen.register(['UniformGridGPU_NoSlip.h', 'UniformGridGPU_NoSlip.cu'], genNoSlip)
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
new file mode 100644
index 000000000..53a3c0512
--- /dev/null
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm
@@ -0,0 +1,27 @@
+
+Parameters 
+{
+	omega           1.8;
+	timesteps       10;
+
+	remainingTimeLoggerFrequency 3;
+	vtkWriteFrequency 0;
+
+	overlapCommunication true;
+	cudaEnabledMPI false;
+}
+
+DomainSetup
+{
+   blocks        <  1,    1, 1 >;
+   cellsPerBlock <  300, 300, 150 >;
+   periodic      <  0,    0, 1 >;  
+}
+
+Boundaries 
+{
+	Border { direction W;    walldistance -1;  flag NoSlip; }
+	Border { direction E;    walldistance -1;  flag NoSlip; }
+    Border { direction S;    walldistance -1;  flag NoSlip; }
+    Border { direction N;    walldistance -1;  flag UBB; }
+}
diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPUSmall.prm b/apps/benchmarks/UniformGridGPU/UniformGridGPUSmall.prm
new file mode 100644
index 000000000..c6b8ae931
--- /dev/null
+++ b/apps/benchmarks/UniformGridGPU/UniformGridGPUSmall.prm
@@ -0,0 +1,27 @@
+
+Parameters 
+{
+	omega           1.8;
+	timesteps       2;
+
+	remainingTimeLoggerFrequency 3;
+	vtkWriteFrequency 0;
+
+	overlapCommunication false;
+	cudaEnabledMPI false;
+}
+
+DomainSetup
+{
+   blocks        <  1,    1, 1 >;
+   cellsPerBlock <  50, 20, 10 >;
+   periodic      <  0,    0, 1 >;  
+}
+
+Boundaries 
+{
+	Border { direction W;    walldistance -1;  flag NoSlip; }
+	Border { direction E;    walldistance -1;  flag NoSlip; }
+    Border { direction S;    walldistance -1;  flag NoSlip; }
+    Border { direction N;    walldistance -1;  flag UBB; }
+}
diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt
index b38244fc9..c4ff50133 100644
--- a/src/cuda/CMakeLists.txt
+++ b/src/cuda/CMakeLists.txt
@@ -6,4 +6,4 @@
 
 waLBerla_add_module( DEPENDS blockforest core communication domain_decomposition python_coupling field stencil BUILD_ONLY_IF_FOUND CUDA )
 
-###################################################################################################                        
\ No newline at end of file
+###################################################################################################
\ No newline at end of file
diff --git a/src/cuda/CudaRAII.h b/src/cuda/CudaRAII.h
index f179348f7..5e1d7a3e7 100644
--- a/src/cuda/CudaRAII.h
+++ b/src/cuda/CudaRAII.h
@@ -18,67 +18,97 @@
 //! \author Martin Bauer <martin.bauer@fau.de>
 //
 //======================================================================================================================
+#pragma once
 
 #include "ErrorChecking.h"
 
-
 namespace walberla {
 namespace cuda {
 
 
-class StreamRAII
-{
-public:
-   ~StreamRAII() {
-       if( stream_ != 0 ) {
-           WALBERLA_CUDA_CHECK( cudaStreamDestroy( stream_ ));
-       }
-   }
-
-   StreamRAII( StreamRAII && other) {
-      stream_ = other.stream_;
-      other.stream_ = 0;
-   }
-
-   StreamRAII(const StreamRAII&) = delete;
-   void operator=( const StreamRAII &) = delete;
-   operator cudaStream_t() const { return stream_; }
-
-
-   static StreamRAII defaultStream() {
-      StreamRAII result;
-      result.stream_ = 0;
-      return result;
-   }
-
-   static StreamRAII newPriorityStream(int priority) {
-      StreamRAII result;
-      WALBERLA_CUDA_CHECK( cudaStreamCreateWithPriority( &result.stream_, cudaStreamDefault, priority ));
-      return result;
-   }
-
-   static StreamRAII newStream() {
-      StreamRAII result;
-      WALBERLA_CUDA_CHECK( cudaStreamCreate( &result.stream_));
-      return result;
-   }
-
-private:
-   StreamRAII() {}
-   cudaStream_t stream_;
-};
-
-
-struct EventRAII
-{
-    explicit EventRAII() { WALBERLA_CUDA_CHECK( cudaEventCreate(&event) ); }
-    ~EventRAII()         { WALBERLA_CUDA_CHECK( cudaEventDestroy(event) ); }
-    EventRAII(const EventRAII &)   = delete;
-    void operator=( const EventRAII &) = delete;
-    operator cudaEvent_t() const { return event; }
-
-    cudaEvent_t event;
-};
+   class StreamRAII
+   {
+   public:
+      ~StreamRAII()
+      {
+         if( stream_ != 0 ) {
+            WALBERLA_CUDA_CHECK( cudaStreamDestroy( stream_ ));
+         }
+      }
+
+      StreamRAII( StreamRAII &&other )
+      {
+         stream_ = other.stream_;
+         other.stream_ = 0;
+      }
+
+      StreamRAII( const StreamRAII & ) = delete;
+
+      void operator=( const StreamRAII & ) = delete;
+
+      operator cudaStream_t() const { return stream_; }
+
+
+      static StreamRAII defaultStream()
+      {
+         StreamRAII result;
+         result.stream_ = 0;
+         return result;
+      }
+
+      static StreamRAII newPriorityStream( int priority )
+      {
+         StreamRAII result;
+         WALBERLA_CUDA_CHECK( cudaStreamCreateWithPriority( &result.stream_, cudaStreamDefault, priority ));
+         return result;
+      }
+
+      static StreamRAII newStream()
+      {
+         StreamRAII result;
+         WALBERLA_CUDA_CHECK( cudaStreamCreate( &result.stream_ ));
+         return result;
+      }
+
+   private:
+      StreamRAII() {}
+
+      cudaStream_t stream_;
+   };
+
+
+   class EventRAII
+   {
+   public:
+      explicit EventRAII()
+      {
+         event = cudaEvent_t();
+         WALBERLA_CUDA_CHECK( cudaEventCreate( &event ));
+      }
+
+      ~EventRAII()
+      {
+         if( event != cudaEvent_t() )
+         {
+            WALBERLA_CUDA_CHECK( cudaEventDestroy( event ));
+         }
+      }
+
+      EventRAII( const EventRAII & ) = delete;
+
+      void operator=( const EventRAII & ) = delete;
+
+      EventRAII( EventRAII &&other )
+      {
+         event = other.event;
+         other.event = cudaEvent_t();
+      }
+
+      operator cudaEvent_t() const { return event; }
+
+   private:
+      cudaEvent_t event;
+   };
 
 
 } // namespace cuda
diff --git a/src/cuda/ParallelStreams.cpp b/src/cuda/ParallelStreams.cpp
new file mode 100644
index 000000000..d2fff0416
--- /dev/null
+++ b/src/cuda/ParallelStreams.cpp
@@ -0,0 +1,113 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file ParallelStreams.cpp
+//! \ingroup cuda
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+
+
+#include "cuda/ParallelStreams.h"
+
+namespace walberla {
+namespace cuda {
+
+
+   ParallelSection::ParallelSection(ParallelStreams * parent, cudaStream_t mainStream)
+     : parent_( parent ), mainStream_( mainStream ), counter_( 0 )
+   {
+      WALBERLA_CUDA_CHECK( cudaEventCreate(&startEvent_) );
+      WALBERLA_CUDA_CHECK( cudaEventRecord( startEvent_, mainStream_ ) );
+   }
+
+   ParallelSection::~ParallelSection()
+   {
+      synchronize();
+      WALBERLA_CUDA_CHECK( cudaEventDestroy(startEvent_) );
+   }
+
+   void ParallelSection::next()
+   {
+      if( counter_ > 0 ) {
+         WALBERLA_CUDA_CHECK( cudaEventRecord( parent_->events_[counter_ - 1], parent_->sideStreams_[counter_ - 1] ) );
+      }
+      else {
+         WALBERLA_CUDA_CHECK( cudaEventRecord( parent_->mainEvent_, mainStream_ ) );
+      }
+      ++counter_;
+
+      parent_->ensureSize( counter_ );
+
+      WALBERLA_CUDA_CHECK( cudaStreamWaitEvent( stream(), startEvent_, 0 ));
+   }
+
+   void ParallelSection::run(const std::function<void( cudaStream_t)> & f)
+   {
+      f( stream() );
+      next();
+   }
+
+   void ParallelSection::synchronize()
+   {
+      for( uint_t i=0; i < counter_; ++i )
+         for( uint_t j=0; j < counter_; ++j )
+         {
+            if( i == j )
+               continue;
+
+            auto & event  = i == 0 ? parent_->mainEvent_ : parent_->events_[i - 1];
+            cudaStream_t stream = j == 0 ? mainStream_ : parent_->sideStreams_[j - 1];
+            WALBERLA_CUDA_CHECK( cudaStreamWaitEvent( stream, event, 0 ));
+         }
+
+      WALBERLA_CUDA_CHECK( cudaEventRecord( startEvent_, mainStream_ ) );
+   }
+
+   cudaStream_t ParallelSection::stream()
+   {
+      return counter_ == 0 ? mainStream_ : parent_->sideStreams_[counter_ - 1];
+   }
+
+
+
+   ParallelStreams::ParallelStreams( int priority )
+           : streamPriority_( priority )
+   {
+   }
+
+   ParallelSection ParallelStreams::parallelSection( cudaStream_t stream ) {
+      return ParallelSection(this, stream);
+   }
+
+   void ParallelStreams::ensureSize( uint_t size ) {
+      for( uint_t i = sideStreams_.size(); i < size; ++i )
+      {
+         sideStreams_.emplace_back( StreamRAII::newPriorityStream(streamPriority_));
+         events_.emplace_back( EventRAII() );
+      }
+   }
+
+   void ParallelStreams::setStreamPriority( int priority )
+   {
+      streamPriority_ = priority;
+      sideStreams_.clear();
+      events_.clear();
+   }
+
+
+
+} // namespace cuda
+} // namespace walberla
\ No newline at end of file
diff --git a/src/cuda/ParallelStreams.h b/src/cuda/ParallelStreams.h
new file mode 100644
index 000000000..8f6348015
--- /dev/null
+++ b/src/cuda/ParallelStreams.h
@@ -0,0 +1,100 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file ParallelStreams.h
+//! \ingroup cuda
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+#pragma once
+#include "cuda/ErrorChecking.h"
+#include "cuda/CudaRAII.h"
+
+#include <vector>
+
+namespace walberla {
+namespace cuda {
+
+   class ParallelStreams;
+
+   class ParallelSection
+   {
+   public:
+      ~ParallelSection();
+      void run( const std::function<void( cudaStream_t )> &f );
+
+   private:
+      friend class ParallelStreams;
+
+      ParallelSection( ParallelStreams *parent, cudaStream_t mainStream );
+      void synchronize();
+
+      cudaStream_t stream();
+      void next();
+
+      ParallelStreams * parent_;
+      cudaStream_t mainStream_;
+      cudaEvent_t startEvent_;
+      uint_t counter_;
+   };
+
+
+   //*******************************************************************************************************************
+   /*!
+    * Helper class to run CUDA operations on parallel streams
+    *
+    * This class introduces "side streams" that overlap with one "main stream". In a parallel section, multiple
+    * kernels (or other CUDA operations) are scheduled to the streams. The first "run" is scheduled on the main stream
+    * all subsequent operations on the side streams. The passed priority affects only the side streams. When
+    * the parallel section goes out of scope the side streams are synchronized to the main stream via CUDA events.
+    *
+    * Example:
+    *
+    * \code
+    * ParallelStreams streams;
+    * {
+    *   // new scope for the parallel section
+    *   ParallelSection sec = streams.parallelSection( mainCudaStream );
+    *   sec.run([&] ( cudaStream_t sideStream ) {
+    *       // run something on the side stream
+    *   });
+    *   // after the parallel section goes out of scope the side streams are synchronized to the main stream
+    * }
+    *
+    * \endcode
+    *
+    */
+   //*******************************************************************************************************************
+   class ParallelStreams
+   {
+   public:
+      ParallelStreams( int priority = 0 );
+      ParallelSection parallelSection( cudaStream_t stream );
+      void setStreamPriority( int priority );
+
+   private:
+      friend class ParallelSection;
+
+      void ensureSize( uint_t size );
+
+      std::vector<StreamRAII> sideStreams_;
+      std::vector<EventRAII> events_;
+      EventRAII mainEvent_;
+      int streamPriority_;
+   };
+
+
+} // namespace cuda
+} // namespace walberla
\ No newline at end of file
diff --git a/src/cuda/communication/UniformGPUScheme.h b/src/cuda/communication/UniformGPUScheme.h
index b7e0bf06c..b22a3be6c 100644
--- a/src/cuda/communication/UniformGPUScheme.h
+++ b/src/cuda/communication/UniformGPUScheme.h
@@ -31,6 +31,7 @@
 #include "cuda/CudaRAII.h"
 #include "cuda/communication/GeneratedGPUPackInfo.h"
 #include "cuda/communication/CustomMemoryBuffer.h"
+#include "cuda/ParallelStreams.h"
 
 #include <chrono>
 #include <thread>
@@ -46,23 +47,21 @@ namespace communication {
    {
    public:
        explicit UniformGPUScheme( weak_ptr_wrapper<StructuredBlockForest> bf,
-                                  const shared_ptr<cuda::EventRAII> & startWaitEvent,
                                   bool sendDirectlyFromGPU = false,
                                   const int tag = 5432 );
 
        void addPackInfo( const shared_ptr<GeneratedGPUPackInfo> &pi );
 
-       void startCommunication();
-       void wait();
+       void startCommunication( cudaStream_t stream = 0);
+       void wait( cudaStream_t stream = 0);
 
-      void operator()()         { communicate(); }
-      inline void communicate() { startCommunication(); wait(); }
+      void operator()( cudaStream_t stream = 0 )         { communicate( stream ); }
+      inline void communicate( cudaStream_t stream = 0 ) { startCommunication(stream); wait(stream); }
 
    private:
        void setupCommunication();
 
        weak_ptr_wrapper<StructuredBlockForest> blockForest_;
-       shared_ptr<cuda::EventRAII> startWaitEvent_;
        uint_t forestModificationStamp_;
 
        bool setupBeforeNextCommunication_;
@@ -76,7 +75,8 @@ namespace communication {
        mpi::GenericBufferSystem<GpuBuffer_T, GpuBuffer_T> bufferSystemGPU_;
 
        std::vector<shared_ptr<GeneratedGPUPackInfo> > packInfos_;
-       std::map<stencil::Direction, cuda::StreamRAII> streams_;
+
+       ParallelStreams parallelSectionManager_;
 
        struct Header
        {
diff --git a/src/cuda/communication/UniformGPUScheme.impl.h b/src/cuda/communication/UniformGPUScheme.impl.h
index 2b9ebd32a..0d7667012 100644
--- a/src/cuda/communication/UniformGPUScheme.impl.h
+++ b/src/cuda/communication/UniformGPUScheme.impl.h
@@ -19,6 +19,7 @@
 //
 //======================================================================================================================
 
+#include "cuda/ParallelStreams.h"
 
 namespace walberla {
 namespace cuda {
@@ -27,26 +28,26 @@ namespace communication {
 
 template<typename Stencil>
 UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr_wrapper <StructuredBlockForest> bf,
-                                             const shared_ptr <cuda::EventRAII> &startWaitEvent,
                                              bool sendDirectlyFromGPU,
                                              const int tag )
         : blockForest_( bf ),
-          startWaitEvent_( startWaitEvent ),
           setupBeforeNextCommunication_( true ),
           communicationInProgress_( false ),
           sendFromGPU_( sendDirectlyFromGPU ),
           bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ),
-          bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ) {}
+          bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ),
+          parallelSectionManager_( -1 )
+   {}
 
 
    template<typename Stencil>
-   void UniformGPUScheme<Stencil>::startCommunication()
+   void UniformGPUScheme<Stencil>::startCommunication( cudaStream_t stream )
    {
       WALBERLA_ASSERT( !communicationInProgress_ );
       auto forest = blockForest_.lock();
 
-      if( setupBeforeNextCommunication_ ||
-          forest->getBlockForest().getModificationStamp() != forestModificationStamp_ )
+      auto currentBlockForestStamp = forest->getBlockForest().getModificationStamp();
+      if( setupBeforeNextCommunication_ || currentBlockForestStamp != forestModificationStamp_ )
          setupCommunication();
 
       // Schedule Receives
@@ -61,44 +62,40 @@ UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr_wrapper <StructuredBlockFo
             bufferSystemGPU_.sendBuffer( it.first ).clear();
 
       // Start filling send buffers
-      for( auto &iBlock : *forest )
       {
-         auto block = dynamic_cast< Block * >( &iBlock );
-         for( auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir )
+         auto parallelSection = parallelSectionManager_.parallelSection( stream );
+         for( auto &iBlock : *forest )
          {
-            const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex( *dir );
-            if( block->getNeighborhoodSectionSize( neighborIdx ) == uint_t( 0 ))
-               continue;
-            auto nProcess = mpi::MPIRank( block->getNeighborProcess( neighborIdx, uint_t( 0 )));
-
-            if( streams_.find( *dir ) == streams_.end() )
-            {
-               streams_.emplace( *dir, StreamRAII::newPriorityStream( -1 ));
-            }
-
-            auto &ci = streams_.at( *dir );
-
-            for( auto &pi : packInfos_ )
+            auto block = dynamic_cast< Block * >( &iBlock );
+            for( auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir )
             {
-               auto size = pi->size( *dir, block );
-               auto gpuDataPtr = bufferSystemGPU_.sendBuffer( nProcess ).advanceNoResize( size );
-               WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr );
-               WALBERLA_CUDA_CHECK( cudaStreamWaitEvent( ci, *startWaitEvent_, 0 ));
-               pi->pack( *dir, gpuDataPtr, block, ci );
+               const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex( *dir );
+               if( block->getNeighborhoodSectionSize( neighborIdx ) == uint_t( 0 ))
+                  continue;
+               auto nProcess = mpi::MPIRank( block->getNeighborProcess( neighborIdx, uint_t( 0 )));
 
-               if( !sendFromGPU_ )
+               for( auto &pi : packInfos_ )
                {
-                  auto cpuDataPtr = bufferSystemCPU_.sendBuffer( nProcess ).advanceNoResize( size );
-                  WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr );
-                  WALBERLA_CUDA_CHECK(
-                          cudaMemcpyAsync( cpuDataPtr, gpuDataPtr, size, cudaMemcpyDeviceToHost, ci ));
+                  parallelSection.run([&](auto s) {
+                     auto size = pi->size( *dir, block );
+                     auto gpuDataPtr = bufferSystemGPU_.sendBuffer( nProcess ).advanceNoResize( size );
+                     WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr );
+                     pi->pack( *dir, gpuDataPtr, block, s );
+
+                     if( !sendFromGPU_ )
+                     {
+                        auto cpuDataPtr = bufferSystemCPU_.sendBuffer( nProcess ).advanceNoResize( size );
+                        WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr );
+                        WALBERLA_CUDA_CHECK( cudaMemcpyAsync( cpuDataPtr, gpuDataPtr, size, cudaMemcpyDeviceToHost, s ));
+                     }
+                  });
                }
             }
          }
       }
 
-      // Busy waiting for packing to finish - then send
-      for( auto &ci : streams_ ) WALBERLA_CUDA_CHECK( cudaStreamSynchronize( ci.second ));
+      // wait for packing to finish
+      cudaStreamSynchronize( stream );
 
       if( sendFromGPU_ )
          bufferSystemGPU_.sendAll();
@@ -110,7 +107,7 @@ UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr_wrapper <StructuredBlockFo
 
 
    template<typename Stencil>
-   void UniformGPUScheme<Stencil>::wait()
+   void UniformGPUScheme<Stencil>::wait( cudaStream_t stream )
    {
       WALBERLA_ASSERT( communicationInProgress_ );
 
@@ -118,11 +115,11 @@ UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr_wrapper <StructuredBlockFo
 
       if( sendFromGPU_ )
       {
+         auto parallelSection = parallelSectionManager_.parallelSection( stream );
          for( auto recvInfo = bufferSystemGPU_.begin(); recvInfo != bufferSystemGPU_.end(); ++recvInfo )
          {
             for( auto &header : headers_[recvInfo.rank()] )
             {
-               auto &ci = streams_.at( header.dir );
                auto block = dynamic_cast< Block * >( forest->getBlock( header.blockId ));
 
                for( auto &pi : packInfos_ )
@@ -130,13 +127,16 @@ UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr_wrapper <StructuredBlockFo
                   auto size = pi->size( header.dir, block );
                   auto gpuDataPtr = recvInfo.buffer().advanceNoResize( size );
                   WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr );
-                  pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, ci );
+                  parallelSection.run([&](auto s) {
+                     pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, s );
+                  });
                }
             }
          }
       }
       else
       {
+         auto parallelSection = parallelSectionManager_.parallelSection( stream );
          for( auto recvInfo = bufferSystemCPU_.begin(); recvInfo != bufferSystemCPU_.end(); ++recvInfo )
          {
             using namespace std::chrono_literals;
@@ -145,7 +145,6 @@ UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr_wrapper <StructuredBlockFo
 
             gpuBuffer.clear();
             for( auto &header : headers_[recvInfo.rank()] ) {
-               auto &ci = streams_.at( header.dir );
                auto block = dynamic_cast< Block * >( forest->getBlock( header.blockId ));
 
                for( auto &pi : packInfos_ )
@@ -156,17 +155,16 @@ UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr_wrapper <StructuredBlockFo
                   WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr );
                   WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr );
 
-                  WALBERLA_CUDA_CHECK( cudaMemcpyAsync( gpuDataPtr, cpuDataPtr, size,
-                                                        cudaMemcpyHostToDevice, ci ));
-                  pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, ci );
+                  parallelSection.run([&](auto s) {
+                     WALBERLA_CUDA_CHECK( cudaMemcpyAsync( gpuDataPtr, cpuDataPtr, size,
+                                                           cudaMemcpyHostToDevice, s ));
+                     pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, s );
+                  });
                }
             }
          }
       }
 
-      for( auto &ci : streams_ )
-         WALBERLA_CUDA_CHECK( cudaStreamSynchronize( ci.second ));
-
       communicationInProgress_ = false;
    }
 
-- 
GitLab