From a0c863118e087600f63f9ba4a3241884f827cbc6 Mon Sep 17 00:00:00 2001 From: Martin Bauer <martin.bauer@fau.de> Date: Wed, 31 Oct 2018 13:48:09 +0100 Subject: [PATCH] CUDA Parallel Streams & UniformGridGPU Benchmark - LB on GPU, Uniform Grid Benchmark app - helper class to schedule tasks to multiple CUDA streams --- apps/benchmarks/CMakeLists.txt | 1 + apps/benchmarks/UniformGridGPU/CMakeLists.txt | 6 + .../UniformGridGPU/UniformGridGPU.cpp | 180 ++++++++++++++++++ .../UniformGridGPU/UniformGridGPU.gen.py | 59 ++++++ .../UniformGridGPU/UniformGridGPU.prm | 27 +++ .../UniformGridGPU/UniformGridGPUSmall.prm | 27 +++ src/cuda/CMakeLists.txt | 2 +- src/cuda/CudaRAII.h | 138 ++++++++------ src/cuda/ParallelStreams.cpp | 113 +++++++++++ src/cuda/ParallelStreams.h | 100 ++++++++++ src/cuda/communication/UniformGPUScheme.h | 14 +- .../communication/UniformGPUScheme.impl.h | 86 ++++----- 12 files changed, 647 insertions(+), 106 deletions(-) create mode 100644 apps/benchmarks/UniformGridGPU/CMakeLists.txt create mode 100644 apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp create mode 100644 apps/benchmarks/UniformGridGPU/UniformGridGPU.gen.py create mode 100644 apps/benchmarks/UniformGridGPU/UniformGridGPU.prm create mode 100644 apps/benchmarks/UniformGridGPU/UniformGridGPUSmall.prm create mode 100644 src/cuda/ParallelStreams.cpp create mode 100644 src/cuda/ParallelStreams.h diff --git a/apps/benchmarks/CMakeLists.txt b/apps/benchmarks/CMakeLists.txt index 4fa0c4cd0..2e835d671 100644 --- a/apps/benchmarks/CMakeLists.txt +++ b/apps/benchmarks/CMakeLists.txt @@ -10,3 +10,4 @@ add_subdirectory( PeriodicGranularGas ) add_subdirectory( PoiseuilleChannel ) add_subdirectory( SchaeferTurek ) add_subdirectory( UniformGrid ) +add_subdirectory( UniformGridGPU ) diff --git a/apps/benchmarks/UniformGridGPU/CMakeLists.txt b/apps/benchmarks/UniformGridGPU/CMakeLists.txt new file mode 100644 index 000000000..0698ea06c --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt @@ -0,0 +1,6 @@ + +waLBerla_link_files_to_builddir( "*.prm" ) + +waLBerla_add_executable ( NAME UniformGridBenchmarkGPU + FILES UniformGridGPU.cpp UniformGridGPU.gen.py + DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk ) diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp new file mode 100644 index 000000000..0b2ff6b34 --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp @@ -0,0 +1,180 @@ +#include "core/Environment.h" +#include "python_coupling/CreateConfig.h" +#include "blockforest/Initialization.h" +#include "lbm/field/PdfField.h" +#include "lbm/field/AddToStorage.h" +#include "field/FlagField.h" +#include "field/AddToStorage.h" +#include "lbm/communication/PdfFieldPackInfo.h" +#include "lbm/vtk/VTKOutput.h" +#include "lbm/PerformanceLogger.h" +#include "blockforest/communication/UniformBufferedScheme.h" +#include "timeloop/all.h" +#include "core/math/Random.h" +#include "geometry/all.h" +#include "cuda/HostFieldAllocator.h" +#include "cuda/communication/GPUPackInfo.h" +#include "cuda/ParallelStreams.h" +#include "core/timing/TimingPool.h" +#include "core/timing/RemainingTimeLogger.h" +#include "cuda/AddGPUFieldToStorage.h" +#include "cuda/communication/UniformGPUScheme.h" +#include "lbm/sweeps/CellwiseSweep.h" +#include "domain_decomposition/SharedSweep.h" + +#include "UniformGridGPU_LatticeModel.h" +#include "UniformGridGPU_LbKernel.h" +#include "UniformGridGPU_PackInfo.h" +#include "UniformGridGPU_UBB.h" +#include "UniformGridGPU_NoSlip.h" + + +using namespace walberla; + +using LatticeModel_T = lbm::UniformGridGPU_LatticeModel; + +using Stencil_T = LatticeModel_T::Stencil; +using CommunicationStencil_T = LatticeModel_T::CommunicationStencil; +using PdfField_T = lbm::PdfField<LatticeModel_T>; +using CommScheme_T = cuda::communication::UniformGPUScheme<CommunicationStencil_T>; + +using flag_t = walberla::uint8_t; +using FlagField_T = FlagField<flag_t>; + + + +int main( int argc, char **argv ) +{ + mpi::Environment env( argc, argv ); + + for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg ) + { + auto config = *cfg; + auto blocks = blockforest::createUniformBlockGridFromConfig( config ); + + // Reading parameters + auto parameters = config->getOneBlock( "Parameters" ); + const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 )); + const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 )); + const Vector3<real_t> initialVelocity = parameters.getParameter< Vector3<real_t> >( "initialVelocity", Vector3<real_t>() ); + + // Creating fields + auto latticeModel = LatticeModel_T( omega ); + BlockDataID pdfFieldCpuID = lbm::addPdfFieldToStorage( blocks, "pdfs on CPU", latticeModel, initialVelocity, real_t(1), field::fzyx ); + BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage<PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true ); + BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" ); + + // Boundaries + const FlagUID fluidFlagUID( "Fluid" ); + auto boundariesConfig = config->getOneBlock( "Boundaries" ); + geometry::initBoundaryHandling<FlagField_T>(*blocks, flagFieldID, boundariesConfig); + geometry::setNonBoundaryCellsToDomain<FlagField_T>(*blocks, flagFieldID, fluidFlagUID); + + lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID); + lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID); + //lbm::GeneratedFixedDensity pressure(blocks, pdfFieldGpuID); + + ubb.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID ); + noSlip.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID ); + //pressure.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("pressure"), fluidFlagUID ); + + + + // Communication setup + bool overlapCommunication = parameters.getParameter<bool>( "overlapCommunication", true ); + bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false ); + + int streamHighPriority = 0; + int streamLowPriority = 0; + WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority) ); + + pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega ); + lbKernel.setOuterPriority( streamHighPriority ); + CommScheme_T gpuComm( blocks, cudaEnabledMPI ); + gpuComm.addPackInfo( make_shared<pystencils::UniformGridGPU_PackInfo>( pdfFieldGpuID )); + + + auto defaultStream = cuda::StreamRAII::newPriorityStream( streamLowPriority ); + auto innerOuterStreams = cuda::ParallelStreams( streamHighPriority ); + auto boundaryOuterStreams = cuda::ParallelStreams( streamHighPriority ); + auto boundaryInnerStreams = cuda::ParallelStreams( streamHighPriority ); + + auto overlapTimeStep = [&]() + { + auto innerOuterSection = innerOuterStreams.parallelSection( defaultStream ); + + innerOuterSection.run([&]( auto innerStream ) + { + for( auto &block: *blocks ) + { + { + auto p = boundaryInnerStreams.parallelSection( innerStream ); + p.run( [&]( auto s ) { ubb.inner( &block, s ); } ); + p.run( [&]( auto s ) { noSlip.inner( &block, s ); } ); + } + lbKernel.inner( &block, innerStream ); + } + }); + + innerOuterSection.run([&]( auto outerStream ) + { + gpuComm( outerStream ); + for( auto &block: *blocks ) + { + { + auto p = boundaryOuterStreams.parallelSection( outerStream ); + p.run( [&]( auto s ) { ubb.outer( &block, s ); } ); + p.run( [&]( auto s ) { noSlip.outer( &block, s ); } ); + } + lbKernel.outer( &block, outerStream ); + } + }); + }; + + + auto boundaryStreams = cuda::ParallelStreams( streamHighPriority ); + auto normalTimeStep = [&]() + { + gpuComm(); + for( auto &block: *blocks ) + { + { + auto p = boundaryStreams.parallelSection( defaultStream ); + p.run( [&]( auto s ) { ubb( &block, s ); } ); + p.run( [&]( auto s ) { noSlip( &block, s ); } ); + } + lbKernel( &block ); + } + }; + + SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps ); + std::function<void()> timeStep = overlapCommunication ? std::function<void()>( overlapTimeStep ) : + std::function<void()>( normalTimeStep ); + timeLoop.add() << BeforeFunction( timeStep ) + << Sweep( []( IBlock * ) {}, "time step" ); + + // VTK + uint_t vtkWriteFrequency = parameters.getParameter<uint_t>( "vtkWriteFrequency", 0 ); + if( vtkWriteFrequency > 0 ) + { + auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out", + "simulation_step", false, true, true, false, 0 ); + vtkOutput->addCellDataWriter( + make_shared<lbm::VelocityVTKWriter<LatticeModel_T> >( pdfFieldCpuID, "Velocity" )); + vtkOutput->addCellDataWriter( make_shared<lbm::DensityVTKWriter<LatticeModel_T> >( pdfFieldCpuID, "Density" )); + vtkOutput->addBeforeFunction( + cuda::fieldCpyFunctor<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID )); + timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" ); + } + + auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", 3.0 ); // in seconds + timeLoop.addFuncAfterTimeStep( timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps(), remainingTimeLoggerFrequency ), "remaining time logger" ); + + lbm::PerformanceLogger<FlagField_T> performanceLogger(blocks, flagFieldID, fluidFlagUID, 500); + timeLoop.addFuncAfterTimeStep( performanceLogger, "remaining time logger" ); + + timeLoop.run(); + } + + return 0; +} \ No newline at end of file diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.gen.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU.gen.py new file mode 100644 index 000000000..731897463 --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.gen.py @@ -0,0 +1,59 @@ +import sympy as sp +from lbmpy_walberla import generate_lattice_model_files +from lbmpy.creationfunctions import create_lb_update_rule +from pystencils_walberla.sweep import Sweep +from lbmpy.boundaries import NoSlip, UBB +from lbmpy.creationfunctions import create_lb_method +from lbmpy_walberla.boundary import create_boundary_class +from pystencils_walberla.cmake_integration import codegen + + +dtype = 'float64' + +# LB options +options = { + 'method': 'srt', + 'stencil': 'D3Q19', + 'relaxation_rate': sp.Symbol("omega"), + 'field_name': 'pdfs', + 'compressible': False, + 'temporary_field_name': 'pdfs_tmp', + 'optimization': {'cse_global': True, + 'cse_pdfs': True, + 'double_precision': dtype == 'float64'} +} + +# GPU optimization options +inner_opt = {'gpu_indexing_params': {'block_size': (128, 1, 1)}, 'data_type': dtype} +outer_opt = {'gpu_indexing_params': {'block_size': (32, 32, 32)}, 'data_type': dtype} + + +def lb_assignments(): + ur = create_lb_update_rule(**options) + return ur.all_assignments + + +def genBoundary(): + boundary = UBB([0.05, 0, 0], dim=3, name="UniformGridGPU_UBB") + return create_boundary_class(boundary, create_lb_method(**options), target='gpu') + + +def genNoSlip(): + boundary = NoSlip(name='UniformGridGPU_NoSlip') + return create_boundary_class(boundary, create_lb_method(**options), target='gpu') + + +generate_lattice_model_files(class_name='UniformGridGPU_LatticeModel', **options) + +Sweep.generate_inner_outer_kernel('UniformGridGPU_LbKernel', + lambda: create_lb_update_rule(**options).all_assignments, + target='gpu', + temporary_fields=['pdfs_tmp'], + field_swaps=[('pdfs', 'pdfs_tmp')], + optimization=inner_opt, + outer_optimization=outer_opt) + +Sweep.generate_pack_info('UniformGridGPU_PackInfo', lb_assignments, target='gpu') + +codegen.register(['UniformGridGPU_UBB.h', 'UniformGridGPU_UBB.cu'], genBoundary) +codegen.register(['UniformGridGPU_NoSlip.h', 'UniformGridGPU_NoSlip.cu'], genNoSlip) diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm new file mode 100644 index 000000000..53a3c0512 --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm @@ -0,0 +1,27 @@ + +Parameters +{ + omega 1.8; + timesteps 10; + + remainingTimeLoggerFrequency 3; + vtkWriteFrequency 0; + + overlapCommunication true; + cudaEnabledMPI false; +} + +DomainSetup +{ + blocks < 1, 1, 1 >; + cellsPerBlock < 300, 300, 150 >; + periodic < 0, 0, 1 >; +} + +Boundaries +{ + Border { direction W; walldistance -1; flag NoSlip; } + Border { direction E; walldistance -1; flag NoSlip; } + Border { direction S; walldistance -1; flag NoSlip; } + Border { direction N; walldistance -1; flag UBB; } +} diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPUSmall.prm b/apps/benchmarks/UniformGridGPU/UniformGridGPUSmall.prm new file mode 100644 index 000000000..c6b8ae931 --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPUSmall.prm @@ -0,0 +1,27 @@ + +Parameters +{ + omega 1.8; + timesteps 2; + + remainingTimeLoggerFrequency 3; + vtkWriteFrequency 0; + + overlapCommunication false; + cudaEnabledMPI false; +} + +DomainSetup +{ + blocks < 1, 1, 1 >; + cellsPerBlock < 50, 20, 10 >; + periodic < 0, 0, 1 >; +} + +Boundaries +{ + Border { direction W; walldistance -1; flag NoSlip; } + Border { direction E; walldistance -1; flag NoSlip; } + Border { direction S; walldistance -1; flag NoSlip; } + Border { direction N; walldistance -1; flag UBB; } +} diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt index b38244fc9..c4ff50133 100644 --- a/src/cuda/CMakeLists.txt +++ b/src/cuda/CMakeLists.txt @@ -6,4 +6,4 @@ waLBerla_add_module( DEPENDS blockforest core communication domain_decomposition python_coupling field stencil BUILD_ONLY_IF_FOUND CUDA ) -################################################################################################### \ No newline at end of file +################################################################################################### \ No newline at end of file diff --git a/src/cuda/CudaRAII.h b/src/cuda/CudaRAII.h index f179348f7..5e1d7a3e7 100644 --- a/src/cuda/CudaRAII.h +++ b/src/cuda/CudaRAII.h @@ -18,67 +18,97 @@ //! \author Martin Bauer <martin.bauer@fau.de> // //====================================================================================================================== +#pragma once #include "ErrorChecking.h" - namespace walberla { namespace cuda { -class StreamRAII -{ -public: - ~StreamRAII() { - if( stream_ != 0 ) { - WALBERLA_CUDA_CHECK( cudaStreamDestroy( stream_ )); - } - } - - StreamRAII( StreamRAII && other) { - stream_ = other.stream_; - other.stream_ = 0; - } - - StreamRAII(const StreamRAII&) = delete; - void operator=( const StreamRAII &) = delete; - operator cudaStream_t() const { return stream_; } - - - static StreamRAII defaultStream() { - StreamRAII result; - result.stream_ = 0; - return result; - } - - static StreamRAII newPriorityStream(int priority) { - StreamRAII result; - WALBERLA_CUDA_CHECK( cudaStreamCreateWithPriority( &result.stream_, cudaStreamDefault, priority )); - return result; - } - - static StreamRAII newStream() { - StreamRAII result; - WALBERLA_CUDA_CHECK( cudaStreamCreate( &result.stream_)); - return result; - } - -private: - StreamRAII() {} - cudaStream_t stream_; -}; - - -struct EventRAII -{ - explicit EventRAII() { WALBERLA_CUDA_CHECK( cudaEventCreate(&event) ); } - ~EventRAII() { WALBERLA_CUDA_CHECK( cudaEventDestroy(event) ); } - EventRAII(const EventRAII &) = delete; - void operator=( const EventRAII &) = delete; - operator cudaEvent_t() const { return event; } - - cudaEvent_t event; -}; + class StreamRAII + { + public: + ~StreamRAII() + { + if( stream_ != 0 ) { + WALBERLA_CUDA_CHECK( cudaStreamDestroy( stream_ )); + } + } + + StreamRAII( StreamRAII &&other ) + { + stream_ = other.stream_; + other.stream_ = 0; + } + + StreamRAII( const StreamRAII & ) = delete; + + void operator=( const StreamRAII & ) = delete; + + operator cudaStream_t() const { return stream_; } + + + static StreamRAII defaultStream() + { + StreamRAII result; + result.stream_ = 0; + return result; + } + + static StreamRAII newPriorityStream( int priority ) + { + StreamRAII result; + WALBERLA_CUDA_CHECK( cudaStreamCreateWithPriority( &result.stream_, cudaStreamDefault, priority )); + return result; + } + + static StreamRAII newStream() + { + StreamRAII result; + WALBERLA_CUDA_CHECK( cudaStreamCreate( &result.stream_ )); + return result; + } + + private: + StreamRAII() {} + + cudaStream_t stream_; + }; + + + class EventRAII + { + public: + explicit EventRAII() + { + event = cudaEvent_t(); + WALBERLA_CUDA_CHECK( cudaEventCreate( &event )); + } + + ~EventRAII() + { + if( event != cudaEvent_t() ) + { + WALBERLA_CUDA_CHECK( cudaEventDestroy( event )); + } + } + + EventRAII( const EventRAII & ) = delete; + + void operator=( const EventRAII & ) = delete; + + EventRAII( EventRAII &&other ) + { + event = other.event; + other.event = cudaEvent_t(); + } + + operator cudaEvent_t() const { return event; } + + private: + cudaEvent_t event; + }; } // namespace cuda diff --git a/src/cuda/ParallelStreams.cpp b/src/cuda/ParallelStreams.cpp new file mode 100644 index 000000000..d2fff0416 --- /dev/null +++ b/src/cuda/ParallelStreams.cpp @@ -0,0 +1,113 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file ParallelStreams.cpp +//! \ingroup cuda +//! \author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== + + +#include "cuda/ParallelStreams.h" + +namespace walberla { +namespace cuda { + + + ParallelSection::ParallelSection(ParallelStreams * parent, cudaStream_t mainStream) + : parent_( parent ), mainStream_( mainStream ), counter_( 0 ) + { + WALBERLA_CUDA_CHECK( cudaEventCreate(&startEvent_) ); + WALBERLA_CUDA_CHECK( cudaEventRecord( startEvent_, mainStream_ ) ); + } + + ParallelSection::~ParallelSection() + { + synchronize(); + WALBERLA_CUDA_CHECK( cudaEventDestroy(startEvent_) ); + } + + void ParallelSection::next() + { + if( counter_ > 0 ) { + WALBERLA_CUDA_CHECK( cudaEventRecord( parent_->events_[counter_ - 1], parent_->sideStreams_[counter_ - 1] ) ); + } + else { + WALBERLA_CUDA_CHECK( cudaEventRecord( parent_->mainEvent_, mainStream_ ) ); + } + ++counter_; + + parent_->ensureSize( counter_ ); + + WALBERLA_CUDA_CHECK( cudaStreamWaitEvent( stream(), startEvent_, 0 )); + } + + void ParallelSection::run(const std::function<void( cudaStream_t)> & f) + { + f( stream() ); + next(); + } + + void ParallelSection::synchronize() + { + for( uint_t i=0; i < counter_; ++i ) + for( uint_t j=0; j < counter_; ++j ) + { + if( i == j ) + continue; + + auto & event = i == 0 ? parent_->mainEvent_ : parent_->events_[i - 1]; + cudaStream_t stream = j == 0 ? mainStream_ : parent_->sideStreams_[j - 1]; + WALBERLA_CUDA_CHECK( cudaStreamWaitEvent( stream, event, 0 )); + } + + WALBERLA_CUDA_CHECK( cudaEventRecord( startEvent_, mainStream_ ) ); + } + + cudaStream_t ParallelSection::stream() + { + return counter_ == 0 ? mainStream_ : parent_->sideStreams_[counter_ - 1]; + } + + + + ParallelStreams::ParallelStreams( int priority ) + : streamPriority_( priority ) + { + } + + ParallelSection ParallelStreams::parallelSection( cudaStream_t stream ) { + return ParallelSection(this, stream); + } + + void ParallelStreams::ensureSize( uint_t size ) { + for( uint_t i = sideStreams_.size(); i < size; ++i ) + { + sideStreams_.emplace_back( StreamRAII::newPriorityStream(streamPriority_)); + events_.emplace_back( EventRAII() ); + } + } + + void ParallelStreams::setStreamPriority( int priority ) + { + streamPriority_ = priority; + sideStreams_.clear(); + events_.clear(); + } + + + +} // namespace cuda +} // namespace walberla \ No newline at end of file diff --git a/src/cuda/ParallelStreams.h b/src/cuda/ParallelStreams.h new file mode 100644 index 000000000..8f6348015 --- /dev/null +++ b/src/cuda/ParallelStreams.h @@ -0,0 +1,100 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file ParallelStreams.h +//! \ingroup cuda +//! \author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== +#pragma once +#include "cuda/ErrorChecking.h" +#include "cuda/CudaRAII.h" + +#include <vector> + +namespace walberla { +namespace cuda { + + class ParallelStreams; + + class ParallelSection + { + public: + ~ParallelSection(); + void run( const std::function<void( cudaStream_t )> &f ); + + private: + friend class ParallelStreams; + + ParallelSection( ParallelStreams *parent, cudaStream_t mainStream ); + void synchronize(); + + cudaStream_t stream(); + void next(); + + ParallelStreams * parent_; + cudaStream_t mainStream_; + cudaEvent_t startEvent_; + uint_t counter_; + }; + + + //******************************************************************************************************************* + /*! + * Helper class to run CUDA operations on parallel streams + * + * This class introduces "side streams" that overlap with one "main stream". In a parallel section, multiple + * kernels (or other CUDA operations) are scheduled to the streams. The first "run" is scheduled on the main stream + * all subsequent operations on the side streams. The passed priority affects only the side streams. When + * the parallel section goes out of scope the side streams are synchronized to the main stream via CUDA events. + * + * Example: + * + * \code + * ParallelStreams streams; + * { + * // new scope for the parallel section + * ParallelSection sec = streams.parallelSection( mainCudaStream ); + * sec.run([&] ( cudaStream_t sideStream ) { + * // run something on the side stream + * }); + * // after the parallel section goes out of scope the side streams are synchronized to the main stream + * } + * + * \endcode + * + */ + //******************************************************************************************************************* + class ParallelStreams + { + public: + ParallelStreams( int priority = 0 ); + ParallelSection parallelSection( cudaStream_t stream ); + void setStreamPriority( int priority ); + + private: + friend class ParallelSection; + + void ensureSize( uint_t size ); + + std::vector<StreamRAII> sideStreams_; + std::vector<EventRAII> events_; + EventRAII mainEvent_; + int streamPriority_; + }; + + +} // namespace cuda +} // namespace walberla \ No newline at end of file diff --git a/src/cuda/communication/UniformGPUScheme.h b/src/cuda/communication/UniformGPUScheme.h index b7e0bf06c..b22a3be6c 100644 --- a/src/cuda/communication/UniformGPUScheme.h +++ b/src/cuda/communication/UniformGPUScheme.h @@ -31,6 +31,7 @@ #include "cuda/CudaRAII.h" #include "cuda/communication/GeneratedGPUPackInfo.h" #include "cuda/communication/CustomMemoryBuffer.h" +#include "cuda/ParallelStreams.h" #include <chrono> #include <thread> @@ -46,23 +47,21 @@ namespace communication { { public: explicit UniformGPUScheme( weak_ptr_wrapper<StructuredBlockForest> bf, - const shared_ptr<cuda::EventRAII> & startWaitEvent, bool sendDirectlyFromGPU = false, const int tag = 5432 ); void addPackInfo( const shared_ptr<GeneratedGPUPackInfo> &pi ); - void startCommunication(); - void wait(); + void startCommunication( cudaStream_t stream = 0); + void wait( cudaStream_t stream = 0); - void operator()() { communicate(); } - inline void communicate() { startCommunication(); wait(); } + void operator()( cudaStream_t stream = 0 ) { communicate( stream ); } + inline void communicate( cudaStream_t stream = 0 ) { startCommunication(stream); wait(stream); } private: void setupCommunication(); weak_ptr_wrapper<StructuredBlockForest> blockForest_; - shared_ptr<cuda::EventRAII> startWaitEvent_; uint_t forestModificationStamp_; bool setupBeforeNextCommunication_; @@ -76,7 +75,8 @@ namespace communication { mpi::GenericBufferSystem<GpuBuffer_T, GpuBuffer_T> bufferSystemGPU_; std::vector<shared_ptr<GeneratedGPUPackInfo> > packInfos_; - std::map<stencil::Direction, cuda::StreamRAII> streams_; + + ParallelStreams parallelSectionManager_; struct Header { diff --git a/src/cuda/communication/UniformGPUScheme.impl.h b/src/cuda/communication/UniformGPUScheme.impl.h index 2b9ebd32a..0d7667012 100644 --- a/src/cuda/communication/UniformGPUScheme.impl.h +++ b/src/cuda/communication/UniformGPUScheme.impl.h @@ -19,6 +19,7 @@ // //====================================================================================================================== +#include "cuda/ParallelStreams.h" namespace walberla { namespace cuda { @@ -27,26 +28,26 @@ namespace communication { template<typename Stencil> UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr_wrapper <StructuredBlockForest> bf, - const shared_ptr <cuda::EventRAII> &startWaitEvent, bool sendDirectlyFromGPU, const int tag ) : blockForest_( bf ), - startWaitEvent_( startWaitEvent ), setupBeforeNextCommunication_( true ), communicationInProgress_( false ), sendFromGPU_( sendDirectlyFromGPU ), bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ), - bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ) {} + bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ), + parallelSectionManager_( -1 ) + {} template<typename Stencil> - void UniformGPUScheme<Stencil>::startCommunication() + void UniformGPUScheme<Stencil>::startCommunication( cudaStream_t stream ) { WALBERLA_ASSERT( !communicationInProgress_ ); auto forest = blockForest_.lock(); - if( setupBeforeNextCommunication_ || - forest->getBlockForest().getModificationStamp() != forestModificationStamp_ ) + auto currentBlockForestStamp = forest->getBlockForest().getModificationStamp(); + if( setupBeforeNextCommunication_ || currentBlockForestStamp != forestModificationStamp_ ) setupCommunication(); // Schedule Receives @@ -61,44 +62,40 @@ UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr_wrapper <StructuredBlockFo bufferSystemGPU_.sendBuffer( it.first ).clear(); // Start filling send buffers - for( auto &iBlock : *forest ) { - auto block = dynamic_cast< Block * >( &iBlock ); - for( auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir ) + auto parallelSection = parallelSectionManager_.parallelSection( stream ); + for( auto &iBlock : *forest ) { - const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex( *dir ); - if( block->getNeighborhoodSectionSize( neighborIdx ) == uint_t( 0 )) - continue; - auto nProcess = mpi::MPIRank( block->getNeighborProcess( neighborIdx, uint_t( 0 ))); - - if( streams_.find( *dir ) == streams_.end() ) - { - streams_.emplace( *dir, StreamRAII::newPriorityStream( -1 )); - } - - auto &ci = streams_.at( *dir ); - - for( auto &pi : packInfos_ ) + auto block = dynamic_cast< Block * >( &iBlock ); + for( auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir ) { - auto size = pi->size( *dir, block ); - auto gpuDataPtr = bufferSystemGPU_.sendBuffer( nProcess ).advanceNoResize( size ); - WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr ); - WALBERLA_CUDA_CHECK( cudaStreamWaitEvent( ci, *startWaitEvent_, 0 )); - pi->pack( *dir, gpuDataPtr, block, ci ); + const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex( *dir ); + if( block->getNeighborhoodSectionSize( neighborIdx ) == uint_t( 0 )) + continue; + auto nProcess = mpi::MPIRank( block->getNeighborProcess( neighborIdx, uint_t( 0 ))); - if( !sendFromGPU_ ) + for( auto &pi : packInfos_ ) { - auto cpuDataPtr = bufferSystemCPU_.sendBuffer( nProcess ).advanceNoResize( size ); - WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr ); - WALBERLA_CUDA_CHECK( - cudaMemcpyAsync( cpuDataPtr, gpuDataPtr, size, cudaMemcpyDeviceToHost, ci )); + parallelSection.run([&](auto s) { + auto size = pi->size( *dir, block ); + auto gpuDataPtr = bufferSystemGPU_.sendBuffer( nProcess ).advanceNoResize( size ); + WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr ); + pi->pack( *dir, gpuDataPtr, block, s ); + + if( !sendFromGPU_ ) + { + auto cpuDataPtr = bufferSystemCPU_.sendBuffer( nProcess ).advanceNoResize( size ); + WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr ); + WALBERLA_CUDA_CHECK( cudaMemcpyAsync( cpuDataPtr, gpuDataPtr, size, cudaMemcpyDeviceToHost, s )); + } + }); } } } } - // Busy waiting for packing to finish - then send - for( auto &ci : streams_ ) WALBERLA_CUDA_CHECK( cudaStreamSynchronize( ci.second )); + // wait for packing to finish + cudaStreamSynchronize( stream ); if( sendFromGPU_ ) bufferSystemGPU_.sendAll(); @@ -110,7 +107,7 @@ UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr_wrapper <StructuredBlockFo template<typename Stencil> - void UniformGPUScheme<Stencil>::wait() + void UniformGPUScheme<Stencil>::wait( cudaStream_t stream ) { WALBERLA_ASSERT( communicationInProgress_ ); @@ -118,11 +115,11 @@ UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr_wrapper <StructuredBlockFo if( sendFromGPU_ ) { + auto parallelSection = parallelSectionManager_.parallelSection( stream ); for( auto recvInfo = bufferSystemGPU_.begin(); recvInfo != bufferSystemGPU_.end(); ++recvInfo ) { for( auto &header : headers_[recvInfo.rank()] ) { - auto &ci = streams_.at( header.dir ); auto block = dynamic_cast< Block * >( forest->getBlock( header.blockId )); for( auto &pi : packInfos_ ) @@ -130,13 +127,16 @@ UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr_wrapper <StructuredBlockFo auto size = pi->size( header.dir, block ); auto gpuDataPtr = recvInfo.buffer().advanceNoResize( size ); WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr ); - pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, ci ); + parallelSection.run([&](auto s) { + pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, s ); + }); } } } } else { + auto parallelSection = parallelSectionManager_.parallelSection( stream ); for( auto recvInfo = bufferSystemCPU_.begin(); recvInfo != bufferSystemCPU_.end(); ++recvInfo ) { using namespace std::chrono_literals; @@ -145,7 +145,6 @@ UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr_wrapper <StructuredBlockFo gpuBuffer.clear(); for( auto &header : headers_[recvInfo.rank()] ) { - auto &ci = streams_.at( header.dir ); auto block = dynamic_cast< Block * >( forest->getBlock( header.blockId )); for( auto &pi : packInfos_ ) @@ -156,17 +155,16 @@ UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr_wrapper <StructuredBlockFo WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr ); WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr ); - WALBERLA_CUDA_CHECK( cudaMemcpyAsync( gpuDataPtr, cpuDataPtr, size, - cudaMemcpyHostToDevice, ci )); - pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, ci ); + parallelSection.run([&](auto s) { + WALBERLA_CUDA_CHECK( cudaMemcpyAsync( gpuDataPtr, cpuDataPtr, size, + cudaMemcpyHostToDevice, s )); + pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, s ); + }); } } } } - for( auto &ci : streams_ ) - WALBERLA_CUDA_CHECK( cudaStreamSynchronize( ci.second )); - communicationInProgress_ = false; } -- GitLab