GPUPackInfo: add asynchronous (un)packing capabilities

Changes introduced in this commit are the following:

- CUDA streams: Add support for asynchronous (un)packing operations using CUDA
  streams in cuda::communication::GPUPackInfo. Through asynchronous operations
  it is possible to overlap GPU computation and MPI communication in simulations
  (e.g. LBM simulations). Asynchronous copies in CUDA require pinned memory on
  the host, and for that purpose a staging buffer is introduced (i.e.
  cuda::communication::PinnedMemoryBuffer) in the cuda module, which is used to
  stage data between the GPU and the MPI buffers.

- zyxf layout: Add zyxf field layout support in GPUPackInfo through extensions
  of the functions in cuda::GPUCopy.

- Extended GPUPackInfo test: Add stream and zyxf layout tests to the
  GPUPackInfoTest to test the proposed implementation.

- Extended Kernel: add CUDA stream and shared memory configuration support in
  cuda::Kernel class.
Signed-off-by: João Victor Tozatti Risso's avatarJoão Victor Tozatti Risso <joaovictortr@protonmail.com>
parent 4251ffbe
This diff is collapsed.
This diff is collapsed.
......@@ -102,8 +102,8 @@ namespace cuda {
template<typename T> void addFieldIndexingParam( const T & indexing );
void configure( dim3 gridDim, dim3 blockDim );
void operator() () const;
void configure( dim3 gridDim, dim3 blockDim, std::size_t sharedMemSize = 0 );
void operator() ( cudaStream_t stream = 0 ) const;
protected:
......@@ -118,6 +118,7 @@ namespace cuda {
bool configured_;
dim3 gridDim_;
dim3 blockDim_;
std::size_t sharedMemSize_;
struct ParamInfo {
std::vector<char> data;
......@@ -178,7 +179,8 @@ namespace cuda {
template<typename FP>
Kernel<FP>::Kernel( FP funcPtr )
: funcPtr_ ( funcPtr ),
configured_( false )
configured_( false ),
sharedMemSize_( 0 )
{}
template<typename FP>
......@@ -206,12 +208,13 @@ namespace cuda {
}
template<typename FP>
void Kernel<FP>::configure( dim3 gridDim, dim3 blockDim )
void Kernel<FP>::configure( dim3 gridDim, dim3 blockDim, std::size_t sharedMemSize )
{
if ( ! configured_ )
{
gridDim_ = gridDim;
blockDim_ = blockDim;
sharedMemSize_ = sharedMemSize;
configured_ = true;
}
else
......@@ -225,7 +228,7 @@ namespace cuda {
}
template<typename FP>
void Kernel<FP>::operator() () const
void Kernel<FP>::operator() ( cudaStream_t stream ) const
{
// check for correct number of parameter calls
......@@ -235,7 +238,7 @@ namespace cuda {
}
// set the number of blocks and threads,
WALBERLA_CUDA_CHECK( cudaConfigureCall( gridDim_, blockDim_ ) ); //TODO extend class to support streams
WALBERLA_CUDA_CHECK( cudaConfigureCall( gridDim_, blockDim_, sharedMemSize_, stream ) );
// register all parameters
for( auto paramIt = params_.begin(); paramIt != params_.end(); ++paramIt ) {
......
This diff is collapsed.
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file PinnedMemoryBuffer.h
//! \ingroup cuda
//! \author João Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
//! \brief Pinned Memory buffer for staging memory when using asynchronous CUDA memory copies.
//
//======================================================================================================================
#pragma once
#include "cuda/ErrorChecking.h"
#include <algorithm>
#include <cuda_runtime.h>
namespace walberla {
namespace cuda {
namespace communication {
template< typename T = unsigned char >
class GenericPinnedMemoryBuffer
{
public:
typedef T ElementType;
GenericPinnedMemoryBuffer();
GenericPinnedMemoryBuffer( std::size_t initSize );
GenericPinnedMemoryBuffer( const GenericPinnedMemoryBuffer & pb );
~GenericPinnedMemoryBuffer();
inline T* ptr() const { return data_; }
inline T* resize( std::size_t newSize );
inline std::size_t size() const { return size_; }
GenericPinnedMemoryBuffer & operator=( const GenericPinnedMemoryBuffer & pb ) = delete;
private:
T * data_;
std::size_t size_;
};
typedef GenericPinnedMemoryBuffer<> PinnedMemoryBuffer;
template< typename T > // Element type
GenericPinnedMemoryBuffer<T>::GenericPinnedMemoryBuffer()
: data_(nullptr), size_(0)
{
}
template< typename T > // Element type
GenericPinnedMemoryBuffer<T>::GenericPinnedMemoryBuffer( std::size_t initSize )
: data_(nullptr), size_(initSize)
{
if (initSize > 0)
{
WALBERLA_CUDA_CHECK( cudaMallocHost( &data_, size_ * sizeof(T) ) );
}
}
template< typename T > // Element type
GenericPinnedMemoryBuffer<T>::GenericPinnedMemoryBuffer( const GenericPinnedMemoryBuffer & pb )
: size_(pb.size_)
{
if ( pb.size_ > 0 )
{
WALBERLA_CUDA_CHECK( cudaMallocHost( &data_, pb.size_ * sizeof(T) ) );
std::copy( pb.data_, static_cast<T *>(pb.data_ + pb.size_), data_ );
}
}
template< typename T > // Element type
GenericPinnedMemoryBuffer<T>::~GenericPinnedMemoryBuffer()
{
if ( data_ != nullptr )
{
WALBERLA_CUDA_CHECK( cudaFreeHost( data_ ) );
}
}
template< typename T > // Element type
T * GenericPinnedMemoryBuffer<T>::resize(std::size_t newSize)
{
if ( newSize > size_ )
{
T * newBegin;
WALBERLA_CUDA_CHECK( cudaMallocHost( &newBegin, newSize * sizeof(T) ) );
std::swap( data_, newBegin );
if ( newBegin != nullptr )
{
WALBERLA_CUDA_CHECK( cudaFreeHost( newBegin ) );
}
size_ = newSize;
}
return data_;
}
} // namespace communication
} // namespace cuda
} // namespace walberla
......@@ -7,6 +7,9 @@
waLBerla_compile_test( FILES communication/GPUPackInfoTest.cpp DEPENDS blockforest )
waLBerla_execute_test( NAME GPUPackInfoTest )
waLBerla_compile_test( FILES communication/GPUPackInfoCommunicationTest.cpp DEPENDS domain_decomposition blockforest stencil )
waLBerla_execute_test( NAME GPUPackInfoCommunicationTest )
waLBerla_compile_test( FILES FieldTransferTest )
waLBerla_execute_test( NAME FieldTransferTest )
......
//========================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file GPUFieldPackInfoTest.cpp
//! \ingroup cuda
//! \author João Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
//! \brief Short communication test to verify the equivalence of GPUPackInfo using a default stream and multiple streams.
//
//========================================================================================================================
#include "core/DataTypes.h"
#include "core/debug/TestSubsystem.h"
#include "core/math/Random.h"
#include "core/mpi/Environment.h"
#include "stencil/Directions.h"
#include "stencil/Iterator.h"
#include "stencil/D3Q27.h"
#include "domain_decomposition/BlockDataID.h"
#include "blockforest/Initialization.h"
#include "blockforest/communication/UniformBufferedScheme.h"
#include "field/GhostLayerField.h"
#include "cuda/ErrorChecking.h"
#include "cuda/HostFieldAllocator.h"
#include "cuda/GPUField.h"
#include "cuda/FieldCopy.h"
#include "cuda/communication/GPUPackInfo.h"
#include <cuda_runtime.h>
#include <vector>
using namespace walberla;
using DataType = walberla::uint_t;
using StencilType = stencil::D3Q27;
using FieldType = field::GhostLayerField< DataType, StencilType::Size >;
using GPUFieldType = cuda::GPUField< DataType >;
using CommSchemeType = blockforest::communication::UniformBufferedScheme<StencilType>;
using GPUPackInfoType = cuda::communication::GPUPackInfo< GPUFieldType >;
static std::vector< cuda::Layout > fieldLayouts = { cuda::fzyx, cuda::zyxf };
static uint_t fieldLayoutIndex = 0;
FieldType * createField( IBlock* const block, StructuredBlockStorage* const storage )
{
return new FieldType(
storage->getNumberOfXCells( *block ), // number of cells in x direction per block
storage->getNumberOfYCells( *block ), // number of cells in y direction per block
storage->getNumberOfZCells( *block ), // number of cells in z direction per block
1, // one ghost layer
DataType(0), // initial value
fieldLayouts[fieldLayoutIndex], // layout
make_shared<cuda::HostFieldAllocator< DataType > >() // allocator for host pinned memory
);
}
GPUFieldType * createGPUField( IBlock* const block, StructuredBlockStorage* const storage )
{
return new GPUFieldType(
storage->getNumberOfXCells( *block ), // number of cells in x direction
storage->getNumberOfYCells( *block ), // number of cells in y direction
storage->getNumberOfZCells( *block ), // number of cells in z direction
StencilType::Size, // number of cells for pdfs
1, // one ghost layer
fieldLayouts[fieldLayoutIndex] );
}
void initFields( const shared_ptr< StructuredBlockStorage > & blocks, const BlockDataID & fieldID )
{
for( auto block = blocks->begin(); block != blocks->end(); ++block )
{
auto fieldPtr = block->getData< FieldType >( fieldID );
for( auto fieldIt = fieldPtr->begin(); fieldIt != fieldPtr->end(); ++fieldIt )
*fieldIt = math::intRandom< DataType >();
}
}
int main( int argc, char ** argv )
{
debug::enterTestMode();
mpi::Environment mpiEnv( argc, argv );
std::vector< cudaStream_t > streams;
for( uint_t i = 0; i < StencilType::Size; ++i )
{
cudaStream_t stream(nullptr);
WALBERLA_CUDA_CHECK( cudaStreamCreate(&stream) );
streams.push_back( stream );
}
const Vector3< uint_t > cells = Vector3< uint_t >( 4, 4, 4 );
uint_t nProc = uint_c( MPIManager::instance()->numProcesses() );
for(; fieldLayoutIndex < fieldLayouts.size(); ++fieldLayoutIndex )
{
auto blocks = blockforest::createUniformBlockGrid(nProc, 1, 1, // blocks
cells[0], cells[1], cells[2], // cells
1, // unit cell spacing
true, // one block per process
true, true, true); // periodic in all directions
BlockDataID sourceFieldId = blocks->addStructuredBlockData< FieldType >( &createField,
"ScalarField" );
BlockDataID syncGPUFieldId = blocks->addStructuredBlockData< GPUFieldType >( &createGPUField,
"syncGPUField" );
BlockDataID asyncGPUFieldId = blocks->addStructuredBlockData< GPUFieldType >( &createGPUField,
"asyncGPUField" );
math::seedRandomGenerator( numeric_cast<boost::mt19937::result_type>( MPIManager::instance()->rank() ) );
// Initialize CPU field with random values
initFields( blocks, sourceFieldId );
// Copy same CPU field to both GPU fields
for( auto block = blocks->begin(); block != blocks->end(); ++block )
{
auto sourceFieldPtr = block->getData< FieldType >( sourceFieldId );
auto syncGPUFieldPtr = block->getData< GPUFieldType >( syncGPUFieldId );
cuda::fieldCpy( *syncGPUFieldPtr, *sourceFieldPtr );
auto asyncGPUFieldPtr = block->getData< GPUFieldType >( asyncGPUFieldId );
cuda::fieldCpy( *asyncGPUFieldPtr, *sourceFieldPtr );
}
// Setup communication schemes for synchronous GPUPackInfo
CommSchemeType syncCommScheme(blocks);
syncCommScheme.addPackInfo( boost::make_shared< GPUPackInfoType >( syncGPUFieldId ) );
// Setup communication scheme for asynchronous GPUPackInfo, which uses CUDA streams
CommSchemeType asyncCommScheme(blocks);
asyncCommScheme.addPackInfo( boost::make_shared< GPUPackInfoType >( asyncGPUFieldId, streams ) );
// Perform one communication step for each scheme
syncCommScheme();
asyncCommScheme();
// Check results
FieldType syncFieldCpu( cells[0], cells[1], cells[2], 1, fieldLayouts[fieldLayoutIndex],
make_shared< cuda::HostFieldAllocator< DataType > >() );
FieldType asyncFieldCpu( cells[0], cells[1], cells[2], 1, fieldLayouts[fieldLayoutIndex],
make_shared< cuda::HostFieldAllocator< DataType > >() );
for( auto block = blocks->begin(); block != blocks->end(); ++block )
{
auto syncGPUFieldPtr = block->getData< GPUFieldType >( syncGPUFieldId );
cuda::fieldCpy( syncFieldCpu, *syncGPUFieldPtr );
auto asyncGPUFieldPtr = block->getData< GPUFieldType >( asyncGPUFieldId );
cuda::fieldCpy( asyncFieldCpu, *asyncGPUFieldPtr );
for( auto syncIt = syncFieldCpu.beginWithGhostLayerXYZ(), asyncIt = asyncFieldCpu.beginWithGhostLayerXYZ();
syncIt != syncFieldCpu.end();
++syncIt, ++asyncIt )
WALBERLA_CHECK_EQUAL( *syncIt, *asyncIt );
}
}
for( uint_t i = 0; i < StencilType::Size; ++i )
WALBERLA_CUDA_CHECK( cudaStreamDestroy(streams[i]) );
return EXIT_SUCCESS;
}
......@@ -34,12 +34,15 @@
#include "stencil/D3Q27.h"
#include <cstring>
#include <vector>
#include <cuda_runtime.h>
#define F_SIZE 19
using namespace walberla;
static std::vector< field::Layout > fieldLayouts = { field::fzyx, field::zyxf };
static uint_t fieldLayoutIndex = 0;
cuda::GPUField<int> * createGPUField( IBlock* const block, StructuredBlockStorage* const storage )
{
......@@ -49,7 +52,7 @@ cuda::GPUField<int> * createGPUField( IBlock* const block, StructuredBlockStorag
storage->getNumberOfZCells( *block ), // number of cells in z direction
F_SIZE, // fSize
1, // number of ghost layers
field::fzyx );
fieldLayouts[fieldLayoutIndex] );
}
// Tester base class. The communicate() template method allows testing different communication methods.
......@@ -59,7 +62,9 @@ public:
typedef cuda::communication::GPUPackInfo< cuda::GPUField<int> > GPUPackInfoType;
GPUPackInfoTester( IBlock* block, BlockDataID fieldId ): block_( block ), fieldId_( fieldId ) {}
GPUPackInfoTester( IBlock* block, BlockDataID fieldId, std::vector< cudaStream_t > & streams ) :
block_( block ), fieldId_( fieldId ), streams_(streams) {}
virtual ~GPUPackInfoTester() {}
void test( stencil::Direction dir )
......@@ -72,7 +77,7 @@ public:
gpuField.zSize(), // number of cells in z direction
1, // number of ghost layers
0, // initial value
field::fzyx);
fieldLayouts[fieldLayoutIndex]);
cpuField.setWithGhostLayer( 0 );
int val = 0;
......@@ -82,7 +87,7 @@ public:
}
cuda::fieldCpy( gpuField, cpuField );
GPUPackInfoType gpuPackInfo( fieldId_ );
GPUPackInfoType gpuPackInfo( fieldId_, streams_ );
communicate( gpuPackInfo, dir );
cuda::fieldCpy( cpuField, gpuField );
......@@ -101,6 +106,7 @@ protected:
IBlock* block_;
BlockDataID fieldId_;
std::vector< cudaStream_t > streams_;
};
......@@ -108,7 +114,7 @@ protected:
class GPUPackInfoBufferTester: public GPUPackInfoTester
{
public:
GPUPackInfoBufferTester( IBlock* block, BlockDataID fieldId ): GPUPackInfoTester( block, fieldId ) {}
GPUPackInfoBufferTester( IBlock* block, BlockDataID fieldId, std::vector< cudaStream_t > & streams): GPUPackInfoTester( block, fieldId, streams ) {}
protected:
void communicate( GPUPackInfoType& gpuPackInfo, stencil::Direction dir )
......@@ -134,7 +140,7 @@ protected:
class GPUPackInfoLocalTester: public GPUPackInfoTester
{
public:
GPUPackInfoLocalTester( IBlock* block, BlockDataID fieldId ): GPUPackInfoTester( block, fieldId ) {}
GPUPackInfoLocalTester( IBlock* block, BlockDataID fieldId, std::vector< cudaStream_t > & streams ): GPUPackInfoTester( block, fieldId, streams ) {}
protected:
void communicate( GPUPackInfoType& gpuPackInfo, stencil::Direction dir )
......@@ -151,27 +157,42 @@ int main(int argc, char **argv)
debug::enterTestMode();
MPIManager::instance()->initializeMPI(&argc,&argv);
// Create BlockForest
uint_t processes = uint_c( MPIManager::instance()->numProcesses() );
auto blocks = createUniformBlockGrid(processes,1,1, //blocks
2,2,2, //cells
1, //dx
false, //one block per process
true,true,true);//periodicity
BlockDataID scalarGPUFieldId = blocks->addStructuredBlockData<cuda::GPUField<int> >(
&createGPUField, "ScalarGPUField" );
for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
for(; fieldLayoutIndex < fieldLayouts.size(); ++fieldLayoutIndex )
{
GPUPackInfoBufferTester bufferTester( &(*blockIt), scalarGPUFieldId );
GPUPackInfoLocalTester localTester( &(*blockIt), scalarGPUFieldId );
std::vector< cudaStream_t > streams;
for( uint_t s = 0; s < stencil::D3Q27::Size; ++s )
{
cudaStream_t stream(nullptr);
WALBERLA_CUDA_CHECK( cudaStreamCreate( &stream ) );
streams.push_back( stream );
}
// Create BlockForest
uint_t processes = uint_c( MPIManager::instance()->numProcesses() );
auto blocks = createUniformBlockGrid(processes,1,1, //blocks
2,2,2, //cells
1, //dx
false, //one block per process
true,true,true);//periodicity
BlockDataID scalarGPUFieldId = blocks->addStructuredBlockData<cuda::GPUField<int> >(
&createGPUField, "ScalarGPUField" );
for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
{
GPUPackInfoBufferTester bufferTester( &(*blockIt), scalarGPUFieldId, streams );
GPUPackInfoLocalTester localTester( &(*blockIt), scalarGPUFieldId, streams );
for( auto dir = stencil::D3Q27::beginNoCenter(); dir != stencil::D3Q27::end(); ++dir )
{
localTester.test( *dir );
bufferTester.test( *dir );
}
}
for( auto dir = stencil::D3Q27::beginNoCenter(); dir != stencil::D3Q27::end(); ++dir )
for( auto streamIt = streams.begin(); streamIt != streams.end(); ++streamIt )
{
localTester.test( *dir );
bufferTester.test( *dir );
cudaStream_t & stream = *streamIt;
WALBERLA_CUDA_CHECK( cudaStreamDestroy( stream ) );
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment