Commit e3d5a02f authored by Martin Bauer's avatar Martin Bauer

Merge branch 'gpupackinfo_stream' into 'master'

Streamed GPUPackInfo

Closes #27

See merge request walberla/walberla!68
parents 62c94737 6bfe8c59
Pipeline #6204 passed with stages
in 90 minutes and 42 seconds
This diff is collapsed.
This diff is collapsed.
......@@ -102,8 +102,8 @@ namespace cuda {
template<typename T> void addFieldIndexingParam( const T & indexing );
void configure( dim3 gridDim, dim3 blockDim );
void operator() () const;
void configure( dim3 gridDim, dim3 blockDim, std::size_t sharedMemSize = 0 );
void operator() ( cudaStream_t stream = 0 ) const;
protected:
......@@ -118,6 +118,7 @@ namespace cuda {
bool configured_;
dim3 gridDim_;
dim3 blockDim_;
std::size_t sharedMemSize_;
struct ParamInfo {
std::vector<char> data;
......@@ -178,7 +179,8 @@ namespace cuda {
template<typename FP>
Kernel<FP>::Kernel( FP funcPtr )
: funcPtr_ ( funcPtr ),
configured_( false )
configured_( false ),
sharedMemSize_( 0 )
{}
template<typename FP>
......@@ -206,12 +208,13 @@ namespace cuda {
}
template<typename FP>
void Kernel<FP>::configure( dim3 gridDim, dim3 blockDim )
void Kernel<FP>::configure( dim3 gridDim, dim3 blockDim, std::size_t sharedMemSize )
{
if ( ! configured_ )
{
gridDim_ = gridDim;
blockDim_ = blockDim;
sharedMemSize_ = sharedMemSize;
configured_ = true;
}
else
......@@ -225,7 +228,7 @@ namespace cuda {
}
template<typename FP>
void Kernel<FP>::operator() () const
void Kernel<FP>::operator() ( cudaStream_t stream ) const
{
// check for correct number of parameter calls
......@@ -235,7 +238,7 @@ namespace cuda {
}
// set the number of blocks and threads,
WALBERLA_CUDA_CHECK( cudaConfigureCall( gridDim_, blockDim_ ) ); //TODO extend class to support streams
WALBERLA_CUDA_CHECK( cudaConfigureCall( gridDim_, blockDim_, sharedMemSize_, stream ) );
// register all parameters
for( auto paramIt = params_.begin(); paramIt != params_.end(); ++paramIt ) {
......
This diff is collapsed.
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file PinnedMemoryBuffer.h
//! \ingroup cuda
//! \author João Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
//! \brief Pinned Memory buffer for staging memory when using asynchronous CUDA memory copies.
//
//======================================================================================================================
#pragma once
#include "cuda/ErrorChecking.h"
#include <algorithm>
#include <cuda_runtime.h>
namespace walberla {
namespace cuda {
namespace communication {
template< typename T = unsigned char >
class GenericPinnedMemoryBuffer
{
public:
typedef T ElementType;
GenericPinnedMemoryBuffer();
GenericPinnedMemoryBuffer( std::size_t initSize );
GenericPinnedMemoryBuffer( const GenericPinnedMemoryBuffer & pb );
~GenericPinnedMemoryBuffer();
inline T* ptr() const { return data_; }
inline T* resize( std::size_t newSize );
inline std::size_t size() const { return size_; }
GenericPinnedMemoryBuffer & operator=( const GenericPinnedMemoryBuffer & pb ) = delete;
private:
T * data_;
std::size_t size_;
};
typedef GenericPinnedMemoryBuffer<> PinnedMemoryBuffer;
template< typename T > // Element type
GenericPinnedMemoryBuffer<T>::GenericPinnedMemoryBuffer()
: data_(nullptr), size_(0)
{
}
template< typename T > // Element type
GenericPinnedMemoryBuffer<T>::GenericPinnedMemoryBuffer( std::size_t initSize )
: data_(nullptr), size_(initSize)
{
if (initSize > 0)
{
WALBERLA_CUDA_CHECK( cudaMallocHost( &data_, size_ * sizeof(T) ) );
}
}
template< typename T > // Element type
GenericPinnedMemoryBuffer<T>::GenericPinnedMemoryBuffer( const GenericPinnedMemoryBuffer & pb )
: size_(pb.size_)
{
if ( pb.size_ > 0 )
{
WALBERLA_CUDA_CHECK( cudaMallocHost( &data_, pb.size_ * sizeof(T) ) );
std::copy( pb.data_, static_cast<T *>(pb.data_ + pb.size_), data_ );
}
}
template< typename T > // Element type
GenericPinnedMemoryBuffer<T>::~GenericPinnedMemoryBuffer()
{
if ( data_ != nullptr )
{
WALBERLA_CUDA_CHECK( cudaFreeHost( data_ ) );
}
}
template< typename T > // Element type
T * GenericPinnedMemoryBuffer<T>::resize(std::size_t newSize)
{
if ( newSize > size_ )
{
T * newBegin;
WALBERLA_CUDA_CHECK( cudaMallocHost( &newBegin, newSize * sizeof(T) ) );
std::swap( data_, newBegin );
if ( newBegin != nullptr )
{
WALBERLA_CUDA_CHECK( cudaFreeHost( newBegin ) );
}
size_ = newSize;
}
return data_;
}
} // namespace communication
} // namespace cuda
} // namespace walberla
......@@ -7,6 +7,9 @@
waLBerla_compile_test( FILES communication/GPUPackInfoTest.cpp DEPENDS blockforest )
waLBerla_execute_test( NAME GPUPackInfoTest )
waLBerla_compile_test( FILES communication/GPUPackInfoCommunicationTest.cpp DEPENDS domain_decomposition blockforest stencil )
waLBerla_execute_test( NAME GPUPackInfoCommunicationTest )
waLBerla_compile_test( FILES FieldTransferTest )
waLBerla_execute_test( NAME FieldTransferTest )
......
//========================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file GPUFieldPackInfoTest.cpp
//! \ingroup cuda
//! \author João Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
//! \brief Short communication test to verify the equivalence of GPUPackInfo using a default stream and multiple streams.
//
//========================================================================================================================
#include "core/DataTypes.h"
#include "core/debug/TestSubsystem.h"
#include "core/math/Random.h"
#include "core/mpi/Environment.h"
#include "stencil/Directions.h"
#include "stencil/Iterator.h"
#include "stencil/D3Q27.h"
#include "domain_decomposition/BlockDataID.h"
#include "blockforest/Initialization.h"
#include "blockforest/communication/UniformBufferedScheme.h"
#include "field/GhostLayerField.h"
#include "cuda/ErrorChecking.h"
#include "cuda/HostFieldAllocator.h"
#include "cuda/GPUField.h"
#include "cuda/FieldCopy.h"
#include "cuda/communication/GPUPackInfo.h"
#include <cuda_runtime.h>
#include <vector>
using namespace walberla;
using DataType = walberla::uint_t;
using StencilType = stencil::D3Q27;
using FieldType = field::GhostLayerField< DataType, StencilType::Size >;
using GPUFieldType = cuda::GPUField< DataType >;
using CommSchemeType = blockforest::communication::UniformBufferedScheme<StencilType>;
using GPUPackInfoType = cuda::communication::GPUPackInfo< GPUFieldType >;
static std::vector< cuda::Layout > fieldLayouts = { cuda::fzyx, cuda::zyxf };
static uint_t fieldLayoutIndex = 0;
FieldType * createField( IBlock* const block, StructuredBlockStorage* const storage )
{
return new FieldType(
storage->getNumberOfXCells( *block ), // number of cells in x direction per block
storage->getNumberOfYCells( *block ), // number of cells in y direction per block
storage->getNumberOfZCells( *block ), // number of cells in z direction per block
1, // one ghost layer
DataType(0), // initial value
fieldLayouts[fieldLayoutIndex], // layout
make_shared<cuda::HostFieldAllocator< DataType > >() // allocator for host pinned memory
);
}
GPUFieldType * createGPUField( IBlock* const block, StructuredBlockStorage* const storage )
{
return new GPUFieldType(
storage->getNumberOfXCells( *block ), // number of cells in x direction
storage->getNumberOfYCells( *block ), // number of cells in y direction
storage->getNumberOfZCells( *block ), // number of cells in z direction
StencilType::Size, // number of cells for pdfs
1, // one ghost layer
fieldLayouts[fieldLayoutIndex] );
}
void initFields( const shared_ptr< StructuredBlockStorage > & blocks, const BlockDataID & fieldID )
{
for( auto block = blocks->begin(); block != blocks->end(); ++block )
{
auto fieldPtr = block->getData< FieldType >( fieldID );
for( auto fieldIt = fieldPtr->begin(); fieldIt != fieldPtr->end(); ++fieldIt )
*fieldIt = math::intRandom< DataType >();
}
}
int main( int argc, char ** argv )
{
debug::enterTestMode();
mpi::Environment mpiEnv( argc, argv );
std::vector< cudaStream_t > streams;
for( uint_t i = 0; i < StencilType::Size; ++i )
{
cudaStream_t stream(nullptr);
WALBERLA_CUDA_CHECK( cudaStreamCreate(&stream) );
streams.push_back( stream );
}
const Vector3< uint_t > cells = Vector3< uint_t >( 4, 4, 4 );
uint_t nProc = uint_c( MPIManager::instance()->numProcesses() );
for(; fieldLayoutIndex < fieldLayouts.size(); ++fieldLayoutIndex )
{
auto blocks = blockforest::createUniformBlockGrid(nProc, 1, 1, // blocks
cells[0], cells[1], cells[2], // cells
1, // unit cell spacing
true, // one block per process
true, true, true); // periodic in all directions
BlockDataID sourceFieldId = blocks->addStructuredBlockData< FieldType >( &createField,
"ScalarField" );
BlockDataID syncGPUFieldId = blocks->addStructuredBlockData< GPUFieldType >( &createGPUField,
"syncGPUField" );
BlockDataID asyncGPUFieldId = blocks->addStructuredBlockData< GPUFieldType >( &createGPUField,
"asyncGPUField" );
math::seedRandomGenerator( numeric_cast<boost::mt19937::result_type>( MPIManager::instance()->rank() ) );
// Initialize CPU field with random values
initFields( blocks, sourceFieldId );
// Copy same CPU field to both GPU fields
for( auto block = blocks->begin(); block != blocks->end(); ++block )
{
auto sourceFieldPtr = block->getData< FieldType >( sourceFieldId );
auto syncGPUFieldPtr = block->getData< GPUFieldType >( syncGPUFieldId );
cuda::fieldCpy( *syncGPUFieldPtr, *sourceFieldPtr );
auto asyncGPUFieldPtr = block->getData< GPUFieldType >( asyncGPUFieldId );
cuda::fieldCpy( *asyncGPUFieldPtr, *sourceFieldPtr );
}
// Setup communication schemes for synchronous GPUPackInfo
CommSchemeType syncCommScheme(blocks);
syncCommScheme.addPackInfo( boost::make_shared< GPUPackInfoType >( syncGPUFieldId ) );
// Setup communication scheme for asynchronous GPUPackInfo, which uses CUDA streams
CommSchemeType asyncCommScheme(blocks);
asyncCommScheme.addPackInfo( boost::make_shared< GPUPackInfoType >( asyncGPUFieldId, streams ) );
// Perform one communication step for each scheme
syncCommScheme();
asyncCommScheme();
// Check results
FieldType syncFieldCpu( cells[0], cells[1], cells[2], 1, fieldLayouts[fieldLayoutIndex],
make_shared< cuda::HostFieldAllocator< DataType > >() );
FieldType asyncFieldCpu( cells[0], cells[1], cells[2], 1, fieldLayouts[fieldLayoutIndex],
make_shared< cuda::HostFieldAllocator< DataType > >() );
for( auto block = blocks->begin(); block != blocks->end(); ++block )
{
auto syncGPUFieldPtr = block->getData< GPUFieldType >( syncGPUFieldId );
cuda::fieldCpy( syncFieldCpu, *syncGPUFieldPtr );
auto asyncGPUFieldPtr = block->getData< GPUFieldType >( asyncGPUFieldId );
cuda::fieldCpy( asyncFieldCpu, *asyncGPUFieldPtr );
for( auto syncIt = syncFieldCpu.beginWithGhostLayerXYZ(), asyncIt = asyncFieldCpu.beginWithGhostLayerXYZ();
syncIt != syncFieldCpu.end();
++syncIt, ++asyncIt )
WALBERLA_CHECK_EQUAL( *syncIt, *asyncIt );
}
}
for( uint_t i = 0; i < StencilType::Size; ++i )
WALBERLA_CUDA_CHECK( cudaStreamDestroy(streams[i]) );
return EXIT_SUCCESS;
}
......@@ -34,12 +34,15 @@
#include "stencil/D3Q27.h"
#include <cstring>
#include <vector>
#include <cuda_runtime.h>
#define F_SIZE 19
using namespace walberla;
static std::vector< field::Layout > fieldLayouts = { field::fzyx, field::zyxf };
static uint_t fieldLayoutIndex = 0;
cuda::GPUField<int> * createGPUField( IBlock* const block, StructuredBlockStorage* const storage )
{
......@@ -49,7 +52,7 @@ cuda::GPUField<int> * createGPUField( IBlock* const block, StructuredBlockStorag
storage->getNumberOfZCells( *block ), // number of cells in z direction
F_SIZE, // fSize
1, // number of ghost layers
field::fzyx );
fieldLayouts[fieldLayoutIndex] );
}
// Tester base class. The communicate() template method allows testing different communication methods.
......@@ -59,7 +62,9 @@ public:
typedef cuda::communication::GPUPackInfo< cuda::GPUField<int> > GPUPackInfoType;
GPUPackInfoTester( IBlock* block, BlockDataID fieldId ): block_( block ), fieldId_( fieldId ) {}
GPUPackInfoTester( IBlock* block, BlockDataID fieldId, std::vector< cudaStream_t > & streams ) :
block_( block ), fieldId_( fieldId ), streams_(streams) {}
virtual ~GPUPackInfoTester() {}
void test( stencil::Direction dir )
......@@ -72,7 +77,7 @@ public:
gpuField.zSize(), // number of cells in z direction
1, // number of ghost layers
0, // initial value
field::fzyx);
fieldLayouts[fieldLayoutIndex]);
cpuField.setWithGhostLayer( 0 );
int val = 0;
......@@ -82,7 +87,7 @@ public:
}
cuda::fieldCpy( gpuField, cpuField );
GPUPackInfoType gpuPackInfo( fieldId_ );
GPUPackInfoType gpuPackInfo( fieldId_, streams_ );
communicate( gpuPackInfo, dir );
cuda::fieldCpy( cpuField, gpuField );
......@@ -101,6 +106,7 @@ protected:
IBlock* block_;
BlockDataID fieldId_;
std::vector< cudaStream_t > streams_;
};
......@@ -108,7 +114,7 @@ protected:
class GPUPackInfoBufferTester: public GPUPackInfoTester
{
public:
GPUPackInfoBufferTester( IBlock* block, BlockDataID fieldId ): GPUPackInfoTester( block, fieldId ) {}
GPUPackInfoBufferTester( IBlock* block, BlockDataID fieldId, std::vector< cudaStream_t > & streams): GPUPackInfoTester( block, fieldId, streams ) {}
protected:
void communicate( GPUPackInfoType& gpuPackInfo, stencil::Direction dir )
......@@ -134,7 +140,7 @@ protected:
class GPUPackInfoLocalTester: public GPUPackInfoTester
{
public:
GPUPackInfoLocalTester( IBlock* block, BlockDataID fieldId ): GPUPackInfoTester( block, fieldId ) {}
GPUPackInfoLocalTester( IBlock* block, BlockDataID fieldId, std::vector< cudaStream_t > & streams ): GPUPackInfoTester( block, fieldId, streams ) {}
protected:
void communicate( GPUPackInfoType& gpuPackInfo, stencil::Direction dir )
......@@ -151,27 +157,42 @@ int main(int argc, char **argv)
debug::enterTestMode();
MPIManager::instance()->initializeMPI(&argc,&argv);
// Create BlockForest
uint_t processes = uint_c( MPIManager::instance()->numProcesses() );
auto blocks = createUniformBlockGrid(processes,1,1, //blocks
2,2,2, //cells
1, //dx
false, //one block per process
true,true,true);//periodicity
BlockDataID scalarGPUFieldId = blocks->addStructuredBlockData<cuda::GPUField<int> >(
&createGPUField, "ScalarGPUField" );
for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
for(; fieldLayoutIndex < fieldLayouts.size(); ++fieldLayoutIndex )
{
GPUPackInfoBufferTester bufferTester( &(*blockIt), scalarGPUFieldId );
GPUPackInfoLocalTester localTester( &(*blockIt), scalarGPUFieldId );
std::vector< cudaStream_t > streams;
for( uint_t s = 0; s < stencil::D3Q27::Size; ++s )
{
cudaStream_t stream(nullptr);
WALBERLA_CUDA_CHECK( cudaStreamCreate( &stream ) );
streams.push_back( stream );
}
// Create BlockForest
uint_t processes = uint_c( MPIManager::instance()->numProcesses() );
auto blocks = createUniformBlockGrid(processes,1,1, //blocks
2,2,2, //cells
1, //dx
false, //one block per process
true,true,true);//periodicity
BlockDataID scalarGPUFieldId = blocks->addStructuredBlockData<cuda::GPUField<int> >(
&createGPUField, "ScalarGPUField" );
for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt )
{
GPUPackInfoBufferTester bufferTester( &(*blockIt), scalarGPUFieldId, streams );
GPUPackInfoLocalTester localTester( &(*blockIt), scalarGPUFieldId, streams );
for( auto dir = stencil::D3Q27::beginNoCenter(); dir != stencil::D3Q27::end(); ++dir )
{
localTester.test( *dir );
bufferTester.test( *dir );
}
}
for( auto dir = stencil::D3Q27::beginNoCenter(); dir != stencil::D3Q27::end(); ++dir )
for( auto streamIt = streams.begin(); streamIt != streams.end(); ++streamIt )
{
localTester.test( *dir );
bufferTester.test( *dir );
cudaStream_t & stream = *streamIt;
WALBERLA_CUDA_CHECK( cudaStreamDestroy( stream ) );
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment