Commit 319909f0 authored by Martin Bauer's avatar Martin Bauer
Browse files

New GPU communication scheme with GPU kernels for packing

Features:
   - uses generated pack infos for packing & unpacking directly on GPU
   - can directly send GPU buffers if cuda-enabled MPI is available,
     otherwise the packed buffers are transfered to CPU first
   - communication hiding with cuda streams: communication can be run
     asynchronously - especially useful when compute kernel is also
     split up into inner and outer part

- added RAII classes for CUDA streams and events
- equivalence test that checks if generated CPU and GPU (overlapped)
  versions are computing same result as normal waLBerla LBM kernel
parent b3213d8a
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file CudaRAII.h
//! \ingroup cuda
//! \author Martin Bauer <martin.bauer@fau.de>
//
//======================================================================================================================
#include "ErrorChecking.h"
namespace walberla {
namespace cuda {
class StreamRAII
{
public:
~StreamRAII() {
if( stream_ != 0 ) {
WALBERLA_CUDA_CHECK( cudaStreamDestroy( stream_ ));
}
}
StreamRAII( StreamRAII && other) {
stream_ = other.stream_;
other.stream_ = 0;
}
StreamRAII(const StreamRAII&) = delete;
void operator=( const StreamRAII &) = delete;
operator cudaStream_t() const { return stream_; }
static StreamRAII defaultStream() {
StreamRAII result;
result.stream_ = 0;
return result;
}
static StreamRAII newPriorityStream(int priority) {
StreamRAII result;
WALBERLA_CUDA_CHECK( cudaStreamCreateWithPriority( &result.stream_, cudaStreamDefault, priority ));
return result;
}
static StreamRAII newStream() {
StreamRAII result;
WALBERLA_CUDA_CHECK( cudaStreamCreate( &result.stream_));
return result;
}
private:
StreamRAII() {}
cudaStream_t stream_;
};
struct EventRAII
{
explicit EventRAII() { WALBERLA_CUDA_CHECK( cudaEventCreate(&event) ); }
~EventRAII() { WALBERLA_CUDA_CHECK( cudaEventDestroy(event) ); }
EventRAII(const EventRAII &) = delete;
void operator=( const EventRAII &) = delete;
operator cudaEvent_t() const { return event; }
cudaEvent_t event;
};
} // namespace cuda
} // namespace walberla
\ No newline at end of file
...@@ -40,7 +40,7 @@ inline void checkForError( cudaError_t code, const std::string & callerPath, con ...@@ -40,7 +40,7 @@ inline void checkForError( cudaError_t code, const std::string & callerPath, con
if(code != cudaSuccess) if(code != cudaSuccess)
{ {
std::stringstream ss; std::stringstream ss;
ss << "CUDA Error: " << cudaGetErrorString( code ); ss << "CUDA Error: " << code << " " << cudaGetErrorName(code) << ": " << cudaGetErrorString( code );
Abort::instance()->abort( ss.str(), callerPath, line ); Abort::instance()->abort( ss.str(), callerPath, line );
} }
} }
......
...@@ -118,6 +118,9 @@ namespace cuda { ...@@ -118,6 +118,9 @@ namespace cuda {
inline uint_t nrOfGhostLayers() const { return nrOfGhostLayers_; } inline uint_t nrOfGhostLayers() const { return nrOfGhostLayers_; }
inline CellInterval xyzSize() const;
inline CellInterval xyzSizeWithGhostLayer() const;
bool operator==( const GPUField & other ) const; bool operator==( const GPUField & other ) const;
void getGhostRegion( stencil::Direction d, CellInterval & ci, void getGhostRegion( stencil::Direction d, CellInterval & ci,
......
...@@ -112,6 +112,26 @@ void GPUField<T>::getGhostRegion(stencil::Direction d, CellInterval & ci, ...@@ -112,6 +112,26 @@ void GPUField<T>::getGhostRegion(stencil::Direction d, CellInterval & ci,
} }
template<typename T>
inline CellInterval GPUField<T>::xyzSize() const
{
return CellInterval (0,0,0,
cell_idx_c( xSize() )-1,
cell_idx_c( ySize() )-1,
cell_idx_c( zSize() )-1 );
}
template<typename T>
inline CellInterval GPUField<T>::xyzSizeWithGhostLayer() const
{
CellInterval ci = GPUField<T>::xyzSize();
for( uint_t i=0; i < 3; ++i ) {
ci.min()[i] -= cell_idx_c( nrOfGhostLayers_ );
ci.max()[i] += cell_idx_c( nrOfGhostLayers_ );
}
return ci;
}
template<typename T> template<typename T>
void GPUField<T>::getSlice(stencil::Direction d, CellInterval & ci, void GPUField<T>::getSlice(stencil::Direction d, CellInterval & ci,
cell_idx_t distance, cell_idx_t thickness, bool fullSlice ) const cell_idx_t distance, cell_idx_t thickness, bool fullSlice ) const
......
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file BasicBuffer.h
//! \ingroup cuda
//! \author Martin Bauer <martin.bauer@fau.de>
//! \brief Basic Buffer supporting different memory spaces
//
//======================================================================================================================
#pragma once
#include "cuda/ErrorChecking.h"
#include <algorithm>
#include <cstring>
namespace walberla {
namespace cuda {
namespace communication {
struct HostMemoryAllocator;
struct DeviceMemoryAllocator;
//*******************************************************************************************************************
/*!
* Simple buffer class that supports memory allocators, e.g. for pinned host memory or GPU memory
*
* \ingroup cuda
*
* In contrast to core::mpi::Buffer this class does not support stream operators "<<" and ">>" because these
* operators imply serial (un)packing which is not feasible on the GPU.
* The allocator template has to provide:
* - static void *allocate( size_t size )
* - void deallocate( void *ptr )
* - void memcpy( void *dst, void *src, size_t count )
*
* The buffer has a beginning, a current position and an end position. Here is an overview of the most important
* operations:
* - clear: reset current position to begin, does not change size
* - advance: moves current position number of bytes forward and returns poitner to the old current position
* two versions are available, one that automatically resizes and reallocates the buffer, and one that
* fails if not enough space is available
*/
//*******************************************************************************************************************
template<typename Allocator>
class CustomMemoryBuffer
{
public:
typedef uint8_t ElementType;
explicit CustomMemoryBuffer();
explicit CustomMemoryBuffer( std::size_t initSize );
explicit CustomMemoryBuffer( const CustomMemoryBuffer &pb );
~CustomMemoryBuffer();
CustomMemoryBuffer &operator=( const CustomMemoryBuffer &pb );
void resize( std::size_t newSize );
inline std::size_t allocSize() const { return std::size_t(end_ - begin_); }
inline std::size_t size() const { return std::size_t(cur_ - begin_); }
ElementType *ptr() const { return begin_; }
inline void clear() { cur_ = begin_; }
ElementType *advance( std::size_t bytes );
ElementType *advanceNoResize( std::size_t bytes );
template<typename T>
T *advance( std::size_t bytes ) { return reinterpret_cast<T *>( advance( bytes * sizeof( T ))); }
template<typename T>
T *advanceNoResize( std::size_t bytes ) { return reinterpret_cast<T *>( advanceNoResize( bytes * sizeof( T ))); }
private:
ElementType *begin_;
ElementType *cur_;
ElementType *end_;
};
using PinnedMemoryBuffer = CustomMemoryBuffer<HostMemoryAllocator>;
using GPUMemoryBuffer = CustomMemoryBuffer<DeviceMemoryAllocator>;
struct HostMemoryAllocator
{
static void *allocate( size_t size )
{
void *p;
WALBERLA_CUDA_CHECK( cudaMallocHost( &p, size ));
return p;
}
static void deallocate( void *ptr )
{
WALBERLA_CUDA_CHECK( cudaFreeHost( ptr ));
}
static void memcpy( void *dst, void *src, size_t count )
{
std::memcpy( dst, src, count );
}
};
struct DeviceMemoryAllocator
{
static void *allocate( size_t size )
{
void *p;
WALBERLA_CUDA_CHECK( cudaMalloc( &p, size ));
return p;
}
static void deallocate( void *ptr )
{
WALBERLA_CUDA_CHECK( cudaFree( ptr ));
}
static void memcpy( void *dst, void *src, size_t count )
{
cudaMemcpy( dst, src, count, cudaMemcpyDeviceToDevice );
}
};
} // namespace communication
} // namespace cuda
} // namespace walberla
#include "CustomMemoryBuffer.impl.h"
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file BasicBuffer.h
//! \ingroup cuda
//! \author Martin Bauer <martin.bauer@fau.de>
//
//======================================================================================================================
namespace walberla {
namespace cuda {
namespace communication {
template<typename Allocator>
CustomMemoryBuffer<Allocator>::CustomMemoryBuffer()
: begin_( nullptr ), cur_( nullptr ), end_( nullptr )
{
}
template<typename Allocator>
CustomMemoryBuffer<Allocator>::CustomMemoryBuffer( std::size_t initSize )
: begin_( nullptr ), cur_( nullptr ), end_( nullptr )
{
if( initSize > 0 )
{
begin_ = Allocator::allocate( initSize );
end_ = begin_ + initSize;
cur_ = begin_;
}
}
template<typename Allocator>
CustomMemoryBuffer<Allocator>::CustomMemoryBuffer( const CustomMemoryBuffer &pb )
{
if( pb.begin_ != nullptr )
{
begin_ = reinterpret_cast<ElementType *>(Allocator::allocate( pb.allocSize()));
end_ = begin_ + pb.allocSize();
Allocator::memcpy( begin_, pb.begin_, pb.allocSize());
cur_ = begin_ + pb.size();
}
}
template<typename Allocator>
CustomMemoryBuffer<Allocator> &CustomMemoryBuffer<Allocator>::operator=( const CustomMemoryBuffer<Allocator> &pb )
{
auto copy( pb );
std::swap( cur_, copy.cur_ );
std::swap( begin_, copy.begin_ );
std::swap( end_, copy.end_ );
return *this;
}
template<typename Allocator>
CustomMemoryBuffer<Allocator>::~CustomMemoryBuffer()
{
if( begin_ != nullptr )
Allocator::deallocate( begin_ );
}
template<typename Allocator>
void CustomMemoryBuffer<Allocator>::resize( std::size_t newSize )
{
if( newSize > allocSize())
{
auto offset = cur_ - begin_;
ElementType *newBegin;
newBegin = reinterpret_cast<ElementType *>(Allocator::allocate( newSize ));
Allocator::memcpy( newBegin, begin_, size_t(end_ - begin_) );
std::swap( begin_, newBegin );
if( newBegin != nullptr )
Allocator::deallocate( newBegin );
end_ = begin_ + newSize;
cur_ = begin_ + offset;
}
}
template<typename Allocator>
typename CustomMemoryBuffer<Allocator>::ElementType *CustomMemoryBuffer<Allocator>::advance( std::size_t bytes )
{
resize( size() + bytes );
auto result = cur_;
cur_ += bytes;
WALBERLA_ASSERT_LESS_EQUAL( cur_, end_ );
return result;
}
template<typename Allocator>
typename CustomMemoryBuffer<Allocator>::ElementType *CustomMemoryBuffer<Allocator>::advanceNoResize( std::size_t bytes )
{
auto newSize = size() + bytes;
if( newSize <= allocSize())
return advance( bytes );
else
return nullptr;
}
} // namespace communication
} // namespace cuda
} // namespace walberla
...@@ -21,21 +21,18 @@ ...@@ -21,21 +21,18 @@
#pragma once #pragma once
#include "blockforest/Block.h"
#include "communication/UniformPackInfo.h"
#include "core/debug/Debug.h" #include "core/debug/Debug.h"
#include "core/math/Vector3.h" #include "core/math/Vector3.h"
#include "core/mpi/BufferSizeTrait.h" #include "core/mpi/BufferSizeTrait.h"
#include "stencil/Directions.h"
#include "field/GhostRegions.h" #include "field/GhostRegions.h"
#include "field/Layout.h"
#include "communication/UniformPackInfo.h" #include "stencil/Directions.h"
#include "blockforest/Block.h"
#include "cuda/ErrorChecking.h" #include "cuda/ErrorChecking.h"
#include "cuda/GPUCopy.h" #include "cuda/GPUCopy.h"
#include "cuda/communication/PinnedMemoryBuffer.h" #include "cuda/communication/CustomMemoryBuffer.h"
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <map> #include <map>
...@@ -142,7 +139,8 @@ void GPUPackInfo<GPUField_T>::unpackData(IBlock * receiver, stencil::Direction d ...@@ -142,7 +139,8 @@ void GPUPackInfo<GPUField_T>::unpackData(IBlock * receiver, stencil::Direction d
if ( copyAsync_ ) if ( copyAsync_ )
{ {
PinnedMemoryBuffer & pinnedBuffer = pinnedRecvBuffers_[dir]; PinnedMemoryBuffer & pinnedBuffer = pinnedRecvBuffers_[dir];
copyBufferPtr = pinnedBuffer.resize( nrOfBytesToRead ); pinnedBuffer.clear();
copyBufferPtr = pinnedBuffer.advance( nrOfBytesToRead );
// Copy data into pinned memory buffer, in order to transfer it asynchronously to the GPU // Copy data into pinned memory buffer, in order to transfer it asynchronously to the GPU
std::copy( bufPtr, static_cast< unsigned char * >( bufPtr + nrOfBytesToRead ), copyBufferPtr ); std::copy( bufPtr, static_cast< unsigned char * >( bufPtr + nrOfBytesToRead ), copyBufferPtr );
} }
...@@ -158,7 +156,7 @@ void GPUPackInfo<GPUField_T>::unpackData(IBlock * receiver, stencil::Direction d ...@@ -158,7 +156,7 @@ void GPUPackInfo<GPUField_T>::unpackData(IBlock * receiver, stencil::Direction d
auto intervalSize = std::make_tuple( fieldCi.xSize(), fieldCi.ySize(), fieldCi.zSize(), auto intervalSize = std::make_tuple( fieldCi.xSize(), fieldCi.ySize(), fieldCi.zSize(),
fieldPtr->fSize() ); fieldPtr->fSize() );
if ( fieldPtr->layout() == fzyx ) if ( fieldPtr->layout() == field::fzyx )
{ {
const uint_t dstAllocSizeZ = fieldPtr->zAllocSize(); const uint_t dstAllocSizeZ = fieldPtr->zAllocSize();
const uint_t srcAllocSizeZ = fieldCi.zSize(); const uint_t srcAllocSizeZ = fieldCi.zSize();
...@@ -217,7 +215,7 @@ void GPUPackInfo<GPUField_T>::communicateLocal(const IBlock * sender, IBlock * r ...@@ -217,7 +215,7 @@ void GPUPackInfo<GPUField_T>::communicateLocal(const IBlock * sender, IBlock * r
auto intervalSize = std::make_tuple( rCi.xSize(), rCi.ySize(), rCi.zSize(), sf->fSize() ); auto intervalSize = std::make_tuple( rCi.xSize(), rCi.ySize(), rCi.zSize(), sf->fSize() );
if ( sf->layout() == fzyx ) if ( sf->layout() == field::fzyx )
{ {
const uint_t dstAllocSizeZ = rf->zAllocSize(); const uint_t dstAllocSizeZ = rf->zAllocSize();
const uint_t srcAllocSizeZ = sf->zAllocSize(); const uint_t srcAllocSizeZ = sf->zAllocSize();
...@@ -263,7 +261,8 @@ void GPUPackInfo<GPUField_T>::packDataImpl(const IBlock * sender, stencil::Direc ...@@ -263,7 +261,8 @@ void GPUPackInfo<GPUField_T>::packDataImpl(const IBlock * sender, stencil::Direc
if ( copyAsync_ ) if ( copyAsync_ )
{ {
PinnedMemoryBuffer & pinnedBuffer = pinnedSendBuffers_[dir]; PinnedMemoryBuffer & pinnedBuffer = pinnedSendBuffers_[dir];
copyBufferPtr = pinnedBuffer.resize( nrOfBytesToPack ); pinnedBuffer.clear();
copyBufferPtr = pinnedBuffer.advance( nrOfBytesToPack );
} }
auto dstOffset = std::make_tuple( uint_c(0), uint_c(0), uint_c(0), uint_c(0) ); auto dstOffset = std::make_tuple( uint_c(0), uint_c(0), uint_c(0), uint_c(0) );
...@@ -275,7 +274,7 @@ void GPUPackInfo<GPUField_T>::packDataImpl(const IBlock * sender, stencil::Direc ...@@ -275,7 +274,7 @@ void GPUPackInfo<GPUField_T>::packDataImpl(const IBlock * sender, stencil::Direc
auto intervalSize = std::make_tuple( fieldCi.xSize(), fieldCi.ySize(), fieldCi.zSize(), auto intervalSize = std::make_tuple( fieldCi.xSize(), fieldCi.ySize(), fieldCi.zSize(),
fieldPtr->fSize() ); fieldPtr->fSize() );
if ( fieldPtr->layout() == fzyx ) if ( fieldPtr->layout() == field::fzyx )
{ {
const uint_t dstAllocSizeZ = fieldCi.zSize(); const uint_t dstAllocSizeZ = fieldCi.zSize();
const uint_t srcAllocSizeZ = fieldPtr->zAllocSize(); const uint_t srcAllocSizeZ = fieldPtr->zAllocSize();
......
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file GeneratedGPUPackInfo.h
//! \ingroup core
//! \author Martin Bauer <martin.bauer@fau.de>
//
//======================================================================================================================
#pragma once
#include "stencil/Directions.h"
#include "domain_decomposition/IBlock.h"
#include <cuda_runtime.h>
namespace walberla {
namespace cuda {
class GeneratedGPUPackInfo
{
public:
virtual void pack ( stencil::Direction dir, unsigned char *buffer, IBlock *block, cudaStream_t stream ) = 0;
virtual void unpack( stencil::Direction dir, unsigned char *buffer, IBlock *block, cudaStream_t stream ) = 0;
virtual uint_t size( stencil::Direction dir, IBlock *block ) = 0;
};
} //namespace cuda
} //namespace walberla
\ No newline at end of file
...@@ -13,111 +13,82 @@ ...@@ -13,111 +13,82 @@
// You should have received a copy of the GNU General Public License along // You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. // with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
// //
//! \file PinnedMemoryBuffer.h //! \file UniformGPUScheme.h
//! \ingroup cuda //! \ingroup cuda
//! \author João Victor Tozatti Risso <jvtrisso@inf.ufpr.br> //! \author Martin Bauer <martin.bauer@fau.de>
//! \brief Pinned Memory buffer for staging memory when using asynchronous CUDA memory copies.
// //
//====================================================================================================================== //======================================================================================================================
#pragma once #pragma once
#include "cuda/ErrorChecking.h" #include "blockforest/StructuredBlockForest.h"
#include "core/mpi/MPIWrapper.h"
#include "core/mpi/BufferSystem.h"
#include "core/WeakPtrWrapper.h"
#include "domain_decomposition/IBlock.h"
#include "stencil/Directions.h"
#include <algorithm> #include "cuda/CudaRAII.h"
#include <cuda_runtime.h> #include "cuda/communication/GeneratedGPUPackInfo.h"
#include "cuda/communication/CustomMemoryBuffer.h"
#include <chrono>
#include <thread>
namespace walberla { namespace walberla {
namespace cuda { namespace cuda {
namespace communication { namespace communication {