Commit dd28a536 authored by Paulo Carvalho's avatar Paulo Carvalho Committed by Martin Bauer
Browse files

CUDA communication that does not rely on cuda aware MPI

parent 6fc7b559
......@@ -19,6 +19,7 @@ Lorenz Hufnagel
Martin Bauer
Matthias Markl
Michael Kuron
Paulo Carvalho
Regina Ammer
Sagar Dolas
Sebastian Eibl
......
......@@ -22,6 +22,7 @@
#include "cuda/HostFieldAllocator.h"
#include "blockforest/Initialization.h"
#include "blockforest/communication/UniformDirectScheme.h"
#include "blockforest/communication/UniformBufferedScheme.h"
#include "core/Environment.h"
......@@ -30,6 +31,8 @@
#include "cuda/GPUField.h"
#include "cuda/Kernel.h"
#include "cuda/AddGPUFieldToStorage.h"
#include "cuda/communication/GPUPackInfo.h"
#include "field/AddToStorage.h"
#include "field/communication/UniformMPIDatatypeInfo.h"
......@@ -113,16 +116,22 @@ int main( int argc, char ** argv )
BlockDataID gpuFieldSrcID = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" );
BlockDataID gpuFieldDstID = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Dst" );
typedef blockforest::communication::UniformDirectScheme<stencil::D2Q9 > CommScheme;
CommScheme communication( blocks );
communication.addDataToCommunicate( make_shared<field::communication::UniformMPIDatatypeInfo<GPUField> > (gpuFieldSrcID) );
typedef blockforest::communication::UniformBufferedScheme<stencil::D2Q9 > CommScheme;
typedef cuda::communication::GPUPackInfo<GPUField> Packing;
// Alternative, if CUDA enabled MPI is available
//blockforest::communication::UniformDirectScheme<stencil::D2Q9 >
//typedef field::communication::UniformMPIDatatypeInfo<GPUField> Packing
CommScheme commScheme(blocks);
commScheme.addDataToCommunicate( make_shared<Packing>(gpuFieldSrcID) );
// Create Timeloop
const uint_t numberOfTimesteps = uint_t(10); // number of timesteps for non-gui runs
SweepTimeloop timeloop ( blocks, numberOfTimesteps );
// Registering the sweep
timeloop.add() << BeforeFunction( communication, "Communication" )
timeloop.add() << BeforeFunction( commScheme, "Communication" )
<< Sweep( GameOfLifeSweepCUDA(gpuFieldSrcID, gpuFieldDstID ), "GameOfLifeSweep" );
timeloop.add() << Sweep( cuda::fieldCpyFunctor<ScalarField, GPUField >(cpuFieldID, gpuFieldDstID) );
......
......@@ -140,7 +140,7 @@ public:
inline void reserve( size_t newCapacity );
inline void resize ( size_t newSize );
template< typename V > void peek ( V& value ) const;
inline void skip ( size_t elements );
inline T * skip ( size_t elements );
inline void clear ();
inline void reset ();
inline void readDebugMarker( const char * marker );
......@@ -578,10 +578,15 @@ inline void GenericRecvBuffer<T>::peek( V& value ) const
// This function skips \a element receive buffer elements of type \a T.
*/
template< typename T > // Element type
void GenericRecvBuffer<T>::skip( size_t elements )
T * GenericRecvBuffer<T>::skip( size_t elements )
{
auto previous = cur_;
cur_ += elements;
if( cur_ > end_ ) cur_ = end_;
// Invariants check
WALBERLA_ASSERT_LESS_EQUAL( cur_, end_ );
return previous;
}
//**********************************************************************************************************************
......
......@@ -154,6 +154,7 @@ public:
//**Repositioning ***************************************************************************************************
/*!\name Repositioning */
inline T * forward( uint_t elements );
inline void rewind(const size_t & size);
//@}
//*******************************************************************************************************************
......@@ -508,6 +509,35 @@ GenericSendBuffer<T,G>::operator<<( V value )
//
//======================================================================================================================
//**********************************************************************************************************************
/*!\brief Forward the given number of elements.
//
// \param elements The number of elements to be advanced.
// \return Previous position.
//
// This function forwards \a element send buffer elements of type \a T and returns the previous buffer position.
*/
template< typename T // Element type
, typename G > // Growth policy
T * GenericSendBuffer<T,G>::forward( uint_t elements )
{
const size_t rest = numeric_cast< size_t >( end_ - cur_ );
// Checking the size of the remaining memory
if( rest < elements ) {
extendMemory( size() + elements );
}
// Adding the data value
auto previous = cur_;
cur_ += elements;
// Invariants check
WALBERLA_ASSERT_LESS_EQUAL( cur_, end_ );
return previous;
}
//**********************************************************************************************************************
/*!\brief Rewinds the stream to a previous position
//
......
......@@ -45,7 +45,8 @@ namespace cuda {
const std::string & identifier,
uint_t fSize,
const Layout layout = fzyx,
uint_t nrOfGhostLayers = 1 );
uint_t nrOfGhostLayers = 1,
bool usePitchedMem = true );
......@@ -61,7 +62,8 @@ namespace cuda {
template< typename Field_T>
BlockDataID addGPUFieldToStorage( const shared_ptr< StructuredBlockStorage > & bs,
ConstBlockDataID cpuFieldID,
const std::string & identifier );
const std::string & identifier,
bool usePitchedMem = true );
......
......@@ -33,26 +33,28 @@ namespace cuda {
const StructuredBlockStorage * const bs,
uint_t ghostLayers,
uint_t fSize,
const field::Layout & layout )
const field::Layout & layout,
bool usePitchedMem )
{
return new GPUField_T( bs->getNumberOfXCells( *block ),
bs->getNumberOfYCells( *block ),
bs->getNumberOfZCells( *block ),
fSize, ghostLayers, layout );
fSize, ghostLayers, layout, usePitchedMem );
}
template< typename Field_T>
GPUField< typename Field_T::value_type> *
createGPUFieldFromCPUField( const IBlock * const block,
const StructuredBlockStorage * const,
ConstBlockDataID cpuFieldID
ConstBlockDataID cpuFieldID,
bool usePitchedMem
)
{
typedef GPUField< typename Field_T::value_type> GPUField_T;
const Field_T * f = block->getData<Field_T>( cpuFieldID );
auto gpuField = new GPUField_T( f->xSize(), f->ySize(), f->zSize(), f->fSize(),
f->nrOfGhostLayers(), f->layout() );
f->nrOfGhostLayers(), f->layout(), usePitchedMem );
cuda::fieldCpy( *gpuField, *f );
......@@ -67,9 +69,10 @@ namespace cuda {
const std::string & identifier,
uint_t fSize,
const Layout layout,
uint_t nrOfGhostLayers )
uint_t nrOfGhostLayers,
bool usePitchedMem )
{
auto func = boost::bind ( internal::createGPUField<GPUField_T>, _1, _2, nrOfGhostLayers, fSize, layout );
auto func = boost::bind ( internal::createGPUField<GPUField_T>, _1, _2, nrOfGhostLayers, fSize, layout, usePitchedMem );
return bs->addStructuredBlockData< GPUField_T >( func, identifier );
}
......@@ -77,9 +80,10 @@ namespace cuda {
template< typename Field_T>
BlockDataID addGPUFieldToStorage( const shared_ptr< StructuredBlockStorage > & bs,
ConstBlockDataID cpuFieldID,
const std::string & identifier )
const std::string & identifier,
bool usePitchedMem )
{
auto func = boost::bind ( internal::createGPUFieldFromCPUField<Field_T>, _1, _2, cpuFieldID );
auto func = boost::bind ( internal::createGPUFieldFromCPUField<Field_T>, _1, _2, cpuFieldID, usePitchedMem );
return bs->addStructuredBlockData< GPUField<typename Field_T::value_type> >( func, identifier );
}
......
......@@ -4,6 +4,6 @@
#
###################################################################################################
waLBerla_add_module( DEPENDS core domain_decomposition field stencil BUILD_ONLY_IF_FOUND CUDA )
waLBerla_add_module( DEPENDS core communication domain_decomposition field stencil BUILD_ONLY_IF_FOUND CUDA )
###################################################################################################
\ No newline at end of file
......@@ -39,10 +39,10 @@ namespace cuda {
};
FieldAccessor( char * ptr,
uint32_t xOffset,
uint32_t yOffset,
uint32_t zOffset,
uint32_t fOffset,
uint_t xOffset,
uint_t yOffset,
uint_t zOffset,
uint_t fOffset,
IndexingScheme indexingScheme )
: ptr_(ptr), xOffset_(xOffset), yOffset_(yOffset), zOffset_(zOffset),
fOffset_(fOffset), indexingScheme_(indexingScheme )
......@@ -65,7 +65,7 @@ namespace cuda {
}
__device__ unsigned int getLinearIndex( uint3 blockIdx, uint3 threadIdx, uint3 gridDim, uint3 blockDim )
__device__ uint_t getLinearIndex( uint3 blockIdx, uint3 threadIdx, uint3 gridDim, uint3 blockDim )
{
return threadIdx.x +
blockIdx.x * blockDim.x +
......@@ -73,6 +73,8 @@ namespace cuda {
blockIdx.z * blockDim.x * gridDim.x * gridDim.y ;
}
// This is always true for this specific field indexing class.
__device__ __forceinline__ bool isValidPosition() { return true; }
__device__ T & get() { return * (T*)(ptr_); }
__device__ T & get( int f) { return * (T*)(ptr_ + f * fOffset_); }
......@@ -80,26 +82,26 @@ namespace cuda {
__device__ T & getNeighbor( int cx, int cy, int cz ) const
{
return * (T*)( ptr_ + cx * (int)(xOffset_) +
cy * (int)(yOffset_) +
cz * (int)(zOffset_) );
return * (T*)( ptr_ + cx * xOffset_ +
cy * yOffset_ +
cz * zOffset_ );
}
__device__ T & getNeighbor( int cx, int cy, int cz, int cf )
{
return * (T*)( ptr_ + cx * (int)(xOffset_) +
cy * (int)(yOffset_) +
cz * (int)(zOffset_) +
cf * (int)(fOffset_) );
return * (T*)( ptr_ + cx * xOffset_ +
cy * yOffset_ +
cz * zOffset_ +
cf * fOffset_ );
}
protected:
char * ptr_;
uint32_t xOffset_;
uint32_t yOffset_;
uint32_t zOffset_;
uint32_t fOffset_;
uint_t xOffset_;
uint_t yOffset_;
uint_t zOffset_;
uint_t fOffset_;
IndexingScheme indexingScheme_;
};
......
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file FieldAccessor3D.h
//! \ingroup cuda
//! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
//
//======================================================================================================================
#pragma once
#include "core/DataTypes.h"
#include <cuda_runtime.h>
namespace walberla {
namespace cuda {
template<typename T>
class FieldAccessor3D
{
public:
FieldAccessor3D( char * ptr,
uint_t xOffset,
uint_t yOffset,
uint_t zOffset,
uint_t fOffset,
const dim3 & idxDim,
const dim3 & blockDim )
: ptr_( ptr ), xOffset_( xOffset ), yOffset_( yOffset ), zOffset_( zOffset ), fOffset_( fOffset ),
idxDim_( idxDim ), blockDim_( blockDim ), isValidPosition_( false )
{}
__device__ __forceinline__ void set( const uint3& blockIdx, const uint3& threadIdx )
{
uint_t x = blockIdx.x * blockDim_.x + threadIdx.x;
uint_t y = blockIdx.y * blockDim_.y + threadIdx.y;
uint_t z = blockIdx.z * blockDim_.z + threadIdx.z;
if ( x < idxDim_.x && y < idxDim_.y && z < idxDim_.z )
{
ptr_ += x * xOffset_ + y * yOffset_ + z * zOffset_;
isValidPosition_ = true;
}
}
__device__ __forceinline__ bool isValidPosition() { return isValidPosition_; }
__device__ __forceinline__ T & get() { return * (T*)(ptr_); }
__device__ __forceinline__ T & get( int f ) { return * (T*)(ptr_ + f * fOffset_); }
__device__ __forceinline__ T & getNeighbor( int cx, int cy, int cz ) const
{
return * (T*)( ptr_ + cx * xOffset_ +
cy * yOffset_ +
cz * zOffset_ );
}
__device__ __forceinline__ T & getNeighbor( int cx, int cy, int cz, int cf )
{
return * (T*)( ptr_ + cx * xOffset_ +
cy * yOffset_ +
cz * zOffset_ +
cf * fOffset_ );
}
protected:
char * ptr_;
uint_t xOffset_;
uint_t yOffset_;
uint_t zOffset_;
uint_t fOffset_;
dim3 idxDim_;
dim3 blockDim_;
bool isValidPosition_;
};
} // namespace cuda
} // namespace walberla
......@@ -109,12 +109,14 @@ namespace cuda {
bool canCopy = ( src.layout() == fzyx &&
dst.fAllocSize() == src.fAllocSize() &&
dst.zAllocSize() == src.zAllocSize() &&
dst.yAllocSize() == src.yAllocSize() )
dst.yAllocSize() == src.yAllocSize() &&
dst.xSize() == src.xSize() )
||
( src.layout() == zyxf &&
dst.zAllocSize() == src.zAllocSize() &&
dst.yAllocSize() == src.yAllocSize() &&
dst.xAllocSize() == src.xAllocSize() );
dst.xAllocSize() == src.xAllocSize() &&
dst.fSize() == src.fSize() );
if ( !canCopy ) {
WALBERLA_ABORT("Field have to have the same size ");
......@@ -127,20 +129,20 @@ namespace cuda {
src.xAllocSize(), // inner dimension size
src.yAllocSize() ); // next outer dimension size
p.extent.width = src.xAllocSize() * sizeof(T);
p.extent.height = src.yAllocSize();
p.extent.depth = src.zAllocSize() * src.fAllocSize();
p.extent.width = std::min( dst.xAllocSize(), src.xAllocSize() ) * sizeof(T);
p.extent.height = dst.yAllocSize();
p.extent.depth = dst.zAllocSize() * dst.fAllocSize();
}
else
{
p.srcPtr = make_cudaPitchedPtr( (void*)(src.data()), // pointer
p.srcPtr = make_cudaPitchedPtr( (void*)(src.data()), // pointer
sizeof(T) * src.fAllocSize(), // pitch
src.fAllocSize(), // inner dimension size
src.xAllocSize() ); // next outer dimension size
p.extent.width = src.fAllocSize() * sizeof(T);
p.extent.height = src.xAllocSize();
p.extent.depth = src.yAllocSize() * src.zAllocSize();
p.extent.width = std::min( dst.fAllocSize(), src.fAllocSize() ) * sizeof(T);
p.extent.height = dst.xAllocSize();
p.extent.depth = dst.yAllocSize() * dst.zAllocSize();
}
p.dstPtr = dst.pitchedPtr();
......@@ -163,12 +165,14 @@ namespace cuda {
bool canCopy = ( src.layout() == fzyx &&
dst.fAllocSize() == src.fAllocSize() &&
dst.zAllocSize() == src.zAllocSize() &&
dst.yAllocSize() == src.yAllocSize() )
dst.yAllocSize() == src.yAllocSize() &&
dst.xSize() == src.xSize() )
||
( src.layout() == zyxf &&
dst.zAllocSize() == src.zAllocSize() &&
dst.yAllocSize() == src.yAllocSize() &&
dst.xAllocSize() == src.xAllocSize() );
dst.xAllocSize() == src.xAllocSize() &&
dst.fSize() == src.fSize() );
if ( !canCopy ) {
WALBERLA_ABORT("Field have to have the same size ");
......@@ -181,7 +185,7 @@ namespace cuda {
dst.xAllocSize(), // inner dimension size
dst.yAllocSize() ); // next outer dimension size
p.extent.width = dst.xAllocSize() * sizeof(T);
p.extent.width = std::min( dst.xAllocSize(), src.xAllocSize() ) * sizeof(T);
p.extent.height = dst.yAllocSize();
p.extent.depth = dst.zAllocSize() * dst.fAllocSize();
}
......@@ -192,7 +196,7 @@ namespace cuda {
dst.fAllocSize(), // inner dimension size
dst.xAllocSize() ); // next outer dimension size
p.extent.width = dst.fAllocSize() * sizeof(T);
p.extent.width = std::min( dst.fAllocSize(), src.fAllocSize() ) * sizeof(T);
p.extent.height = dst.xAllocSize();
p.extent.depth = dst.yAllocSize() * dst.zAllocSize();
}
......
......@@ -38,7 +38,7 @@ namespace cuda {
template< typename T>
FieldIndexing<T>::FieldIndexing ( const GPUField<T> & field,
uint3 _blockDim, uint3 _gridDim,
dim3 _blockDim, dim3 _gridDim,
const FieldAccessor<T> _gpuAccess )
: field_( field ),
blockDim_( _blockDim ),
......@@ -56,8 +56,8 @@ FieldIndexing<T>::FieldIndexing ( const GPUField<T> & field,
threadsPerBlock = std::min( prop.maxThreadsPerBlock, threadsPerBlock );
}
WALBERLA_ASSERT_LESS( int_c( blockDim_.x ), threadsPerBlock,
"InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller " <<
"than the maximal thread count per CUDA block." );
"InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller " <<
"than the maximal thread count per CUDA block." );
}
}
......@@ -93,29 +93,29 @@ void shiftCoordinatesWhileFastestCoordHasSizeOne( typename FieldAccessor<T>::Ind
template< typename T>
FieldIndexing<T> FieldIndexing<T>::interval ( const GPUField<T> & f, const CellInterval & ci, int fBegin, int fEnd )
{
unsigned int xOffset, yOffset, zOffset, fOffset;
uint_t xOffset, yOffset, zOffset, fOffset;
if ( f.layout() == field::zyxf )
{
fOffset = sizeof(T);
xOffset = uint32_c( f.pitchedPtr().pitch );
yOffset = xOffset * uint32_c( f.xAllocSize() );
zOffset = yOffset * uint32_c( f.yAllocSize() );
xOffset = f.pitchedPtr().pitch;
yOffset = xOffset * f.xAllocSize();
zOffset = yOffset * f.yAllocSize();
}
else
{
xOffset = sizeof(T);
yOffset = uint32_c( f.pitchedPtr().pitch );
zOffset = yOffset * uint32_c( f.yAllocSize() );
fOffset = zOffset * uint32_c( f.zAllocSize() );
yOffset = f.pitchedPtr().pitch;
zOffset = yOffset * f.yAllocSize();
fOffset = zOffset * f.zAllocSize();
}
char * data = (char*)f.pitchedPtr().ptr;
// Jump over ghost cells to first inner cell
cell_idx_t gl = cell_idx_c( f.nrOfGhostLayers() );
data += ( ci.xMin() + gl )* int_c(xOffset) +
( ci.yMin() + gl )* int_c(yOffset) +
( ci.zMin() + gl )* int_c(zOffset);
data += ( ci.xMin() + gl )* xOffset +
( ci.yMin() + gl )* yOffset +
( ci.zMin() + gl )* zOffset;
dim3 gridDim;
......@@ -183,6 +183,15 @@ FieldIndexing<T> FieldIndexing<T>::sliceBeforeGhostLayerXYZ( const GPUField<T> &
return interval( f, ci, 0, 1 );
}
template< typename T>
FieldIndexing<T> FieldIndexing<T>::sliceXYZ( const GPUField<T> & f, cell_idx_t distance, uint_t thickness,
stencil::Direction dir, bool fullSlice )
{
CellInterval ci;
f.getSlice( dir, ci, distance, cell_idx_c(thickness), fullSlice );
return interval( f, ci );
}
template< typename T>
FieldIndexing<T> FieldIndexing<T>::allInner ( const GPUField<T> & f )
{
......
......@@ -43,8 +43,8 @@ namespace cuda {
//** Kernel call ******************************************************************************************
/*! \name Kernel call */
//@{
uint3 blockDim() const { return blockDim_; }
uint3 gridDim () const { return gridDim_; }
dim3 blockDim() const { return blockDim_; }
dim3 gridDim () const { return gridDim_; }
const FieldAccessor<T> & gpuAccess() const { return gpuAccess_; }
//@}
......@@ -67,6 +67,8 @@ namespace cuda {
stencil::Direction dir, bool fullSlice = false );
static FieldIndexing<T> sliceBeforeGhostLayerXYZ( const GPUField<T> & f, uint_t thickness,
stencil::Direction dir, bool fullSlice = false );
static FieldIndexing<T> sliceXYZ ( const GPUField<T> & f, cell_idx_t distance, uint_t thickness,
stencil::Direction dir, bool fullSlice = false );
static FieldIndexing<T> allInner ( const GPUField<T> & f );
static FieldIndexing<T> allWithGhostLayer ( const GPUField<T> & f );
......@@ -76,12 +78,12 @@ namespace cuda {
protected:
FieldIndexing ( const GPUField<T> & field,
uint3 _blockDim, uint3 _gridDim,
const FieldAccessor<T> _gpuAccess );
dim3 _blockDim, dim3 _gridDim,
const FieldAccessor<T> _gpuAccess );
const GPUField<T> & field_;
uint3 blockDim_;
uint3 gridDim_;
dim3 blockDim_;
dim3 gridDim_;
FieldAccessor<T> gpuAccess_;
};
......
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file FieldIndexing3D.cpp
//! \ingroup cuda
//! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
//
//======================================================================================================================
#include "FieldIndexing3D.h"
#include "GPUTypesExplicitInstantiation.h"
#include "GPUField.h"
#include "core/cell/CellInterval.h"