diff --git a/AUTHORS.txt b/AUTHORS.txt index 7e17599fe65ab501fd023ab0288490e9ab4938f6..65d88317f9734455c873ea661d7a545a4df987cf 100644 --- a/AUTHORS.txt +++ b/AUTHORS.txt @@ -19,6 +19,7 @@ Lorenz Hufnagel Martin Bauer Matthias Markl Michael Kuron +Paulo Carvalho Regina Ammer Sagar Dolas Sebastian Eibl diff --git a/apps/tutorials/cuda/01_GameOfLife_cuda.cpp b/apps/tutorials/cuda/01_GameOfLife_cuda.cpp index 6f8da219715bd1e5b74601b2fc683b59c1600346..a261e0c07398462c4c8687ad653a552bad993b15 100644 --- a/apps/tutorials/cuda/01_GameOfLife_cuda.cpp +++ b/apps/tutorials/cuda/01_GameOfLife_cuda.cpp @@ -22,6 +22,7 @@ #include "cuda/HostFieldAllocator.h" #include "blockforest/Initialization.h" #include "blockforest/communication/UniformDirectScheme.h" +#include "blockforest/communication/UniformBufferedScheme.h" #include "core/Environment.h" @@ -30,6 +31,8 @@ #include "cuda/GPUField.h" #include "cuda/Kernel.h" #include "cuda/AddGPUFieldToStorage.h" +#include "cuda/communication/GPUPackInfo.h" + #include "field/AddToStorage.h" #include "field/communication/UniformMPIDatatypeInfo.h" @@ -113,16 +116,22 @@ int main( int argc, char ** argv ) BlockDataID gpuFieldSrcID = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Src" ); BlockDataID gpuFieldDstID = cuda::addGPUFieldToStorage<ScalarField>( blocks, cpuFieldID, "GPU Field Dst" ); - typedef blockforest::communication::UniformDirectScheme<stencil::D2Q9 > CommScheme; - CommScheme communication( blocks ); - communication.addDataToCommunicate( make_shared<field::communication::UniformMPIDatatypeInfo<GPUField> > (gpuFieldSrcID) ); + + typedef blockforest::communication::UniformBufferedScheme<stencil::D2Q9 > CommScheme; + typedef cuda::communication::GPUPackInfo<GPUField> Packing; + // Alternative, if CUDA enabled MPI is available + //blockforest::communication::UniformDirectScheme<stencil::D2Q9 > + //typedef field::communication::UniformMPIDatatypeInfo<GPUField> Packing + + CommScheme commScheme(blocks); + commScheme.addDataToCommunicate( make_shared<Packing>(gpuFieldSrcID) ); // Create Timeloop const uint_t numberOfTimesteps = uint_t(10); // number of timesteps for non-gui runs SweepTimeloop timeloop ( blocks, numberOfTimesteps ); // Registering the sweep - timeloop.add() << BeforeFunction( communication, "Communication" ) + timeloop.add() << BeforeFunction( commScheme, "Communication" ) << Sweep( GameOfLifeSweepCUDA(gpuFieldSrcID, gpuFieldDstID ), "GameOfLifeSweep" ); timeloop.add() << Sweep( cuda::fieldCpyFunctor<ScalarField, GPUField >(cpuFieldID, gpuFieldDstID) ); diff --git a/src/core/mpi/RecvBuffer.h b/src/core/mpi/RecvBuffer.h index c9317ca969f72eae517262aff50ff6805aff5cac..e1af9c26473236323d21c55a141ca866a3a13d52 100644 --- a/src/core/mpi/RecvBuffer.h +++ b/src/core/mpi/RecvBuffer.h @@ -140,7 +140,7 @@ public: inline void reserve( size_t newCapacity ); inline void resize ( size_t newSize ); template< typename V > void peek ( V& value ) const; - inline void skip ( size_t elements ); + inline T * skip ( size_t elements ); inline void clear (); inline void reset (); inline void readDebugMarker( const char * marker ); @@ -578,10 +578,15 @@ inline void GenericRecvBuffer<T>::peek( V& value ) const // This function skips \a element receive buffer elements of type \a T. */ template< typename T > // Element type -void GenericRecvBuffer<T>::skip( size_t elements ) +T * GenericRecvBuffer<T>::skip( size_t elements ) { + auto previous = cur_; cur_ += elements; - if( cur_ > end_ ) cur_ = end_; + + // Invariants check + WALBERLA_ASSERT_LESS_EQUAL( cur_, end_ ); + + return previous; } //********************************************************************************************************************** diff --git a/src/core/mpi/SendBuffer.h b/src/core/mpi/SendBuffer.h index 841e8f886dcedc936382ae8ba327ea75c9ef683b..57d097a2113a5db73705f667f5e4fe9fe47495eb 100644 --- a/src/core/mpi/SendBuffer.h +++ b/src/core/mpi/SendBuffer.h @@ -154,6 +154,7 @@ public: //**Repositioning *************************************************************************************************** /*!\name Repositioning */ + inline T * forward( uint_t elements ); inline void rewind(const size_t & size); //@} //******************************************************************************************************************* @@ -508,6 +509,35 @@ GenericSendBuffer<T,G>::operator<<( V value ) // //====================================================================================================================== +//********************************************************************************************************************** +/*!\brief Forward the given number of elements. +// +// \param elements The number of elements to be advanced. +// \return Previous position. +// +// This function forwards \a element send buffer elements of type \a T and returns the previous buffer position. +*/ +template< typename T // Element type + , typename G > // Growth policy +T * GenericSendBuffer<T,G>::forward( uint_t elements ) +{ + const size_t rest = numeric_cast< size_t >( end_ - cur_ ); + + // Checking the size of the remaining memory + if( rest < elements ) { + extendMemory( size() + elements ); + } + + // Adding the data value + auto previous = cur_; + cur_ += elements; + + // Invariants check + WALBERLA_ASSERT_LESS_EQUAL( cur_, end_ ); + + return previous; +} + //********************************************************************************************************************** /*!\brief Rewinds the stream to a previous position // diff --git a/src/cuda/AddGPUFieldToStorage.h b/src/cuda/AddGPUFieldToStorage.h index 67968b93e2534c8750e6c8600985c90cafeee65f..358a3397cdf7fe635fb30d07a2d24e3d27147695 100644 --- a/src/cuda/AddGPUFieldToStorage.h +++ b/src/cuda/AddGPUFieldToStorage.h @@ -45,7 +45,8 @@ namespace cuda { const std::string & identifier, uint_t fSize, const Layout layout = fzyx, - uint_t nrOfGhostLayers = 1 ); + uint_t nrOfGhostLayers = 1, + bool usePitchedMem = true ); @@ -61,7 +62,8 @@ namespace cuda { template< typename Field_T> BlockDataID addGPUFieldToStorage( const shared_ptr< StructuredBlockStorage > & bs, ConstBlockDataID cpuFieldID, - const std::string & identifier ); + const std::string & identifier, + bool usePitchedMem = true ); diff --git a/src/cuda/AddGPUFieldToStorage.impl.h b/src/cuda/AddGPUFieldToStorage.impl.h index dfde01a84f2dd4cda9dff4ab19ac52e6ab967c9d..f007181a6f959e4128302114e15ecb01698913b0 100644 --- a/src/cuda/AddGPUFieldToStorage.impl.h +++ b/src/cuda/AddGPUFieldToStorage.impl.h @@ -33,26 +33,28 @@ namespace cuda { const StructuredBlockStorage * const bs, uint_t ghostLayers, uint_t fSize, - const field::Layout & layout ) + const field::Layout & layout, + bool usePitchedMem ) { return new GPUField_T( bs->getNumberOfXCells( *block ), bs->getNumberOfYCells( *block ), bs->getNumberOfZCells( *block ), - fSize, ghostLayers, layout ); + fSize, ghostLayers, layout, usePitchedMem ); } template< typename Field_T> GPUField< typename Field_T::value_type> * createGPUFieldFromCPUField( const IBlock * const block, const StructuredBlockStorage * const, - ConstBlockDataID cpuFieldID + ConstBlockDataID cpuFieldID, + bool usePitchedMem ) { typedef GPUField< typename Field_T::value_type> GPUField_T; const Field_T * f = block->getData<Field_T>( cpuFieldID ); auto gpuField = new GPUField_T( f->xSize(), f->ySize(), f->zSize(), f->fSize(), - f->nrOfGhostLayers(), f->layout() ); + f->nrOfGhostLayers(), f->layout(), usePitchedMem ); cuda::fieldCpy( *gpuField, *f ); @@ -67,9 +69,10 @@ namespace cuda { const std::string & identifier, uint_t fSize, const Layout layout, - uint_t nrOfGhostLayers ) + uint_t nrOfGhostLayers, + bool usePitchedMem ) { - auto func = boost::bind ( internal::createGPUField<GPUField_T>, _1, _2, nrOfGhostLayers, fSize, layout ); + auto func = boost::bind ( internal::createGPUField<GPUField_T>, _1, _2, nrOfGhostLayers, fSize, layout, usePitchedMem ); return bs->addStructuredBlockData< GPUField_T >( func, identifier ); } @@ -77,9 +80,10 @@ namespace cuda { template< typename Field_T> BlockDataID addGPUFieldToStorage( const shared_ptr< StructuredBlockStorage > & bs, ConstBlockDataID cpuFieldID, - const std::string & identifier ) + const std::string & identifier, + bool usePitchedMem ) { - auto func = boost::bind ( internal::createGPUFieldFromCPUField<Field_T>, _1, _2, cpuFieldID ); + auto func = boost::bind ( internal::createGPUFieldFromCPUField<Field_T>, _1, _2, cpuFieldID, usePitchedMem ); return bs->addStructuredBlockData< GPUField<typename Field_T::value_type> >( func, identifier ); } diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt index 6959180c2cc2c2df40633188487a001d0ff1c380..a4c149c369bb911a060490803c94a3304de5d0c8 100644 --- a/src/cuda/CMakeLists.txt +++ b/src/cuda/CMakeLists.txt @@ -4,6 +4,6 @@ # ################################################################################################### -waLBerla_add_module( DEPENDS core domain_decomposition field stencil BUILD_ONLY_IF_FOUND CUDA ) +waLBerla_add_module( DEPENDS core communication domain_decomposition field stencil BUILD_ONLY_IF_FOUND CUDA ) ################################################################################################### \ No newline at end of file diff --git a/src/cuda/FieldAccessor.h b/src/cuda/FieldAccessor.h index bf45831bdf55c2536281959e5c16aaa2b3748144..b4650ea577bb462879298528158ddb43019f6356 100644 --- a/src/cuda/FieldAccessor.h +++ b/src/cuda/FieldAccessor.h @@ -39,10 +39,10 @@ namespace cuda { }; FieldAccessor( char * ptr, - uint32_t xOffset, - uint32_t yOffset, - uint32_t zOffset, - uint32_t fOffset, + uint_t xOffset, + uint_t yOffset, + uint_t zOffset, + uint_t fOffset, IndexingScheme indexingScheme ) : ptr_(ptr), xOffset_(xOffset), yOffset_(yOffset), zOffset_(zOffset), fOffset_(fOffset), indexingScheme_(indexingScheme ) @@ -65,7 +65,7 @@ namespace cuda { } - __device__ unsigned int getLinearIndex( uint3 blockIdx, uint3 threadIdx, uint3 gridDim, uint3 blockDim ) + __device__ uint_t getLinearIndex( uint3 blockIdx, uint3 threadIdx, uint3 gridDim, uint3 blockDim ) { return threadIdx.x + blockIdx.x * blockDim.x + @@ -73,6 +73,8 @@ namespace cuda { blockIdx.z * blockDim.x * gridDim.x * gridDim.y ; } + // This is always true for this specific field indexing class. + __device__ __forceinline__ bool isValidPosition() { return true; } __device__ T & get() { return * (T*)(ptr_); } __device__ T & get( int f) { return * (T*)(ptr_ + f * fOffset_); } @@ -80,26 +82,26 @@ namespace cuda { __device__ T & getNeighbor( int cx, int cy, int cz ) const { - return * (T*)( ptr_ + cx * (int)(xOffset_) + - cy * (int)(yOffset_) + - cz * (int)(zOffset_) ); + return * (T*)( ptr_ + cx * xOffset_ + + cy * yOffset_ + + cz * zOffset_ ); } __device__ T & getNeighbor( int cx, int cy, int cz, int cf ) { - return * (T*)( ptr_ + cx * (int)(xOffset_) + - cy * (int)(yOffset_) + - cz * (int)(zOffset_) + - cf * (int)(fOffset_) ); + return * (T*)( ptr_ + cx * xOffset_ + + cy * yOffset_ + + cz * zOffset_ + + cf * fOffset_ ); } protected: char * ptr_; - uint32_t xOffset_; - uint32_t yOffset_; - uint32_t zOffset_; - uint32_t fOffset_; + uint_t xOffset_; + uint_t yOffset_; + uint_t zOffset_; + uint_t fOffset_; IndexingScheme indexingScheme_; }; diff --git a/src/cuda/FieldAccessor3D.h b/src/cuda/FieldAccessor3D.h new file mode 100644 index 0000000000000000000000000000000000000000..411b64813e0b73531716cd09759852fabc5952e1 --- /dev/null +++ b/src/cuda/FieldAccessor3D.h @@ -0,0 +1,100 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file FieldAccessor3D.h +//! \ingroup cuda +//! \author Paulo Carvalho <prcjunior@inf.ufpr.br> +// +//====================================================================================================================== + +#pragma once + +#include "core/DataTypes.h" + +#include <cuda_runtime.h> + + +namespace walberla { +namespace cuda { + + + template<typename T> + class FieldAccessor3D + { + public: + + FieldAccessor3D( char * ptr, + uint_t xOffset, + uint_t yOffset, + uint_t zOffset, + uint_t fOffset, + const dim3 & idxDim, + const dim3 & blockDim ) + : ptr_( ptr ), xOffset_( xOffset ), yOffset_( yOffset ), zOffset_( zOffset ), fOffset_( fOffset ), + idxDim_( idxDim ), blockDim_( blockDim ), isValidPosition_( false ) + {} + + __device__ __forceinline__ void set( const uint3& blockIdx, const uint3& threadIdx ) + { + uint_t x = blockIdx.x * blockDim_.x + threadIdx.x; + uint_t y = blockIdx.y * blockDim_.y + threadIdx.y; + uint_t z = blockIdx.z * blockDim_.z + threadIdx.z; + + if ( x < idxDim_.x && y < idxDim_.y && z < idxDim_.z ) + { + ptr_ += x * xOffset_ + y * yOffset_ + z * zOffset_; + isValidPosition_ = true; + } + } + + __device__ __forceinline__ bool isValidPosition() { return isValidPosition_; } + + __device__ __forceinline__ T & get() { return * (T*)(ptr_); } + __device__ __forceinline__ T & get( int f ) { return * (T*)(ptr_ + f * fOffset_); } + + + __device__ __forceinline__ T & getNeighbor( int cx, int cy, int cz ) const + { + return * (T*)( ptr_ + cx * xOffset_ + + cy * yOffset_ + + cz * zOffset_ ); + } + + __device__ __forceinline__ T & getNeighbor( int cx, int cy, int cz, int cf ) + { + return * (T*)( ptr_ + cx * xOffset_ + + cy * yOffset_ + + cz * zOffset_ + + cf * fOffset_ ); + } + + + protected: + char * ptr_; + uint_t xOffset_; + uint_t yOffset_; + uint_t zOffset_; + uint_t fOffset_; + dim3 idxDim_; + dim3 blockDim_; + bool isValidPosition_; + }; + + + +} // namespace cuda +} // namespace walberla + + diff --git a/src/cuda/FieldCopy.h b/src/cuda/FieldCopy.h index e51f262522926490de80b618f8e4a9e93a95121b..87af5060d139cefd2ef529ab4c51878bc5f72816 100644 --- a/src/cuda/FieldCopy.h +++ b/src/cuda/FieldCopy.h @@ -109,12 +109,14 @@ namespace cuda { bool canCopy = ( src.layout() == fzyx && dst.fAllocSize() == src.fAllocSize() && dst.zAllocSize() == src.zAllocSize() && - dst.yAllocSize() == src.yAllocSize() ) + dst.yAllocSize() == src.yAllocSize() && + dst.xSize() == src.xSize() ) || ( src.layout() == zyxf && dst.zAllocSize() == src.zAllocSize() && dst.yAllocSize() == src.yAllocSize() && - dst.xAllocSize() == src.xAllocSize() ); + dst.xAllocSize() == src.xAllocSize() && + dst.fSize() == src.fSize() ); if ( !canCopy ) { WALBERLA_ABORT("Field have to have the same size "); @@ -127,20 +129,20 @@ namespace cuda { src.xAllocSize(), // inner dimension size src.yAllocSize() ); // next outer dimension size - p.extent.width = src.xAllocSize() * sizeof(T); - p.extent.height = src.yAllocSize(); - p.extent.depth = src.zAllocSize() * src.fAllocSize(); + p.extent.width = std::min( dst.xAllocSize(), src.xAllocSize() ) * sizeof(T); + p.extent.height = dst.yAllocSize(); + p.extent.depth = dst.zAllocSize() * dst.fAllocSize(); } else { - p.srcPtr = make_cudaPitchedPtr( (void*)(src.data()), // pointer + p.srcPtr = make_cudaPitchedPtr( (void*)(src.data()), // pointer sizeof(T) * src.fAllocSize(), // pitch src.fAllocSize(), // inner dimension size src.xAllocSize() ); // next outer dimension size - p.extent.width = src.fAllocSize() * sizeof(T); - p.extent.height = src.xAllocSize(); - p.extent.depth = src.yAllocSize() * src.zAllocSize(); + p.extent.width = std::min( dst.fAllocSize(), src.fAllocSize() ) * sizeof(T); + p.extent.height = dst.xAllocSize(); + p.extent.depth = dst.yAllocSize() * dst.zAllocSize(); } p.dstPtr = dst.pitchedPtr(); @@ -163,12 +165,14 @@ namespace cuda { bool canCopy = ( src.layout() == fzyx && dst.fAllocSize() == src.fAllocSize() && dst.zAllocSize() == src.zAllocSize() && - dst.yAllocSize() == src.yAllocSize() ) + dst.yAllocSize() == src.yAllocSize() && + dst.xSize() == src.xSize() ) || ( src.layout() == zyxf && dst.zAllocSize() == src.zAllocSize() && dst.yAllocSize() == src.yAllocSize() && - dst.xAllocSize() == src.xAllocSize() ); + dst.xAllocSize() == src.xAllocSize() && + dst.fSize() == src.fSize() ); if ( !canCopy ) { WALBERLA_ABORT("Field have to have the same size "); @@ -181,7 +185,7 @@ namespace cuda { dst.xAllocSize(), // inner dimension size dst.yAllocSize() ); // next outer dimension size - p.extent.width = dst.xAllocSize() * sizeof(T); + p.extent.width = std::min( dst.xAllocSize(), src.xAllocSize() ) * sizeof(T); p.extent.height = dst.yAllocSize(); p.extent.depth = dst.zAllocSize() * dst.fAllocSize(); } @@ -192,7 +196,7 @@ namespace cuda { dst.fAllocSize(), // inner dimension size dst.xAllocSize() ); // next outer dimension size - p.extent.width = dst.fAllocSize() * sizeof(T); + p.extent.width = std::min( dst.fAllocSize(), src.fAllocSize() ) * sizeof(T); p.extent.height = dst.xAllocSize(); p.extent.depth = dst.yAllocSize() * dst.zAllocSize(); } diff --git a/src/cuda/FieldIndexing.cpp b/src/cuda/FieldIndexing.cpp index a61e50a0d965d2528e46541623b702eb4b677c70..413bbe1aaf4ceb0915afe70bae445f0a0ac01b90 100644 --- a/src/cuda/FieldIndexing.cpp +++ b/src/cuda/FieldIndexing.cpp @@ -38,7 +38,7 @@ namespace cuda { template< typename T> FieldIndexing<T>::FieldIndexing ( const GPUField<T> & field, - uint3 _blockDim, uint3 _gridDim, + dim3 _blockDim, dim3 _gridDim, const FieldAccessor<T> _gpuAccess ) : field_( field ), blockDim_( _blockDim ), @@ -56,8 +56,8 @@ FieldIndexing<T>::FieldIndexing ( const GPUField<T> & field, threadsPerBlock = std::min( prop.maxThreadsPerBlock, threadsPerBlock ); } WALBERLA_ASSERT_LESS( int_c( blockDim_.x ), threadsPerBlock, - "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller " << - "than the maximal thread count per CUDA block." ); + "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller " << + "than the maximal thread count per CUDA block." ); } } @@ -93,29 +93,29 @@ void shiftCoordinatesWhileFastestCoordHasSizeOne( typename FieldAccessor<T>::Ind template< typename T> FieldIndexing<T> FieldIndexing<T>::interval ( const GPUField<T> & f, const CellInterval & ci, int fBegin, int fEnd ) { - unsigned int xOffset, yOffset, zOffset, fOffset; + uint_t xOffset, yOffset, zOffset, fOffset; if ( f.layout() == field::zyxf ) { fOffset = sizeof(T); - xOffset = uint32_c( f.pitchedPtr().pitch ); - yOffset = xOffset * uint32_c( f.xAllocSize() ); - zOffset = yOffset * uint32_c( f.yAllocSize() ); + xOffset = f.pitchedPtr().pitch; + yOffset = xOffset * f.xAllocSize(); + zOffset = yOffset * f.yAllocSize(); } else { xOffset = sizeof(T); - yOffset = uint32_c( f.pitchedPtr().pitch ); - zOffset = yOffset * uint32_c( f.yAllocSize() ); - fOffset = zOffset * uint32_c( f.zAllocSize() ); + yOffset = f.pitchedPtr().pitch; + zOffset = yOffset * f.yAllocSize(); + fOffset = zOffset * f.zAllocSize(); } char * data = (char*)f.pitchedPtr().ptr; // Jump over ghost cells to first inner cell cell_idx_t gl = cell_idx_c( f.nrOfGhostLayers() ); - data += ( ci.xMin() + gl )* int_c(xOffset) + - ( ci.yMin() + gl )* int_c(yOffset) + - ( ci.zMin() + gl )* int_c(zOffset); + data += ( ci.xMin() + gl )* xOffset + + ( ci.yMin() + gl )* yOffset + + ( ci.zMin() + gl )* zOffset; dim3 gridDim; @@ -183,6 +183,15 @@ FieldIndexing<T> FieldIndexing<T>::sliceBeforeGhostLayerXYZ( const GPUField<T> & return interval( f, ci, 0, 1 ); } +template< typename T> +FieldIndexing<T> FieldIndexing<T>::sliceXYZ( const GPUField<T> & f, cell_idx_t distance, uint_t thickness, + stencil::Direction dir, bool fullSlice ) +{ + CellInterval ci; + f.getSlice( dir, ci, distance, cell_idx_c(thickness), fullSlice ); + return interval( f, ci ); +} + template< typename T> FieldIndexing<T> FieldIndexing<T>::allInner ( const GPUField<T> & f ) { diff --git a/src/cuda/FieldIndexing.h b/src/cuda/FieldIndexing.h index 437b03453d8770d36d37e6b6f7cbe2572b062d0b..653a5de27c48840f43edc83521eb26be8f7d8b89 100644 --- a/src/cuda/FieldIndexing.h +++ b/src/cuda/FieldIndexing.h @@ -43,8 +43,8 @@ namespace cuda { //** Kernel call ****************************************************************************************** /*! \name Kernel call */ //@{ - uint3 blockDim() const { return blockDim_; } - uint3 gridDim () const { return gridDim_; } + dim3 blockDim() const { return blockDim_; } + dim3 gridDim () const { return gridDim_; } const FieldAccessor<T> & gpuAccess() const { return gpuAccess_; } //@} @@ -67,6 +67,8 @@ namespace cuda { stencil::Direction dir, bool fullSlice = false ); static FieldIndexing<T> sliceBeforeGhostLayerXYZ( const GPUField<T> & f, uint_t thickness, stencil::Direction dir, bool fullSlice = false ); + static FieldIndexing<T> sliceXYZ ( const GPUField<T> & f, cell_idx_t distance, uint_t thickness, + stencil::Direction dir, bool fullSlice = false ); static FieldIndexing<T> allInner ( const GPUField<T> & f ); static FieldIndexing<T> allWithGhostLayer ( const GPUField<T> & f ); @@ -76,12 +78,12 @@ namespace cuda { protected: FieldIndexing ( const GPUField<T> & field, - uint3 _blockDim, uint3 _gridDim, - const FieldAccessor<T> _gpuAccess ); + dim3 _blockDim, dim3 _gridDim, + const FieldAccessor<T> _gpuAccess ); const GPUField<T> & field_; - uint3 blockDim_; - uint3 gridDim_; + dim3 blockDim_; + dim3 gridDim_; FieldAccessor<T> gpuAccess_; }; diff --git a/src/cuda/FieldIndexing3D.cpp b/src/cuda/FieldIndexing3D.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5a797a23d2b45e8b96bc3a4066061e39cadc6dc2 --- /dev/null +++ b/src/cuda/FieldIndexing3D.cpp @@ -0,0 +1,175 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file FieldIndexing3D.cpp +//! \ingroup cuda +//! \author Paulo Carvalho <prcjunior@inf.ufpr.br> +// +//====================================================================================================================== + +#include "FieldIndexing3D.h" +#include "GPUTypesExplicitInstantiation.h" +#include "GPUField.h" + +#include "core/cell/CellInterval.h" +#include "core/debug/Debug.h" +#include "core/logging/Logging.h" +#include "field/Layout.h" + +#include <cuda_runtime.h> + +#include <limits> +#include <cmath> + +#define BLOCK_SIZE_X uint_t( 32 ) +#define BLOCK_SIZE_Y uint_t( 2 ) +#define BLOCK_SIZE_Z uint_t( 2 ) + +namespace walberla { +namespace cuda { + +// Returns ( a % b != 0 ) ? ( a / b + 1 ) : ( a / b ) +inline unsigned int iDivUp( unsigned int a, unsigned int b ) { return ( a + b - 1 ) / b; } + + +dim3 FieldIndexing3DBase::preferredBlockDim_( BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z ); + + +template< typename T> +FieldIndexing3D<T>::FieldIndexing3D( const GPUField<T> & field, + const dim3 & _blockDim, const dim3 & _gridDim, + const FieldAccessor3D<T> & _gpuAccess ) + : field_( field ), + blockDim_( _blockDim ), + gridDim_( _gridDim ), + gpuAccess_( _gpuAccess ) +{ + WALBERLA_DEBUG_SECTION() + { + cudaDeviceProp prop; + int count; + cudaGetDeviceCount(&count); + int threadsPerBlock = std::numeric_limits<int>::max(); + for (int i = 0; i < count; i++) { + cudaGetDeviceProperties(&prop, i); + threadsPerBlock = std::min( prop.maxThreadsPerBlock, threadsPerBlock ); + } + WALBERLA_ASSERT_LESS( int_c( blockDim_.x ), threadsPerBlock, + "InnerCoordThreadIndexing works only for fields where each dimension x,y,z is smaller " << + "than the maximal thread count per CUDA block." ); + } +} + +template< typename T> +FieldIndexing3D<T> FieldIndexing3D<T>::interval( const GPUField<T> & f, const CellInterval & ci ) +{ + uint_t xOffset, yOffset, zOffset, fOffset; + + WALBERLA_ASSERT( f.layout() == field::fzyx ); + + xOffset = sizeof(T); + yOffset = f.pitchedPtr().pitch; + zOffset = yOffset * f.yAllocSize(); + fOffset = zOffset * f.zAllocSize(); + + char * data = (char*)f.pitchedPtr().ptr; + + // position data according to ci + cell_idx_t gl = cell_idx_c( f.nrOfGhostLayers() ); + data += ( ci.xMin() + gl ) * xOffset + + ( ci.yMin() + gl ) * yOffset + + ( ci.zMin() + gl ) * zOffset; + + + dim3 idxDim( (unsigned int)ci.xSize(), (unsigned int)ci.ySize(), (unsigned int)ci.zSize() ); + unsigned int bx = std::min( preferredBlockDim_.x, idxDim.x ); + unsigned int by = std::min( preferredBlockDim_.y, idxDim.y ); + unsigned int bz = std::min( preferredBlockDim_.z, idxDim.z ); + dim3 gridDim( iDivUp( idxDim.x, bx ), + iDivUp( idxDim.y, by ), + iDivUp( idxDim.z, bz ) ); + dim3 blockDim( bx, by, bz ); + + return FieldIndexing3D<T>( f, blockDim, gridDim, + FieldAccessor3D<T>( data, xOffset, yOffset, zOffset, fOffset, + idxDim, blockDim ) ); +} + +template< typename T> +FieldIndexing3D<T> FieldIndexing3D<T>::xyz ( const GPUField<T> & f ) +{ + CellInterval ci( 0,0,0, + cell_idx_c( f.xSize() ) - 1, + cell_idx_c( f.ySize() ) - 1, + cell_idx_c( f.zSize() ) - 1 ); + + return interval( f, ci ); +} + +template< typename T> +FieldIndexing3D<T> FieldIndexing3D<T>::withGhostLayerXYZ( const GPUField<T> & f, uint_t numGhostLayers ) +{ + cell_idx_t gl = std::min( cell_idx_c( numGhostLayers ), cell_idx_c( f.nrOfGhostLayers() ) ); + CellInterval ci( -gl,-gl,-gl, + cell_idx_c( f.xSize() ) + gl - 1, + cell_idx_c( f.ySize() ) + gl - 1, + cell_idx_c( f.zSize() ) + gl - 1 ); + + return interval( f, ci ); +} + +template< typename T> +FieldIndexing3D<T> FieldIndexing3D<T>::ghostLayerOnlyXYZ( const GPUField<T> & f, uint_t thickness, + stencil::Direction dir, bool fullSlice ) +{ + CellInterval ci; + f.getGhostRegion( dir, ci, cell_idx_c(thickness), fullSlice ); + return interval( f, ci ); +} + +template< typename T> +FieldIndexing3D<T> FieldIndexing3D<T>::sliceBeforeGhostLayerXYZ( const GPUField<T> & f, uint_t thickness, + stencil::Direction dir, bool fullSlice ) +{ + CellInterval ci; + f.getSliceBeforeGhostLayer( dir, ci, cell_idx_c(thickness), fullSlice ); + return interval( f, ci ); +} + +template< typename T> +FieldIndexing3D<T> FieldIndexing3D<T>::sliceXYZ( const GPUField<T> & f, cell_idx_t distance, uint_t thickness, + stencil::Direction dir, bool fullSlice ) +{ + CellInterval ci; + f.getSlice( dir, ci, distance, cell_idx_c(thickness), fullSlice ); + return interval( f, ci ); +} + +template< typename T> +FieldIndexing3D<T> FieldIndexing3D<T>::intervalXYZ( const GPUField<T> & f, const cell::CellInterval & ci ) +{ + return interval( f, ci ); +} + + + + +GPU_CLASS_TEMPLATE_INSTANTIATION( FieldIndexing3D ) + + +} // namespace cuda +} // namespace walberla + + diff --git a/src/cuda/FieldIndexing3D.h b/src/cuda/FieldIndexing3D.h new file mode 100644 index 0000000000000000000000000000000000000000..0dbe97566ec3e22ca22be8de41a2f62e0554957d --- /dev/null +++ b/src/cuda/FieldIndexing3D.h @@ -0,0 +1,105 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file FieldIndexing3D.h +//! \ingroup cuda +//! \author Paulo Carvalho <prcjunior@inf.ufpr.br> +//! \brief Indexing Scheme that executes all elements of inner coordinate within on thread block +// +//====================================================================================================================== + +#pragma once + +#include "FieldAccessor3D.h" + +#include "stencil/Directions.h" +#include <cuda_runtime.h> + +namespace walberla { namespace cell { class CellInterval; } } + +namespace walberla { +namespace cuda { + + // Forward Declarations + template< typename T> class GPUField; + + + class FieldIndexing3DBase + { + public: + + static void setPreferredBlockDim( dim3 blockDim ) { preferredBlockDim_ = blockDim; } + static void setPreferredBlockDim( unsigned int x, unsigned int y, unsigned int z ) { preferredBlockDim_ = dim3( x, y, z ); } + + protected: + + static dim3 preferredBlockDim_; + }; + + + template<typename T> + class FieldIndexing3D: public FieldIndexing3DBase + { + public: + + //** Kernel call ****************************************************************************************** + /*! \name Kernel call */ + //@{ + dim3 blockDim() const { return blockDim_; } + dim3 gridDim () const { return gridDim_; } + + const FieldAccessor3D<T> & gpuAccess() const { return gpuAccess_; } + //@} + //**************************************************************************************************************** + + + + + //** Creation ********************************************************************************************* + /*! \name Creation */ + //@{ + static FieldIndexing3D<T> interval ( const GPUField<T> & f, + const cell::CellInterval & ci ); + + + static FieldIndexing3D<T> xyz ( const GPUField<T> & f ); + static FieldIndexing3D<T> withGhostLayerXYZ ( const GPUField<T> & f, uint_t numGhostLayers ); + static FieldIndexing3D<T> ghostLayerOnlyXYZ ( const GPUField<T> & f, uint_t thickness, + stencil::Direction dir, bool fullSlice = false ); + static FieldIndexing3D<T> sliceBeforeGhostLayerXYZ ( const GPUField<T> & f, uint_t thickness, + stencil::Direction dir, bool fullSlice = false ); + static FieldIndexing3D<T> sliceXYZ ( const GPUField<T> & f, cell_idx_t distance, uint_t thickness, + stencil::Direction dir, bool fullSlice = false ); + + static FieldIndexing3D<T> intervalXYZ ( const GPUField<T> & f, const cell::CellInterval & ci ); + //@} + //**************************************************************************************************************** + + protected: + FieldIndexing3D ( const GPUField<T> & field, + const dim3 & _blockDim, const dim3 & _gridDim, + const FieldAccessor3D<T> & _gpuAccess ); + + const GPUField<T> & field_; + dim3 blockDim_; + dim3 gridDim_; + FieldAccessor3D<T> gpuAccess_; + }; + + +} // namespace cuda +} // namespace walberla + + diff --git a/src/cuda/FieldIndexingXYZ.cpp b/src/cuda/FieldIndexingXYZ.cpp index e1bb2cbb2ebb6913bb24969e773eb1f3b60c9f0c..8cc0bd6381224ca776cf411dc6fe6eb470b06e95 100644 --- a/src/cuda/FieldIndexingXYZ.cpp +++ b/src/cuda/FieldIndexingXYZ.cpp @@ -24,7 +24,7 @@ #include "GPUField.h" #include "core/cell/CellInterval.h" -#include "core/debug/Debug.h" + namespace walberla { namespace cuda { @@ -32,7 +32,7 @@ namespace cuda { template< typename T> FieldIndexingXYZ<T>::FieldIndexingXYZ ( const GPUField<T> & field, - uint3 _blockDim, uint3 _gridDim, + dim3 _blockDim, dim3 _gridDim, const FieldAccessorXYZ<T> _gpuAccess ) : field_( field ), blockDim_( _blockDim ), diff --git a/src/cuda/FieldIndexingXYZ.h b/src/cuda/FieldIndexingXYZ.h index 19d3f98c4a71e09d670324d553834ca56075f6e1..2c25975ea3ca417f0007de10764d5e66a42c389c 100644 --- a/src/cuda/FieldIndexingXYZ.h +++ b/src/cuda/FieldIndexingXYZ.h @@ -43,8 +43,8 @@ template< typename T> class GPUField; //** Kernel call ****************************************************************************************** /*! \name Kernel call */ //@{ - uint3 blockDim() const { return blockDim_; } - uint3 gridDim () const { return gridDim_; } + dim3 blockDim() const { return blockDim_; } + dim3 gridDim () const { return gridDim_; } const FieldAccessorXYZ<T> & gpuAccess() const { return gpuAccess_; } //@} @@ -64,11 +64,11 @@ template< typename T> class GPUField; //**************************************************************************************************************** protected: - FieldIndexingXYZ<T> ( const GPUField<T> & field, uint3 _blockDim, uint3 _gridDim, const FieldAccessorXYZ<T> _gpuAccess ); + FieldIndexingXYZ<T> ( const GPUField<T> & field, dim3 _blockDim, dim3 _gridDim, const FieldAccessorXYZ<T> _gpuAccess ); const GPUField<T> & field_; - uint3 blockDim_; - uint3 gridDim_; + dim3 blockDim_; + dim3 gridDim_; FieldAccessorXYZ<T> gpuAccess_; }; diff --git a/src/cuda/GPUCopy.cpp b/src/cuda/GPUCopy.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e1d6c86e723befc0622631cda612359dbd429040 --- /dev/null +++ b/src/cuda/GPUCopy.cpp @@ -0,0 +1,144 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file GPUCopy.cpp +//! \ingroup cuda +//! \author Paulo Carvalho <prcjunior@inf.ufpr.br> +//! \brief Copy routines of 4D intervals involving GPU buffers. +// +//====================================================================================================================== + +#include "core/debug/Debug.h" + +#include "GPUCopy.h" +#include "ErrorChecking.h" + + +namespace walberla { +namespace cuda { + + +void copyDevToDevFZYXRestricted( const cudaPitchedPtr& dst, const cudaPitchedPtr& src, + uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ, + uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF, + uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF, + uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf ) +{ + WALBERLA_ASSERT( Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ) ); + + cudaMemcpy3DParms p; + memset( &p, 0, sizeof(p) ); + + p.srcPos.x = srcX * typeSz; + p.srcPos.y = srcY; + p.srcPos.z = srcF * srcAllocZ + srcZ; + p.srcPtr.ptr = src.ptr; + p.srcPtr.pitch = src.pitch; + p.srcPtr.xsize = src.xsize; + p.srcPtr.ysize = src.ysize; + + p.dstPos.x = dstX * typeSz; + p.dstPos.y = dstY; + p.dstPos.z = dstF * dstAllocZ + dstZ; + p.dstPtr.ptr = dst.ptr; + p.dstPtr.pitch = dst.pitch; + p.dstPtr.xsize = dst.xsize; + p.dstPtr.ysize = dst.ysize; + + p.extent.width = Nx * typeSz; + p.extent.height = Ny; + p.extent.depth = Nz * Nf; + + p.kind = cudaMemcpyDeviceToDevice; + + WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) ); +} + + +void copyHostToDevFZYXRestricted( const cudaPitchedPtr& dst, unsigned char* src, + uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ, + uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF, + uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF, + uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf ) +{ + WALBERLA_ASSERT( Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ) ); + + cudaMemcpy3DParms p; + memset( &p, 0, sizeof(p) ); + + p.srcPos.x = srcX * typeSz; + p.srcPos.y = srcY; + p.srcPos.z = srcF * srcAllocZ + srcZ; + p.srcPtr.ptr = src; + p.srcPtr.pitch = Nx * typeSz; + p.srcPtr.xsize = Nx * typeSz; + p.srcPtr.ysize = Ny; + + p.dstPos.x = dstX * typeSz; + p.dstPos.y = dstY; + p.dstPos.z = dstF * dstAllocZ + dstZ; + p.dstPtr.ptr = dst.ptr; + p.dstPtr.pitch = dst.pitch; + p.dstPtr.xsize = dst.xsize; + p.dstPtr.ysize = dst.ysize; + + p.extent.width = Nx * typeSz; + p.extent.height = Ny; + p.extent.depth = Nz * Nf; + + p.kind = cudaMemcpyHostToDevice; + + WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) ); +} + + +void copyDevToHostFZYXRestricted( unsigned char* dst, const cudaPitchedPtr& src, + uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ, + uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF, + uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF, + uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf ) +{ + cudaMemcpy3DParms p; + memset( &p, 0, sizeof(p) ); + + p.srcPos.x = srcX * typeSz; + p.srcPos.y = srcY; + p.srcPos.z = srcF * srcAllocZ + srcZ; + p.srcPtr.ptr = src.ptr; + p.srcPtr.pitch = src.pitch; + p.srcPtr.xsize = src.xsize; + p.srcPtr.ysize = src.ysize; + + p.dstPos.x = dstX * typeSz; + p.dstPos.y = dstY; + p.dstPos.z = dstF * dstAllocZ + dstZ; + p.dstPtr.ptr = dst; + p.dstPtr.pitch = Nx * typeSz; + p.dstPtr.xsize = Nx * typeSz; + p.dstPtr.ysize = Ny; + + p.extent.width = Nx * typeSz; + p.extent.height = Ny; + p.extent.depth = Nz * Nf; + + p.kind = cudaMemcpyDeviceToHost; + + WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) ); +} + + + +} // namespace cuda +} // namespace walberla diff --git a/src/cuda/GPUCopy.h b/src/cuda/GPUCopy.h new file mode 100644 index 0000000000000000000000000000000000000000..7c6c69a7963cd8ecc3bdd1289065c3e070890a32 --- /dev/null +++ b/src/cuda/GPUCopy.h @@ -0,0 +1,182 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file GPUCopy.h +//! \ingroup cuda +//! \author Paulo Carvalho <prcjunior@inf.ufpr.br> +//! \brief Copy routines of 4D intervals involving GPU buffers. +// +//====================================================================================================================== + +#pragma once + +#include "core/DataTypes.h" + +#include <cuda_runtime.h> + + +namespace walberla { +namespace cuda { + + +//******************************************************************************************************************* +/*! Restricted version of copyDevToDevFZYX() that requires Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ). + * See copyDevToDevFZYX() for more details. + *******************************************************************************************************************/ +void copyDevToDevFZYXRestricted( const cudaPitchedPtr& dst, const cudaPitchedPtr& src, + uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ, + uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF, + uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF, + uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf ); + + +//******************************************************************************************************************* +/*! Copy a 4D interval of a device buffer to another device buffer with fzyx memory layout. + * + * \param dst destination buffer + * \param src source buffer + * \param typeSz size of an f element + * \param dstAllocZ allocation size in z direction of the destination buffer + * \param srcAllocZ allocation size in z direction of the source buffer + * \param dstX x coordinate of the interval start point in the destination buffer + * \param dstY y coordinate of the interval start point in the destination buffer + * \param dstZ z coordinate of the interval start point in the destination buffer + * \param dstF f coordinate of the interval start point in the destination buffer + * \param srcX x coordinate of the interval start point in the source buffer + * \param srcY y coordinate of the interval start point in the source buffer + * \param srcZ z coordinate of the interval start point in the source buffer + * \param srcF f coordinate of the interval start point in the source buffer + * \param Nx interval size in x direction + * \param Ny interval size in y direction + * \param Nz interval size in z direction + * \param Nf interval size in f direction + *******************************************************************************************************************/ +inline void copyDevToDevFZYX( const cudaPitchedPtr& dst, const cudaPitchedPtr& src, + uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ, + uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF, + uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF, + uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf ) +{ + if( Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ) ) + { + copyDevToDevFZYXRestricted( dst, src, + typeSz, dstAllocZ, srcAllocZ, + dstX, dstY, dstZ, dstF, + srcX, srcY, srcZ, srcF, + Nx, Ny, Nz, Nf ); + } + else + { + for( uint_t f = 0; f < Nf; ++f ) + { + copyDevToDevFZYXRestricted( dst, src, + typeSz, dstAllocZ, srcAllocZ, + dstX, dstY, dstZ, dstF + f, + srcX, srcY, srcZ, srcF + f, + Nx, Ny, Nz, 1 ); + } + } +} + + +//******************************************************************************************************************* +/*! Restricted version of copyHostToDevFZYX() that requires Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ). + * See copyHostToDevFZYX() for more details. + *******************************************************************************************************************/ +void copyHostToDevFZYXRestricted( const cudaPitchedPtr& dst, unsigned char* src, + uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ, + uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF, + uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF, + uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf ); + + +//******************************************************************************************************************* +/*! Copy a 4D interval of a host buffer to a device buffer with fzyx memory layout. See copyDevToDevFZYX() for + * parameter information. + *******************************************************************************************************************/ +inline void copyHostToDevFZYX( const cudaPitchedPtr& dst, unsigned char* src, + uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ, + uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF, + uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF, + uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf ) +{ + if( Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ) ) + { + copyHostToDevFZYXRestricted( dst, src, + typeSz, dstAllocZ, srcAllocZ, + dstX, dstY, dstZ, dstF, + srcX, srcY, srcZ, srcF, + Nx, Ny, Nz, Nf ); + } + else + { + for( uint_t f = 0; f < Nf; ++f ) + { + copyHostToDevFZYXRestricted( dst, src, + typeSz, dstAllocZ, srcAllocZ, + dstX, dstY, dstZ, dstF + f, + srcX, srcY, srcZ, srcF + f, + Nx, Ny, Nz, 1 ); + } + } +} + + +//******************************************************************************************************************* +/*! Restricted version of copyDevToHostFZYX() that requires Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ). + * See copyDevToHostFZYX() for more details. + *******************************************************************************************************************/ +void copyDevToHostFZYXRestricted( unsigned char* dst, const cudaPitchedPtr& src, + uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ, + uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF, + uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF, + uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf ); + + +//******************************************************************************************************************* +/*! Copy a 4D interval of a device buffer to a host buffer with fzyx memory layout. See copyDevToDevFZYX() for + * parameter information. + *******************************************************************************************************************/ +inline void copyDevToHostFZYX( unsigned char* dst, const cudaPitchedPtr& src, + uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ, + uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF, + uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF, + uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf ) +{ + if( Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ) ) + { + copyDevToHostFZYXRestricted( dst, src, + typeSz, dstAllocZ, srcAllocZ, + dstX, dstY, dstZ, dstF, + srcX, srcY, srcZ, srcF, + Nx, Ny, Nz, Nf ); + } + else + { + for( uint_t f = 0; f < Nf; ++f ) + { + copyDevToHostFZYXRestricted( dst, src, + typeSz, dstAllocZ, srcAllocZ, + dstX, dstY, dstZ, dstF + f, + srcX, srcY, srcZ, srcF + f, + Nx, Ny, Nz, 1 ); + } + } +} + + + +} // namespace cuda +} // namespace walberla diff --git a/src/cuda/GPUField.cpp b/src/cuda/GPUField.cpp index 326e5c0b7e62398a918eb6cd1f8d6bab1d4759b2..8d2b51ed4d2d121c227eefaefa89bc4b9e9a5cbf 100644 --- a/src/cuda/GPUField.cpp +++ b/src/cuda/GPUField.cpp @@ -31,10 +31,10 @@ namespace cuda { template<typename T> GPUField<T>::GPUField( uint_t _xSize, uint_t _ySize, uint_t _zSize, uint_t _fSize, - uint_t _nrOfGhostLayers, const Layout & _layout ) + uint_t _nrOfGhostLayers, const Layout & _layout, bool usePitchedMem ) : nrOfGhostLayers_( _nrOfGhostLayers ), xSize_( _xSize), ySize_( _ySize ), zSize_( _zSize ), fSize_( _fSize ), - layout_( _layout ) + layout_( _layout ), usePitchedMem_( usePitchedMem ) { cudaExtent extent; if ( layout_ == zyxf ) @@ -50,7 +50,15 @@ GPUField<T>::GPUField( uint_t _xSize, uint_t _ySize, uint_t _zSize, uint_t _fSiz extent.depth = (_zSize + 2 * _nrOfGhostLayers ) * _fSize; } - WALBERLA_CUDA_CHECK ( cudaMalloc3D ( &pitchedPtr_, extent ) ); + if ( usePitchedMem_ ) + { + WALBERLA_CUDA_CHECK ( cudaMalloc3D ( &pitchedPtr_, extent ) ); + } + else + { + pitchedPtr_ = make_cudaPitchedPtr( NULL, extent.width, extent.width, extent.height ); + WALBERLA_CUDA_CHECK ( cudaMalloc( &pitchedPtr_.ptr, extent.width * extent.height * extent.depth ) ); + } } @@ -86,23 +94,76 @@ void GPUField<T>::getGhostRegion(stencil::Direction d, CellInterval & ci, template<typename T> -void GPUField<T>::getSliceBeforeGhostLayer(stencil::Direction d, CellInterval & ci, - cell_idx_t thickness, bool fullSlice ) const +void GPUField<T>::getSlice(stencil::Direction d, CellInterval & ci, + cell_idx_t distance, cell_idx_t thickness, bool fullSlice ) const { WALBERLA_ASSERT_GREATER( thickness, 0 ); const cell_idx_t sizeArr [] = { cell_idx_c( xSize() ), cell_idx_c( ySize() ), - cell_idx_c( zSize() )}; + cell_idx_c( zSize() ) }; cell_idx_t fullSliceInc = fullSlice ? cell_idx_c( nrOfGhostLayers() ) : 0; for(unsigned int dim = 0; dim< 3; ++dim) + { switch ( stencil::c[dim][d] ) { - case -1: ci.min()[dim] = 0; ci.max()[dim] = thickness - 1; break; - case 0: ci.min()[dim] = -fullSliceInc; ci.max()[dim] = sizeArr[dim] +fullSliceInc- 1; break; - case 1: ci.min()[dim] = sizeArr[dim]-thickness; ci.max()[dim] = sizeArr[dim] - 1; break; + case -1: + ci.min()[dim] = distance; + ci.max()[dim] = distance + thickness - 1; + break; + case 0: + ci.min()[dim] = -fullSliceInc; + ci.max()[dim] = sizeArr[dim] + fullSliceInc - 1; + break; + case 1: + ci.min()[dim] = sizeArr[dim] - distance - thickness; + ci.max()[dim] = sizeArr[dim] - distance - 1; + break; } + } +} + + +//******************************************************************************************************************* +/*! True if sizes of all dimensions match + *******************************************************************************************************************/ +template<typename T> +inline bool GPUField<T>::hasSameSize( const GPUField<T> & other ) const +{ + return xSize() == other.xSize() && + ySize() == other.ySize() && + zSize() == other.zSize(); +} + +//******************************************************************************************************************* +/*! True if allocation sizes of all dimensions match + *******************************************************************************************************************/ +template<typename T> +inline bool GPUField<T>::hasSameAllocSize( const GPUField<T> & other ) const +{ + return xAllocSize() == other.xAllocSize() && + yAllocSize() == other.yAllocSize() && + zAllocSize() == other.zAllocSize() && + fAllocSize() == other.fAllocSize(); +} + +//******************************************************************************************************************* +/*! Creates a new GPUField that has equal size, layout and memory type as this field but has uninitialized memory. + * + * \return a new FPUField that has to be freed by caller. + *******************************************************************************************************************/ +template <typename T> +GPUField<T> * GPUField<T>::cloneUninitialized() const +{ + GPUField<T> * res = new GPUField<T>( xSize(), ySize(), zSize(), fSize(), + nrOfGhostLayers(), layout(), isPitchedMem() ); + + WALBERLA_ASSERT( hasSameAllocSize( *res ) ); + WALBERLA_ASSERT( hasSameSize( *res ) ); + WALBERLA_ASSERT( layout() == res->layout() ); + WALBERLA_ASSERT( isPitchedMem() == res->isPitchedMem() ); + return res; } //******************************************************************************************************************* @@ -168,11 +229,10 @@ uint_t GPUField<T>::fAllocSize() const template<typename T> void GPUField<T>::swapDataPointers( GPUField<T> & other ) { - WALBERLA_ASSERT_EQUAL( xAllocSize(), other.xAllocSize() ); - WALBERLA_ASSERT_EQUAL( yAllocSize(), other.yAllocSize() ); - WALBERLA_ASSERT_EQUAL( zAllocSize(), other.zAllocSize() ); - WALBERLA_ASSERT_EQUAL( fAllocSize(), other.fAllocSize() ); - WALBERLA_ASSERT_EQUAL( layout(), other.layout() ); + WALBERLA_ASSERT( hasSameAllocSize( other ) ); + WALBERLA_ASSERT( hasSameSize( other ) ); + WALBERLA_ASSERT( layout() == other.layout() ); + WALBERLA_ASSERT( isPitchedMem() == other.isPitchedMem() ); std::swap( pitchedPtr_, other.pitchedPtr_ ); } diff --git a/src/cuda/GPUField.h b/src/cuda/GPUField.h index 52cc002b3d98f9cc196b9ddc8d12fc9f9f2c7214..3153aba60831751d68b8404bf70cbaaa3a64f1e0 100755 --- a/src/cuda/GPUField.h +++ b/src/cuda/GPUField.h @@ -63,12 +63,14 @@ namespace cuda { typedef T value_type; GPUField( uint_t _xSize, uint_t _ySize, uint_t _zSize, uint_t _fSize, - uint_t _nrOfGhostLayers, const Layout & _layout = zyxf ); + uint_t _nrOfGhostLayers, const Layout & _layout = zyxf, bool usePitchedMem = true ); ~GPUField(); Layout layout() const { return layout_; } + bool isPitchedMem() const { return usePitchedMem_; } + cudaPitchedPtr pitchedPtr() const { return pitchedPtr_; } @@ -76,6 +78,7 @@ namespace cuda { inline uint_t ySize() const { return ySize_; } inline uint_t zSize() const { return zSize_; } inline uint_t fSize() const { return fSize_; } + inline uint_t size() const { return fSize() * xSize() * ySize() * zSize(); } cell_idx_t xOff() const { return cell_idx_c( nrOfGhostLayers_ ); } cell_idx_t yOff() const { return cell_idx_c( nrOfGhostLayers_ ); } @@ -86,10 +89,15 @@ namespace cuda { uint_t yAllocSize() const; uint_t zAllocSize() const; uint_t fAllocSize() const; + inline uint_t allocSize() const { return fAllocSize() * xAllocSize() * yAllocSize() * zAllocSize(); } + + inline bool hasSameAllocSize( const GPUField<T> & other ) const; + inline bool hasSameSize( const GPUField<T> & other ) const; + GPUField<T> * cloneUninitialized() const; void swapDataPointers( GPUField<T> & other ); - void swapDataPointers( GPUField<T> * other ) { swapDataPointers( *other); } + void swapDataPointers( GPUField<T> * other ) { swapDataPointers( *other ); } inline uint_t nrOfGhostLayers() const { return nrOfGhostLayers_; } @@ -99,8 +107,12 @@ namespace cuda { void getGhostRegion( stencil::Direction d, CellInterval & ci, cell_idx_t thickness, bool fullSlice ) const; void getSliceBeforeGhostLayer(stencil::Direction d, CellInterval & ci, - cell_idx_t thickness, bool fullSlice ) const; - + cell_idx_t thickness, bool fullSlice ) const + { + getSlice( d, ci, 0, thickness, fullSlice ); + } + void getSlice(stencil::Direction d, CellInterval & ci, + cell_idx_t distance, cell_idx_t thickness, bool fullSlice ) const; void * data() { return pitchedPtr_.ptr; } const void * data() const { return pitchedPtr_.ptr; } @@ -113,6 +125,7 @@ namespace cuda { uint_t zSize_; uint_t fSize_; Layout layout_; + bool usePitchedMem_; }; diff --git a/src/cuda/communication/GPUPackInfo.h b/src/cuda/communication/GPUPackInfo.h new file mode 100644 index 0000000000000000000000000000000000000000..6a3e98434b0418dad921303f7b82fda46eb456de --- /dev/null +++ b/src/cuda/communication/GPUPackInfo.h @@ -0,0 +1,174 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file GPUPackInfo.h +//! \ingroup cuda +//! \author Paulo Carvalho <prcjunior@inf.ufpr.br> +//====================================================================================================================== + +#pragma once + +#include "cuda/GPUCopy.h" +#include "communication/UniformPackInfo.h" +#include "core/debug/Debug.h" +#include "field/GhostRegions.h" +#include "stencil/Directions.h" + + +namespace walberla { +namespace cuda { +namespace communication { + + +/** + * Data packing/unpacking for ghost layer based communication of a cuda::GPUField + * \ingroup cuda + * Template Parameters: + * - GPUField_T A fully qualified GPUField. + */ +template<typename GPUField_T> +class GPUPackInfo : public walberla::communication::UniformPackInfo +{ +public: + typedef typename GPUField_T::value_type T; + + GPUPackInfo( const BlockDataID & bdId ) : bdId_( bdId ), communicateAllGhostLayers_( true ), + numberOfGhostLayers_( 0 ) {} + + GPUPackInfo( const BlockDataID & bdId, const uint_t numberOfGHostLayers ) : bdId_( bdId ), + communicateAllGhostLayers_( false ), numberOfGhostLayers_( numberOfGHostLayers ) {} + + virtual ~GPUPackInfo() {} + + bool constantDataExchange() const { return mpi::BufferSizeTrait<T>::constantSize; } + bool threadsafeReceiving() const { return true; } + + void unpackData(IBlock * receiver, stencil::Direction dir, mpi::RecvBuffer & buffer); + + void communicateLocal(const IBlock * sender, IBlock * receiver, stencil::Direction dir); + +protected: + void packDataImpl(const IBlock * sender, stencil::Direction dir, mpi::SendBuffer & outBuffer) const; + + uint_t numberOfGhostLayersToCommunicate( const GPUField_T * const field ) const; + + const BlockDataID bdId_; + bool communicateAllGhostLayers_; + uint_t numberOfGhostLayers_; +}; + + +template<typename GPUField_T> +void GPUPackInfo<GPUField_T>::unpackData(IBlock * receiver, stencil::Direction dir, mpi::RecvBuffer & buffer) +{ + GPUField_T * f = receiver->getData< GPUField_T >( bdId_ ); + WALBERLA_ASSERT_NOT_NULLPTR(f); + + if ( f->layout() != fzyx ) + { + WALBERLA_ABORT( "GPUPackInfo currently only supports fzyx layout" ); + } + + cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( f ) ); + + CellInterval ci = field::getGhostRegion( *f, dir, nrOfGhostLayers, false ); + + uint_t nrOfBytesToRead = ci.xSize() * ci.ySize() * ci.zSize() * f->fSize() * sizeof(T); + + unsigned char * buf = buffer.skip(nrOfBytesToRead); + + copyHostToDevFZYX( f->pitchedPtr(), buf, + sizeof(T), f->zAllocSize(), ci.zSize(), + ci.xMin() + nrOfGhostLayers, ci.yMin() + nrOfGhostLayers, ci.zMin() + nrOfGhostLayers, 0, + 0, 0, 0, 0, + ci.xSize(), ci.ySize(), ci.zSize(), f->fSize() ); +} + + +template<typename GPUField_T> +void GPUPackInfo<GPUField_T>::communicateLocal(const IBlock * sender, IBlock * receiver, stencil::Direction dir) +{ + const GPUField_T * sf = sender ->getData< GPUField_T >( bdId_ ); + GPUField_T * rf = receiver->getData< GPUField_T >( bdId_ ); + + if ( sf->layout() != fzyx || rf->layout() != fzyx ) + { + WALBERLA_ABORT( "GPUPackInfo currently only supports fzyx layout" ); + } + + WALBERLA_ASSERT_EQUAL(sf->xSize(), rf->xSize()); + WALBERLA_ASSERT_EQUAL(sf->ySize(), rf->ySize()); + WALBERLA_ASSERT_EQUAL(sf->zSize(), rf->zSize()); + WALBERLA_ASSERT_EQUAL(sf->fSize(), rf->fSize()); + + cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( sf ) ); + + CellInterval sCi = field::getSliceBeforeGhostLayer( *sf, dir, nrOfGhostLayers, false ); + CellInterval rCi = field::getGhostRegion( *rf, stencil::inverseDir[dir], nrOfGhostLayers, false ); + + copyDevToDevFZYX( rf->pitchedPtr(), sf->pitchedPtr(), + sizeof(T), rf->zAllocSize(), sf->zAllocSize(), + rCi.xMin() + nrOfGhostLayers, rCi.yMin() + nrOfGhostLayers, rCi.zMin() + nrOfGhostLayers, 0, + sCi.xMin() + nrOfGhostLayers, sCi.yMin() + nrOfGhostLayers, sCi.zMin() + nrOfGhostLayers, 0, + rCi.xSize(), rCi.ySize(), rCi.zSize(), sf->fSize() ); +} + + +template<typename GPUField_T> +void GPUPackInfo<GPUField_T>::packDataImpl(const IBlock * sender, stencil::Direction dir, mpi::SendBuffer & outBuffer) const +{ + const GPUField_T * f = sender->getData< GPUField_T >( bdId_ ); + WALBERLA_ASSERT_NOT_NULLPTR(f); + + if ( f->layout() != fzyx ) + { + WALBERLA_ABORT( "GPUPackInfo currently only supports fzyx layout" ); + } + + cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( f ) ); + + CellInterval ci = field::getSliceBeforeGhostLayer( *f, dir, nrOfGhostLayers, false ); + + uint_t nrOfBytesToPack = ci.xSize() * ci.ySize() * ci.zSize() * f->fSize() * sizeof(T); + + unsigned char * buf = outBuffer.forward( nrOfBytesToPack ); + + copyDevToHostFZYX( buf, f->pitchedPtr(), + sizeof(T), ci.zSize(), f->zAllocSize(), + 0, 0, 0, 0, + ci.xMin() + nrOfGhostLayers, ci.yMin() + nrOfGhostLayers, ci.zMin() + nrOfGhostLayers, 0, + ci.xSize(), ci.ySize(), ci.zSize(), f->fSize() ); +} + + +template<typename GPUField_T> +uint_t GPUPackInfo<GPUField_T>::numberOfGhostLayersToCommunicate( const GPUField_T * const field ) const +{ + if( communicateAllGhostLayers_ ) + { + return field->nrOfGhostLayers(); + } + else + { + WALBERLA_ASSERT_LESS_EQUAL( numberOfGhostLayers_, field->nrOfGhostLayers() ); + return numberOfGhostLayers_; + } +} + + + +} // namespace communication +} // namespace cuda +} // namespace walberla diff --git a/src/cuda/sweeps/GPUSweepBase.h b/src/cuda/sweeps/GPUSweepBase.h new file mode 100644 index 0000000000000000000000000000000000000000..fbd5e2f8e6ff688a95c2e13425f58ff49085b8c9 --- /dev/null +++ b/src/cuda/sweeps/GPUSweepBase.h @@ -0,0 +1,76 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file GPUSweepBase.h +//! \ingroup cuda +//! \author Paulo Carvalho <prcjunior@inf.ufpr.br> +// +//====================================================================================================================== + +#pragma once + + +#include "cuda/GPUField.h" + +#include "core/debug/Debug.h" + +#include "field/SwapableCompare.h" + +#include <set> + +namespace walberla { +namespace cuda { + + +template < typename GPUField_T > +class GPUSweepBase +{ +public: + GPUSweepBase() + { + } + virtual ~GPUSweepBase() + { + for( auto field = dstFields_.begin(); field != dstFields_.end(); ++field ) + { + delete *field; + } + } + + GPUField_T * getDstField( GPUField_T * const src ) + { + auto it = dstFields_.find( src ); + if( it != dstFields_.end() ) + { + return *it; + } + + GPUField_T * dst = src->cloneUninitialized(); + WALBERLA_ASSERT_NOT_NULLPTR( dst ); + + dstFields_.insert( dst ); + + return dst; + } + +protected: + + std::set< GPUField_T *, field::SwapableCompare< GPUField_T * > > dstFields_; +}; + + +} // namespace cuda +} // namespace walberla + diff --git a/src/field/GhostLayerField.impl.h b/src/field/GhostLayerField.impl.h index 84bd0aa3781b886cfa0c83c71649abf445e8b5e0..6a4d725f319e9e7fa2c718c194ca3b6528db0dc5 100644 --- a/src/field/GhostLayerField.impl.h +++ b/src/field/GhostLayerField.impl.h @@ -21,6 +21,7 @@ //====================================================================================================================== #include "field/iterators/IteratorMacros.h" +#include "field/GhostRegions.h" #include "core/debug/Debug.h" @@ -348,80 +349,18 @@ namespace field { xs, ys, zs, 1 ); } - - - - //******************************************************************************************************************* - /*!\brief Constructs CellInterval containing the ghost region in the specified direction - * - * \param d direction of the ghost layer For W,E,N,S,T,B a slice is returned - * for NW, NE, .. an edge is returned - * for TBE, TNW, ... a corner ( single cell ) is returned - * \param ci output parameter, CellInterval describing the ghost layer - * \param thickness how many ghost layer to return, if thickness < nrOfGhostLayers() the ghostlayers - * nearest to the domain are returned - * \param fullSlice if true also the ghost cells in directions orthogonal to d are contained in the - * returned cell interval. Example (d=W ): if fullSlice then also the ghost cells in N-S and T-B - * are included. - *******************************************************************************************************************/ template<typename T, uint_t fSize_> void GhostLayerField<T,fSize_>::getGhostRegion(stencil::Direction d, CellInterval & ci, cell_idx_t thickness, bool fullSlice ) const { - const cell_idx_t sizeArr [] = { cell_idx_c( Field<T,fSize_>::xSize() ), - cell_idx_c( Field<T,fSize_>::ySize() ), - cell_idx_c( Field<T,fSize_>::zSize() )}; - - WALBERLA_ASSERT_GREATER( thickness, 0 ); - WALBERLA_ASSERT_LESS_EQUAL( uint_c(thickness), gl_ ); - const cell_idx_t ghosts = cell_idx_c ( thickness ); - - cell_idx_t fullSliceInc = fullSlice ? cell_idx_c(gl_) : 0; - - for( uint_t dim = 0; dim< 3; ++dim ) - switch ( stencil::c[dim][d] ) - { - case -1: ci.min()[dim] = -ghosts; ci.max()[dim] = 0 - 1; break; - case 0: ci.min()[dim] = -fullSliceInc; ci.max()[dim] = sizeArr[dim]+fullSliceInc - 1; break; - case 1: ci.min()[dim] = sizeArr[dim]; ci.max()[dim] = sizeArr[dim]+ghosts - 1; break; - } - + ci = field::getGhostRegion( *this, d, thickness, fullSlice ); } - - //******************************************************************************************************************* - /*!\brief Constructs CellInterval containing the last slice before the ghost layer begins - * for a given direction. - * - * \param d direction of the border . For W,E,N,S,T,B a slice is returned - * for NW, NE, .. an edge is returned - * for TBE, TNW, ... a corner ( single cell ) is returned - * \param ci output parameter, CellInterval describing the slice before ghost layer - * \param thickness how many slices to return - * \param fullSlice if true also the ghost cells in directions orthogonal to d are contained in the \ - * returned cell interval. Example (d=W ): if fullSlice then also the ghost layer in N-S and T-B - * are included, otherwise only inner cells are returned - * - *******************************************************************************************************************/ template<typename T, uint_t fSize_> void GhostLayerField<T,fSize_>::getSliceBeforeGhostLayer(stencil::Direction d, CellInterval & ci, cell_idx_t thickness, bool fullSlice ) const { - WALBERLA_ASSERT_GREATER( thickness, 0 ); - - const cell_idx_t sizeArr [] = { cell_idx_c( Field<T,fSize_>::xSize() ), - cell_idx_c( Field<T,fSize_>::ySize() ), - cell_idx_c( Field<T,fSize_>::zSize() )}; - - - cell_idx_t fullSliceInc = fullSlice ? cell_idx_c(gl_) : 0; - for( uint_t dim = 0; dim< 3; ++dim ) - switch ( stencil::c[dim][d] ) - { - case -1: ci.min()[dim] = 0; ci.max()[dim] = thickness - 1; break; - case 0: ci.min()[dim] = -fullSliceInc; ci.max()[dim] = sizeArr[dim] +fullSliceInc- 1; break; - case 1: ci.min()[dim] = sizeArr[dim]-thickness; ci.max()[dim] = sizeArr[dim] - 1; break; - } + ci = field::getSliceBeforeGhostLayer( *this, d, thickness, fullSlice ); } //******************************************************************************************************************* diff --git a/src/field/GhostRegions.h b/src/field/GhostRegions.h new file mode 100644 index 0000000000000000000000000000000000000000..359163aedd81b24e03bddbabb35453b006926146 --- /dev/null +++ b/src/field/GhostRegions.h @@ -0,0 +1,109 @@ +//============================================================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file GhostRegions.h +//! \ingroup field +//! \author Paulo Carvalho <prcjunior@inf.ufpr.br> +//! \author Martin Bauer <martin.bauer@fau.de> +// +//============================================================================================================================================================== + +#pragma once + +#include "core/cell/CellInterval.h" + +namespace walberla { +namespace field { + + +//******************************************************************************************************************* +/*!\brief Constructs CellInterval containing the ghost region in the specified direction + * + * \param d direction of the ghost layer For W,E,N,S,T,B a slice is returned + * for NW, NE, .. an edge is returned + * for TBE, TNW, ... a corner ( single cell ) is returned + * \param thickness how many ghost layer to return, if thickness < nrOfGhostLayers() the ghostlayers + * nearest to the domain are returned + * \param fullSlice if true also the ghost cells in directions orthogonal to d are contained in the + * returned cell interval. Example (d=W ): if fullSlice then also the ghost cells in N-S and T-B + * are included. + * \return CellInterval describing the ghost layer + *******************************************************************************************************************/ +template< typename GhostLayerField_T> +CellInterval getGhostRegion( const GhostLayerField_T & f, stencil::Direction d, + cell_idx_t thickness, bool fullSlice ) +{ + const cell_idx_t sizeArr [] = { cell_idx_c( f.xSize() ), + cell_idx_c( f.ySize() ), + cell_idx_c( f.zSize() )}; + + WALBERLA_ASSERT_GREATER( thickness, 0 ); + WALBERLA_ASSERT_LESS_EQUAL( uint_c(thickness), f.nrOfGhostLayers() ); + const cell_idx_t ghosts = cell_idx_c ( thickness ); + + cell_idx_t fullSliceInc = fullSlice ? cell_idx_c( f.nrOfGhostLayers() ) : 0; + + CellInterval ci; + for( uint_t dim = 0; dim< 3; ++dim ) + switch ( stencil::c[dim][d] ) + { + case -1: ci.min()[dim] = -ghosts; ci.max()[dim] = 0 - 1; break; + case 0: ci.min()[dim] = -fullSliceInc; ci.max()[dim] = sizeArr[dim]+fullSliceInc - 1; break; + case 1: ci.min()[dim] = sizeArr[dim]; ci.max()[dim] = sizeArr[dim]+ghosts - 1; break; + } + return ci; +} + + + +//******************************************************************************************************************* +/*!\brief Constructs CellInterval containing the last slice before the ghost layer begins + * for a given direction. + * + * \param d direction of the border . For W,E,N,S,T,B a slice is returned + * for NW, NE, .. an edge is returned + * for TBE, TNW, ... a corner ( single cell ) is returned + * \param thickness how many slices to return + * \param fullSlice if true also the ghost cells in directions orthogonal to d are contained in the \ + * returned cell interval. Example (d=W ): if fullSlice then also the ghost layer in N-S and T-B + * are included, otherwise only inner cells are returned + * \return CellInterval describing the slice before ghost layer + *******************************************************************************************************************/ +template< typename GhostLayerField_T> +CellInterval getSliceBeforeGhostLayer(const GhostLayerField_T & f, stencil::Direction d, + cell_idx_t thickness, bool fullSlice ) +{ + WALBERLA_ASSERT_GREATER( thickness, 0 ); + + const cell_idx_t sizeArr [] = { cell_idx_c( f.xSize() ), + cell_idx_c( f.ySize() ), + cell_idx_c( f.zSize() )}; + + + CellInterval ci; + cell_idx_t fullSliceInc = fullSlice ? cell_idx_c( f.nrOfGhostLayers()) : 0; + for( uint_t dim = 0; dim< 3; ++dim ) + switch ( stencil::c[dim][d] ) + { + case -1: ci.min()[dim] = 0; ci.max()[dim] = thickness - 1; break; + case 0: ci.min()[dim] = -fullSliceInc; ci.max()[dim] = sizeArr[dim] +fullSliceInc- 1; break; + case 1: ci.min()[dim] = sizeArr[dim]-thickness; ci.max()[dim] = sizeArr[dim] - 1; break; + } + return ci; +} + + +} // namespace field +} // namespace walberla diff --git a/tests/cuda/CMakeLists.txt b/tests/cuda/CMakeLists.txt index 89d914803497c7f907f7338eeea19efaba01769f..74769cbbc2458c61dc44da91605b976904adc927 100644 --- a/tests/cuda/CMakeLists.txt +++ b/tests/cuda/CMakeLists.txt @@ -4,6 +4,12 @@ # ################################################################################################### +waLBerla_compile_test( FILES communication/GPUPackInfoTest.cpp DEPENDS blockforest ) +waLBerla_execute_test( NAME GPUPackInfoTest ) + +waLBerla_compile_test( FILES communication/CommTest ) +waLBerla_execute_test( NAME CommTest ) + waLBerla_compile_test( FILES FieldTransferTest ) waLBerla_execute_test( NAME FieldTransferTest ) @@ -13,3 +19,6 @@ waLBerla_execute_test( NAME SimpleKernelTest ) waLBerla_compile_test( FILES CudaMPI DEPENDS blockforest timeloop gui ) waLBerla_execute_test( NAME CudaMPI ) +waLBerla_compile_test( FILES FieldIndexing3DTest.cpp FieldIndexing3DTest.cu ) +waLBerla_execute_test( NAME FieldIndexing3DTest ) + diff --git a/tests/cuda/CudaTestCommon.h b/tests/cuda/CudaTestCommon.h new file mode 100644 index 0000000000000000000000000000000000000000..8de39016e60dc88ba0b5a2c26632e43b4c12ccbd --- /dev/null +++ b/tests/cuda/CudaTestCommon.h @@ -0,0 +1,88 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file +//! \author Paulo Carvalho <prcjunior@inf.ufpr.br> +// +//====================================================================================================================== + +#pragma once + +#include "core/DataTypes.h" +#include "core/debug/CheckFunctions.h" + +#include "field/GhostLayerField.h" + + +#ifdef DBG_PRINT_ON +#define DBG_PRINT_FIELD( f ) printField( f ) +#define DBG_PRINT( fmt, ... ) printf( fmt, ##__VA_ARGS__ ) +#else +#define DBG_PRINT_FIELD( f ) +#define DBG_PRINT(fmt, ...) +#endif + +template<typename Field_T> +void printField( Field_T& field ) +{ + using namespace walberla; + cell_idx_t fs = 0; + cell_idx_t zs = -(cell_idx_t)field.nrOfGhostLayers(); + cell_idx_t ys = -(cell_idx_t)field.nrOfGhostLayers(); + cell_idx_t xs = -(cell_idx_t)field.nrOfGhostLayers(); + cell_idx_t nf = (cell_idx_t)field.fSize(); + cell_idx_t nz = (cell_idx_t)(field.zSize() + field.nrOfGhostLayers()); + cell_idx_t ny = (cell_idx_t)(field.ySize() + field.nrOfGhostLayers()); + cell_idx_t nx = (cell_idx_t)(field.xSize() + field.nrOfGhostLayers()); + for ( cell_idx_t f = fs; f < nf; ++f ) { + std::cout << "{"; + for ( cell_idx_t z = zs; z < nz; ++z ) { + std::cout << ( z == zs ? "[" : " [" ); + for ( cell_idx_t y = ys; y < ny; ++y ) { + std::cout << "("; + for ( cell_idx_t x = xs; x < nx; ++x ) { + std::cout << field( x, y, z, f ) << ( x == nx-1 ? "" : " " ); + } + std::cout << ( y == ny-1 ? ")" : ") " ); + } + std::cout << "]\n"; + } + std::cout << "}\n"; + } +} + + +#define CHECK_FIELD_EQUAL( f1, f2 ) WALBERLA_CHECK( checkFieldEqual( f1, f2 ), "Field differ" ) + +template< typename Field_T > +bool checkFieldEqual( Field_T& field1, Field_T& field2 ) +{ + using namespace walberla; + WALBERLA_ASSERT( field1.xSize() == field2.xSize() && + field1.ySize() == field2.ySize() && + field1.zSize() == field2.zSize() && + field1.fSize() == field2.fSize() ); + + WALBERLA_FOR_ALL_CELLS_XYZ( &field2, + for ( uint_t f = 0; f < field1.fSize(); ++f ) + { + if ( field1.get( x, y, z, f ) != field2.get( x, y, z, f ) ) + { + return false; + } + } + ) + return true; +} diff --git a/tests/cuda/FieldIndexing3DTest.cpp b/tests/cuda/FieldIndexing3DTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e1044d08a01ef22ebfc9d4e162dd89e5742e8adc --- /dev/null +++ b/tests/cuda/FieldIndexing3DTest.cpp @@ -0,0 +1,110 @@ + +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file +//! \author Paulo Carvalho <prcjunior@inf.ufpr.br> +// +//====================================================================================================================== + + +#include "core/debug/TestSubsystem.h" +#include "core/Environment.h" +#include "core/mpi/Datatype.h" + +#include "field/GhostLayerField.h" + +#include "cuda/GPUField.h" +#include "cuda/FieldCopy.h" +#include "cuda/Kernel.h" +#include "cuda/FieldIndexing3D.h" + +//#define DBG_PRINT_ON +#include "CudaTestCommon.h" + +#include "FieldIndexing3DTest.h" + +using namespace walberla; + + +typedef cuda::FieldIndexing3D<int> FieldIdx3D_T; +typedef GhostLayerField<int , F_SIZE> HostField_T; +typedef cuda::GPUField<int> GPUField_T; + + + +void xyzTest() +{ + const HostField_T emptyField( X_SIZE, Y_SIZE, Z_SIZE, GL_SIZE, -1, LAYOUT ); + GPUField_T deviceField( X_SIZE, Y_SIZE, Z_SIZE, F_SIZE, 1, LAYOUT ); + cuda::fieldCpy( deviceField, emptyField ); + + auto setValue = cuda::make_kernel( &setValueKernel ); + setValue.addFieldIndexingParam( FieldIdx3D_T::xyz( deviceField ) ); + setValue(); + + HostField_T resultField( X_SIZE, Y_SIZE, Z_SIZE, GL_SIZE, -1, LAYOUT ); + cuda::fieldCpy( resultField, deviceField ); + + HostField_T expectedField( X_SIZE, Y_SIZE, Z_SIZE, GL_SIZE, -1, LAYOUT ); + WALBERLA_FOR_ALL_CELLS_XYZ( &expectedField, + for ( uint_t f = 0; f < expectedField.fSize(); ++f ) + { + expectedField.get( x, y, z, f ) = IDX4D( x, y, z, f ); + } + ) + + DBG_PRINT_FIELD( resultField ); + CHECK_FIELD_EQUAL( resultField, expectedField ); +} + + +void sliceBeforeGhostLayerXYZTest() +{ + const HostField_T emptyField( X_SIZE, Y_SIZE, Z_SIZE, GL_SIZE, -1, LAYOUT ); + GPUField_T deviceField( X_SIZE, Y_SIZE, Z_SIZE, F_SIZE, 1, LAYOUT ); + cuda::fieldCpy( deviceField, emptyField ); + + auto setValue = cuda::make_kernel( &setValueKernel ); + setValue.addFieldIndexingParam( FieldIdx3D_T::sliceBeforeGhostLayerXYZ( deviceField, 1, stencil::B, true ) ); + setValue(); + + HostField_T resultField( X_SIZE, Y_SIZE, Z_SIZE, GL_SIZE, -1, LAYOUT ); + cuda::fieldCpy( resultField, deviceField ); + + HostField_T expectedField( X_SIZE, Y_SIZE, Z_SIZE, GL_SIZE, -1, LAYOUT ); + CellInterval ci; + expectedField.getSliceBeforeGhostLayer( stencil::B, ci, 1, true ); + WALBERLA_FOR_ALL_CELLS_IN_INTERVAL_XYZ( ci, + for ( uint_t f = 0; f < expectedField.fSize(); ++f ) + { + expectedField.get( x, y, z, f ) = IDX4D( x - ci.xMin(), y - ci.yMin(), z - ci.zMin(), f ); + } + ) + DBG_PRINT_FIELD( resultField ); + CHECK_FIELD_EQUAL( resultField, expectedField ); +} + + +int main( int argc, char ** argv ) +{ + debug::enterTestMode(); + walberla::Environment walberlaEnv( argc, argv ); + + xyzTest(); + sliceBeforeGhostLayerXYZTest(); + + return 0; +} diff --git a/tests/cuda/FieldIndexing3DTest.cu b/tests/cuda/FieldIndexing3DTest.cu new file mode 100644 index 0000000000000000000000000000000000000000..7ade5bfc6a4149ffb65255c1e7a99aa9e159b72d --- /dev/null +++ b/tests/cuda/FieldIndexing3DTest.cu @@ -0,0 +1,43 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file +//! \author Paulo Carvalho <prcjunior@inf.ufpr.br> +// +//====================================================================================================================== + +#include "FieldIndexing3DTest.h" + + +namespace walberla { + +__global__ void setValueKernel( FieldAccessor3D_T fa ) +{ + unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; + unsigned int z = blockIdx.z * blockDim.z + threadIdx.z; + fa.set( blockIdx, threadIdx ); + + if ( fa.isValidPosition() ) + { + for ( int f = 0; f < F_SIZE; ++f ) + { + fa.get(f) = IDX4D( x, y, z, f ); + } + } +} + + +} // namespace walberla diff --git a/tests/cuda/FieldIndexing3DTest.h b/tests/cuda/FieldIndexing3DTest.h new file mode 100644 index 0000000000000000000000000000000000000000..ab8eb00ce675f5b16dbca4be4bf0afb4aa2b4756 --- /dev/null +++ b/tests/cuda/FieldIndexing3DTest.h @@ -0,0 +1,45 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file +//! \author Paulo Carvalho <prcjunior@inf.ufpr.br> +// +//====================================================================================================================== + +#pragma once + +#include "cuda/FieldAccessor3D.h" + +#define X_SIZE (64-2) +#define Y_SIZE (64-2) +#define Z_SIZE (64-2) +#define F_SIZE 19 +#define LAYOUT field::fzyx +#define GL_SIZE 1 + +#define YOFFSET ( X_SIZE ) +#define ZOFFSET ( ( Y_SIZE ) * ( YOFFSET ) ) +#define FOFFSET ( ( Z_SIZE ) * ( ZOFFSET ) ) +#define IDX4D( x, y, z, f ) ( (int)( (f) * (FOFFSET) + (z) * (Z_SIZE) + (y) * (YOFFSET) + (x) ) ) + + +namespace walberla { + +typedef cuda::FieldAccessor3D<int> FieldAccessor3D_T; + +__global__ void setValueKernel( FieldAccessor3D_T fa ); + + +} // namespace walberla diff --git a/tests/cuda/FieldTransferTest.cpp b/tests/cuda/FieldTransferTest.cpp index e37768d9d2d1dd5a2fd188d057bc493f514d1f7a..4cdd11b9f5ede14ec9c07a7ad5a3465b8a93df98 100644 --- a/tests/cuda/FieldTransferTest.cpp +++ b/tests/cuda/FieldTransferTest.cpp @@ -42,7 +42,7 @@ void simpleTransfer() WALBERLA_CHECK_EQUAL( h_f1.ySize() ,d_f.ySize() ); WALBERLA_CHECK_EQUAL( h_f1.zSize() ,d_f.zSize() ); WALBERLA_CHECK_EQUAL( h_f1.fSize() ,d_f.fSize() ); - WALBERLA_CHECK_EQUAL( h_f1.layout() ,d_f.layout() ); + WALBERLA_CHECK_EQUAL( h_f1.layout(),d_f.layout() ); cuda::fieldCpy( d_f, h_f1 ); diff --git a/tests/cuda/SimpleKernelTest.cpp b/tests/cuda/SimpleKernelTest.cpp index 8313947ed6c672a2aa3c92b641b37b8eced9515b..4a19ec001ecfa1725bcb42ecf707b0cdfdf7dd49 100644 --- a/tests/cuda/SimpleKernelTest.cpp +++ b/tests/cuda/SimpleKernelTest.cpp @@ -101,7 +101,7 @@ int main( int argc, char ** argv ) cuda::fieldCpy( *cpuField, *gpuField ); - WALBERLA_ASSERT_EQUAL( cpuField->get(0,0,0), 2 ); + WALBERLA_ASSERT_FLOAT_EQUAL( cpuField->get(0,0,0), real_t(2) ); } diff --git a/tests/cuda/communication/CommTest.cpp b/tests/cuda/communication/CommTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0b7378060ad58c8813efa3d3a1e2b88d4a72ee01 --- /dev/null +++ b/tests/cuda/communication/CommTest.cpp @@ -0,0 +1,242 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file +//! \author Paulo Carvalho <prcjunior@inf.ufpr.br> +// +//====================================================================================================================== + + +#include "core/debug/TestSubsystem.h" +#include "core/Environment.h" +#include "core/mpi/Datatype.h" + +#include "field/communication/MPIDatatypes.h" +#include "field/Field.h" + +#include "cuda/GPUField.h" +#include "cuda/FieldCopy.h" + +#define NUM_ITER 100 +#define SIZE_X 16 +#define SIZE_Y 16 +#define SIZE_Z 16 +#define LAYOUT field::fzyx + + +using namespace walberla; + +void hostToHost() +{ + Field<double, 1> hostField1(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT); + Field<double, 1> hostField2(SIZE_X, SIZE_Y, SIZE_Z, 0, LAYOUT); + + double startTime = MPI_Wtime(); + for (int i = 0; i < NUM_ITER; ++i) { + hostField2.set(hostField1); + } + double endTime = MPI_Wtime(); + std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl; +} + +void hostToDevice() +{ + Field<double, 1> hostField(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT); + cuda::GPUField<double> deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0.0, LAYOUT); + + double startTime = MPI_Wtime(); + for (int i = 0; i < NUM_ITER; ++i) { + cuda::fieldCpy(deviceField, hostField); + } + double endTime = MPI_Wtime(); + std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl; +} + +void deviceToHost() +{ + Field<double, 1> hostField(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT); + cuda::GPUField<double> deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT); + cuda::fieldCpy(deviceField, hostField); + + double startTime = MPI_Wtime(); + for (int i = 0; i < NUM_ITER; ++i) { + cuda::fieldCpy(hostField, deviceField); + } + double endTime = MPI_Wtime(); + std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl; +} + +void mpiHostToHost() +{ + Field<double, 1> hostField1(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT); + Field<double, 1> hostField2(SIZE_X, SIZE_Y, SIZE_Z, 0.0, LAYOUT); + + auto hostDatatype1 = mpi::Datatype ( field::communication::mpiDatatype( hostField1 ) ); + auto hostDatatype2 = mpi::Datatype ( field::communication::mpiDatatype( hostField2 ) ); + + double startTime = MPI_Wtime(); + for (int i = 0; i < NUM_ITER; ++i) { + MPI_Request request1; + MPI_Isend( hostField1.data(), 1, hostDatatype1, 0, 0, MPI_COMM_WORLD, &request1 ); + + MPI_Request request2; + MPI_Irecv( hostField2.data(), 1, hostDatatype2, 0, 0, MPI_COMM_WORLD, &request2 ); + + MPI_Wait( &request1, MPI_STATUS_IGNORE ); + MPI_Wait( &request2, MPI_STATUS_IGNORE ); + } + double endTime = MPI_Wtime(); + std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl; +} + +void mpiHostToDevice() +{ + Field<double, 1> hostField(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT); + cuda::GPUField<double> deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT); + + auto hostDatatype = mpi::Datatype ( field::communication::mpiDatatype( hostField ) ); + auto deviceDatatype = mpi::Datatype ( field::communication::mpiDatatype( deviceField ) ); + + double startTime = MPI_Wtime(); + for (int i = 0; i < NUM_ITER; ++i) { + MPI_Request request1; + MPI_Isend( hostField.data(), 1, hostDatatype, 0, 0, MPI_COMM_WORLD, &request1 ); + + MPI_Request request2; + MPI_Irecv( deviceField.data(), 1, deviceDatatype, 0, 0, MPI_COMM_WORLD, &request2 ); + + MPI_Wait( &request1, MPI_STATUS_IGNORE ); + MPI_Wait( &request2, MPI_STATUS_IGNORE ); + } + double endTime = MPI_Wtime(); + std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl; +} + +void mpiDeviceToHost() +{ + Field<double, 1> hostField(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT); + cuda::GPUField<double> deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT); + + auto hostDatatype = mpi::Datatype ( field::communication::mpiDatatype( hostField ) ); + auto deviceDatatype = mpi::Datatype ( field::communication::mpiDatatype( deviceField ) ); + + double startTime = MPI_Wtime(); + for (int i = 0; i < NUM_ITER; ++i) { + MPI_Request request1; + MPI_Isend( deviceField.data(), 1, deviceDatatype, 0, 0, MPI_COMM_WORLD, &request1 ); + + MPI_Request request2; + MPI_Irecv( hostField.data(), 1, hostDatatype, 0, 0, MPI_COMM_WORLD, &request2 ); + + MPI_Wait( &request1, MPI_STATUS_IGNORE ); + MPI_Wait( &request2, MPI_STATUS_IGNORE ); + } + double endTime = MPI_Wtime(); + std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl; +} + +void mpiDeviceToDevice() +{ + cuda::GPUField<double> deviceField1(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT); + cuda::GPUField<double> deviceField2(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT); + + auto deviceDatatype1 = mpi::Datatype ( field::communication::mpiDatatype( deviceField1 ) ); + auto deviceDatatype2 = mpi::Datatype ( field::communication::mpiDatatype( deviceField2 ) ); + + double startTime = MPI_Wtime(); + for (int i = 0; i < NUM_ITER; ++i) { + MPI_Request request1; + MPI_Isend( deviceField1.data(), 1, deviceDatatype1, 0, 0, MPI_COMM_WORLD, &request1 ); + + MPI_Request request2; + MPI_Irecv( deviceField2.data(), 1, deviceDatatype2, 0, 0, MPI_COMM_WORLD, &request2 ); + + MPI_Wait( &request1, MPI_STATUS_IGNORE ); + MPI_Wait( &request2, MPI_STATUS_IGNORE ); + } + double endTime = MPI_Wtime(); + std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl; +} + +void mpiCopyHostToDevice() +{ + Field<double, 1> hostField1(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT); + Field<double, 1> hostField2(SIZE_X, SIZE_Y, SIZE_Z, 0.0, LAYOUT); + cuda::GPUField<double> deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT); + + auto hostDatatype1 = mpi::Datatype ( field::communication::mpiDatatype( hostField1 ) ); + auto hostDatatype2 = mpi::Datatype ( field::communication::mpiDatatype( hostField2 ) ); + + double startTime = MPI_Wtime(); + for (int i = 0; i < NUM_ITER; ++i) { + MPI_Request request1; + MPI_Isend( hostField1.data(), 1, hostDatatype1, 0, 0, MPI_COMM_WORLD, &request1 ); + + MPI_Request request2; + MPI_Irecv( hostField2.data(), 1, hostDatatype2, 0, 0, MPI_COMM_WORLD, &request2 ); + + MPI_Wait( &request1, MPI_STATUS_IGNORE ); + MPI_Wait( &request2, MPI_STATUS_IGNORE ); + + cuda::fieldCpy(deviceField, hostField2); + } + double endTime = MPI_Wtime(); + std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl; +} + +void mpiCopyDeviceToHost() +{ + Field<double, 1> hostField1(SIZE_X, SIZE_Y, SIZE_Z, 4321.0, LAYOUT); + Field<double, 1> hostField2(SIZE_X, SIZE_Y, SIZE_Z, 0.0, LAYOUT); + cuda::GPUField<double> deviceField(SIZE_X, SIZE_Y, SIZE_Z, 1, 0, LAYOUT); + + auto hostDatatype1 = mpi::Datatype ( field::communication::mpiDatatype( hostField1 ) ); + auto hostDatatype2 = mpi::Datatype ( field::communication::mpiDatatype( hostField2 ) ); + + double startTime = MPI_Wtime(); + for (int i = 0; i < NUM_ITER; ++i) { + + MPI_Request request2; + MPI_Irecv( hostField2.data(), 1, hostDatatype2, 0, 0, MPI_COMM_WORLD, &request2 ); + + cuda::fieldCpy(hostField1, deviceField); + + MPI_Request request1; + MPI_Isend( hostField1.data(), 1, hostDatatype1, 0, 0, MPI_COMM_WORLD, &request1 ); + + MPI_Wait( &request1, MPI_STATUS_IGNORE ); + MPI_Wait( &request2, MPI_STATUS_IGNORE ); + } + double endTime = MPI_Wtime(); + std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl; +} + +int main( int argc, char ** argv ) +{ + debug::enterTestMode(); + walberla::Environment walberlaEnv( argc, argv ); + + hostToHost(); + hostToDevice(); + deviceToHost(); + mpiHostToHost(); + mpiHostToDevice(); + mpiDeviceToHost(); + mpiDeviceToDevice(); + mpiCopyHostToDevice(); + mpiCopyDeviceToHost(); + + return 0; +} diff --git a/tests/cuda/communication/GPUPackInfoTest.cpp b/tests/cuda/communication/GPUPackInfoTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b3f4af3e96a7f2e62ccbbbd9e1ba3ada881f08c3 --- /dev/null +++ b/tests/cuda/communication/GPUPackInfoTest.cpp @@ -0,0 +1,192 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file GPUFieldPackInfoTest.cpp +//! \ingroup cuda +//! \author Paulo Carvalho <prcjunior@inf.ufpr.br> +//! \brief Tests if a GPUField is correctly packed into buffers +// +//====================================================================================================================== + +#include "field/GhostLayerField.h" + +#include "cuda/GPUField.h" +#include "cuda/FieldCopy.h" +#include "cuda/communication/GPUPackInfo.h" + +#include "blockforest/Initialization.h" + +#include "core/debug/TestSubsystem.h" +#include "core/mpi/MPIManager.h" + +#include "stencil/D3Q27.h" + +#include <cstring> + +//#define DBG_PRINT_ON +#include "../CudaTestCommon.h" + +#define F_SIZE 19 + + +using namespace walberla; + + +cuda::GPUField<int> * createGPUField( IBlock* const block, StructuredBlockStorage* const storage ) +{ + return new cuda::GPUField<int> ( + storage->getNumberOfXCells( *block ), // number of cells in x direction + storage->getNumberOfYCells( *block ), // number of cells in y direction + storage->getNumberOfZCells( *block ), // number of cells in z direction + F_SIZE, // fSize + 1, // number of ghost layers + field::fzyx ); +} + +// Tester base class. The communicate() template method allows testing different communication methods. +class GPUPackInfoTester +{ +public: + + typedef cuda::communication::GPUPackInfo< cuda::GPUField<int> > GPUPackInfoType; + + GPUPackInfoTester( IBlock* block, BlockDataID fieldId ): block_( block ), fieldId_( fieldId ) {} + virtual ~GPUPackInfoTester() {} + + void test( stencil::Direction dir ) + { + cuda::GPUField<int> & gpuField = *(block_->getData<cuda::GPUField<int> >( fieldId_ )); + + field::GhostLayerField<int,F_SIZE> cpuField( + gpuField.xSize(), // number of cells in x direction + gpuField.ySize(), // number of cells in y direction + gpuField.zSize(), // number of cells in z direction + 1, // number of ghost layers + 0, // initial value + field::fzyx); + cpuField.setWithGhostLayer( 0 ); + + int val = 0; + for ( auto it = cpuField.beginSliceBeforeGhostLayer( dir ); it != cpuField.end(); ++it ) + { + *it = ++val; + } + cuda::fieldCpy( gpuField, cpuField ); + + GPUPackInfoType gpuPackInfo( fieldId_ ); + + communicate( gpuPackInfo, dir ); + cuda::fieldCpy( cpuField, gpuField ); + + DBG_PRINT_FIELD( cpuField ); + + val = 0; + for ( auto it = cpuField.beginGhostLayerOnly( stencil::inverseDir[dir] ); it != cpuField.end(); ++it ) + { + WALBERLA_CHECK_EQUAL( *it, ++val ); + } + + } + +protected: + + virtual void communicate( GPUPackInfoType& gpuPackInfo, stencil::Direction dir ) = 0; + + IBlock* block_; + BlockDataID fieldId_; +}; + + +// Tester for buffer communication +class GPUPackInfoBufferTester: public GPUPackInfoTester +{ +public: + GPUPackInfoBufferTester( IBlock* block, BlockDataID fieldId ): GPUPackInfoTester( block, fieldId ) {} + +protected: + void communicate( GPUPackInfoType& gpuPackInfo, stencil::Direction dir ) + { + DBG_PRINT( "REMOTE %s -> %s\n", + stencil::dirToString[dir].c_str(), + stencil::dirToString[stencil::inverseDir[dir]].c_str() ); + + mpi::GenericSendBuffer<> sendBuf; + sendBuf.addDebugMarker( "Be" ); + gpuPackInfo.packData( block_, dir, sendBuf ); + sendBuf.addDebugMarker( "Af" ); + + // Manually copy over the send to the receive buffer + mpi::GenericRecvBuffer<> recvBuf; + recvBuf.resize( sendBuf.size() ); + memcpy( recvBuf.ptr(), sendBuf.ptr(), sendBuf.size() * sizeof(mpi::GenericSendBuffer<>::ElementType) ); + + recvBuf.readDebugMarker( "Be" ); + gpuPackInfo.unpackData( block_, stencil::inverseDir[dir], recvBuf ); + recvBuf.readDebugMarker( "Af" ); + } +}; + + +// Tester for local communication +class GPUPackInfoLocalTester: public GPUPackInfoTester +{ +public: + GPUPackInfoLocalTester( IBlock* block, BlockDataID fieldId ): GPUPackInfoTester( block, fieldId ) {} + +protected: + void communicate( GPUPackInfoType& gpuPackInfo, stencil::Direction dir ) + { + DBG_PRINT( "LOCAL %s -> %s\n", + stencil::dirToString[dir].c_str(), + stencil::dirToString[stencil::inverseDir[dir]].c_str() ); + + gpuPackInfo.communicateLocal( block_, block_, dir ); + } +}; + + +int main(int argc, char **argv) +{ + using blockforest::createUniformBlockGrid; + + debug::enterTestMode(); + MPIManager::instance()->initializeMPI(&argc,&argv); + + // Create BlockForest + uint_t processes = uint_c( MPIManager::instance()->numProcesses() ); + auto blocks = createUniformBlockGrid(processes,1,1, //blocks + 2,2,2, //cells + 1, //dx + false, //one block per process + true,true,true);//periodicity + + + BlockDataID scalarGPUFieldId = blocks->addStructuredBlockData<cuda::GPUField<int> >( + &createGPUField, "ScalarGPUField" ); + + for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt ) + { + GPUPackInfoBufferTester bufferTester( &(*blockIt), scalarGPUFieldId ); + GPUPackInfoLocalTester localTester( &(*blockIt), scalarGPUFieldId ); + + for( auto dir = stencil::D3Q27::beginNoCenter(); dir != stencil::D3Q27::end(); ++dir ) + { + localTester.test( *dir ); + bufferTester.test( *dir ); + } + } + + return 0; +}