diff --git a/src/cuda/CudaRAII.h b/src/cuda/CudaRAII.h new file mode 100644 index 0000000000000000000000000000000000000000..f179348f7860b65f31deeb107e0dfeb979883402 --- /dev/null +++ b/src/cuda/CudaRAII.h @@ -0,0 +1,85 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file CudaRAII.h +//! \ingroup cuda +//! \author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== + +#include "ErrorChecking.h" + + +namespace walberla { +namespace cuda { + + +class StreamRAII +{ +public: + ~StreamRAII() { + if( stream_ != 0 ) { + WALBERLA_CUDA_CHECK( cudaStreamDestroy( stream_ )); + } + } + + StreamRAII( StreamRAII && other) { + stream_ = other.stream_; + other.stream_ = 0; + } + + StreamRAII(const StreamRAII&) = delete; + void operator=( const StreamRAII &) = delete; + operator cudaStream_t() const { return stream_; } + + + static StreamRAII defaultStream() { + StreamRAII result; + result.stream_ = 0; + return result; + } + + static StreamRAII newPriorityStream(int priority) { + StreamRAII result; + WALBERLA_CUDA_CHECK( cudaStreamCreateWithPriority( &result.stream_, cudaStreamDefault, priority )); + return result; + } + + static StreamRAII newStream() { + StreamRAII result; + WALBERLA_CUDA_CHECK( cudaStreamCreate( &result.stream_)); + return result; + } + +private: + StreamRAII() {} + cudaStream_t stream_; +}; + + +struct EventRAII +{ + explicit EventRAII() { WALBERLA_CUDA_CHECK( cudaEventCreate(&event) ); } + ~EventRAII() { WALBERLA_CUDA_CHECK( cudaEventDestroy(event) ); } + EventRAII(const EventRAII &) = delete; + void operator=( const EventRAII &) = delete; + operator cudaEvent_t() const { return event; } + + cudaEvent_t event; +}; + + +} // namespace cuda +} // namespace walberla \ No newline at end of file diff --git a/src/cuda/ErrorChecking.h b/src/cuda/ErrorChecking.h index 49b2167445db23614e744020c76283d0fc0a72d4..82dc0b4a913936eb2c4b8f9d01ccc0c1e0df2528 100644 --- a/src/cuda/ErrorChecking.h +++ b/src/cuda/ErrorChecking.h @@ -40,7 +40,7 @@ inline void checkForError( cudaError_t code, const std::string & callerPath, con if(code != cudaSuccess) { std::stringstream ss; - ss << "CUDA Error: " << cudaGetErrorString( code ); + ss << "CUDA Error: " << code << " " << cudaGetErrorName(code) << ": " << cudaGetErrorString( code ); Abort::instance()->abort( ss.str(), callerPath, line ); } } diff --git a/src/cuda/GPUField.h b/src/cuda/GPUField.h index 116071a6505a11be9e186c0172ad6f00c1259251..bc24470726cadb69e4d30894e369e26dacb5ceae 100755 --- a/src/cuda/GPUField.h +++ b/src/cuda/GPUField.h @@ -118,6 +118,9 @@ namespace cuda { inline uint_t nrOfGhostLayers() const { return nrOfGhostLayers_; } + inline CellInterval xyzSize() const; + inline CellInterval xyzSizeWithGhostLayer() const; + bool operator==( const GPUField & other ) const; void getGhostRegion( stencil::Direction d, CellInterval & ci, diff --git a/src/cuda/GPUField.impl.h b/src/cuda/GPUField.impl.h index 946afd7ae7fbc016b6fc43123539f9bef960ad62..840ebd6a59cb98e3ae57e75bdae4513b61039766 100644 --- a/src/cuda/GPUField.impl.h +++ b/src/cuda/GPUField.impl.h @@ -112,6 +112,26 @@ void GPUField<T>::getGhostRegion(stencil::Direction d, CellInterval & ci, } +template<typename T> +inline CellInterval GPUField<T>::xyzSize() const +{ + return CellInterval (0,0,0, + cell_idx_c( xSize() )-1, + cell_idx_c( ySize() )-1, + cell_idx_c( zSize() )-1 ); +} + +template<typename T> +inline CellInterval GPUField<T>::xyzSizeWithGhostLayer() const +{ + CellInterval ci = GPUField<T>::xyzSize(); + for( uint_t i=0; i < 3; ++i ) { + ci.min()[i] -= cell_idx_c( nrOfGhostLayers_ ); + ci.max()[i] += cell_idx_c( nrOfGhostLayers_ ); + } + return ci; +} + template<typename T> void GPUField<T>::getSlice(stencil::Direction d, CellInterval & ci, cell_idx_t distance, cell_idx_t thickness, bool fullSlice ) const diff --git a/src/cuda/communication/CustomMemoryBuffer.h b/src/cuda/communication/CustomMemoryBuffer.h new file mode 100644 index 0000000000000000000000000000000000000000..8ad176077a09d292ab7f00750d58efb0348a37cb --- /dev/null +++ b/src/cuda/communication/CustomMemoryBuffer.h @@ -0,0 +1,143 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file BasicBuffer.h +//! \ingroup cuda +//! \author Martin Bauer <martin.bauer@fau.de> +//! \brief Basic Buffer supporting different memory spaces +// +//====================================================================================================================== + +#pragma once + +#include "cuda/ErrorChecking.h" + +#include <algorithm> +#include <cstring> + + +namespace walberla { +namespace cuda { +namespace communication { + + + struct HostMemoryAllocator; + struct DeviceMemoryAllocator; + + //******************************************************************************************************************* + /*! + * Simple buffer class that supports memory allocators, e.g. for pinned host memory or GPU memory + * + * \ingroup cuda + * + * In contrast to core::mpi::Buffer this class does not support stream operators "<<" and ">>" because these + * operators imply serial (un)packing which is not feasible on the GPU. + * The allocator template has to provide: + * - static void *allocate( size_t size ) + * - void deallocate( void *ptr ) + * - void memcpy( void *dst, void *src, size_t count ) + * + * The buffer has a beginning, a current position and an end position. Here is an overview of the most important + * operations: + * - clear: reset current position to begin, does not change size + * - advance: moves current position number of bytes forward and returns poitner to the old current position + * two versions are available, one that automatically resizes and reallocates the buffer, and one that + * fails if not enough space is available + */ + //******************************************************************************************************************* + template<typename Allocator> + class CustomMemoryBuffer + { + public: + typedef uint8_t ElementType; + + explicit CustomMemoryBuffer(); + explicit CustomMemoryBuffer( std::size_t initSize ); + explicit CustomMemoryBuffer( const CustomMemoryBuffer &pb ); + ~CustomMemoryBuffer(); + CustomMemoryBuffer &operator=( const CustomMemoryBuffer &pb ); + + void resize( std::size_t newSize ); + inline std::size_t allocSize() const { return std::size_t(end_ - begin_); } + inline std::size_t size() const { return std::size_t(cur_ - begin_); } + ElementType *ptr() const { return begin_; } + + inline void clear() { cur_ = begin_; } + + ElementType *advance( std::size_t bytes ); + ElementType *advanceNoResize( std::size_t bytes ); + + template<typename T> + T *advance( std::size_t bytes ) { return reinterpret_cast<T *>( advance( bytes * sizeof( T ))); } + template<typename T> + T *advanceNoResize( std::size_t bytes ) { return reinterpret_cast<T *>( advanceNoResize( bytes * sizeof( T ))); } + + private: + ElementType *begin_; + ElementType *cur_; + ElementType *end_; + }; + + + using PinnedMemoryBuffer = CustomMemoryBuffer<HostMemoryAllocator>; + using GPUMemoryBuffer = CustomMemoryBuffer<DeviceMemoryAllocator>; + + + struct HostMemoryAllocator + { + static void *allocate( size_t size ) + { + void *p; + WALBERLA_CUDA_CHECK( cudaMallocHost( &p, size )); + return p; + } + + static void deallocate( void *ptr ) + { + WALBERLA_CUDA_CHECK( cudaFreeHost( ptr )); + } + + static void memcpy( void *dst, void *src, size_t count ) + { + std::memcpy( dst, src, count ); + } + }; + + struct DeviceMemoryAllocator + { + static void *allocate( size_t size ) + { + void *p; + WALBERLA_CUDA_CHECK( cudaMalloc( &p, size )); + return p; + } + + static void deallocate( void *ptr ) + { + WALBERLA_CUDA_CHECK( cudaFree( ptr )); + } + + static void memcpy( void *dst, void *src, size_t count ) + { + cudaMemcpy( dst, src, count, cudaMemcpyDeviceToDevice ); + } + }; + + +} // namespace communication +} // namespace cuda +} // namespace walberla + +#include "CustomMemoryBuffer.impl.h" diff --git a/src/cuda/communication/CustomMemoryBuffer.impl.h b/src/cuda/communication/CustomMemoryBuffer.impl.h new file mode 100644 index 0000000000000000000000000000000000000000..377ba4bd3dd6dc1b07100cdcac86dccd4fc5265d --- /dev/null +++ b/src/cuda/communication/CustomMemoryBuffer.impl.h @@ -0,0 +1,120 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file BasicBuffer.h +//! \ingroup cuda +//! \author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== + + +namespace walberla { +namespace cuda { +namespace communication { + + + template<typename Allocator> + CustomMemoryBuffer<Allocator>::CustomMemoryBuffer() + : begin_( nullptr ), cur_( nullptr ), end_( nullptr ) + { + } + + template<typename Allocator> + CustomMemoryBuffer<Allocator>::CustomMemoryBuffer( std::size_t initSize ) + : begin_( nullptr ), cur_( nullptr ), end_( nullptr ) + { + if( initSize > 0 ) + { + begin_ = Allocator::allocate( initSize ); + end_ = begin_ + initSize; + cur_ = begin_; + } + } + + template<typename Allocator> + CustomMemoryBuffer<Allocator>::CustomMemoryBuffer( const CustomMemoryBuffer &pb ) + { + if( pb.begin_ != nullptr ) + { + begin_ = reinterpret_cast<ElementType *>(Allocator::allocate( pb.allocSize())); + end_ = begin_ + pb.allocSize(); + Allocator::memcpy( begin_, pb.begin_, pb.allocSize()); + cur_ = begin_ + pb.size(); + } + } + + template<typename Allocator> + CustomMemoryBuffer<Allocator> &CustomMemoryBuffer<Allocator>::operator=( const CustomMemoryBuffer<Allocator> &pb ) + { + auto copy( pb ); + std::swap( cur_, copy.cur_ ); + std::swap( begin_, copy.begin_ ); + std::swap( end_, copy.end_ ); + return *this; + } + + template<typename Allocator> + CustomMemoryBuffer<Allocator>::~CustomMemoryBuffer() + { + if( begin_ != nullptr ) + Allocator::deallocate( begin_ ); + } + + template<typename Allocator> + void CustomMemoryBuffer<Allocator>::resize( std::size_t newSize ) + { + if( newSize > allocSize()) + { + auto offset = cur_ - begin_; + + ElementType *newBegin; + + newBegin = reinterpret_cast<ElementType *>(Allocator::allocate( newSize )); + + Allocator::memcpy( newBegin, begin_, size_t(end_ - begin_) ); + + std::swap( begin_, newBegin ); + if( newBegin != nullptr ) + Allocator::deallocate( newBegin ); + + end_ = begin_ + newSize; + cur_ = begin_ + offset; + } + + } + + template<typename Allocator> + typename CustomMemoryBuffer<Allocator>::ElementType *CustomMemoryBuffer<Allocator>::advance( std::size_t bytes ) + { + resize( size() + bytes ); + auto result = cur_; + cur_ += bytes; + WALBERLA_ASSERT_LESS_EQUAL( cur_, end_ ); + return result; + } + + template<typename Allocator> + typename CustomMemoryBuffer<Allocator>::ElementType *CustomMemoryBuffer<Allocator>::advanceNoResize( std::size_t bytes ) + { + auto newSize = size() + bytes; + if( newSize <= allocSize()) + return advance( bytes ); + else + return nullptr; + } + +} // namespace communication +} // namespace cuda +} // namespace walberla diff --git a/src/cuda/communication/GPUPackInfo.h b/src/cuda/communication/GPUPackInfo.h index 61c698d82e729add8ffe639cfb31184d7d2bf5f4..7e1bad001bbaa9e912c972eef0004373262135cb 100644 --- a/src/cuda/communication/GPUPackInfo.h +++ b/src/cuda/communication/GPUPackInfo.h @@ -21,21 +21,18 @@ #pragma once +#include "blockforest/Block.h" +#include "communication/UniformPackInfo.h" #include "core/debug/Debug.h" #include "core/math/Vector3.h" #include "core/mpi/BufferSizeTrait.h" - -#include "stencil/Directions.h" - #include "field/GhostRegions.h" - -#include "communication/UniformPackInfo.h" - -#include "blockforest/Block.h" +#include "field/Layout.h" +#include "stencil/Directions.h" #include "cuda/ErrorChecking.h" #include "cuda/GPUCopy.h" -#include "cuda/communication/PinnedMemoryBuffer.h" +#include "cuda/communication/CustomMemoryBuffer.h" #include <cuda_runtime.h> #include <map> @@ -142,7 +139,8 @@ void GPUPackInfo<GPUField_T>::unpackData(IBlock * receiver, stencil::Direction d if ( copyAsync_ ) { PinnedMemoryBuffer & pinnedBuffer = pinnedRecvBuffers_[dir]; - copyBufferPtr = pinnedBuffer.resize( nrOfBytesToRead ); + pinnedBuffer.clear(); + copyBufferPtr = pinnedBuffer.advance( nrOfBytesToRead ); // Copy data into pinned memory buffer, in order to transfer it asynchronously to the GPU std::copy( bufPtr, static_cast< unsigned char * >( bufPtr + nrOfBytesToRead ), copyBufferPtr ); } @@ -158,7 +156,7 @@ void GPUPackInfo<GPUField_T>::unpackData(IBlock * receiver, stencil::Direction d auto intervalSize = std::make_tuple( fieldCi.xSize(), fieldCi.ySize(), fieldCi.zSize(), fieldPtr->fSize() ); - if ( fieldPtr->layout() == fzyx ) + if ( fieldPtr->layout() == field::fzyx ) { const uint_t dstAllocSizeZ = fieldPtr->zAllocSize(); const uint_t srcAllocSizeZ = fieldCi.zSize(); @@ -217,7 +215,7 @@ void GPUPackInfo<GPUField_T>::communicateLocal(const IBlock * sender, IBlock * r auto intervalSize = std::make_tuple( rCi.xSize(), rCi.ySize(), rCi.zSize(), sf->fSize() ); - if ( sf->layout() == fzyx ) + if ( sf->layout() == field::fzyx ) { const uint_t dstAllocSizeZ = rf->zAllocSize(); const uint_t srcAllocSizeZ = sf->zAllocSize(); @@ -263,7 +261,8 @@ void GPUPackInfo<GPUField_T>::packDataImpl(const IBlock * sender, stencil::Direc if ( copyAsync_ ) { PinnedMemoryBuffer & pinnedBuffer = pinnedSendBuffers_[dir]; - copyBufferPtr = pinnedBuffer.resize( nrOfBytesToPack ); + pinnedBuffer.clear(); + copyBufferPtr = pinnedBuffer.advance( nrOfBytesToPack ); } auto dstOffset = std::make_tuple( uint_c(0), uint_c(0), uint_c(0), uint_c(0) ); @@ -275,7 +274,7 @@ void GPUPackInfo<GPUField_T>::packDataImpl(const IBlock * sender, stencil::Direc auto intervalSize = std::make_tuple( fieldCi.xSize(), fieldCi.ySize(), fieldCi.zSize(), fieldPtr->fSize() ); - if ( fieldPtr->layout() == fzyx ) + if ( fieldPtr->layout() == field::fzyx ) { const uint_t dstAllocSizeZ = fieldCi.zSize(); const uint_t srcAllocSizeZ = fieldPtr->zAllocSize(); diff --git a/src/cuda/communication/GeneratedGPUPackInfo.h b/src/cuda/communication/GeneratedGPUPackInfo.h new file mode 100644 index 0000000000000000000000000000000000000000..752f2907c734cbb1a18d73b390424d33c94f3aa8 --- /dev/null +++ b/src/cuda/communication/GeneratedGPUPackInfo.h @@ -0,0 +1,44 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file GeneratedGPUPackInfo.h +//! \ingroup core +//! \author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== + + +#pragma once +#include "stencil/Directions.h" +#include "domain_decomposition/IBlock.h" +#include <cuda_runtime.h> + + +namespace walberla { +namespace cuda { + + +class GeneratedGPUPackInfo +{ +public: + virtual void pack ( stencil::Direction dir, unsigned char *buffer, IBlock *block, cudaStream_t stream ) = 0; + virtual void unpack( stencil::Direction dir, unsigned char *buffer, IBlock *block, cudaStream_t stream ) = 0; + virtual uint_t size( stencil::Direction dir, IBlock *block ) = 0; +}; + + + +} //namespace cuda +} //namespace walberla \ No newline at end of file diff --git a/src/cuda/communication/PinnedMemoryBuffer.h b/src/cuda/communication/PinnedMemoryBuffer.h deleted file mode 100644 index 702e71bd69aa9bfa7858e3595330b092edfe65dc..0000000000000000000000000000000000000000 --- a/src/cuda/communication/PinnedMemoryBuffer.h +++ /dev/null @@ -1,123 +0,0 @@ -//====================================================================================================================== -// -// This file is part of waLBerla. waLBerla is free software: you can -// redistribute it and/or modify it under the terms of the GNU General Public -// License as published by the Free Software Foundation, either version 3 of -// the License, or (at your option) any later version. -// -// waLBerla is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -// for more details. -// -// You should have received a copy of the GNU General Public License along -// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. -// -//! \file PinnedMemoryBuffer.h -//! \ingroup cuda -//! \author João Victor Tozatti Risso <jvtrisso@inf.ufpr.br> -//! \brief Pinned Memory buffer for staging memory when using asynchronous CUDA memory copies. -// -//====================================================================================================================== - -#pragma once - -#include "cuda/ErrorChecking.h" - -#include <algorithm> -#include <cuda_runtime.h> - - -namespace walberla { -namespace cuda { -namespace communication { - -template< typename T = unsigned char > -class GenericPinnedMemoryBuffer -{ -public: - typedef T ElementType; - - GenericPinnedMemoryBuffer(); - - GenericPinnedMemoryBuffer( std::size_t initSize ); - - GenericPinnedMemoryBuffer( const GenericPinnedMemoryBuffer & pb ); - - ~GenericPinnedMemoryBuffer(); - - inline T* ptr() const { return data_; } - - inline T* resize( std::size_t newSize ); - - inline std::size_t size() const { return size_; } - - GenericPinnedMemoryBuffer & operator=( const GenericPinnedMemoryBuffer & pb ) = delete; -private: - T * data_; - std::size_t size_; -}; - -typedef GenericPinnedMemoryBuffer<> PinnedMemoryBuffer; - -template< typename T > // Element type -GenericPinnedMemoryBuffer<T>::GenericPinnedMemoryBuffer() - : data_(nullptr), size_(0) -{ -} - - -template< typename T > // Element type -GenericPinnedMemoryBuffer<T>::GenericPinnedMemoryBuffer( std::size_t initSize ) - : data_(nullptr), size_(initSize) -{ - if (initSize > 0) - { - WALBERLA_CUDA_CHECK( cudaMallocHost( &data_, size_ * sizeof(T) ) ); - } -} - -template< typename T > // Element type -GenericPinnedMemoryBuffer<T>::GenericPinnedMemoryBuffer( const GenericPinnedMemoryBuffer & pb ) - : size_(pb.size_) -{ - if ( pb.size_ > 0 ) - { - WALBERLA_CUDA_CHECK( cudaMallocHost( &data_, pb.size_ * sizeof(T) ) ); - - std::copy( pb.data_, static_cast<T *>(pb.data_ + pb.size_), data_ ); - } -} - -template< typename T > // Element type -GenericPinnedMemoryBuffer<T>::~GenericPinnedMemoryBuffer() -{ - if ( data_ != nullptr ) - { - WALBERLA_CUDA_CHECK( cudaFreeHost( data_ ) ); - } -} - -template< typename T > // Element type -T * GenericPinnedMemoryBuffer<T>::resize(std::size_t newSize) -{ - if ( newSize > size_ ) - { - T * newBegin; - WALBERLA_CUDA_CHECK( cudaMallocHost( &newBegin, newSize * sizeof(T) ) ); - - std::swap( data_, newBegin ); - if ( newBegin != nullptr ) - { - WALBERLA_CUDA_CHECK( cudaFreeHost( newBegin ) ); - } - - size_ = newSize; - } - - return data_; -} - -} // namespace communication -} // namespace cuda -} // namespace walberla diff --git a/src/cuda/communication/UniformGPUScheme.h b/src/cuda/communication/UniformGPUScheme.h new file mode 100644 index 0000000000000000000000000000000000000000..b7e0bf06cdc177f7cae675d0239a58d8ba968a65 --- /dev/null +++ b/src/cuda/communication/UniformGPUScheme.h @@ -0,0 +1,94 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file UniformGPUScheme.h +//! \ingroup cuda +//! \author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== + +#pragma once + +#include "blockforest/StructuredBlockForest.h" +#include "core/mpi/MPIWrapper.h" +#include "core/mpi/BufferSystem.h" +#include "core/WeakPtrWrapper.h" +#include "domain_decomposition/IBlock.h" +#include "stencil/Directions.h" + +#include "cuda/CudaRAII.h" +#include "cuda/communication/GeneratedGPUPackInfo.h" +#include "cuda/communication/CustomMemoryBuffer.h" + +#include <chrono> +#include <thread> + +namespace walberla { +namespace cuda { +namespace communication { + + + + template<typename Stencil> + class UniformGPUScheme + { + public: + explicit UniformGPUScheme( weak_ptr_wrapper<StructuredBlockForest> bf, + const shared_ptr<cuda::EventRAII> & startWaitEvent, + bool sendDirectlyFromGPU = false, + const int tag = 5432 ); + + void addPackInfo( const shared_ptr<GeneratedGPUPackInfo> &pi ); + + void startCommunication(); + void wait(); + + void operator()() { communicate(); } + inline void communicate() { startCommunication(); wait(); } + + private: + void setupCommunication(); + + weak_ptr_wrapper<StructuredBlockForest> blockForest_; + shared_ptr<cuda::EventRAII> startWaitEvent_; + uint_t forestModificationStamp_; + + bool setupBeforeNextCommunication_; + bool communicationInProgress_; + bool sendFromGPU_; + + using CpuBuffer_T = cuda::communication::PinnedMemoryBuffer; + using GpuBuffer_T = cuda::communication::GPUMemoryBuffer; + + mpi::GenericBufferSystem<CpuBuffer_T, CpuBuffer_T> bufferSystemCPU_; + mpi::GenericBufferSystem<GpuBuffer_T, GpuBuffer_T> bufferSystemGPU_; + + std::vector<shared_ptr<GeneratedGPUPackInfo> > packInfos_; + std::map<stencil::Direction, cuda::StreamRAII> streams_; + + struct Header + { + BlockID blockId; + stencil::Direction dir; + }; + std::map<mpi::MPIRank, std::vector<Header> > headers_; + }; + + +} // namespace communication +} // namespace cuda +} // namespace walberla + +#include "UniformGPUScheme.impl.h" diff --git a/src/cuda/communication/UniformGPUScheme.impl.h b/src/cuda/communication/UniformGPUScheme.impl.h new file mode 100644 index 0000000000000000000000000000000000000000..2b9ebd32a53b1681a84d131e54c54070bc889d58 --- /dev/null +++ b/src/cuda/communication/UniformGPUScheme.impl.h @@ -0,0 +1,248 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file UniformGPUScheme.impl.h +//! \ingroup cuda +//! \author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== + + +namespace walberla { +namespace cuda { +namespace communication { + + +template<typename Stencil> +UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr_wrapper <StructuredBlockForest> bf, + const shared_ptr <cuda::EventRAII> &startWaitEvent, + bool sendDirectlyFromGPU, + const int tag ) + : blockForest_( bf ), + startWaitEvent_( startWaitEvent ), + setupBeforeNextCommunication_( true ), + communicationInProgress_( false ), + sendFromGPU_( sendDirectlyFromGPU ), + bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ), + bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ) {} + + + template<typename Stencil> + void UniformGPUScheme<Stencil>::startCommunication() + { + WALBERLA_ASSERT( !communicationInProgress_ ); + auto forest = blockForest_.lock(); + + if( setupBeforeNextCommunication_ || + forest->getBlockForest().getModificationStamp() != forestModificationStamp_ ) + setupCommunication(); + + // Schedule Receives + if( sendFromGPU_ ) + bufferSystemGPU_.scheduleReceives(); + else + bufferSystemCPU_.scheduleReceives(); + + + if( !sendFromGPU_ ) + for( auto it : headers_ ) + bufferSystemGPU_.sendBuffer( it.first ).clear(); + + // Start filling send buffers + for( auto &iBlock : *forest ) + { + auto block = dynamic_cast< Block * >( &iBlock ); + for( auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir ) + { + const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex( *dir ); + if( block->getNeighborhoodSectionSize( neighborIdx ) == uint_t( 0 )) + continue; + auto nProcess = mpi::MPIRank( block->getNeighborProcess( neighborIdx, uint_t( 0 ))); + + if( streams_.find( *dir ) == streams_.end() ) + { + streams_.emplace( *dir, StreamRAII::newPriorityStream( -1 )); + } + + auto &ci = streams_.at( *dir ); + + for( auto &pi : packInfos_ ) + { + auto size = pi->size( *dir, block ); + auto gpuDataPtr = bufferSystemGPU_.sendBuffer( nProcess ).advanceNoResize( size ); + WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr ); + WALBERLA_CUDA_CHECK( cudaStreamWaitEvent( ci, *startWaitEvent_, 0 )); + pi->pack( *dir, gpuDataPtr, block, ci ); + + if( !sendFromGPU_ ) + { + auto cpuDataPtr = bufferSystemCPU_.sendBuffer( nProcess ).advanceNoResize( size ); + WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr ); + WALBERLA_CUDA_CHECK( + cudaMemcpyAsync( cpuDataPtr, gpuDataPtr, size, cudaMemcpyDeviceToHost, ci )); + } + } + } + } + + // Busy waiting for packing to finish - then send + for( auto &ci : streams_ ) WALBERLA_CUDA_CHECK( cudaStreamSynchronize( ci.second )); + + if( sendFromGPU_ ) + bufferSystemGPU_.sendAll(); + else + bufferSystemCPU_.sendAll(); + + communicationInProgress_ = true; + } + + + template<typename Stencil> + void UniformGPUScheme<Stencil>::wait() + { + WALBERLA_ASSERT( communicationInProgress_ ); + + auto forest = blockForest_.lock(); + + if( sendFromGPU_ ) + { + for( auto recvInfo = bufferSystemGPU_.begin(); recvInfo != bufferSystemGPU_.end(); ++recvInfo ) + { + for( auto &header : headers_[recvInfo.rank()] ) + { + auto &ci = streams_.at( header.dir ); + auto block = dynamic_cast< Block * >( forest->getBlock( header.blockId )); + + for( auto &pi : packInfos_ ) + { + auto size = pi->size( header.dir, block ); + auto gpuDataPtr = recvInfo.buffer().advanceNoResize( size ); + WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr ); + pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, ci ); + } + } + } + } + else + { + for( auto recvInfo = bufferSystemCPU_.begin(); recvInfo != bufferSystemCPU_.end(); ++recvInfo ) + { + using namespace std::chrono_literals; + std::this_thread::sleep_for( 1ms ); + auto &gpuBuffer = bufferSystemGPU_.sendBuffer( recvInfo.rank()); + + gpuBuffer.clear(); + for( auto &header : headers_[recvInfo.rank()] ) { + auto &ci = streams_.at( header.dir ); + auto block = dynamic_cast< Block * >( forest->getBlock( header.blockId )); + + for( auto &pi : packInfos_ ) + { + auto size = pi->size( header.dir, block ); + auto cpuDataPtr = recvInfo.buffer().advanceNoResize( size ); + auto gpuDataPtr = gpuBuffer.advanceNoResize( size ); + WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr ); + WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr ); + + WALBERLA_CUDA_CHECK( cudaMemcpyAsync( gpuDataPtr, cpuDataPtr, size, + cudaMemcpyHostToDevice, ci )); + pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, ci ); + } + } + } + } + + for( auto &ci : streams_ ) + WALBERLA_CUDA_CHECK( cudaStreamSynchronize( ci.second )); + + communicationInProgress_ = false; + } + + + template<typename Stencil> + void UniformGPUScheme<Stencil>::setupCommunication() + { + auto forest = blockForest_.lock(); + + headers_.clear(); + + std::map<mpi::MPIRank, mpi::MPISize> receiverInfo; // how many bytes to send to each neighbor + + mpi::BufferSystem headerExchangeBs( mpi::MPIManager::instance()->comm(), 123 ); + + for( auto &iBlock : *forest ) { + auto block = dynamic_cast< Block * >( &iBlock ); + + for( auto dir = Stencil::beginNoCenter(); dir != Stencil::end(); ++dir ) { + // skip if block has no neighbors in this direction + const auto neighborIdx = blockforest::getBlockNeighborhoodSectionIndex( *dir ); + if( block->getNeighborhoodSectionSize( neighborIdx ) == uint_t( 0 )) + continue; + + WALBERLA_ASSERT( block->neighborhoodSectionHasEquallySizedBlock( neighborIdx ), + "Works for uniform setups only" ); + WALBERLA_ASSERT_EQUAL( block->getNeighborhoodSectionSize( neighborIdx ), uint_t( 1 ), + "Works for uniform setups only" ); + + const BlockID &nBlockId = block->getNeighborId( neighborIdx, uint_t( 0 )); + auto nProcess = mpi::MPIRank( block->getNeighborProcess( neighborIdx, uint_t( 0 ))); + + for( auto &pi : packInfos_ ) + receiverInfo[nProcess] += mpi::MPISize( pi->size( *dir, block )); + + auto &headerBuffer = headerExchangeBs.sendBuffer( nProcess ); + nBlockId.toBuffer( headerBuffer ); + headerBuffer << *dir; + } + } + + headerExchangeBs.setReceiverInfoFromSendBufferState( false, true ); + headerExchangeBs.sendAll(); + for( auto recvIter = headerExchangeBs.begin(); recvIter != headerExchangeBs.end(); ++recvIter ) { + auto &headerVector = headers_[recvIter.rank()]; + auto &buffer = recvIter.buffer(); + while ( buffer.size()) { + Header header; + header.blockId.fromBuffer( buffer ); + buffer >> header.dir; + headerVector.push_back( header ); + } + } + + bufferSystemCPU_.setReceiverInfo( receiverInfo ); + bufferSystemGPU_.setReceiverInfo( receiverInfo ); + + for( auto it : receiverInfo ) { + bufferSystemCPU_.sendBuffer( it.first ).resize( it.second ); + bufferSystemGPU_.sendBuffer( it.first ).resize( it.second ); + } + + forestModificationStamp_ = forest->getBlockForest().getModificationStamp(); + setupBeforeNextCommunication_ = false; + } + + + template<typename Stencil> + void UniformGPUScheme<Stencil>::addPackInfo( const shared_ptr<GeneratedGPUPackInfo> &pi ) + { + WALBERLA_ASSERT( !communicationInProgress_, "Cannot add pack info while communication is in progress" ); + packInfos_.push_back( pi ); + setupBeforeNextCommunication_ = true; + } + + +} // namespace communication +} // namespace cuda +} // namespace walberla diff --git a/tests/cuda/CMakeLists.txt b/tests/cuda/CMakeLists.txt index aa82a445f73db26c8cb7312ab0445a8eefe6b5bc..39222ebdeca1f35fd58aabd94ca13c2795b9e3d7 100644 --- a/tests/cuda/CMakeLists.txt +++ b/tests/cuda/CMakeLists.txt @@ -31,3 +31,7 @@ waLBerla_compile_test( FILES communication/CommTest ) waLBerla_compile_test( FILES CudaMPI DEPENDS blockforest timeloop gui ) #waLBerla_execute_test( NAME CudaMPI ) + +waLBerla_add_executable ( NAME CpuGpuGeneratedEquivalenceTest + FILES codegen/EquivalenceTest.cpp codegen/EquivalenceTest.gen.py + DEPENDS blockforest boundary core cuda field stencil timeloop vtk gui ) diff --git a/tests/cuda/codegen/CudaJacobiKernel.py b/tests/cuda/codegen/CudaJacobiKernel.py index d30da2edcb663a351634490c93d503742e34e418..225593dcdb12fd90d6c679e5da00035f5c2f9054 100644 --- a/tests/cuda/codegen/CudaJacobiKernel.py +++ b/tests/cuda/codegen/CudaJacobiKernel.py @@ -1,17 +1,20 @@ from pystencils_walberla.sweep import Sweep -from pystencils_walberla.cmake_integration import codegen + def jacobi2D(sweep): src = sweep.field("f1") dst = sweep.temporaryField(src) - dst[0, 0] @= (src[1, 0] + src[-1, 0] + src[0, 1] + src[0, -1]) / (4 * S.h ** 2) + dst[0, 0] @= (src[1, 0] + src[-1, 0] + src[0, 1] + src[0, -1]) / (4 * sweep.constant.h ** 2) + def jacobi3D(sweep): src = sweep.field("f1") dst = sweep.temporaryField(src) - dst[0,0,0] @= (src[1,0,0] + src[-1,0,0] + src[0,1,0] + src[0, -1, 0] + src[0, 0, 1] + src[0, 0 , -1] ) / (6 * S.h**2) + dst[0, 0, 0] @= (src[1, 0, 0] + src[-1, 0, 0] + src[0, 1, 0] + src[0, -1, 0] + src[0, 0, 1] + src[0, 0, -1]) / \ + (6 * sweep.constant.h ** 2) + Sweep.generate('CudaJacobiKernel2D', jacobi2D, dim=2, target='gpu') -Sweep.generate('CudaJacobiKernel3D', jacobi3D, dim=3, target='gpu') \ No newline at end of file +Sweep.generate('CudaJacobiKernel3D', jacobi3D, dim=3, target='gpu') diff --git a/tests/cuda/codegen/EquivalenceTest.cpp b/tests/cuda/codegen/EquivalenceTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bbdc079d7e84de2538653a5a63f6aad596c35c4f --- /dev/null +++ b/tests/cuda/codegen/EquivalenceTest.cpp @@ -0,0 +1,178 @@ +#include "core/Environment.h" +#include "python_coupling/CreateConfig.h" +#include "blockforest/Initialization.h" +#include "lbm/field/PdfField.h" +#include "lbm/field/AddToStorage.h" +#include "field/FlagField.h" +#include "field/AddToStorage.h" +#include "lbm/communication/PdfFieldPackInfo.h" +#include "lbm/vtk/VTKOutput.h" +#include "lbm/PerformanceLogger.h" +#include "blockforest/communication/UniformBufferedScheme.h" +#include "timeloop/all.h" +#include "core/math/Random.h" +#include "geometry/all.h" +#include "cuda/HostFieldAllocator.h" +#include "cuda/communication/GPUPackInfo.h" +#include "core/timing/TimingPool.h" +#include "core/timing/RemainingTimeLogger.h" +#include "cuda/AddGPUFieldToStorage.h" +#include "cuda/communication/UniformGPUScheme.h" +#include "lbm/sweeps/CellwiseSweep.h" +#include "domain_decomposition/SharedSweep.h" + +#include "EquivalenceTest_LatticeModel.h" +#include "EquivalenceTest_GPUKernel.h" +#include "EquivalenceTest_GPUPackInfo.h" + +using namespace walberla; + +using NativeLatticeModel_T = lbm::D3Q19<lbm::collision_model::SRT, false>; +using GeneratedLatticeModel_T = lbm::EquivalenceTest_LatticeModel; + +using Stencil_T = GeneratedLatticeModel_T::Stencil; +using CommunicationStencil_T = GeneratedLatticeModel_T::CommunicationStencil; +using NativePdfField_T = lbm::PdfField<NativeLatticeModel_T>; +using GeneratedPdfField_T = lbm::PdfField<GeneratedLatticeModel_T>; + +using flag_t = walberla::uint8_t; +using FlagField_T = FlagField<flag_t>; + +using CpuCommScheme_T = blockforest::communication::UniformBufferedScheme<CommunicationStencil_T>; +using GpuCommScheme_T = cuda::communication::UniformGPUScheme<CommunicationStencil_T>; + + +template<typename PdfField_T> +void initPdfField( const shared_ptr<StructuredBlockForest> &blocks, BlockDataID pdfFieldId ) +{ + auto domainBB = blocks->getDomainCellBB(); + + for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt ) + { + auto pdfField = blockIt->getData<PdfField_T>( pdfFieldId ); + Cell offset( 0, 0, 0 ); + blocks->transformBlockLocalToGlobalCell( offset, *blockIt ); + + WALBERLA_FOR_ALL_CELLS_XYZ( pdfField, + auto globalX = real_c( offset[0] + x ); + auto globalZ = real_c( offset[2] + z ); + auto xArg = real_c(std::sin(real_c(globalX) / real_t(4) * real_c(domainBB.size(0)) )); + auto zArg = real_c(std::sin(real_c(globalZ) / real_t(4) * real_c(domainBB.size(2)) )); + pdfField->setToEquilibrium( x, y, z, Vector3<real_t>( 0.05 * std::sin(xArg), 0, 0.05 * std::cos(zArg))); + ); + } +} + + +int main( int argc, char **argv ) +{ + mpi::Environment env( argc, argv ); + + for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg ) + { + auto config = *cfg; + auto parameters = config->getOneBlock( "Parameters" ); + + auto blocks = blockforest::createUniformBlockGridFromConfig( config ); + + const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 )); + const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 )); + + // Boundary + BlockDataID flagFieldId = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field" ); + const FlagUID fluidFlagUID( "Fluid" ); + geometry::setNonBoundaryCellsToDomain<FlagField_T>( *blocks, flagFieldId, fluidFlagUID ); + + + // Part 1 : Native walberla + NativeLatticeModel_T nativeLatticeModel = NativeLatticeModel_T( lbm::collision_model::SRT( omega )); + BlockDataID pdfFieldNativeId = lbm::addPdfFieldToStorage( blocks, "pdfNative", nativeLatticeModel, field::fzyx ); + initPdfField<NativePdfField_T >( blocks, pdfFieldNativeId ); + CpuCommScheme_T nativeComm( blocks ); + nativeComm.addPackInfo( make_shared< lbm::PdfFieldPackInfo< NativeLatticeModel_T > >( pdfFieldNativeId ) ); + auto nativeSweep = lbm::makeCellwiseSweep< NativeLatticeModel_T , FlagField_T >( pdfFieldNativeId, flagFieldId, fluidFlagUID ); + + SweepTimeloop nativeTimeLoop( blocks->getBlockStorage(), timesteps ); + nativeTimeLoop.add() << BeforeFunction( nativeComm, "communication" ) + << Sweep(makeSharedSweep(nativeSweep), "native stream collide" ); + nativeTimeLoop.run(); + + + // Part 2: Generated CPU Version + GeneratedLatticeModel_T generatedLatticeModel = GeneratedLatticeModel_T( omega ); + BlockDataID pdfFieldGeneratedId = lbm::addPdfFieldToStorage( blocks, "pdfGenerated", generatedLatticeModel, field::fzyx ); + initPdfField<GeneratedPdfField_T >( blocks, pdfFieldGeneratedId ); + CpuCommScheme_T cpuComm( blocks ); + cpuComm.addPackInfo( make_shared< lbm::PdfFieldPackInfo< GeneratedLatticeModel_T > >( pdfFieldGeneratedId ) ); + SweepTimeloop cpuTimeLoop( blocks->getBlockStorage(), timesteps ); + cpuTimeLoop.add() << BeforeFunction( cpuComm, "communication" ) + << Sweep(GeneratedLatticeModel_T::Sweep( pdfFieldGeneratedId ), "generated stream collide on cpu" ); + cpuTimeLoop.run(); + + + // Part 3: Generated GPU Version + bool overlapCommunication = parameters.getParameter<bool>( "overlapCommunication", true ); + bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false ); + + BlockDataID pdfShadowCPU = lbm::addPdfFieldToStorage( blocks, "cpu shadow field", generatedLatticeModel, field::fzyx ); + initPdfField<GeneratedPdfField_T >( blocks, pdfShadowCPU ); + + BlockDataID pdfGpuFieldId = cuda::addGPUFieldToStorage<GeneratedPdfField_T >( blocks, pdfShadowCPU, "pdfs on gpu", true ); + auto defaultKernelStream = overlapCommunication ? cuda::StreamRAII::newStream() : cuda::StreamRAII::defaultStream(); + auto innerKernelStartedEvent = make_shared<cuda::EventRAII>(); + + pystencils::EquivalenceTest_GPUKernel cudaLbKernel( pdfGpuFieldId, omega, defaultKernelStream ); + GpuCommScheme_T gpuComm( blocks, innerKernelStartedEvent, cudaEnabledMPI ); + gpuComm.addPackInfo( make_shared<pystencils::EquivalenceTest_GPUPackInfo>( pdfGpuFieldId )); + auto runCommunication = [&]() { gpuComm(); }; + + SweepTimeloop gpuTimeLoop( blocks->getBlockStorage(), timesteps ); + if( !overlapCommunication ) + { + gpuTimeLoop.add() << BeforeFunction( runCommunication, "gpu communication" ) + << Sweep( cudaLbKernel, "LB stream & collide gpu" ); + } + else + { + gpuTimeLoop.add() << Sweep( [&]( IBlock *b ) + { + cudaEventRecord( *innerKernelStartedEvent, defaultKernelStream ); + cudaLbKernel.inner( b ); + }, "LBM @ inner" ); + gpuTimeLoop.add() << BeforeFunction( runCommunication, "gpu communication" ) + << Sweep( [&]( IBlock *b ) { cudaLbKernel.outer( b ); }, "LBM @ outer" ); + } + gpuTimeLoop.run(); + cuda::fieldCpy<GeneratedPdfField_T, cuda::GPUField<real_t>> (blocks, pdfShadowCPU, pdfGpuFieldId); + + // Compare all three versions + auto errorCPU = real_t(0); + auto errorGPU = real_t(0); + + for( auto & block : *blocks ) + { + auto native = block.getData<NativePdfField_T>( pdfFieldNativeId ); + auto cpu = block.getData<GeneratedPdfField_T >( pdfFieldGeneratedId ); + auto gpu = block.getData<GeneratedPdfField_T>( pdfShadowCPU ); + + WALBERLA_FOR_ALL_CELLS_XYZ(native, + for(cell_idx_t f = 0; f < cell_idx_c(NativeLatticeModel_T::Stencil::Q); ++f ) + { + errorCPU += std::abs( native->get( x, y, z, f ) - cpu->get( x, y, z, f )); + errorGPU += std::abs( native->get( x, y, z, f ) - gpu->get( x, y, z, f )); + } + ) + } + mpi::reduceInplace(errorCPU, mpi::SUM); + mpi::reduceInplace(errorGPU, mpi::SUM); + auto domainBB = blocks->getDomainCellBB(); + errorCPU /= real_c(domainBB.numCells()); + errorGPU /= real_c(domainBB.numCells()); + WALBERLA_LOG_RESULT_ON_ROOT("CPU Error " << errorCPU ); + WALBERLA_LOG_RESULT_ON_ROOT("GPU Error " << errorGPU ); + WALBERLA_CHECK_FLOAT_EQUAL(errorCPU, real_c(0.0)); + WALBERLA_CHECK_FLOAT_EQUAL(errorGPU, real_c(0.0)); + } + + return 0; +} \ No newline at end of file diff --git a/tests/cuda/codegen/EquivalenceTest.gen.py b/tests/cuda/codegen/EquivalenceTest.gen.py new file mode 100644 index 0000000000000000000000000000000000000000..af4d3a8d8a2a1f685227497906fc349d0b50000f --- /dev/null +++ b/tests/cuda/codegen/EquivalenceTest.gen.py @@ -0,0 +1,40 @@ +import sympy as sp +from lbmpy_walberla import generate_lattice_model_files +from lbmpy.creationfunctions import create_lb_update_rule +from pystencils_walberla.sweep import Sweep + +# LB options +options = { + 'method': 'srt', + 'stencil': 'D3Q19', + 'relaxation_rate': sp.Symbol("omega"), + 'field_name': 'pdfs', + 'compressible': False, + 'maxwellian_moments': False, + 'temporary_field_name': 'pdfs_tmp', + 'optimization': {'cse_global': False, + 'cse_pdfs': False, + 'double_precision': True} +} + +# GPU optimization options +opt = {'gpu_indexing_params': {'block_size': (128, 2, 1)}, 'data_type': 'float64'} +outer_opt = {'gpu_indexing_params': {'block_size': (32, 32, 32)}, 'data_type': 'float64'} + + +def lb_assignments(): + ur = create_lb_update_rule(**options) + return ur.all_assignments + + +generate_lattice_model_files(class_name='EquivalenceTest_LatticeModel', **options) + +Sweep.generate_inner_outer_kernel('EquivalenceTest_GPUKernel', + lambda: create_lb_update_rule(**options).all_assignments, + target='gpu', + temporary_fields=['pdfs_tmp'], + field_swaps=[('pdfs', 'pdfs_tmp')], + optimization=opt, + outer_optimization=outer_opt) + +Sweep.generate_pack_info('EquivalenceTest_GPUPackInfo', lb_assignments, target='gpu')