diff --git a/src/cuda/GPUCopy.cpp b/src/cuda/GPUCopy.cpp index e1d6c86e723befc0622631cda612359dbd429040..3c1c8f12649ab4f2de483e4b6a3ebf6877790910 100644 --- a/src/cuda/GPUCopy.cpp +++ b/src/cuda/GPUCopy.cpp @@ -1,21 +1,22 @@ //====================================================================================================================== // -// This file is part of waLBerla. waLBerla is free software: you can +// This file is part of waLBerla. waLBerla is free software: you can // redistribute it and/or modify it under the terms of the GNU General Public -// License as published by the Free Software Foundation, either version 3 of +// License as published by the Free Software Foundation, either version 3 of // the License, or (at your option) any later version. -// -// waLBerla is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License // for more details. -// +// // You should have received a copy of the GNU General Public License along // with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. // //! \file GPUCopy.cpp //! \ingroup cuda //! \author Paulo Carvalho <prcjunior@inf.ufpr.br> +//! \author João Victor Tozatti Risso <jvtrisso@inf.ufpr.br> //! \brief Copy routines of 4D intervals involving GPU buffers. // //====================================================================================================================== @@ -29,116 +30,365 @@ namespace walberla { namespace cuda { +void copyDevToDevFZYX( const cudaPitchedPtr& dst, const cudaPitchedPtr& src, + std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset, + std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset, + uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize, + std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize, + cudaStream_t copyStream ) +{ + const uint_t & Nx = std::get<0>(intervalSize), + & Ny = std::get<1>(intervalSize), + & Nz = std::get<2>(intervalSize), + & Nf = std::get<3>(intervalSize); + + const uint_t & srcX = std::get<0>(srcOffset), + & srcY = std::get<1>(srcOffset), + & srcZ = std::get<2>(srcOffset), + & srcF = std::get<3>(srcOffset); + + const uint_t & dstX = std::get<0>(dstOffset), + & dstY = std::get<1>(dstOffset), + & dstZ = std::get<2>(dstOffset), + & dstF = std::get<3>(dstOffset); + + auto copyFunctor = [&](uint_t dstCoordF, uint_t srcCoordF, uint_t fIntervalSize) { + WALBERLA_ASSERT( fIntervalSize == 1 || ( Nz == dstAllocSizeZ && Nz == srcAllocSizeZ ) ); + + cudaMemcpy3DParms p; + memset( &p, 0, sizeof(p) ); + + p.srcPos = make_cudaPos( srcX * typeSize, srcY, srcCoordF * srcAllocSizeZ + srcZ ); + p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize ); -void copyDevToDevFZYXRestricted( const cudaPitchedPtr& dst, const cudaPitchedPtr& src, - uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ, - uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF, - uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF, - uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf ) + p.dstPos = make_cudaPos( dstX * typeSize, dstY, dstCoordF * dstAllocSizeZ + dstZ ); + p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize ); + + p.extent = make_cudaExtent( Nx * typeSize, Ny, Nz * fIntervalSize ); + p.kind = cudaMemcpyDeviceToDevice; + + if ( copyStream == 0 ) + { + WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) ); + } + else + { + WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) ); + } + }; + + if( Nf == 1 || ( Nz == dstAllocSizeZ && Nz == srcAllocSizeZ ) ) + { + copyFunctor( dstF, srcF, Nf ); + } + else + { + for( uint_t f = 0; f < Nf; ++f ) + { + copyFunctor( dstF + f, srcF + f, uint_c(1) ); + } + } +} + + +void copyDevToDevZYXF( const cudaPitchedPtr& dst, const cudaPitchedPtr& src, + std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset, + std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset, + uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize, + std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize, + cudaStream_t copyStream ) { - WALBERLA_ASSERT( Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ) ); - - cudaMemcpy3DParms p; - memset( &p, 0, sizeof(p) ); - - p.srcPos.x = srcX * typeSz; - p.srcPos.y = srcY; - p.srcPos.z = srcF * srcAllocZ + srcZ; - p.srcPtr.ptr = src.ptr; - p.srcPtr.pitch = src.pitch; - p.srcPtr.xsize = src.xsize; - p.srcPtr.ysize = src.ysize; - - p.dstPos.x = dstX * typeSz; - p.dstPos.y = dstY; - p.dstPos.z = dstF * dstAllocZ + dstZ; - p.dstPtr.ptr = dst.ptr; - p.dstPtr.pitch = dst.pitch; - p.dstPtr.xsize = dst.xsize; - p.dstPtr.ysize = dst.ysize; - - p.extent.width = Nx * typeSz; - p.extent.height = Ny; - p.extent.depth = Nz * Nf; - - p.kind = cudaMemcpyDeviceToDevice; - - WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) ); + const uint_t & Nx = std::get<0>(intervalSize), + & Ny = std::get<1>(intervalSize), + & Nz = std::get<2>(intervalSize), + & Nf = std::get<3>(intervalSize); + + const uint_t & srcX = std::get<0>(srcOffset), + & srcY = std::get<1>(srcOffset), + & srcZ = std::get<2>(srcOffset), + & srcF = std::get<3>(srcOffset); + + const uint_t & dstX = std::get<0>(dstOffset), + & dstY = std::get<1>(dstOffset), + & dstZ = std::get<2>(dstOffset), + & dstF = std::get<3>(dstOffset); + + auto copyFunctor = [&](uint_t dstCoordZ, uint_t srcCoordZ, uint_t zIntervalSize) { + cudaMemcpy3DParms p; + memset( &p, 0, sizeof(p) ); + + p.srcPos = make_cudaPos( srcF * typeSize, srcX, srcCoordZ * srcAllocSizeY + srcY ); + p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize ); + + p.dstPos = make_cudaPos( dstF * typeSize, dstX, dstCoordZ * dstAllocSizeY + dstY ); + p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize ); + + p.extent = make_cudaExtent( Nf * typeSize, Nx, Ny * zIntervalSize ); + p.kind = cudaMemcpyDeviceToDevice; + + if ( copyStream == 0 ) + { + WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) ); + } + else + { + WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) ); + } + }; + + if ( Nz == 1 || ( Ny == dstAllocSizeY && Ny == srcAllocSizeY ) ) + { + copyFunctor( dstZ, srcZ, Nz ); + } + else + { + for( uint_t z = 0; z < Nz; ++z ) + { + copyFunctor( dstZ + z, srcZ + z, 1 ); + } + } } -void copyHostToDevFZYXRestricted( const cudaPitchedPtr& dst, unsigned char* src, - uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ, - uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF, - uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF, - uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf ) +void copyHostToDevFZYX( const cudaPitchedPtr& dst, unsigned char* src, + std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset, + std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset, + uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize, + std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize, + cudaStream_t copyStream ) { - WALBERLA_ASSERT( Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ) ); - - cudaMemcpy3DParms p; - memset( &p, 0, sizeof(p) ); - - p.srcPos.x = srcX * typeSz; - p.srcPos.y = srcY; - p.srcPos.z = srcF * srcAllocZ + srcZ; - p.srcPtr.ptr = src; - p.srcPtr.pitch = Nx * typeSz; - p.srcPtr.xsize = Nx * typeSz; - p.srcPtr.ysize = Ny; - - p.dstPos.x = dstX * typeSz; - p.dstPos.y = dstY; - p.dstPos.z = dstF * dstAllocZ + dstZ; - p.dstPtr.ptr = dst.ptr; - p.dstPtr.pitch = dst.pitch; - p.dstPtr.xsize = dst.xsize; - p.dstPtr.ysize = dst.ysize; - - p.extent.width = Nx * typeSz; - p.extent.height = Ny; - p.extent.depth = Nz * Nf; - - p.kind = cudaMemcpyHostToDevice; - - WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) ); + const uint_t & Nx = std::get<0>(intervalSize), + & Ny = std::get<1>(intervalSize), + & Nz = std::get<2>(intervalSize), + & Nf = std::get<3>(intervalSize); + + const uint_t & srcX = std::get<0>(srcOffset), + & srcY = std::get<1>(srcOffset), + & srcZ = std::get<2>(srcOffset), + & srcF = std::get<3>(srcOffset); + + const uint_t & dstX = std::get<0>(dstOffset), + & dstY = std::get<1>(dstOffset), + & dstZ = std::get<2>(dstOffset), + & dstF = std::get<3>(dstOffset); + + auto copyFunctor = [&](uint_t dstCoordF, uint_t srcCoordF, uint_t fIntervalSize) { + cudaMemcpy3DParms p; + memset( &p, 0, sizeof(p) ); + + p.srcPos = make_cudaPos( srcX * typeSize, srcY, srcCoordF * srcAllocSizeZ + srcZ ); + p.srcPtr = make_cudaPitchedPtr( src, Nx * typeSize, Nx * typeSize, Ny ); + + p.dstPos = make_cudaPos( dstX * typeSize, dstY, dstCoordF * dstAllocSizeZ + dstZ ); + p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize ); + + p.extent = make_cudaExtent( Nx * typeSize, Ny, Nz * fIntervalSize ); + p.kind = cudaMemcpyHostToDevice; + + if (copyStream == 0) + { + WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) ); + } + else + { + // Using cudaMemcpy3DAsync requires page-locked memory on the host! + WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) ); + } + }; + + if ( Nf == 1 || ( Nz == dstAllocSizeZ ) ) + { + copyFunctor( dstF, srcF, Nf ); + } + else + { + for( uint_t f = 0; f < Nf; ++f ) + { + copyFunctor( dstF + f, srcF + f, uint_c(1) ); + } + } } +void copyHostToDevZYXF( const cudaPitchedPtr& dst, unsigned char* src, + std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset, + std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset, + uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize, + std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize, + cudaStream_t copyStream ) +{ + const uint_t & Nx = std::get<0>(intervalSize), + & Ny = std::get<1>(intervalSize), + & Nz = std::get<2>(intervalSize), + & Nf = std::get<3>(intervalSize); -void copyDevToHostFZYXRestricted( unsigned char* dst, const cudaPitchedPtr& src, - uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ, - uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF, - uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF, - uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf ) + const uint_t & srcX = std::get<0>(srcOffset), + & srcY = std::get<1>(srcOffset), + & srcZ = std::get<2>(srcOffset), + & srcF = std::get<3>(srcOffset); + + const uint_t & dstX = std::get<0>(dstOffset), + & dstY = std::get<1>(dstOffset), + & dstZ = std::get<2>(dstOffset), + & dstF = std::get<3>(dstOffset); + + auto copyFunctor = [&](uint_t dstCoordZ, uint_t srcCoordZ, uint_t zIntervalSize) { + cudaMemcpy3DParms p; + memset( &p, 0, sizeof(p) ); + + p.srcPos = make_cudaPos( srcF * typeSize, srcX, srcCoordZ * srcAllocSizeY + srcY ); + p.srcPtr = make_cudaPitchedPtr( src, Nf * typeSize, Nf * typeSize, Nx ); + + p.dstPos = make_cudaPos( dstF * typeSize, dstX, dstCoordZ * dstAllocSizeY + dstY ); + p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize ); + + p.extent = make_cudaExtent( Nf * typeSize, Nx, Ny * zIntervalSize ); + p.kind = cudaMemcpyHostToDevice; + + if ( copyStream == 0 ) + { + WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) ); + } + else + { + // Using cudaMemcpy3DAsync requires page-locked memory on the host! + WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) ); + } + }; + + if ( Nz == 1 || ( Ny == dstAllocSizeY && Ny == srcAllocSizeY ) ) + { + copyFunctor( dstZ, srcZ, Nz ); + } + else + { + for( uint_t z = 0; z < Nz; ++z ) + { + copyFunctor( dstZ + z, srcZ + z, 1 ); + } + } +} + + +void copyDevToHostFZYX( unsigned char* dst, const cudaPitchedPtr& src, + std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset, + std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset, + uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize, + std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize, + cudaStream_t copyStream ) { - cudaMemcpy3DParms p; - memset( &p, 0, sizeof(p) ); - - p.srcPos.x = srcX * typeSz; - p.srcPos.y = srcY; - p.srcPos.z = srcF * srcAllocZ + srcZ; - p.srcPtr.ptr = src.ptr; - p.srcPtr.pitch = src.pitch; - p.srcPtr.xsize = src.xsize; - p.srcPtr.ysize = src.ysize; - - p.dstPos.x = dstX * typeSz; - p.dstPos.y = dstY; - p.dstPos.z = dstF * dstAllocZ + dstZ; - p.dstPtr.ptr = dst; - p.dstPtr.pitch = Nx * typeSz; - p.dstPtr.xsize = Nx * typeSz; - p.dstPtr.ysize = Ny; - - p.extent.width = Nx * typeSz; - p.extent.height = Ny; - p.extent.depth = Nz * Nf; - - p.kind = cudaMemcpyDeviceToHost; - - WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) ); + const uint_t & Nx = std::get<0>(intervalSize), + & Ny = std::get<1>(intervalSize), + & Nz = std::get<2>(intervalSize), + & Nf = std::get<3>(intervalSize); + + const uint_t & srcX = std::get<0>(srcOffset), + & srcY = std::get<1>(srcOffset), + & srcZ = std::get<2>(srcOffset), + & srcF = std::get<3>(srcOffset); + + const uint_t & dstX = std::get<0>(dstOffset), + & dstY = std::get<1>(dstOffset), + & dstZ = std::get<2>(dstOffset), + & dstF = std::get<3>(dstOffset); + + auto copyFunctor = [&](uint_t dstCoordF, uint_t srcCoordF, uint_t fIntervalSize) { + cudaMemcpy3DParms p; + memset( &p, 0, sizeof(p) ); + + p.srcPos = make_cudaPos( srcX * typeSize, srcY, srcCoordF * srcAllocSizeZ + srcZ ); + p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize ); + + p.dstPos = make_cudaPos( dstX * typeSize, dstY, dstCoordF * dstAllocSizeZ + dstZ ); + p.dstPtr = make_cudaPitchedPtr( dst, Nx * typeSize, Nx * typeSize, Ny ); + + p.extent = make_cudaExtent( Nx * typeSize, Ny, Nz * fIntervalSize ); + p.kind = cudaMemcpyDeviceToHost; + + if ( copyStream == 0 ) + { + WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) ); + } + else + { + // Using cudaMemcpy3DAsync requires page-locked memory on the host! + WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) ); + } + }; + + if( Nf == 1 || ( Nz == dstAllocSizeZ && Nz == srcAllocSizeZ ) ) + { + copyFunctor( dstF, srcF, Nf ); + } + else + { + for( uint_t f = 0; f < Nf; ++f ) + { + copyFunctor( dstF + f, srcF + f, 1 ); + } + } } +void copyDevToHostZYXF( unsigned char* dst, const cudaPitchedPtr& src, + std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset, + std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset, + uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize, + std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize, + cudaStream_t copyStream ) +{ + const uint_t & Nx = std::get<0>(intervalSize), + & Ny = std::get<1>(intervalSize), + & Nz = std::get<2>(intervalSize), + & Nf = std::get<3>(intervalSize); + + const uint_t & srcX = std::get<0>(srcOffset), + & srcY = std::get<1>(srcOffset), + & srcZ = std::get<2>(srcOffset), + & srcF = std::get<3>(srcOffset); + + const uint_t & dstX = std::get<0>(dstOffset), + & dstY = std::get<1>(dstOffset), + & dstZ = std::get<2>(dstOffset), + & dstF = std::get<3>(dstOffset); + + auto copyFunctor = [&](uint_t dstCoordZ, uint_t srcCoordZ, uint_t zIntervalSize) { + cudaMemcpy3DParms p; + memset( &p, 0, sizeof(p) ); + + p.srcPos = make_cudaPos( srcF * typeSize, srcX, srcCoordZ * srcAllocSizeY + srcY ); + p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize ); + + p.dstPos = make_cudaPos( dstF * typeSize, dstX, dstCoordZ * dstAllocSizeY + dstY ); + p.dstPtr = make_cudaPitchedPtr( dst, Nf * typeSize, Nf * typeSize, Nx ); + + p.extent = make_cudaExtent( Nf * typeSize, Nx, Ny * zIntervalSize ); + + p.kind = cudaMemcpyDeviceToHost; + + if ( copyStream == 0 ) + { + WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) ); + } + else + { + // Using cudaMemcpy3DAsync requires page-locked memory on the host! + WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) ); + } + }; + + + if ( Nz == 1 || ( Ny == dstAllocSizeY && Ny == srcAllocSizeY ) ) + { + copyFunctor( dstZ, srcZ, Nz ); + } + else + { + for( uint_t z = 0; z < Nz; ++z ) + { + copyFunctor( dstZ + z, srcZ + z, 1 ); + } + } +} } // namespace cuda } // namespace walberla diff --git a/src/cuda/GPUCopy.h b/src/cuda/GPUCopy.h index 7c6c69a7963cd8ecc3bdd1289065c3e070890a32..a854068cdc67fe5c4f9d202ecd7cc9559e9661f7 100644 --- a/src/cuda/GPUCopy.h +++ b/src/cuda/GPUCopy.h @@ -1,21 +1,22 @@ //====================================================================================================================== // -// This file is part of waLBerla. waLBerla is free software: you can +// This file is part of waLBerla. waLBerla is free software: you can // redistribute it and/or modify it under the terms of the GNU General Public -// License as published by the Free Software Foundation, either version 3 of +// License as published by the Free Software Foundation, either version 3 of // the License, or (at your option) any later version. -// -// waLBerla is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License // for more details. -// +// // You should have received a copy of the GNU General Public License along // with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. // //! \file GPUCopy.h //! \ingroup cuda //! \author Paulo Carvalho <prcjunior@inf.ufpr.br> +//! \author João Victor Tozatti Risso <jvtrisso@inf.ufpr.br> //! \brief Copy routines of 4D intervals involving GPU buffers. // //====================================================================================================================== @@ -24,6 +25,7 @@ #include "core/DataTypes.h" +#include <tuple> #include <cuda_runtime.h> @@ -31,152 +33,87 @@ namespace walberla { namespace cuda { -//******************************************************************************************************************* -/*! Restricted version of copyDevToDevFZYX() that requires Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ). - * See copyDevToDevFZYX() for more details. - *******************************************************************************************************************/ -void copyDevToDevFZYXRestricted( const cudaPitchedPtr& dst, const cudaPitchedPtr& src, - uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ, - uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF, - uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF, - uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf ); - - -//******************************************************************************************************************* +//**************************************************************************************************************************** /*! Copy a 4D interval of a device buffer to another device buffer with fzyx memory layout. * - * \param dst destination buffer - * \param src source buffer - * \param typeSz size of an f element - * \param dstAllocZ allocation size in z direction of the destination buffer - * \param srcAllocZ allocation size in z direction of the source buffer - * \param dstX x coordinate of the interval start point in the destination buffer - * \param dstY y coordinate of the interval start point in the destination buffer - * \param dstZ z coordinate of the interval start point in the destination buffer - * \param dstF f coordinate of the interval start point in the destination buffer - * \param srcX x coordinate of the interval start point in the source buffer - * \param srcY y coordinate of the interval start point in the source buffer - * \param srcZ z coordinate of the interval start point in the source buffer - * \param srcF f coordinate of the interval start point in the source buffer - * \param Nx interval size in x direction - * \param Ny interval size in y direction - * \param Nz interval size in z direction - * \param Nf interval size in f direction - *******************************************************************************************************************/ -inline void copyDevToDevFZYX( const cudaPitchedPtr& dst, const cudaPitchedPtr& src, - uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ, - uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF, - uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF, - uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf ) -{ - if( Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ) ) - { - copyDevToDevFZYXRestricted( dst, src, - typeSz, dstAllocZ, srcAllocZ, - dstX, dstY, dstZ, dstF, - srcX, srcY, srcZ, srcF, - Nx, Ny, Nz, Nf ); - } - else - { - for( uint_t f = 0; f < Nf; ++f ) - { - copyDevToDevFZYXRestricted( dst, src, - typeSz, dstAllocZ, srcAllocZ, - dstX, dstY, dstZ, dstF + f, - srcX, srcY, srcZ, srcF + f, - Nx, Ny, Nz, 1 ); - } - } -} - - -//******************************************************************************************************************* -/*! Restricted version of copyHostToDevFZYX() that requires Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ). - * See copyHostToDevFZYX() for more details. - *******************************************************************************************************************/ -void copyHostToDevFZYXRestricted( const cudaPitchedPtr& dst, unsigned char* src, - uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ, - uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF, - uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF, - uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf ); - + * \param dst destination buffer + * \param src source buffer + * \param dstOffset (x, y, z, f)-tuple containing the coordinate of the interval start point in the destination buffer + * \param srcOffset (x, y, z, f)-tuple containing the coordinate of the interval start point in the source buffer + * \param dstAllocSizeY allocation size in y direction of the destination buffer + * \param srcAllocSizeY allocation size in y direction of the source buffer + * \param typeSize size of an f element + * \param copyStream CUDA stream, if not NULL copy operations will be performed asynchronously + *****************************************************************************************************************************/ +void copyDevToDevFZYX( const cudaPitchedPtr& dst, const cudaPitchedPtr& src, + std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset, + std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset, + uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize, + std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize, + cudaStream_t copyStream ); + +//**************************************************************************************************************************** +/*! Copy a 4D interval of a device buffer to another device buffer with zyxf memory layout. + * + * \param dst destination buffer + * \param src source buffer + * \param dstOffset (x, y, z, f)-tuple containing the coordinate of the interval start point in the destination buffer + * \param srcOffset (x, y, z, f)-tuple containing the coordinate of the interval start point in the source buffer + * \param dstAllocSizeY allocation size in y direction of the destination buffer + * \param srcAllocSizeY allocation size in y direction of the source buffer + * \param typeSize size of an f element + * \param copyStream CUDA stream, if not NULL copy operations will be performed asynchronously + *****************************************************************************************************************************/ +void copyDevToDevZYXF( const cudaPitchedPtr& dst, const cudaPitchedPtr& src, + std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset, + std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset, + uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize, + std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize, + cudaStream_t copyStream ); //******************************************************************************************************************* /*! Copy a 4D interval of a host buffer to a device buffer with fzyx memory layout. See copyDevToDevFZYX() for * parameter information. *******************************************************************************************************************/ -inline void copyHostToDevFZYX( const cudaPitchedPtr& dst, unsigned char* src, - uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ, - uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF, - uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF, - uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf ) -{ - if( Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ) ) - { - copyHostToDevFZYXRestricted( dst, src, - typeSz, dstAllocZ, srcAllocZ, - dstX, dstY, dstZ, dstF, - srcX, srcY, srcZ, srcF, - Nx, Ny, Nz, Nf ); - } - else - { - for( uint_t f = 0; f < Nf; ++f ) - { - copyHostToDevFZYXRestricted( dst, src, - typeSz, dstAllocZ, srcAllocZ, - dstX, dstY, dstZ, dstF + f, - srcX, srcY, srcZ, srcF + f, - Nx, Ny, Nz, 1 ); - } - } -} - +void copyHostToDevFZYX( const cudaPitchedPtr& dst, unsigned char* src, + std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset, + std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset, + uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize, + std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize, + cudaStream_t copyStream ); //******************************************************************************************************************* -/*! Restricted version of copyDevToHostFZYX() that requires Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ). - * See copyDevToHostFZYX() for more details. +/*! Copy a 4D interval of a host buffer to a device buffer with zyxf memory layout. See copyDevToDevZYXF() for + * parameter information. *******************************************************************************************************************/ -void copyDevToHostFZYXRestricted( unsigned char* dst, const cudaPitchedPtr& src, - uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ, - uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF, - uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF, - uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf ); - +void copyHostToDevZYXF( const cudaPitchedPtr& dst, unsigned char* src, + std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset, + std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset, + uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize, + std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize, + cudaStream_t copyStream ); //******************************************************************************************************************* /*! Copy a 4D interval of a device buffer to a host buffer with fzyx memory layout. See copyDevToDevFZYX() for * parameter information. *******************************************************************************************************************/ -inline void copyDevToHostFZYX( unsigned char* dst, const cudaPitchedPtr& src, - uint_t typeSz, uint_t dstAllocZ, uint_t srcAllocZ, - uint_t dstX, uint_t dstY, uint_t dstZ, uint_t dstF, - uint_t srcX, uint_t srcY, uint_t srcZ, uint_t srcF, - uint_t Nx, uint_t Ny, uint_t Nz, uint_t Nf ) -{ - if( Nf == 1 || ( Nz == dstAllocZ && Nz == srcAllocZ ) ) - { - copyDevToHostFZYXRestricted( dst, src, - typeSz, dstAllocZ, srcAllocZ, - dstX, dstY, dstZ, dstF, - srcX, srcY, srcZ, srcF, - Nx, Ny, Nz, Nf ); - } - else - { - for( uint_t f = 0; f < Nf; ++f ) - { - copyDevToHostFZYXRestricted( dst, src, - typeSz, dstAllocZ, srcAllocZ, - dstX, dstY, dstZ, dstF + f, - srcX, srcY, srcZ, srcF + f, - Nx, Ny, Nz, 1 ); - } - } -} - +void copyDevToHostFZYX( unsigned char* dst, const cudaPitchedPtr& src, + std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset, + std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset, + uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize, + std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize, + cudaStream_t copyStream ); +//******************************************************************************************************************* +/*! Copy a 4D interval of a device buffer to a host buffer with zyxf memory layout. See copyDevToDevZYXF() for + * parameter information. + *******************************************************************************************************************/ +void copyDevToHostZYXF( unsigned char* dst, const cudaPitchedPtr& src, + std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset, + std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset, + uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize, + std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize, + cudaStream_t copyStream ); } // namespace cuda } // namespace walberla diff --git a/src/cuda/Kernel.h b/src/cuda/Kernel.h index 3b6acf0a1fe2bb7956110236e2b68d07dcb5e4b3..59e588399da12888fe102685797ff6c67e9dde61 100644 --- a/src/cuda/Kernel.h +++ b/src/cuda/Kernel.h @@ -102,8 +102,8 @@ namespace cuda { template<typename T> void addFieldIndexingParam( const T & indexing ); - void configure( dim3 gridDim, dim3 blockDim ); - void operator() () const; + void configure( dim3 gridDim, dim3 blockDim, std::size_t sharedMemSize = 0 ); + void operator() ( cudaStream_t stream = 0 ) const; protected: @@ -118,6 +118,7 @@ namespace cuda { bool configured_; dim3 gridDim_; dim3 blockDim_; + std::size_t sharedMemSize_; struct ParamInfo { std::vector<char> data; @@ -178,7 +179,8 @@ namespace cuda { template<typename FP> Kernel<FP>::Kernel( FP funcPtr ) : funcPtr_ ( funcPtr ), - configured_( false ) + configured_( false ), + sharedMemSize_( 0 ) {} template<typename FP> @@ -206,12 +208,13 @@ namespace cuda { } template<typename FP> - void Kernel<FP>::configure( dim3 gridDim, dim3 blockDim ) + void Kernel<FP>::configure( dim3 gridDim, dim3 blockDim, std::size_t sharedMemSize ) { if ( ! configured_ ) { gridDim_ = gridDim; blockDim_ = blockDim; + sharedMemSize_ = sharedMemSize; configured_ = true; } else @@ -225,7 +228,7 @@ namespace cuda { } template<typename FP> - void Kernel<FP>::operator() () const + void Kernel<FP>::operator() ( cudaStream_t stream ) const { // check for correct number of parameter calls @@ -235,7 +238,7 @@ namespace cuda { } // set the number of blocks and threads, - WALBERLA_CUDA_CHECK( cudaConfigureCall( gridDim_, blockDim_ ) ); //TODO extend class to support streams + WALBERLA_CUDA_CHECK( cudaConfigureCall( gridDim_, blockDim_, sharedMemSize_, stream ) ); // register all parameters for( auto paramIt = params_.begin(); paramIt != params_.end(); ++paramIt ) { diff --git a/src/cuda/communication/GPUPackInfo.h b/src/cuda/communication/GPUPackInfo.h index b3124dd392ebf5f27a7a72734afa349aa480ec45..61c698d82e729add8ffe639cfb31184d7d2bf5f4 100644 --- a/src/cuda/communication/GPUPackInfo.h +++ b/src/cuda/communication/GPUPackInfo.h @@ -16,16 +16,32 @@ //! \file GPUPackInfo.h //! \ingroup cuda //! \author Paulo Carvalho <prcjunior@inf.ufpr.br> +//! \author João Victor Tozatti Risso <jvtrisso@inf.ufpr.br> //====================================================================================================================== #pragma once -#include "cuda/GPUCopy.h" -#include "communication/UniformPackInfo.h" #include "core/debug/Debug.h" -#include "field/GhostRegions.h" +#include "core/math/Vector3.h" +#include "core/mpi/BufferSizeTrait.h" + #include "stencil/Directions.h" +#include "field/GhostRegions.h" + +#include "communication/UniformPackInfo.h" + +#include "blockforest/Block.h" + +#include "cuda/ErrorChecking.h" +#include "cuda/GPUCopy.h" +#include "cuda/communication/PinnedMemoryBuffer.h" + +#include <cuda_runtime.h> +#include <map> +#include <vector> +#include <tuple> + namespace walberla { namespace cuda { @@ -42,17 +58,49 @@ template<typename GPUField_T> class GPUPackInfo : public walberla::communication::UniformPackInfo { public: - typedef typename GPUField_T::value_type T; + typedef typename GPUField_T::value_type FieldType; + + GPUPackInfo( const BlockDataID & bdId, cudaStream_t stream = 0 ) + : bdId_( bdId ), communicateAllGhostLayers_( true ), numberOfGhostLayers_( 0 ) + { + streams_.push_back( stream ); + copyAsync_ = (stream != 0); + } + + GPUPackInfo( const BlockDataID & bdId, const uint_t numberOfGHostLayers, cudaStream_t stream = 0 ) + : bdId_( bdId ), communicateAllGhostLayers_( false ), numberOfGhostLayers_( numberOfGHostLayers ) + { + streams_.push_back( stream ); + copyAsync_ = (stream != 0); + } - GPUPackInfo( const BlockDataID & bdId ) : bdId_( bdId ), communicateAllGhostLayers_( true ), - numberOfGhostLayers_( 0 ) {} + GPUPackInfo( const BlockDataID & bdId, std::vector< cudaStream_t > & streams ) + : bdId_( bdId ), communicateAllGhostLayers_( true ), numberOfGhostLayers_( 0 ), streams_( streams ) + { + copyAsync_ = true; + for( auto streamIt = streams_.begin(); streamIt != streams_.end(); ++streamIt ) + if ( *streamIt == 0 ) + { + copyAsync_ = false; + break; + } + } - GPUPackInfo( const BlockDataID & bdId, const uint_t numberOfGHostLayers ) : bdId_( bdId ), - communicateAllGhostLayers_( false ), numberOfGhostLayers_( numberOfGHostLayers ) {} + GPUPackInfo( const BlockDataID & bdId, const uint_t numberOfGHostLayers, std::vector< cudaStream_t > & streams ) + : bdId_( bdId ), communicateAllGhostLayers_( false ), numberOfGhostLayers_( numberOfGHostLayers ), streams_( streams ) + { + copyAsync_ = true; + for( auto streamIt = streams_.begin(); streamIt != streams_.end(); ++streamIt ) + if ( *streamIt == 0 ) + { + copyAsync_ = false; + break; + } + } virtual ~GPUPackInfo() {} - bool constantDataExchange() const { return mpi::BufferSizeTrait<T>::constantSize; } + bool constantDataExchange() const { return mpi::BufferSizeTrait<FieldType>::constantSize; } bool threadsafeReceiving() const { return true; } void unpackData(IBlock * receiver, stencil::Direction dir, mpi::RecvBuffer & buffer); @@ -64,38 +112,73 @@ protected: uint_t numberOfGhostLayersToCommunicate( const GPUField_T * const field ) const; + inline cudaStream_t & getStream(stencil::Direction & dir) { return streams_[dir % streams_.size()]; } + const BlockDataID bdId_; bool communicateAllGhostLayers_; uint_t numberOfGhostLayers_; + std::vector< cudaStream_t > streams_; + bool copyAsync_; + std::map< stencil::Direction, PinnedMemoryBuffer > pinnedRecvBuffers_; + mutable std::map< stencil::Direction, PinnedMemoryBuffer > pinnedSendBuffers_; }; template<typename GPUField_T> void GPUPackInfo<GPUField_T>::unpackData(IBlock * receiver, stencil::Direction dir, mpi::RecvBuffer & buffer) { - GPUField_T * f = receiver->getData< GPUField_T >( bdId_ ); - WALBERLA_ASSERT_NOT_NULLPTR(f); + GPUField_T * fieldPtr = receiver->getData< GPUField_T >( bdId_ ); + WALBERLA_ASSERT_NOT_NULLPTR(fieldPtr); + + cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( fieldPtr ) ); - if ( f->layout() != fzyx ) + CellInterval fieldCi = field::getGhostRegion( *fieldPtr, dir, nrOfGhostLayers, false ); + + uint_t nrOfBytesToRead = fieldCi.numCells() * fieldPtr->fSize() * sizeof(FieldType); + + unsigned char * bufPtr = buffer.skip(nrOfBytesToRead); + + unsigned char * copyBufferPtr = bufPtr; + if ( copyAsync_ ) { - WALBERLA_ABORT( "GPUPackInfo currently only supports fzyx layout" ); + PinnedMemoryBuffer & pinnedBuffer = pinnedRecvBuffers_[dir]; + copyBufferPtr = pinnedBuffer.resize( nrOfBytesToRead ); + // Copy data into pinned memory buffer, in order to transfer it asynchronously to the GPU + std::copy( bufPtr, static_cast< unsigned char * >( bufPtr + nrOfBytesToRead ), copyBufferPtr ); } - cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( f ) ); + cudaStream_t & unpackStream = getStream(dir); - CellInterval ci = field::getGhostRegion( *f, dir, nrOfGhostLayers, false ); + auto dstOffset = std::make_tuple( uint_c(fieldCi.xMin() + nrOfGhostLayers), + uint_c(fieldCi.yMin() + nrOfGhostLayers), + uint_c(fieldCi.zMin() + nrOfGhostLayers), + uint_c(0) ); + auto srcOffset = std::make_tuple( uint_c(0), uint_c(0), uint_c(0), uint_c(0) ); - uint_t nrOfBytesToRead = ci.xSize() * ci.ySize() * ci.zSize() * f->fSize() * sizeof(T); + auto intervalSize = std::make_tuple( fieldCi.xSize(), fieldCi.ySize(), fieldCi.zSize(), + fieldPtr->fSize() ); - unsigned char * buf = buffer.skip(nrOfBytesToRead); + if ( fieldPtr->layout() == fzyx ) + { + const uint_t dstAllocSizeZ = fieldPtr->zAllocSize(); + const uint_t srcAllocSizeZ = fieldCi.zSize(); + copyHostToDevFZYX( fieldPtr->pitchedPtr(), copyBufferPtr, dstOffset, srcOffset, + dstAllocSizeZ, srcAllocSizeZ, sizeof(FieldType), + intervalSize, unpackStream ); + } + else + { + const uint_t dstAllocSizeY = fieldPtr->yAllocSize(); + const uint_t srcAllocSizeY = fieldCi.ySize(); + copyHostToDevZYXF( fieldPtr->pitchedPtr(), copyBufferPtr, dstOffset, srcOffset, + dstAllocSizeY, srcAllocSizeY, sizeof(FieldType), + intervalSize, unpackStream ); + } - copyHostToDevFZYX( f->pitchedPtr(), buf, - sizeof(T), f->zAllocSize(), ci.zSize(), - uint_c(ci.xMin() + nrOfGhostLayers), - uint_c(ci.yMin() + nrOfGhostLayers), - uint_c(ci.zMin() + nrOfGhostLayers), 0, - 0, 0, 0, 0, - ci.xSize(), ci.ySize(), ci.zSize(), f->fSize() ); + if ( copyAsync_ ) + { + WALBERLA_CUDA_CHECK( cudaStreamSynchronize( unpackStream ) ); + } } @@ -105,55 +188,116 @@ void GPUPackInfo<GPUField_T>::communicateLocal(const IBlock * sender, IBlock * r const GPUField_T * sf = sender ->getData< GPUField_T >( bdId_ ); GPUField_T * rf = receiver->getData< GPUField_T >( bdId_ ); - if ( sf->layout() != fzyx || rf->layout() != fzyx ) - { - WALBERLA_ABORT( "GPUPackInfo currently only supports fzyx layout" ); - } + WALBERLA_ASSERT_NOT_NULLPTR( sf ); + WALBERLA_ASSERT_NOT_NULLPTR( rf ); WALBERLA_ASSERT_EQUAL(sf->xSize(), rf->xSize()); WALBERLA_ASSERT_EQUAL(sf->ySize(), rf->ySize()); WALBERLA_ASSERT_EQUAL(sf->zSize(), rf->zSize()); WALBERLA_ASSERT_EQUAL(sf->fSize(), rf->fSize()); + WALBERLA_CHECK( sf->layout() == rf->layout(), "GPUPackInfo::communicateLocal: fields must have the same layout!" ); + cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( sf ) ); CellInterval sCi = field::getSliceBeforeGhostLayer( *sf, dir, nrOfGhostLayers, false ); CellInterval rCi = field::getGhostRegion( *rf, stencil::inverseDir[dir], nrOfGhostLayers, false ); - copyDevToDevFZYX( rf->pitchedPtr(), sf->pitchedPtr(), - sizeof(T), rf->zAllocSize(), sf->zAllocSize(), - uint_c(rCi.xMin() + nrOfGhostLayers), uint_c(rCi.yMin() + nrOfGhostLayers), uint_c(rCi.zMin() + nrOfGhostLayers), 0, - uint_c(sCi.xMin() + nrOfGhostLayers), uint_c(sCi.yMin() + nrOfGhostLayers), uint_c(sCi.zMin() + nrOfGhostLayers), 0, - rCi.xSize(), rCi.ySize(), rCi.zSize(), sf->fSize() ); + cudaStream_t & commStream = getStream(dir); + + auto dstOffset = std::make_tuple( uint_c(rCi.xMin() + nrOfGhostLayers), + uint_c(rCi.yMin() + nrOfGhostLayers), + uint_c(rCi.zMin() + nrOfGhostLayers), + uint_c(0) ); + + auto srcOffset = std::make_tuple( uint_c(sCi.xMin() + nrOfGhostLayers), + uint_c(sCi.yMin() + nrOfGhostLayers), + uint_c(sCi.zMin() + nrOfGhostLayers), + uint_c(0) ); + + auto intervalSize = std::make_tuple( rCi.xSize(), rCi.ySize(), rCi.zSize(), sf->fSize() ); + + if ( sf->layout() == fzyx ) + { + const uint_t dstAllocSizeZ = rf->zAllocSize(); + const uint_t srcAllocSizeZ = sf->zAllocSize(); + + copyDevToDevFZYX( rf->pitchedPtr(), sf->pitchedPtr(), dstOffset, srcOffset, + dstAllocSizeZ, srcAllocSizeZ, sizeof(FieldType), + intervalSize, commStream ); + } + else + { + const uint_t dstAllocSizeY = rf->yAllocSize(); + const uint_t srcAllocSizeY = sf->yAllocSize(); + + copyDevToDevZYXF( rf->pitchedPtr(), sf->pitchedPtr(), dstOffset, srcOffset, + dstAllocSizeY, srcAllocSizeY, sizeof(FieldType), + intervalSize, commStream ); + } + + if ( copyAsync_ ) + { + WALBERLA_CUDA_CHECK( cudaStreamSynchronize( commStream ) ); + } } template<typename GPUField_T> void GPUPackInfo<GPUField_T>::packDataImpl(const IBlock * sender, stencil::Direction dir, mpi::SendBuffer & outBuffer) const { - const GPUField_T * f = sender->getData< GPUField_T >( bdId_ ); - WALBERLA_ASSERT_NOT_NULLPTR(f); + const GPUField_T * fieldPtr = sender->getData< GPUField_T >( bdId_ ); + WALBERLA_ASSERT_NOT_NULLPTR(fieldPtr); + + cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( fieldPtr ) ); + + CellInterval fieldCi = field::getSliceBeforeGhostLayer( *fieldPtr, dir, nrOfGhostLayers, false ); + + const std::size_t nrOfBytesToPack = fieldCi.numCells() * fieldPtr->fSize() * sizeof(FieldType); + + unsigned char * outBufferPtr = outBuffer.forward( nrOfBytesToPack ); + + const cudaStream_t & packStream = streams_[dir % streams_.size()]; - if ( f->layout() != fzyx ) + unsigned char * copyBufferPtr = outBufferPtr; + if ( copyAsync_ ) { - WALBERLA_ABORT( "GPUPackInfo currently only supports fzyx layout" ); + PinnedMemoryBuffer & pinnedBuffer = pinnedSendBuffers_[dir]; + copyBufferPtr = pinnedBuffer.resize( nrOfBytesToPack ); } - cell_idx_t nrOfGhostLayers = cell_idx_c( numberOfGhostLayersToCommunicate( f ) ); + auto dstOffset = std::make_tuple( uint_c(0), uint_c(0), uint_c(0), uint_c(0) ); + auto srcOffset = std::make_tuple( uint_c(fieldCi.xMin() + nrOfGhostLayers), + uint_c(fieldCi.yMin() + nrOfGhostLayers), + uint_c(fieldCi.zMin() + nrOfGhostLayers), + uint_c(0) ); - CellInterval ci = field::getSliceBeforeGhostLayer( *f, dir, nrOfGhostLayers, false ); + auto intervalSize = std::make_tuple( fieldCi.xSize(), fieldCi.ySize(), fieldCi.zSize(), + fieldPtr->fSize() ); - uint_t nrOfBytesToPack = ci.xSize() * ci.ySize() * ci.zSize() * f->fSize() * sizeof(T); + if ( fieldPtr->layout() == fzyx ) + { + const uint_t dstAllocSizeZ = fieldCi.zSize(); + const uint_t srcAllocSizeZ = fieldPtr->zAllocSize(); + copyDevToHostFZYX( copyBufferPtr, fieldPtr->pitchedPtr(), dstOffset, srcOffset, + dstAllocSizeZ, srcAllocSizeZ, sizeof(FieldType), + intervalSize, packStream ); + } + else + { + const uint_t dstAllocSizeZ = fieldCi.ySize(); + const uint_t srcAllocSizeZ = fieldPtr->yAllocSize(); + copyDevToHostZYXF( copyBufferPtr, fieldPtr->pitchedPtr(), dstOffset, srcOffset, + dstAllocSizeZ, srcAllocSizeZ, sizeof(FieldType), + intervalSize, packStream ); + } - unsigned char * buf = outBuffer.forward( nrOfBytesToPack ); + if ( copyAsync_ ) + { + WALBERLA_CUDA_CHECK( cudaStreamSynchronize( packStream ) ); - copyDevToHostFZYX( buf, f->pitchedPtr(), - sizeof(T), ci.zSize(), f->zAllocSize(), - 0, 0, 0, 0, - uint_c(ci.xMin() + nrOfGhostLayers), - uint_c(ci.yMin() + nrOfGhostLayers), - uint_c(ci.zMin() + nrOfGhostLayers), 0, - ci.xSize(), ci.ySize(), ci.zSize(), f->fSize() ); + std::copy( copyBufferPtr, static_cast<unsigned char *>( copyBufferPtr + nrOfBytesToPack ), outBufferPtr ); + } } diff --git a/src/cuda/communication/PinnedMemoryBuffer.h b/src/cuda/communication/PinnedMemoryBuffer.h new file mode 100644 index 0000000000000000000000000000000000000000..702e71bd69aa9bfa7858e3595330b092edfe65dc --- /dev/null +++ b/src/cuda/communication/PinnedMemoryBuffer.h @@ -0,0 +1,123 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file PinnedMemoryBuffer.h +//! \ingroup cuda +//! \author João Victor Tozatti Risso <jvtrisso@inf.ufpr.br> +//! \brief Pinned Memory buffer for staging memory when using asynchronous CUDA memory copies. +// +//====================================================================================================================== + +#pragma once + +#include "cuda/ErrorChecking.h" + +#include <algorithm> +#include <cuda_runtime.h> + + +namespace walberla { +namespace cuda { +namespace communication { + +template< typename T = unsigned char > +class GenericPinnedMemoryBuffer +{ +public: + typedef T ElementType; + + GenericPinnedMemoryBuffer(); + + GenericPinnedMemoryBuffer( std::size_t initSize ); + + GenericPinnedMemoryBuffer( const GenericPinnedMemoryBuffer & pb ); + + ~GenericPinnedMemoryBuffer(); + + inline T* ptr() const { return data_; } + + inline T* resize( std::size_t newSize ); + + inline std::size_t size() const { return size_; } + + GenericPinnedMemoryBuffer & operator=( const GenericPinnedMemoryBuffer & pb ) = delete; +private: + T * data_; + std::size_t size_; +}; + +typedef GenericPinnedMemoryBuffer<> PinnedMemoryBuffer; + +template< typename T > // Element type +GenericPinnedMemoryBuffer<T>::GenericPinnedMemoryBuffer() + : data_(nullptr), size_(0) +{ +} + + +template< typename T > // Element type +GenericPinnedMemoryBuffer<T>::GenericPinnedMemoryBuffer( std::size_t initSize ) + : data_(nullptr), size_(initSize) +{ + if (initSize > 0) + { + WALBERLA_CUDA_CHECK( cudaMallocHost( &data_, size_ * sizeof(T) ) ); + } +} + +template< typename T > // Element type +GenericPinnedMemoryBuffer<T>::GenericPinnedMemoryBuffer( const GenericPinnedMemoryBuffer & pb ) + : size_(pb.size_) +{ + if ( pb.size_ > 0 ) + { + WALBERLA_CUDA_CHECK( cudaMallocHost( &data_, pb.size_ * sizeof(T) ) ); + + std::copy( pb.data_, static_cast<T *>(pb.data_ + pb.size_), data_ ); + } +} + +template< typename T > // Element type +GenericPinnedMemoryBuffer<T>::~GenericPinnedMemoryBuffer() +{ + if ( data_ != nullptr ) + { + WALBERLA_CUDA_CHECK( cudaFreeHost( data_ ) ); + } +} + +template< typename T > // Element type +T * GenericPinnedMemoryBuffer<T>::resize(std::size_t newSize) +{ + if ( newSize > size_ ) + { + T * newBegin; + WALBERLA_CUDA_CHECK( cudaMallocHost( &newBegin, newSize * sizeof(T) ) ); + + std::swap( data_, newBegin ); + if ( newBegin != nullptr ) + { + WALBERLA_CUDA_CHECK( cudaFreeHost( newBegin ) ); + } + + size_ = newSize; + } + + return data_; +} + +} // namespace communication +} // namespace cuda +} // namespace walberla diff --git a/tests/cuda/CMakeLists.txt b/tests/cuda/CMakeLists.txt index 7cde4254bbfbe376d04d666d23fdfb81b73f11eb..aa82a445f73db26c8cb7312ab0445a8eefe6b5bc 100644 --- a/tests/cuda/CMakeLists.txt +++ b/tests/cuda/CMakeLists.txt @@ -7,6 +7,9 @@ waLBerla_compile_test( FILES communication/GPUPackInfoTest.cpp DEPENDS blockforest ) waLBerla_execute_test( NAME GPUPackInfoTest ) +waLBerla_compile_test( FILES communication/GPUPackInfoCommunicationTest.cpp DEPENDS domain_decomposition blockforest stencil ) +waLBerla_execute_test( NAME GPUPackInfoCommunicationTest ) + waLBerla_compile_test( FILES FieldTransferTest ) waLBerla_execute_test( NAME FieldTransferTest ) diff --git a/tests/cuda/communication/GPUPackInfoCommunicationTest.cpp b/tests/cuda/communication/GPUPackInfoCommunicationTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f3f19d085012b02e8947d2d5100a7c2e3ad1dfb6 --- /dev/null +++ b/tests/cuda/communication/GPUPackInfoCommunicationTest.cpp @@ -0,0 +1,187 @@ +//======================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file GPUFieldPackInfoTest.cpp +//! \ingroup cuda +//! \author João Victor Tozatti Risso <jvtrisso@inf.ufpr.br> +//! \brief Short communication test to verify the equivalence of GPUPackInfo using a default stream and multiple streams. +// +//======================================================================================================================== + +#include "core/DataTypes.h" +#include "core/debug/TestSubsystem.h" +#include "core/math/Random.h" +#include "core/mpi/Environment.h" + +#include "stencil/Directions.h" +#include "stencil/Iterator.h" +#include "stencil/D3Q27.h" + +#include "domain_decomposition/BlockDataID.h" + +#include "blockforest/Initialization.h" +#include "blockforest/communication/UniformBufferedScheme.h" + +#include "field/GhostLayerField.h" + +#include "cuda/ErrorChecking.h" +#include "cuda/HostFieldAllocator.h" +#include "cuda/GPUField.h" +#include "cuda/FieldCopy.h" +#include "cuda/communication/GPUPackInfo.h" + +#include <cuda_runtime.h> +#include <vector> + +using namespace walberla; + +using DataType = walberla::uint_t; +using StencilType = stencil::D3Q27; +using FieldType = field::GhostLayerField< DataType, StencilType::Size >; +using GPUFieldType = cuda::GPUField< DataType >; +using CommSchemeType = blockforest::communication::UniformBufferedScheme<StencilType>; +using GPUPackInfoType = cuda::communication::GPUPackInfo< GPUFieldType >; + +static std::vector< cuda::Layout > fieldLayouts = { cuda::fzyx, cuda::zyxf }; +static uint_t fieldLayoutIndex = 0; + + +FieldType * createField( IBlock* const block, StructuredBlockStorage* const storage ) +{ + return new FieldType( + storage->getNumberOfXCells( *block ), // number of cells in x direction per block + storage->getNumberOfYCells( *block ), // number of cells in y direction per block + storage->getNumberOfZCells( *block ), // number of cells in z direction per block + 1, // one ghost layer + DataType(0), // initial value + fieldLayouts[fieldLayoutIndex], // layout + make_shared<cuda::HostFieldAllocator< DataType > >() // allocator for host pinned memory + ); +} + + +GPUFieldType * createGPUField( IBlock* const block, StructuredBlockStorage* const storage ) +{ + return new GPUFieldType( + storage->getNumberOfXCells( *block ), // number of cells in x direction + storage->getNumberOfYCells( *block ), // number of cells in y direction + storage->getNumberOfZCells( *block ), // number of cells in z direction + StencilType::Size, // number of cells for pdfs + 1, // one ghost layer + fieldLayouts[fieldLayoutIndex] ); +} + + +void initFields( const shared_ptr< StructuredBlockStorage > & blocks, const BlockDataID & fieldID ) +{ + for( auto block = blocks->begin(); block != blocks->end(); ++block ) + { + auto fieldPtr = block->getData< FieldType >( fieldID ); + + for( auto fieldIt = fieldPtr->begin(); fieldIt != fieldPtr->end(); ++fieldIt ) + *fieldIt = math::intRandom< DataType >(); + } +} + + +int main( int argc, char ** argv ) +{ + debug::enterTestMode(); + mpi::Environment mpiEnv( argc, argv ); + + std::vector< cudaStream_t > streams; + + for( uint_t i = 0; i < StencilType::Size; ++i ) + { + cudaStream_t stream(nullptr); + WALBERLA_CUDA_CHECK( cudaStreamCreate(&stream) ); + streams.push_back( stream ); + } + + const Vector3< uint_t > cells = Vector3< uint_t >( 4, 4, 4 ); + + uint_t nProc = uint_c( MPIManager::instance()->numProcesses() ); + + for(; fieldLayoutIndex < fieldLayouts.size(); ++fieldLayoutIndex ) + { + auto blocks = blockforest::createUniformBlockGrid(nProc, 1, 1, // blocks + cells[0], cells[1], cells[2], // cells + 1, // unit cell spacing + true, // one block per process + true, true, true); // periodic in all directions + + BlockDataID sourceFieldId = blocks->addStructuredBlockData< FieldType >( &createField, + "ScalarField" ); + + BlockDataID syncGPUFieldId = blocks->addStructuredBlockData< GPUFieldType >( &createGPUField, + "syncGPUField" ); + + BlockDataID asyncGPUFieldId = blocks->addStructuredBlockData< GPUFieldType >( &createGPUField, + "asyncGPUField" ); + + math::seedRandomGenerator( numeric_cast<boost::mt19937::result_type>( MPIManager::instance()->rank() ) ); + // Initialize CPU field with random values + initFields( blocks, sourceFieldId ); + + // Copy same CPU field to both GPU fields + for( auto block = blocks->begin(); block != blocks->end(); ++block ) + { + auto sourceFieldPtr = block->getData< FieldType >( sourceFieldId ); + + auto syncGPUFieldPtr = block->getData< GPUFieldType >( syncGPUFieldId ); + cuda::fieldCpy( *syncGPUFieldPtr, *sourceFieldPtr ); + + auto asyncGPUFieldPtr = block->getData< GPUFieldType >( asyncGPUFieldId ); + cuda::fieldCpy( *asyncGPUFieldPtr, *sourceFieldPtr ); + } + + // Setup communication schemes for synchronous GPUPackInfo + CommSchemeType syncCommScheme(blocks); + syncCommScheme.addPackInfo( boost::make_shared< GPUPackInfoType >( syncGPUFieldId ) ); + + // Setup communication scheme for asynchronous GPUPackInfo, which uses CUDA streams + CommSchemeType asyncCommScheme(blocks); + asyncCommScheme.addPackInfo( boost::make_shared< GPUPackInfoType >( asyncGPUFieldId, streams ) ); + + // Perform one communication step for each scheme + syncCommScheme(); + asyncCommScheme(); + + // Check results + FieldType syncFieldCpu( cells[0], cells[1], cells[2], 1, fieldLayouts[fieldLayoutIndex], + make_shared< cuda::HostFieldAllocator< DataType > >() ); + FieldType asyncFieldCpu( cells[0], cells[1], cells[2], 1, fieldLayouts[fieldLayoutIndex], + make_shared< cuda::HostFieldAllocator< DataType > >() ); + + for( auto block = blocks->begin(); block != blocks->end(); ++block ) + { + auto syncGPUFieldPtr = block->getData< GPUFieldType >( syncGPUFieldId ); + cuda::fieldCpy( syncFieldCpu, *syncGPUFieldPtr ); + + auto asyncGPUFieldPtr = block->getData< GPUFieldType >( asyncGPUFieldId ); + cuda::fieldCpy( asyncFieldCpu, *asyncGPUFieldPtr ); + + for( auto syncIt = syncFieldCpu.beginWithGhostLayerXYZ(), asyncIt = asyncFieldCpu.beginWithGhostLayerXYZ(); + syncIt != syncFieldCpu.end(); + ++syncIt, ++asyncIt ) + WALBERLA_CHECK_EQUAL( *syncIt, *asyncIt ); + } + } + + for( uint_t i = 0; i < StencilType::Size; ++i ) + WALBERLA_CUDA_CHECK( cudaStreamDestroy(streams[i]) ); + + return EXIT_SUCCESS; +} diff --git a/tests/cuda/communication/GPUPackInfoTest.cpp b/tests/cuda/communication/GPUPackInfoTest.cpp index 28d3def70cd23dd3a69c6b3758a3e58f260684cf..29fc84f259cb66a1928337fe85c467a83ae2c6b6 100644 --- a/tests/cuda/communication/GPUPackInfoTest.cpp +++ b/tests/cuda/communication/GPUPackInfoTest.cpp @@ -34,12 +34,15 @@ #include "stencil/D3Q27.h" #include <cstring> +#include <vector> +#include <cuda_runtime.h> #define F_SIZE 19 - using namespace walberla; +static std::vector< field::Layout > fieldLayouts = { field::fzyx, field::zyxf }; +static uint_t fieldLayoutIndex = 0; cuda::GPUField<int> * createGPUField( IBlock* const block, StructuredBlockStorage* const storage ) { @@ -49,7 +52,7 @@ cuda::GPUField<int> * createGPUField( IBlock* const block, StructuredBlockStorag storage->getNumberOfZCells( *block ), // number of cells in z direction F_SIZE, // fSize 1, // number of ghost layers - field::fzyx ); + fieldLayouts[fieldLayoutIndex] ); } // Tester base class. The communicate() template method allows testing different communication methods. @@ -59,7 +62,9 @@ public: typedef cuda::communication::GPUPackInfo< cuda::GPUField<int> > GPUPackInfoType; - GPUPackInfoTester( IBlock* block, BlockDataID fieldId ): block_( block ), fieldId_( fieldId ) {} + GPUPackInfoTester( IBlock* block, BlockDataID fieldId, std::vector< cudaStream_t > & streams ) : + block_( block ), fieldId_( fieldId ), streams_(streams) {} + virtual ~GPUPackInfoTester() {} void test( stencil::Direction dir ) @@ -72,7 +77,7 @@ public: gpuField.zSize(), // number of cells in z direction 1, // number of ghost layers 0, // initial value - field::fzyx); + fieldLayouts[fieldLayoutIndex]); cpuField.setWithGhostLayer( 0 ); int val = 0; @@ -82,7 +87,7 @@ public: } cuda::fieldCpy( gpuField, cpuField ); - GPUPackInfoType gpuPackInfo( fieldId_ ); + GPUPackInfoType gpuPackInfo( fieldId_, streams_ ); communicate( gpuPackInfo, dir ); cuda::fieldCpy( cpuField, gpuField ); @@ -101,6 +106,7 @@ protected: IBlock* block_; BlockDataID fieldId_; + std::vector< cudaStream_t > streams_; }; @@ -108,7 +114,7 @@ protected: class GPUPackInfoBufferTester: public GPUPackInfoTester { public: - GPUPackInfoBufferTester( IBlock* block, BlockDataID fieldId ): GPUPackInfoTester( block, fieldId ) {} + GPUPackInfoBufferTester( IBlock* block, BlockDataID fieldId, std::vector< cudaStream_t > & streams): GPUPackInfoTester( block, fieldId, streams ) {} protected: void communicate( GPUPackInfoType& gpuPackInfo, stencil::Direction dir ) @@ -134,7 +140,7 @@ protected: class GPUPackInfoLocalTester: public GPUPackInfoTester { public: - GPUPackInfoLocalTester( IBlock* block, BlockDataID fieldId ): GPUPackInfoTester( block, fieldId ) {} + GPUPackInfoLocalTester( IBlock* block, BlockDataID fieldId, std::vector< cudaStream_t > & streams ): GPUPackInfoTester( block, fieldId, streams ) {} protected: void communicate( GPUPackInfoType& gpuPackInfo, stencil::Direction dir ) @@ -151,27 +157,42 @@ int main(int argc, char **argv) debug::enterTestMode(); MPIManager::instance()->initializeMPI(&argc,&argv); - // Create BlockForest - uint_t processes = uint_c( MPIManager::instance()->numProcesses() ); - auto blocks = createUniformBlockGrid(processes,1,1, //blocks - 2,2,2, //cells - 1, //dx - false, //one block per process - true,true,true);//periodicity - - - BlockDataID scalarGPUFieldId = blocks->addStructuredBlockData<cuda::GPUField<int> >( - &createGPUField, "ScalarGPUField" ); - - for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt ) + for(; fieldLayoutIndex < fieldLayouts.size(); ++fieldLayoutIndex ) { - GPUPackInfoBufferTester bufferTester( &(*blockIt), scalarGPUFieldId ); - GPUPackInfoLocalTester localTester( &(*blockIt), scalarGPUFieldId ); + std::vector< cudaStream_t > streams; + for( uint_t s = 0; s < stencil::D3Q27::Size; ++s ) + { + cudaStream_t stream(nullptr); + WALBERLA_CUDA_CHECK( cudaStreamCreate( &stream ) ); + streams.push_back( stream ); + } + // Create BlockForest + uint_t processes = uint_c( MPIManager::instance()->numProcesses() ); + auto blocks = createUniformBlockGrid(processes,1,1, //blocks + 2,2,2, //cells + 1, //dx + false, //one block per process + true,true,true);//periodicity + + BlockDataID scalarGPUFieldId = blocks->addStructuredBlockData<cuda::GPUField<int> >( + &createGPUField, "ScalarGPUField" ); + + for( auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt ) + { + GPUPackInfoBufferTester bufferTester( &(*blockIt), scalarGPUFieldId, streams ); + GPUPackInfoLocalTester localTester( &(*blockIt), scalarGPUFieldId, streams ); + + for( auto dir = stencil::D3Q27::beginNoCenter(); dir != stencil::D3Q27::end(); ++dir ) + { + localTester.test( *dir ); + bufferTester.test( *dir ); + } + } - for( auto dir = stencil::D3Q27::beginNoCenter(); dir != stencil::D3Q27::end(); ++dir ) + for( auto streamIt = streams.begin(); streamIt != streams.end(); ++streamIt ) { - localTester.test( *dir ); - bufferTester.test( *dir ); + cudaStream_t & stream = *streamIt; + WALBERLA_CUDA_CHECK( cudaStreamDestroy( stream ) ); } }