//====================================================================================================================== // // This file is part of waLBerla. waLBerla is free software: you can // redistribute it and/or modify it under the terms of the GNU General Public // License as published by the Free Software Foundation, either version 3 of // the License, or (at your option) any later version. // // waLBerla is distributed in the hope that it will be useful, but WITHOUT // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License // for more details. // // You should have received a copy of the GNU General Public License along // with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. // //! \file GPUCopy.cpp //! \ingroup cuda //! \author Paulo Carvalho <prcjunior@inf.ufpr.br> //! \author João Victor Tozatti Risso <jvtrisso@inf.ufpr.br> //! \brief Copy routines of 4D intervals involving GPU buffers. // //====================================================================================================================== #include "core/debug/Debug.h" #include "GPUCopy.h" #include "ErrorChecking.h" #include <cstring> namespace walberla { namespace cuda { void copyDevToDevFZYX( const cudaPitchedPtr& dst, const cudaPitchedPtr& src, std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset, std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset, uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize, std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize, cudaStream_t copyStream ) { const uint_t & Nx = std::get<0>(intervalSize), & Ny = std::get<1>(intervalSize), & Nz = std::get<2>(intervalSize), & Nf = std::get<3>(intervalSize); const uint_t & srcX = std::get<0>(srcOffset), & srcY = std::get<1>(srcOffset), & srcZ = std::get<2>(srcOffset), & srcF = std::get<3>(srcOffset); const uint_t & dstX = std::get<0>(dstOffset), & dstY = std::get<1>(dstOffset), & dstZ = std::get<2>(dstOffset), & dstF = std::get<3>(dstOffset); auto copyFunctor = [&](uint_t dstCoordF, uint_t srcCoordF, uint_t fIntervalSize) { WALBERLA_ASSERT( fIntervalSize == 1 || ( Nz == dstAllocSizeZ && Nz == srcAllocSizeZ ) ); cudaMemcpy3DParms p; std::memset( &p, 0, sizeof(p) ); p.srcPos = make_cudaPos( srcX * typeSize, srcY, srcCoordF * srcAllocSizeZ + srcZ ); p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize ); p.dstPos = make_cudaPos( dstX * typeSize, dstY, dstCoordF * dstAllocSizeZ + dstZ ); p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize ); p.extent = make_cudaExtent( Nx * typeSize, Ny, Nz * fIntervalSize ); p.kind = cudaMemcpyDeviceToDevice; if ( copyStream == 0 ) { WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) ); } else { WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) ); } }; if( Nf == 1 || ( Nz == dstAllocSizeZ && Nz == srcAllocSizeZ ) ) { copyFunctor( dstF, srcF, Nf ); } else { for( uint_t f = 0; f < Nf; ++f ) { copyFunctor( dstF + f, srcF + f, uint_c(1) ); } } } void copyDevToDevZYXF( const cudaPitchedPtr& dst, const cudaPitchedPtr& src, std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset, std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset, uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize, std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize, cudaStream_t copyStream ) { const uint_t & Nx = std::get<0>(intervalSize), & Ny = std::get<1>(intervalSize), & Nz = std::get<2>(intervalSize), & Nf = std::get<3>(intervalSize); const uint_t & srcX = std::get<0>(srcOffset), & srcY = std::get<1>(srcOffset), & srcZ = std::get<2>(srcOffset), & srcF = std::get<3>(srcOffset); const uint_t & dstX = std::get<0>(dstOffset), & dstY = std::get<1>(dstOffset), & dstZ = std::get<2>(dstOffset), & dstF = std::get<3>(dstOffset); auto copyFunctor = [&](uint_t dstCoordZ, uint_t srcCoordZ, uint_t zIntervalSize) { cudaMemcpy3DParms p; std::memset( &p, 0, sizeof(p) ); p.srcPos = make_cudaPos( srcF * typeSize, srcX, srcCoordZ * srcAllocSizeY + srcY ); p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize ); p.dstPos = make_cudaPos( dstF * typeSize, dstX, dstCoordZ * dstAllocSizeY + dstY ); p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize ); p.extent = make_cudaExtent( Nf * typeSize, Nx, Ny * zIntervalSize ); p.kind = cudaMemcpyDeviceToDevice; if ( copyStream == 0 ) { WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) ); } else { WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) ); } }; if ( Nz == 1 || ( Ny == dstAllocSizeY && Ny == srcAllocSizeY ) ) { copyFunctor( dstZ, srcZ, Nz ); } else { for( uint_t z = 0; z < Nz; ++z ) { copyFunctor( dstZ + z, srcZ + z, 1 ); } } } void copyHostToDevFZYX( const cudaPitchedPtr& dst, unsigned char* src, std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset, std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset, uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize, std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize, cudaStream_t copyStream ) { const uint_t & Nx = std::get<0>(intervalSize), & Ny = std::get<1>(intervalSize), & Nz = std::get<2>(intervalSize), & Nf = std::get<3>(intervalSize); const uint_t & srcX = std::get<0>(srcOffset), & srcY = std::get<1>(srcOffset), & srcZ = std::get<2>(srcOffset), & srcF = std::get<3>(srcOffset); const uint_t & dstX = std::get<0>(dstOffset), & dstY = std::get<1>(dstOffset), & dstZ = std::get<2>(dstOffset), & dstF = std::get<3>(dstOffset); auto copyFunctor = [&](uint_t dstCoordF, uint_t srcCoordF, uint_t fIntervalSize) { cudaMemcpy3DParms p; std::memset( &p, 0, sizeof(p) ); p.srcPos = make_cudaPos( srcX * typeSize, srcY, srcCoordF * srcAllocSizeZ + srcZ ); p.srcPtr = make_cudaPitchedPtr( src, Nx * typeSize, Nx * typeSize, Ny ); p.dstPos = make_cudaPos( dstX * typeSize, dstY, dstCoordF * dstAllocSizeZ + dstZ ); p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize ); p.extent = make_cudaExtent( Nx * typeSize, Ny, Nz * fIntervalSize ); p.kind = cudaMemcpyHostToDevice; if (copyStream == 0) { WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) ); } else { // Using cudaMemcpy3DAsync requires page-locked memory on the host! WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) ); } }; if ( Nf == 1 || ( Nz == dstAllocSizeZ ) ) { copyFunctor( dstF, srcF, Nf ); } else { for( uint_t f = 0; f < Nf; ++f ) { copyFunctor( dstF + f, srcF + f, uint_c(1) ); } } } void copyHostToDevZYXF( const cudaPitchedPtr& dst, unsigned char* src, std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset, std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset, uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize, std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize, cudaStream_t copyStream ) { const uint_t & Nx = std::get<0>(intervalSize), & Ny = std::get<1>(intervalSize), & Nz = std::get<2>(intervalSize), & Nf = std::get<3>(intervalSize); const uint_t & srcX = std::get<0>(srcOffset), & srcY = std::get<1>(srcOffset), & srcZ = std::get<2>(srcOffset), & srcF = std::get<3>(srcOffset); const uint_t & dstX = std::get<0>(dstOffset), & dstY = std::get<1>(dstOffset), & dstZ = std::get<2>(dstOffset), & dstF = std::get<3>(dstOffset); auto copyFunctor = [&](uint_t dstCoordZ, uint_t srcCoordZ, uint_t zIntervalSize) { cudaMemcpy3DParms p; std::memset( &p, 0, sizeof(p) ); p.srcPos = make_cudaPos( srcF * typeSize, srcX, srcCoordZ * srcAllocSizeY + srcY ); p.srcPtr = make_cudaPitchedPtr( src, Nf * typeSize, Nf * typeSize, Nx ); p.dstPos = make_cudaPos( dstF * typeSize, dstX, dstCoordZ * dstAllocSizeY + dstY ); p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize ); p.extent = make_cudaExtent( Nf * typeSize, Nx, Ny * zIntervalSize ); p.kind = cudaMemcpyHostToDevice; if ( copyStream == 0 ) { WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) ); } else { // Using cudaMemcpy3DAsync requires page-locked memory on the host! WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) ); } }; if ( Nz == 1 || ( Ny == dstAllocSizeY && Ny == srcAllocSizeY ) ) { copyFunctor( dstZ, srcZ, Nz ); } else { for( uint_t z = 0; z < Nz; ++z ) { copyFunctor( dstZ + z, srcZ + z, 1 ); } } } void copyDevToHostFZYX( unsigned char* dst, const cudaPitchedPtr& src, std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset, std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset, uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize, std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize, cudaStream_t copyStream ) { const uint_t & Nx = std::get<0>(intervalSize), & Ny = std::get<1>(intervalSize), & Nz = std::get<2>(intervalSize), & Nf = std::get<3>(intervalSize); const uint_t & srcX = std::get<0>(srcOffset), & srcY = std::get<1>(srcOffset), & srcZ = std::get<2>(srcOffset), & srcF = std::get<3>(srcOffset); const uint_t & dstX = std::get<0>(dstOffset), & dstY = std::get<1>(dstOffset), & dstZ = std::get<2>(dstOffset), & dstF = std::get<3>(dstOffset); auto copyFunctor = [&](uint_t dstCoordF, uint_t srcCoordF, uint_t fIntervalSize) { cudaMemcpy3DParms p; std::memset( &p, 0, sizeof(p) ); p.srcPos = make_cudaPos( srcX * typeSize, srcY, srcCoordF * srcAllocSizeZ + srcZ ); p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize ); p.dstPos = make_cudaPos( dstX * typeSize, dstY, dstCoordF * dstAllocSizeZ + dstZ ); p.dstPtr = make_cudaPitchedPtr( dst, Nx * typeSize, Nx * typeSize, Ny ); p.extent = make_cudaExtent( Nx * typeSize, Ny, Nz * fIntervalSize ); p.kind = cudaMemcpyDeviceToHost; if ( copyStream == 0 ) { WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) ); } else { // Using cudaMemcpy3DAsync requires page-locked memory on the host! WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) ); } }; if( Nf == 1 || ( Nz == dstAllocSizeZ && Nz == srcAllocSizeZ ) ) { copyFunctor( dstF, srcF, Nf ); } else { for( uint_t f = 0; f < Nf; ++f ) { copyFunctor( dstF + f, srcF + f, 1 ); } } } void copyDevToHostZYXF( unsigned char* dst, const cudaPitchedPtr& src, std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset, std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset, uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize, std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize, cudaStream_t copyStream ) { const uint_t & Nx = std::get<0>(intervalSize), & Ny = std::get<1>(intervalSize), & Nz = std::get<2>(intervalSize), & Nf = std::get<3>(intervalSize); const uint_t & srcX = std::get<0>(srcOffset), & srcY = std::get<1>(srcOffset), & srcZ = std::get<2>(srcOffset), & srcF = std::get<3>(srcOffset); const uint_t & dstX = std::get<0>(dstOffset), & dstY = std::get<1>(dstOffset), & dstZ = std::get<2>(dstOffset), & dstF = std::get<3>(dstOffset); auto copyFunctor = [&](uint_t dstCoordZ, uint_t srcCoordZ, uint_t zIntervalSize) { cudaMemcpy3DParms p; std::memset( &p, 0, sizeof(p) ); p.srcPos = make_cudaPos( srcF * typeSize, srcX, srcCoordZ * srcAllocSizeY + srcY ); p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize ); p.dstPos = make_cudaPos( dstF * typeSize, dstX, dstCoordZ * dstAllocSizeY + dstY ); p.dstPtr = make_cudaPitchedPtr( dst, Nf * typeSize, Nf * typeSize, Nx ); p.extent = make_cudaExtent( Nf * typeSize, Nx, Ny * zIntervalSize ); p.kind = cudaMemcpyDeviceToHost; if ( copyStream == 0 ) { WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) ); } else { // Using cudaMemcpy3DAsync requires page-locked memory on the host! WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) ); } }; if ( Nz == 1 || ( Ny == dstAllocSizeY && Ny == srcAllocSizeY ) ) { copyFunctor( dstZ, srcZ, Nz ); } else { for( uint_t z = 0; z < Nz; ++z ) { copyFunctor( dstZ + z, srcZ + z, 1 ); } } } } // namespace cuda } // namespace walberla