-
Michael Kuron authoredfb43f673
Forked from
waLBerla / waLBerla
465 commits behind the upstream repository.
GPUCopy.cpp 14.05 KiB
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file GPUCopy.cpp
//! \ingroup cuda
//! \author Paulo Carvalho <prcjunior@inf.ufpr.br>
//! \author João Victor Tozatti Risso <jvtrisso@inf.ufpr.br>
//! \brief Copy routines of 4D intervals involving GPU buffers.
//
//======================================================================================================================
#include "core/debug/Debug.h"
#include "GPUCopy.h"
#include "ErrorChecking.h"
#include <cstring>
namespace walberla {
namespace cuda {
void copyDevToDevFZYX( const cudaPitchedPtr& dst, const cudaPitchedPtr& src,
std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
cudaStream_t copyStream )
{
const uint_t & Nx = std::get<0>(intervalSize),
& Ny = std::get<1>(intervalSize),
& Nz = std::get<2>(intervalSize),
& Nf = std::get<3>(intervalSize);
const uint_t & srcX = std::get<0>(srcOffset),
& srcY = std::get<1>(srcOffset),
& srcZ = std::get<2>(srcOffset),
& srcF = std::get<3>(srcOffset);
const uint_t & dstX = std::get<0>(dstOffset),
& dstY = std::get<1>(dstOffset),
& dstZ = std::get<2>(dstOffset),
& dstF = std::get<3>(dstOffset);
auto copyFunctor = [&](uint_t dstCoordF, uint_t srcCoordF, uint_t fIntervalSize) {
WALBERLA_ASSERT( fIntervalSize == 1 || ( Nz == dstAllocSizeZ && Nz == srcAllocSizeZ ) );
cudaMemcpy3DParms p;
std::memset( &p, 0, sizeof(p) );
p.srcPos = make_cudaPos( srcX * typeSize, srcY, srcCoordF * srcAllocSizeZ + srcZ );
p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
p.dstPos = make_cudaPos( dstX * typeSize, dstY, dstCoordF * dstAllocSizeZ + dstZ );
p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
p.extent = make_cudaExtent( Nx * typeSize, Ny, Nz * fIntervalSize );
p.kind = cudaMemcpyDeviceToDevice;
if ( copyStream == 0 )
{
WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
}
else
{
WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
}
};
if( Nf == 1 || ( Nz == dstAllocSizeZ && Nz == srcAllocSizeZ ) )
{
copyFunctor( dstF, srcF, Nf );
}
else
{
for( uint_t f = 0; f < Nf; ++f )
{
copyFunctor( dstF + f, srcF + f, uint_c(1) );
}
}
}
void copyDevToDevZYXF( const cudaPitchedPtr& dst, const cudaPitchedPtr& src,
std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
cudaStream_t copyStream )
{
const uint_t & Nx = std::get<0>(intervalSize),
& Ny = std::get<1>(intervalSize),
& Nz = std::get<2>(intervalSize),
& Nf = std::get<3>(intervalSize);
const uint_t & srcX = std::get<0>(srcOffset),
& srcY = std::get<1>(srcOffset),
& srcZ = std::get<2>(srcOffset),
& srcF = std::get<3>(srcOffset);
const uint_t & dstX = std::get<0>(dstOffset),
& dstY = std::get<1>(dstOffset),
& dstZ = std::get<2>(dstOffset),
& dstF = std::get<3>(dstOffset);
auto copyFunctor = [&](uint_t dstCoordZ, uint_t srcCoordZ, uint_t zIntervalSize) {
cudaMemcpy3DParms p;
std::memset( &p, 0, sizeof(p) );
p.srcPos = make_cudaPos( srcF * typeSize, srcX, srcCoordZ * srcAllocSizeY + srcY );
p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
p.dstPos = make_cudaPos( dstF * typeSize, dstX, dstCoordZ * dstAllocSizeY + dstY );
p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
p.extent = make_cudaExtent( Nf * typeSize, Nx, Ny * zIntervalSize );
p.kind = cudaMemcpyDeviceToDevice;
if ( copyStream == 0 )
{
WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
}
else
{
WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
}
};
if ( Nz == 1 || ( Ny == dstAllocSizeY && Ny == srcAllocSizeY ) )
{
copyFunctor( dstZ, srcZ, Nz );
}
else
{
for( uint_t z = 0; z < Nz; ++z )
{
copyFunctor( dstZ + z, srcZ + z, 1 );
}
}
}
void copyHostToDevFZYX( const cudaPitchedPtr& dst, unsigned char* src,
std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
cudaStream_t copyStream )
{
const uint_t & Nx = std::get<0>(intervalSize),
& Ny = std::get<1>(intervalSize),
& Nz = std::get<2>(intervalSize),
& Nf = std::get<3>(intervalSize);
const uint_t & srcX = std::get<0>(srcOffset),
& srcY = std::get<1>(srcOffset),
& srcZ = std::get<2>(srcOffset),
& srcF = std::get<3>(srcOffset);
const uint_t & dstX = std::get<0>(dstOffset),
& dstY = std::get<1>(dstOffset),
& dstZ = std::get<2>(dstOffset),
& dstF = std::get<3>(dstOffset);
auto copyFunctor = [&](uint_t dstCoordF, uint_t srcCoordF, uint_t fIntervalSize) {
cudaMemcpy3DParms p;
std::memset( &p, 0, sizeof(p) );
p.srcPos = make_cudaPos( srcX * typeSize, srcY, srcCoordF * srcAllocSizeZ + srcZ );
p.srcPtr = make_cudaPitchedPtr( src, Nx * typeSize, Nx * typeSize, Ny );
p.dstPos = make_cudaPos( dstX * typeSize, dstY, dstCoordF * dstAllocSizeZ + dstZ );
p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
p.extent = make_cudaExtent( Nx * typeSize, Ny, Nz * fIntervalSize );
p.kind = cudaMemcpyHostToDevice;
if (copyStream == 0)
{
WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
}
else
{
// Using cudaMemcpy3DAsync requires page-locked memory on the host!
WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
}
};
if ( Nf == 1 || ( Nz == dstAllocSizeZ ) )
{
copyFunctor( dstF, srcF, Nf );
}
else
{
for( uint_t f = 0; f < Nf; ++f )
{
copyFunctor( dstF + f, srcF + f, uint_c(1) );
}
}
}
void copyHostToDevZYXF( const cudaPitchedPtr& dst, unsigned char* src,
std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
cudaStream_t copyStream )
{
const uint_t & Nx = std::get<0>(intervalSize),
& Ny = std::get<1>(intervalSize),
& Nz = std::get<2>(intervalSize),
& Nf = std::get<3>(intervalSize);
const uint_t & srcX = std::get<0>(srcOffset),
& srcY = std::get<1>(srcOffset),
& srcZ = std::get<2>(srcOffset),
& srcF = std::get<3>(srcOffset);
const uint_t & dstX = std::get<0>(dstOffset),
& dstY = std::get<1>(dstOffset),
& dstZ = std::get<2>(dstOffset),
& dstF = std::get<3>(dstOffset);
auto copyFunctor = [&](uint_t dstCoordZ, uint_t srcCoordZ, uint_t zIntervalSize) {
cudaMemcpy3DParms p;
std::memset( &p, 0, sizeof(p) );
p.srcPos = make_cudaPos( srcF * typeSize, srcX, srcCoordZ * srcAllocSizeY + srcY );
p.srcPtr = make_cudaPitchedPtr( src, Nf * typeSize, Nf * typeSize, Nx );
p.dstPos = make_cudaPos( dstF * typeSize, dstX, dstCoordZ * dstAllocSizeY + dstY );
p.dstPtr = make_cudaPitchedPtr( dst.ptr, dst.pitch, dst.xsize, dst.ysize );
p.extent = make_cudaExtent( Nf * typeSize, Nx, Ny * zIntervalSize );
p.kind = cudaMemcpyHostToDevice;
if ( copyStream == 0 )
{
WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
}
else
{
// Using cudaMemcpy3DAsync requires page-locked memory on the host!
WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
}
};
if ( Nz == 1 || ( Ny == dstAllocSizeY && Ny == srcAllocSizeY ) )
{
copyFunctor( dstZ, srcZ, Nz );
}
else
{
for( uint_t z = 0; z < Nz; ++z )
{
copyFunctor( dstZ + z, srcZ + z, 1 );
}
}
}
void copyDevToHostFZYX( unsigned char* dst, const cudaPitchedPtr& src,
std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
uint_t dstAllocSizeZ, uint_t srcAllocSizeZ, uint_t typeSize,
std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
cudaStream_t copyStream )
{
const uint_t & Nx = std::get<0>(intervalSize),
& Ny = std::get<1>(intervalSize),
& Nz = std::get<2>(intervalSize),
& Nf = std::get<3>(intervalSize);
const uint_t & srcX = std::get<0>(srcOffset),
& srcY = std::get<1>(srcOffset),
& srcZ = std::get<2>(srcOffset),
& srcF = std::get<3>(srcOffset);
const uint_t & dstX = std::get<0>(dstOffset),
& dstY = std::get<1>(dstOffset),
& dstZ = std::get<2>(dstOffset),
& dstF = std::get<3>(dstOffset);
auto copyFunctor = [&](uint_t dstCoordF, uint_t srcCoordF, uint_t fIntervalSize) {
cudaMemcpy3DParms p;
std::memset( &p, 0, sizeof(p) );
p.srcPos = make_cudaPos( srcX * typeSize, srcY, srcCoordF * srcAllocSizeZ + srcZ );
p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
p.dstPos = make_cudaPos( dstX * typeSize, dstY, dstCoordF * dstAllocSizeZ + dstZ );
p.dstPtr = make_cudaPitchedPtr( dst, Nx * typeSize, Nx * typeSize, Ny );
p.extent = make_cudaExtent( Nx * typeSize, Ny, Nz * fIntervalSize );
p.kind = cudaMemcpyDeviceToHost;
if ( copyStream == 0 )
{
WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
}
else
{
// Using cudaMemcpy3DAsync requires page-locked memory on the host!
WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
}
};
if( Nf == 1 || ( Nz == dstAllocSizeZ && Nz == srcAllocSizeZ ) )
{
copyFunctor( dstF, srcF, Nf );
}
else
{
for( uint_t f = 0; f < Nf; ++f )
{
copyFunctor( dstF + f, srcF + f, 1 );
}
}
}
void copyDevToHostZYXF( unsigned char* dst, const cudaPitchedPtr& src,
std::tuple< uint_t, uint_t, uint_t, uint_t > & dstOffset,
std::tuple< uint_t, uint_t, uint_t, uint_t > & srcOffset,
uint_t dstAllocSizeY, uint_t srcAllocSizeY, uint_t typeSize,
std::tuple< uint_t, uint_t, uint_t, uint_t > & intervalSize,
cudaStream_t copyStream )
{
const uint_t & Nx = std::get<0>(intervalSize),
& Ny = std::get<1>(intervalSize),
& Nz = std::get<2>(intervalSize),
& Nf = std::get<3>(intervalSize);
const uint_t & srcX = std::get<0>(srcOffset),
& srcY = std::get<1>(srcOffset),
& srcZ = std::get<2>(srcOffset),
& srcF = std::get<3>(srcOffset);
const uint_t & dstX = std::get<0>(dstOffset),
& dstY = std::get<1>(dstOffset),
& dstZ = std::get<2>(dstOffset),
& dstF = std::get<3>(dstOffset);
auto copyFunctor = [&](uint_t dstCoordZ, uint_t srcCoordZ, uint_t zIntervalSize) {
cudaMemcpy3DParms p;
std::memset( &p, 0, sizeof(p) );
p.srcPos = make_cudaPos( srcF * typeSize, srcX, srcCoordZ * srcAllocSizeY + srcY );
p.srcPtr = make_cudaPitchedPtr( src.ptr, src.pitch, src.xsize, src.ysize );
p.dstPos = make_cudaPos( dstF * typeSize, dstX, dstCoordZ * dstAllocSizeY + dstY );
p.dstPtr = make_cudaPitchedPtr( dst, Nf * typeSize, Nf * typeSize, Nx );
p.extent = make_cudaExtent( Nf * typeSize, Nx, Ny * zIntervalSize );
p.kind = cudaMemcpyDeviceToHost;
if ( copyStream == 0 )
{
WALBERLA_CUDA_CHECK( cudaMemcpy3D(&p) );
}
else
{
// Using cudaMemcpy3DAsync requires page-locked memory on the host!
WALBERLA_CUDA_CHECK( cudaMemcpy3DAsync(&p, copyStream) );
}
};
if ( Nz == 1 || ( Ny == dstAllocSizeY && Ny == srcAllocSizeY ) )
{
copyFunctor( dstZ, srcZ, Nz );
}
else
{
for( uint_t z = 0; z < Nz; ++z )
{
copyFunctor( dstZ + z, srcZ + z, 1 );
}
}
}
} // namespace cuda
} // namespace walberla