Commit bdfa6981 authored by Martin Bauer's avatar Martin Bauer
Browse files

GPU Fields: alignment function such that first inner cell is aligned

previously the ghost layer was aligned
parent 020743a1
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file AlignedAllocation.h
//! \ingroup cuda
//! \author Martin Bauer <martin.bauer@fau.de>
//
//======================================================================================================================
#include "cuda/ErrorChecking.h"
#include "core/debug/CheckFunctions.h"
#include "core/debug/Debug.h"
#include "core/logging/Logging.h"
namespace walberla {
namespace cuda {
static std::map<void *, void*> freePointers_;
inline void *allocate_aligned_with_offset( uint_t size, uint_t alignment, uint_t offset )
{
// With 0 alignment this function makes no sense
// use normal malloc instead
WALBERLA_ASSERT_GREATER( alignment, 0 );
// Tests if alignment is power of two (assuming alignment>0)
WALBERLA_ASSERT( !(alignment & (alignment - 1)) );
WALBERLA_ASSERT_LESS( offset, alignment );
if( offset == 0 )
{
void * result = nullptr;
WALBERLA_CUDA_CHECK( cudaMalloc( &result, size ) );
freePointers_[result] = result;
return result;
}
void *pa; // pointer to allocated memory
void *ptr; // pointer to usable aligned memory
WALBERLA_CUDA_CHECK( cudaMalloc( &pa, size + alignment ));
WALBERLA_CHECK_EQUAL(size_t(pa) % alignment, 0 , "CUDA malloc did not return memory with requested alignment");
ptr = (void *) ((char *) (pa) + alignment - offset);
freePointers_[ptr] = pa;
WALBERLA_ASSERT_EQUAL(((size_t) ptr + offset) % alignment, 0 );
return ptr;
}
inline void free_aligned_with_offset( void *ptr )
{
// assume that pointer to real allocated chunk is stored just before
// chunk that was given to user
WALBERLA_CUDA_CHECK( cudaFree( freePointers_[ptr] ));
freePointers_.erase(ptr);
}
inline void *allocate_pitched_with_offset( size_t &pitchOut, size_t width, size_t height,
size_t alignment, size_t alignmentOffset )
{
if( width % alignment == 0)
pitchOut = width;
else
pitchOut = ((width + alignment) / alignment ) * alignment;
WALBERLA_ASSERT_GREATER_EQUAL( pitchOut, width );
WALBERLA_ASSERT_EQUAL( pitchOut % alignment, 0 );
return allocate_aligned_with_offset( pitchOut * height, alignment, alignmentOffset );
}
} // namespace cuda
} // namespace walberla
......@@ -21,7 +21,7 @@
#include "GPUField.h"
#include "ErrorChecking.h"
#include "AlignedAllocation.h"
#include "core/logging/Logging.h"
namespace walberla {
......@@ -51,7 +51,12 @@ GPUField<T>::GPUField( uint_t _xSize, uint_t _ySize, uint_t _zSize, uint_t _fSiz
if ( usePitchedMem_ )
{
WALBERLA_CUDA_CHECK ( cudaMalloc3D ( &pitchedPtr_, extent ) );
size_t pitch;
const size_t alignment = 256;
void * mem = allocate_pitched_with_offset( pitch, extent.width, extent.height * extent.depth, alignment,
sizeof(T) * nrOfGhostLayers_ );
WALBERLA_ASSERT_EQUAL( size_t((char*)(mem) + sizeof(T) * nrOfGhostLayers_ ) % alignment, 0 );
pitchedPtr_ = make_cudaPitchedPtr( mem, pitch, extent.width, extent.height );
}
else
{
......@@ -79,9 +84,15 @@ GPUField<T>::GPUField( uint_t _xSize, uint_t _ySize, uint_t _zSize, uint_t _fSiz
template<typename T>
GPUField<T>::~GPUField()
{
cudaFree( pitchedPtr_.ptr );
if( usePitchedMem_ )
free_aligned_with_offset(pitchedPtr_.ptr );
else
{
WALBERLA_CUDA_CHECK( cudaFree( pitchedPtr_.ptr ) );
}
}
template<typename T>
T * GPUField<T>::dataAt(cell_idx_t x, cell_idx_t y, cell_idx_t z, cell_idx_t f)
{
......
......@@ -41,7 +41,7 @@ namespace field {
void *pa; // pointer to allocated memory
void *ptr; // pointer to usable aligned memory
pa=std::malloc((size+alignment-1)+sizeof(void *));
pa = std::malloc((size + alignment - 1) + sizeof( void * ));
if(!pa)
return nullptr;
......
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file AlignmentTest.h
//! \author Martin Bauer <martin.bauer@fau.de>
//
//======================================================================================================================
#include "cuda/AlignedAllocation.h"
#include "core/mpi/Environment.h"
#include "core/debug/TestSubsystem.h"
#include "core/logging/Logging.h"
using namespace walberla;
using namespace cuda;
int main( int argc, char ** argv )
{
debug::enterTestMode();
mpi::Environment env( argc, argv );
size_t pitch = 0;
size_t width = 7;
size_t height = 20;
size_t alignment = 512;
size_t offset = 16;
void *ptr = allocate_pitched_with_offset( pitch, width, height, alignment, offset );
WALBERLA_LOG_INFO("Pitch " << pitch);
char * cptr = reinterpret_cast<char*>( ptr );
WALBERLA_CHECK_EQUAL( size_t(cptr + offset) % alignment, 0 );
free_aligned_with_offset( ptr );
return 0;
}
......@@ -32,6 +32,10 @@ waLBerla_compile_test( FILES communication/CommTest )
waLBerla_compile_test( FILES CudaMPI DEPENDS blockforest timeloop gui )
#waLBerla_execute_test( NAME CudaMPI )
waLBerla_compile_test( FILES AlignmentTest.cpp DEPENDS blockforest timeloop )
waLBerla_compile_test( FILES codegen/MicroBenchmarkGpuLbm.cpp codegen/MicroBenchmarkGpuLbm.py)
waLBerla_add_executable ( NAME CpuGpuGeneratedEquivalenceTest
FILES codegen/EquivalenceTest.cpp codegen/EquivalenceTest.gen.py
DEPENDS blockforest boundary core cuda field stencil timeloop vtk gui )
......@@ -26,23 +26,27 @@
#include "cuda/GPUField.h"
#include "cuda/FieldCopy.h"
#include "core/math/Random.h"
using namespace walberla;
void simpleTransfer()
{
Field<double,4> h_f1 ( 16, 20, 30, 42.0, field::fzyx );
Field<double,4> h_f2 ( 16, 20, 30, 0.0, field::fzyx );
Field<double, 4> h_f1( 16, 20, 30, 42.0, field::fzyx );
Field<double, 4> h_f2( 16, 20, 30, 0.0, field::fzyx );
WALBERLA_FOR_ALL_CELLS_XYZ(&h_f1,
h_f1(x, y, z, 0) = math::realRandom<double>();
)
cuda::GPUField<double> d_f ( 16,20,30,4,0, field::fzyx );
cuda::GPUField<double> d_f( 16, 20, 30, 4, 0, field::fzyx );
WALBERLA_CHECK_EQUAL( h_f1.xSize() ,d_f.xSize() );
WALBERLA_CHECK_EQUAL( h_f1.ySize() ,d_f.ySize() );
WALBERLA_CHECK_EQUAL( h_f1.zSize() ,d_f.zSize() );
WALBERLA_CHECK_EQUAL( h_f1.fSize() ,d_f.fSize() );
WALBERLA_CHECK_EQUAL( h_f1.layout(),d_f.layout() );
WALBERLA_CHECK_EQUAL( h_f1.xSize(), d_f.xSize());
WALBERLA_CHECK_EQUAL( h_f1.ySize(), d_f.ySize());
WALBERLA_CHECK_EQUAL( h_f1.zSize(), d_f.zSize());
WALBERLA_CHECK_EQUAL( h_f1.fSize(), d_f.fSize());
WALBERLA_CHECK_EQUAL( h_f1.layout(), d_f.layout());
cuda::fieldCpy( d_f, h_f1 );
......@@ -52,9 +56,7 @@ void simpleTransfer()
}
int main( int argc, char ** argv )
int main( int argc, char **argv )
{
debug::enterTestMode();
walberla::Environment walberlaEnv( argc, argv );
......
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file MicroBenchmarkPdfCopy.h
//! \author Martin Bauer <martin.bauer@fau.de>
//
//======================================================================================================================
#include "core/debug/TestSubsystem.h"
#include "core/mpi/Environment.h"
#include "blockforest/Initialization.h"
#include "field/Field.h"
#include "cuda/GPUField.h"
#include "cuda/FieldCopy.h"
#include "cuda/AddGPUFieldToStorage.h"
#include "MicroBenchmarkCopyKernel.h"
#include "MicroBenchmarkStreamKernel.h"
using namespace walberla;
int main( int argc, char **argv )
{
debug::enterTestMode();
mpi::Environment env( argc, argv );
shared_ptr<StructuredBlockForest> blocks = blockforest::createUniformBlockGrid(1u, 1u, 1u,
128u, 128u, 128u, 1.0, false, false, false, false);
BlockDataID srcID = cuda::addGPUFieldToStorage<cuda::GPUField<double> >(blocks, "src", 19, field::fzyx, 1);
BlockDataID dstID = cuda::addGPUFieldToStorage<cuda::GPUField<double> >(blocks, "dst", 19, field::fzyx, 1);
int iterations = 3;
pystencils::MicroBenchmarkCopyKernel copy(dstID, srcID);
for( int i=0 ; i < iterations; ++i )
for( auto &block: *blocks )
copy( &block );
pystencils::MicroBenchmarkStreamKernel stream(dstID, srcID);
for( int i=0 ; i < iterations; ++i )
for( auto &block: *blocks )
stream( &block );
WALBERLA_CUDA_CHECK(cudaDeviceSynchronize());
return 0;
}
import pystencils as ps
from pystencils_walberla.sweep import Sweep
from lbmpy.updatekernels import create_stream_pull_only_kernel
from lbmpy.stencils import get_stencil
dtype = 'float64'
f_size = 19
def copy_only():
src, dst = ps.fields("src({f_size}), dst({f_size}) : {dtype}[3D]".format(dtype=dtype, f_size=f_size),
layout='fzyx')
return [ps.Assignment(dst(i), src(i)) for i in range(f_size)]
def stream_only():
stencil = get_stencil("D3Q19")
return create_stream_pull_only_kernel(stencil, src_field_name='src',
dst_field_name='dst',
generic_field_type=dtype,
generic_layout='fzyx')
opt = {'gpu_indexing_params': {'block_size': (128, 1, 1)}, 'data_type': dtype}
Sweep.generate_from_equations('MicroBenchmarkCopyKernel', copy_only, target='gpu', optimization=opt)
Sweep.generate_from_equations('MicroBenchmarkStreamKernel', stream_only, target='gpu', optimization=opt)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment