From bdfa6981fb5a2945af819171762bd98c28e74ae0 Mon Sep 17 00:00:00 2001 From: Martin Bauer <martin.bauer@fau.de> Date: Wed, 24 Oct 2018 17:43:25 +0200 Subject: [PATCH] GPU Fields: alignment function such that first inner cell is aligned previously the ghost layer was aligned --- src/cuda/AlignedAllocation.h | 90 +++++++++++++++++++++ src/cuda/GPUField.impl.h | 17 +++- src/field/allocation/AlignedMalloc.cpp | 2 +- tests/cuda/AlignmentTest.cpp | 50 ++++++++++++ tests/cuda/CMakeLists.txt | 4 + tests/cuda/FieldTransferTest.cpp | 24 +++--- tests/cuda/codegen/MicroBenchmarkGpuLbm.cpp | 66 +++++++++++++++ tests/cuda/codegen/MicroBenchmarkGpuLbm.py | 27 +++++++ 8 files changed, 265 insertions(+), 15 deletions(-) create mode 100644 src/cuda/AlignedAllocation.h create mode 100644 tests/cuda/AlignmentTest.cpp create mode 100644 tests/cuda/codegen/MicroBenchmarkGpuLbm.cpp create mode 100644 tests/cuda/codegen/MicroBenchmarkGpuLbm.py diff --git a/src/cuda/AlignedAllocation.h b/src/cuda/AlignedAllocation.h new file mode 100644 index 000000000..20e44be92 --- /dev/null +++ b/src/cuda/AlignedAllocation.h @@ -0,0 +1,90 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file AlignedAllocation.h +//! \ingroup cuda +//! \author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== + +#include "cuda/ErrorChecking.h" +#include "core/debug/CheckFunctions.h" +#include "core/debug/Debug.h" +#include "core/logging/Logging.h" + +namespace walberla { +namespace cuda { + +static std::map<void *, void*> freePointers_; + + +inline void *allocate_aligned_with_offset( uint_t size, uint_t alignment, uint_t offset ) +{ + // With 0 alignment this function makes no sense + // use normal malloc instead + WALBERLA_ASSERT_GREATER( alignment, 0 ); + // Tests if alignment is power of two (assuming alignment>0) + WALBERLA_ASSERT( !(alignment & (alignment - 1)) ); + + WALBERLA_ASSERT_LESS( offset, alignment ); + + if( offset == 0 ) + { + void * result = nullptr; + WALBERLA_CUDA_CHECK( cudaMalloc( &result, size ) ); + freePointers_[result] = result; + return result; + } + + void *pa; // pointer to allocated memory + void *ptr; // pointer to usable aligned memory + + WALBERLA_CUDA_CHECK( cudaMalloc( &pa, size + alignment )); + WALBERLA_CHECK_EQUAL(size_t(pa) % alignment, 0 , "CUDA malloc did not return memory with requested alignment"); + ptr = (void *) ((char *) (pa) + alignment - offset); + freePointers_[ptr] = pa; + + WALBERLA_ASSERT_EQUAL(((size_t) ptr + offset) % alignment, 0 ); + return ptr; +} + + +inline void free_aligned_with_offset( void *ptr ) +{ + // assume that pointer to real allocated chunk is stored just before + // chunk that was given to user + WALBERLA_CUDA_CHECK( cudaFree( freePointers_[ptr] )); + freePointers_.erase(ptr); +} + + + +inline void *allocate_pitched_with_offset( size_t &pitchOut, size_t width, size_t height, + size_t alignment, size_t alignmentOffset ) +{ + if( width % alignment == 0) + pitchOut = width; + else + pitchOut = ((width + alignment) / alignment ) * alignment; + + WALBERLA_ASSERT_GREATER_EQUAL( pitchOut, width ); + WALBERLA_ASSERT_EQUAL( pitchOut % alignment, 0 ); + + return allocate_aligned_with_offset( pitchOut * height, alignment, alignmentOffset ); +} + +} // namespace cuda +} // namespace walberla + diff --git a/src/cuda/GPUField.impl.h b/src/cuda/GPUField.impl.h index e5b49c7d7..702af9b66 100644 --- a/src/cuda/GPUField.impl.h +++ b/src/cuda/GPUField.impl.h @@ -21,7 +21,7 @@ #include "GPUField.h" #include "ErrorChecking.h" - +#include "AlignedAllocation.h" #include "core/logging/Logging.h" namespace walberla { @@ -51,7 +51,12 @@ GPUField<T>::GPUField( uint_t _xSize, uint_t _ySize, uint_t _zSize, uint_t _fSiz if ( usePitchedMem_ ) { - WALBERLA_CUDA_CHECK ( cudaMalloc3D ( &pitchedPtr_, extent ) ); + size_t pitch; + const size_t alignment = 256; + void * mem = allocate_pitched_with_offset( pitch, extent.width, extent.height * extent.depth, alignment, + sizeof(T) * nrOfGhostLayers_ ); + WALBERLA_ASSERT_EQUAL( size_t((char*)(mem) + sizeof(T) * nrOfGhostLayers_ ) % alignment, 0 ); + pitchedPtr_ = make_cudaPitchedPtr( mem, pitch, extent.width, extent.height ); } else { @@ -79,9 +84,15 @@ GPUField<T>::GPUField( uint_t _xSize, uint_t _ySize, uint_t _zSize, uint_t _fSiz template<typename T> GPUField<T>::~GPUField() { - cudaFree( pitchedPtr_.ptr ); + if( usePitchedMem_ ) + free_aligned_with_offset(pitchedPtr_.ptr ); + else + { + WALBERLA_CUDA_CHECK( cudaFree( pitchedPtr_.ptr ) ); + } } + template<typename T> T * GPUField<T>::dataAt(cell_idx_t x, cell_idx_t y, cell_idx_t z, cell_idx_t f) { diff --git a/src/field/allocation/AlignedMalloc.cpp b/src/field/allocation/AlignedMalloc.cpp index 578bcfd9f..d7e4409d5 100644 --- a/src/field/allocation/AlignedMalloc.cpp +++ b/src/field/allocation/AlignedMalloc.cpp @@ -41,7 +41,7 @@ namespace field { void *pa; // pointer to allocated memory void *ptr; // pointer to usable aligned memory - pa=std::malloc((size+alignment-1)+sizeof(void *)); + pa = std::malloc((size + alignment - 1) + sizeof( void * )); if(!pa) return nullptr; diff --git a/tests/cuda/AlignmentTest.cpp b/tests/cuda/AlignmentTest.cpp new file mode 100644 index 000000000..3de12c762 --- /dev/null +++ b/tests/cuda/AlignmentTest.cpp @@ -0,0 +1,50 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file AlignmentTest.h +//! \author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== + +#include "cuda/AlignedAllocation.h" +#include "core/mpi/Environment.h" +#include "core/debug/TestSubsystem.h" +#include "core/logging/Logging.h" + + +using namespace walberla; +using namespace cuda; + + +int main( int argc, char ** argv ) +{ + debug::enterTestMode(); + mpi::Environment env( argc, argv ); + + size_t pitch = 0; + size_t width = 7; + size_t height = 20; + size_t alignment = 512; + size_t offset = 16; + void *ptr = allocate_pitched_with_offset( pitch, width, height, alignment, offset ); + WALBERLA_LOG_INFO("Pitch " << pitch); + + char * cptr = reinterpret_cast<char*>( ptr ); + WALBERLA_CHECK_EQUAL( size_t(cptr + offset) % alignment, 0 ); + + free_aligned_with_offset( ptr ); + + return 0; +} diff --git a/tests/cuda/CMakeLists.txt b/tests/cuda/CMakeLists.txt index 39222ebde..dac2f4a1c 100644 --- a/tests/cuda/CMakeLists.txt +++ b/tests/cuda/CMakeLists.txt @@ -32,6 +32,10 @@ waLBerla_compile_test( FILES communication/CommTest ) waLBerla_compile_test( FILES CudaMPI DEPENDS blockforest timeloop gui ) #waLBerla_execute_test( NAME CudaMPI ) +waLBerla_compile_test( FILES AlignmentTest.cpp DEPENDS blockforest timeloop ) + +waLBerla_compile_test( FILES codegen/MicroBenchmarkGpuLbm.cpp codegen/MicroBenchmarkGpuLbm.py) + waLBerla_add_executable ( NAME CpuGpuGeneratedEquivalenceTest FILES codegen/EquivalenceTest.cpp codegen/EquivalenceTest.gen.py DEPENDS blockforest boundary core cuda field stencil timeloop vtk gui ) diff --git a/tests/cuda/FieldTransferTest.cpp b/tests/cuda/FieldTransferTest.cpp index 4cdd11b9f..7a41330a2 100644 --- a/tests/cuda/FieldTransferTest.cpp +++ b/tests/cuda/FieldTransferTest.cpp @@ -26,23 +26,27 @@ #include "cuda/GPUField.h" #include "cuda/FieldCopy.h" +#include "core/math/Random.h" using namespace walberla; void simpleTransfer() { - Field<double,4> h_f1 ( 16, 20, 30, 42.0, field::fzyx ); - Field<double,4> h_f2 ( 16, 20, 30, 0.0, field::fzyx ); + Field<double, 4> h_f1( 16, 20, 30, 42.0, field::fzyx ); + Field<double, 4> h_f2( 16, 20, 30, 0.0, field::fzyx ); + WALBERLA_FOR_ALL_CELLS_XYZ(&h_f1, + h_f1(x, y, z, 0) = math::realRandom<double>(); + ) - cuda::GPUField<double> d_f ( 16,20,30,4,0, field::fzyx ); + cuda::GPUField<double> d_f( 16, 20, 30, 4, 0, field::fzyx ); - WALBERLA_CHECK_EQUAL( h_f1.xSize() ,d_f.xSize() ); - WALBERLA_CHECK_EQUAL( h_f1.ySize() ,d_f.ySize() ); - WALBERLA_CHECK_EQUAL( h_f1.zSize() ,d_f.zSize() ); - WALBERLA_CHECK_EQUAL( h_f1.fSize() ,d_f.fSize() ); - WALBERLA_CHECK_EQUAL( h_f1.layout(),d_f.layout() ); + WALBERLA_CHECK_EQUAL( h_f1.xSize(), d_f.xSize()); + WALBERLA_CHECK_EQUAL( h_f1.ySize(), d_f.ySize()); + WALBERLA_CHECK_EQUAL( h_f1.zSize(), d_f.zSize()); + WALBERLA_CHECK_EQUAL( h_f1.fSize(), d_f.fSize()); + WALBERLA_CHECK_EQUAL( h_f1.layout(), d_f.layout()); cuda::fieldCpy( d_f, h_f1 ); @@ -52,9 +56,7 @@ void simpleTransfer() } - - -int main( int argc, char ** argv ) +int main( int argc, char **argv ) { debug::enterTestMode(); walberla::Environment walberlaEnv( argc, argv ); diff --git a/tests/cuda/codegen/MicroBenchmarkGpuLbm.cpp b/tests/cuda/codegen/MicroBenchmarkGpuLbm.cpp new file mode 100644 index 000000000..8516c0933 --- /dev/null +++ b/tests/cuda/codegen/MicroBenchmarkGpuLbm.cpp @@ -0,0 +1,66 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file MicroBenchmarkPdfCopy.h +//! \author Martin Bauer <martin.bauer@fau.de> +// +//====================================================================================================================== + + +#include "core/debug/TestSubsystem.h" +#include "core/mpi/Environment.h" +#include "blockforest/Initialization.h" + +#include "field/Field.h" + +#include "cuda/GPUField.h" +#include "cuda/FieldCopy.h" +#include "cuda/AddGPUFieldToStorage.h" + +#include "MicroBenchmarkCopyKernel.h" +#include "MicroBenchmarkStreamKernel.h" + + +using namespace walberla; + + +int main( int argc, char **argv ) +{ + debug::enterTestMode(); + mpi::Environment env( argc, argv ); + + shared_ptr<StructuredBlockForest> blocks = blockforest::createUniformBlockGrid(1u, 1u, 1u, + 128u, 128u, 128u, 1.0, false, false, false, false); + + BlockDataID srcID = cuda::addGPUFieldToStorage<cuda::GPUField<double> >(blocks, "src", 19, field::fzyx, 1); + BlockDataID dstID = cuda::addGPUFieldToStorage<cuda::GPUField<double> >(blocks, "dst", 19, field::fzyx, 1); + + int iterations = 3; + + pystencils::MicroBenchmarkCopyKernel copy(dstID, srcID); + for( int i=0 ; i < iterations; ++i ) + for( auto &block: *blocks ) + copy( &block ); + + + pystencils::MicroBenchmarkStreamKernel stream(dstID, srcID); + for( int i=0 ; i < iterations; ++i ) + for( auto &block: *blocks ) + stream( &block ); + + WALBERLA_CUDA_CHECK(cudaDeviceSynchronize()); + + return 0; +} diff --git a/tests/cuda/codegen/MicroBenchmarkGpuLbm.py b/tests/cuda/codegen/MicroBenchmarkGpuLbm.py new file mode 100644 index 000000000..b722f0510 --- /dev/null +++ b/tests/cuda/codegen/MicroBenchmarkGpuLbm.py @@ -0,0 +1,27 @@ +import pystencils as ps +from pystencils_walberla.sweep import Sweep +from lbmpy.updatekernels import create_stream_pull_only_kernel +from lbmpy.stencils import get_stencil + +dtype = 'float64' +f_size = 19 + + +def copy_only(): + src, dst = ps.fields("src({f_size}), dst({f_size}) : {dtype}[3D]".format(dtype=dtype, f_size=f_size), + layout='fzyx') + return [ps.Assignment(dst(i), src(i)) for i in range(f_size)] + + +def stream_only(): + stencil = get_stencil("D3Q19") + return create_stream_pull_only_kernel(stencil, src_field_name='src', + dst_field_name='dst', + generic_field_type=dtype, + generic_layout='fzyx') + + +opt = {'gpu_indexing_params': {'block_size': (128, 1, 1)}, 'data_type': dtype} + +Sweep.generate_from_equations('MicroBenchmarkCopyKernel', copy_only, target='gpu', optimization=opt) +Sweep.generate_from_equations('MicroBenchmarkStreamKernel', stream_only, target='gpu', optimization=opt) -- GitLab