GPU Fields: alignment function such that first inner cell is aligned

previously the ghost layer was aligned

GPU Fields: alignment function such that first inner cell is aligned
previously the ghost layer was aligned
bdfa6981 · Martin Bauer · 020743a1 · bdfa6981 · bdfa6981 · bdfa6981
Commit bdfa6981 authored 6 years ago by Martin Bauer
--- a/src/cuda/AlignedAllocation.h
+++ b/src/cuda/AlignedAllocation.h
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file AlignedAllocation.h
+//! \ingroup cuda
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+#include "cuda/ErrorChecking.h"
+#include "core/debug/CheckFunctions.h"
+#include "core/debug/Debug.h"
+#include "core/logging/Logging.h"
+namespace walberla {
+namespace cuda {
+static std::map<void *, void*> freePointers_;
+inline void *allocate_aligned_with_offset( uint_t size, uint_t alignment, uint_t offset )
+{
+   // With 0 alignment this function makes no sense
+   // use normal malloc instead
+   WALBERLA_ASSERT_GREATER( alignment, 0 );
+   // Tests if alignment is power of two (assuming alignment>0)
+   WALBERLA_ASSERT( !(alignment & (alignment - 1)) );
+   WALBERLA_ASSERT_LESS( offset, alignment );
+   if( offset == 0 )
+   {
+      void * result = nullptr;
+      WALBERLA_CUDA_CHECK( cudaMalloc( &result, size ) );
+      freePointers_[result] = result;
+      return result;
+   }
+   void *pa;  // pointer to allocated memory
+   void *ptr; // pointer to usable aligned memory
+   WALBERLA_CUDA_CHECK( cudaMalloc( &pa, size + alignment ));
+   WALBERLA_CHECK_EQUAL(size_t(pa) % alignment, 0 , "CUDA malloc did not return memory with requested alignment");
+   ptr = (void *) ((char *) (pa) + alignment - offset);
+   freePointers_[ptr] = pa;
+   WALBERLA_ASSERT_EQUAL(((size_t) ptr + offset) % alignment, 0 );
+   return ptr;
+}
+inline void free_aligned_with_offset( void *ptr )
+{
+   // assume that pointer to real allocated chunk is stored just before
+   // chunk that was given to user
+   WALBERLA_CUDA_CHECK( cudaFree( freePointers_[ptr] ));
+   freePointers_.erase(ptr);
+}
+inline void *allocate_pitched_with_offset( size_t &pitchOut, size_t width, size_t height,
+                                           size_t alignment, size_t alignmentOffset )
+{
+   if( width % alignment == 0)
+      pitchOut = width;
+   else
+      pitchOut = ((width + alignment) / alignment ) * alignment;
+   WALBERLA_ASSERT_GREATER_EQUAL( pitchOut, width );
+   WALBERLA_ASSERT_EQUAL( pitchOut  % alignment, 0 );
+   return allocate_aligned_with_offset( pitchOut * height, alignment, alignmentOffset );
+}
+} // namespace cuda
+} // namespace walberla
--- a/src/cuda/GPUField.impl.h
+++ b/src/cuda/GPUField.impl.h
@@ -21,7 +21,7 @@
 #include "GPUField.h"
 #include "ErrorChecking.h"
+#include "AlignedAllocation.h"
 #include "core/logging/Logging.h"
 namespace walberla {
@@ -51,7 +51,12 @@ GPUField<T>::GPUField( uint_t _xSize, uint_t _ySize, uint_t _zSize, uint_t _fSiz
   if ( usePitchedMem_ )
   {
-      WALBERLA_CUDA_CHECK ( cudaMalloc3D ( &pitchedPtr_, extent ) );
+      size_t pitch;
+      const size_t alignment = 256;
+      void * mem = allocate_pitched_with_offset( pitch, extent.width, extent.height * extent.depth, alignment,
+                                                 sizeof(T) * nrOfGhostLayers_ );
+      WALBERLA_ASSERT_EQUAL( size_t((char*)(mem) + sizeof(T) * nrOfGhostLayers_ ) % alignment, 0 );
+      pitchedPtr_ = make_cudaPitchedPtr( mem, pitch, extent.width, extent.height );
   }
   else
   {
@@ -79,9 +84,15 @@ GPUField<T>::GPUField( uint_t _xSize, uint_t _ySize, uint_t _zSize, uint_t _fSiz
 template<typename T>
 GPUField<T>::~GPUField()
 {
-   cudaFree( pitchedPtr_.ptr );
+   if( usePitchedMem_ )
+      free_aligned_with_offset(pitchedPtr_.ptr );
+   else
+   {
+      WALBERLA_CUDA_CHECK( cudaFree( pitchedPtr_.ptr ) );
+   }
 }
 template<typename T>
 T * GPUField<T>::dataAt(cell_idx_t x, cell_idx_t y, cell_idx_t z, cell_idx_t f)
 {

--- a/src/field/allocation/AlignedMalloc.cpp
+++ b/src/field/allocation/AlignedMalloc.cpp
@@ -41,7 +41,7 @@ namespace field {
      void *pa;  // pointer to allocated memory
      void *ptr; // pointer to usable aligned memory
-      pa=std::malloc((size+alignment-1)+sizeof(void *));
+      pa = std::malloc((size + alignment - 1) + sizeof( void * ));
      if(!pa)
         return nullptr;

--- a/tests/cuda/AlignmentTest.cpp
+++ b/tests/cuda/AlignmentTest.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file AlignmentTest.h
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+#include "cuda/AlignedAllocation.h"
+#include "core/mpi/Environment.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/logging/Logging.h"
+using namespace walberla;
+using namespace cuda;
+int main( int argc, char ** argv )
+{
+   debug::enterTestMode();
+   mpi::Environment env( argc, argv );
+   size_t pitch = 0;
+   size_t width = 7;
+   size_t height = 20;
+   size_t alignment = 512;
+   size_t offset = 16;
+   void *ptr = allocate_pitched_with_offset( pitch, width, height, alignment, offset );
+   WALBERLA_LOG_INFO("Pitch " << pitch);
+   char * cptr = reinterpret_cast<char*>( ptr );
+   WALBERLA_CHECK_EQUAL( size_t(cptr + offset) % alignment, 0 );
+   free_aligned_with_offset( ptr );
+   return 0;
+}
--- a/tests/cuda/CMakeLists.txt
+++ b/tests/cuda/CMakeLists.txt
@@ -32,6 +32,10 @@ waLBerla_compile_test( FILES communication/CommTest )
 waLBerla_compile_test( FILES CudaMPI DEPENDS blockforest timeloop gui )
 #waLBerla_execute_test( NAME  CudaMPI )
+waLBerla_compile_test( FILES AlignmentTest.cpp DEPENDS blockforest timeloop )
+waLBerla_compile_test( FILES codegen/MicroBenchmarkGpuLbm.cpp codegen/MicroBenchmarkGpuLbm.py)
 waLBerla_add_executable ( NAME CpuGpuGeneratedEquivalenceTest
                          FILES codegen/EquivalenceTest.cpp codegen/EquivalenceTest.gen.py
                          DEPENDS blockforest boundary core cuda field stencil timeloop vtk gui )
--- a/tests/cuda/FieldTransferTest.cpp
+++ b/tests/cuda/FieldTransferTest.cpp
@@ -26,23 +26,27 @@
 #include "cuda/GPUField.h"
 #include "cuda/FieldCopy.h"
+#include "core/math/Random.h"
 using namespace walberla;
 void simpleTransfer()
 {
-   Field<double,4>  h_f1 ( 16, 20, 30, 42.0, field::fzyx );
+   Field<double, 4> h_f1( 16, 20, 30, 42.0, field::fzyx );
-   Field<double,4>  h_f2 ( 16, 20, 30,  0.0, field::fzyx );
+   Field<double, 4> h_f2( 16, 20, 30, 0.0, field::fzyx );
+   WALBERLA_FOR_ALL_CELLS_XYZ(&h_f1,
+      h_f1(x, y, z, 0) = math::realRandom<double>();
+   )
-   cuda::GPUField<double> d_f ( 16,20,30,4,0, field::fzyx );
+   cuda::GPUField<double> d_f( 16, 20, 30, 4, 0, field::fzyx );
-   WALBERLA_CHECK_EQUAL(  h_f1.xSize() ,d_f.xSize() );
+   WALBERLA_CHECK_EQUAL( h_f1.xSize(), d_f.xSize());
-   WALBERLA_CHECK_EQUAL(  h_f1.ySize() ,d_f.ySize() );
+   WALBERLA_CHECK_EQUAL( h_f1.ySize(), d_f.ySize());
-   WALBERLA_CHECK_EQUAL(  h_f1.zSize() ,d_f.zSize() );
+   WALBERLA_CHECK_EQUAL( h_f1.zSize(), d_f.zSize());
-   WALBERLA_CHECK_EQUAL(  h_f1.fSize() ,d_f.fSize() );
+   WALBERLA_CHECK_EQUAL( h_f1.fSize(), d_f.fSize());
-   WALBERLA_CHECK_EQUAL(  h_f1.layout(),d_f.layout()     );
+   WALBERLA_CHECK_EQUAL( h_f1.layout(), d_f.layout());
   cuda::fieldCpy( d_f, h_f1 );
@@ -52,9 +56,7 @@ void simpleTransfer()
 }
+int main( int argc, char **argv )
-int main( int argc, char ** argv )
 {
   debug::enterTestMode();
   walberla::Environment walberlaEnv( argc, argv );

--- a/tests/cuda/codegen/MicroBenchmarkGpuLbm.cpp
+++ b/tests/cuda/codegen/MicroBenchmarkGpuLbm.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file MicroBenchmarkPdfCopy.h
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+#include "core/debug/TestSubsystem.h"
+#include "core/mpi/Environment.h"
+#include "blockforest/Initialization.h"
+#include "field/Field.h"
+#include "cuda/GPUField.h"
+#include "cuda/FieldCopy.h"
+#include "cuda/AddGPUFieldToStorage.h"
+#include "MicroBenchmarkCopyKernel.h"
+#include "MicroBenchmarkStreamKernel.h"
+using namespace walberla;
+int main( int argc, char **argv )
+{
+   debug::enterTestMode();
+   mpi::Environment env( argc, argv );
+   shared_ptr<StructuredBlockForest> blocks = blockforest::createUniformBlockGrid(1u, 1u, 1u,
+           128u, 128u, 128u, 1.0, false, false, false, false);
+   BlockDataID srcID = cuda::addGPUFieldToStorage<cuda::GPUField<double> >(blocks, "src", 19, field::fzyx, 1);
+   BlockDataID dstID = cuda::addGPUFieldToStorage<cuda::GPUField<double> >(blocks, "dst", 19, field::fzyx, 1);
+   int iterations = 3;
+   pystencils::MicroBenchmarkCopyKernel copy(dstID, srcID);
+   for( int i=0 ; i < iterations; ++i )
+      for( auto &block: *blocks )
+         copy( &block );
+   pystencils::MicroBenchmarkStreamKernel stream(dstID, srcID);
+   for( int i=0 ; i < iterations; ++i )
+      for( auto &block: *blocks )
+         stream( &block );
+   WALBERLA_CUDA_CHECK(cudaDeviceSynchronize());
+   return 0;
+}
--- a/tests/cuda/codegen/MicroBenchmarkGpuLbm.py
+++ b/tests/cuda/codegen/MicroBenchmarkGpuLbm.py
+import pystencils as ps
+from pystencils_walberla.sweep import Sweep
+from lbmpy.updatekernels import create_stream_pull_only_kernel
+from lbmpy.stencils import get_stencil
+dtype = 'float64'
+f_size = 19
+def copy_only():
+    src, dst = ps.fields("src({f_size}), dst({f_size}) : {dtype}[3D]".format(dtype=dtype, f_size=f_size),
+                         layout='fzyx')
+    return [ps.Assignment(dst(i), src(i)) for i in range(f_size)]
+def stream_only():
+    stencil = get_stencil("D3Q19")
+    return create_stream_pull_only_kernel(stencil, src_field_name='src',
+                                          dst_field_name='dst',
+                                          generic_field_type=dtype,
+                                          generic_layout='fzyx')
+opt = {'gpu_indexing_params': {'block_size': (128, 1, 1)}, 'data_type': dtype}
+Sweep.generate_from_equations('MicroBenchmarkCopyKernel', copy_only, target='gpu', optimization=opt)
+Sweep.generate_from_equations('MicroBenchmarkStreamKernel', stream_only, target='gpu', optimization=opt)