From bdfa6981fb5a2945af819171762bd98c28e74ae0 Mon Sep 17 00:00:00 2001
From: Martin Bauer <martin.bauer@fau.de>
Date: Wed, 24 Oct 2018 17:43:25 +0200
Subject: [PATCH] GPU Fields: alignment function such that first inner cell is
 aligned

previously the ghost layer was aligned
---
 src/cuda/AlignedAllocation.h                | 90 +++++++++++++++++++++
 src/cuda/GPUField.impl.h                    | 17 +++-
 src/field/allocation/AlignedMalloc.cpp      |  2 +-
 tests/cuda/AlignmentTest.cpp                | 50 ++++++++++++
 tests/cuda/CMakeLists.txt                   |  4 +
 tests/cuda/FieldTransferTest.cpp            | 24 +++---
 tests/cuda/codegen/MicroBenchmarkGpuLbm.cpp | 66 +++++++++++++++
 tests/cuda/codegen/MicroBenchmarkGpuLbm.py  | 27 +++++++
 8 files changed, 265 insertions(+), 15 deletions(-)
 create mode 100644 src/cuda/AlignedAllocation.h
 create mode 100644 tests/cuda/AlignmentTest.cpp
 create mode 100644 tests/cuda/codegen/MicroBenchmarkGpuLbm.cpp
 create mode 100644 tests/cuda/codegen/MicroBenchmarkGpuLbm.py

diff --git a/src/cuda/AlignedAllocation.h b/src/cuda/AlignedAllocation.h
new file mode 100644
index 000000000..20e44be92
--- /dev/null
+++ b/src/cuda/AlignedAllocation.h
@@ -0,0 +1,90 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file AlignedAllocation.h
+//! \ingroup cuda
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+
+#include "cuda/ErrorChecking.h"
+#include "core/debug/CheckFunctions.h"
+#include "core/debug/Debug.h"
+#include "core/logging/Logging.h"
+
+namespace walberla {
+namespace cuda {
+
+static std::map<void *, void*> freePointers_;
+
+
+inline void *allocate_aligned_with_offset( uint_t size, uint_t alignment, uint_t offset )
+{
+   // With 0 alignment this function makes no sense
+   // use normal malloc instead
+   WALBERLA_ASSERT_GREATER( alignment, 0 );
+   // Tests if alignment is power of two (assuming alignment>0)
+   WALBERLA_ASSERT( !(alignment & (alignment - 1)) );
+
+   WALBERLA_ASSERT_LESS( offset, alignment );
+
+   if( offset == 0 )
+   {
+      void * result = nullptr;
+      WALBERLA_CUDA_CHECK( cudaMalloc( &result, size ) );
+      freePointers_[result] = result;
+      return result;
+   }
+
+   void *pa;  // pointer to allocated memory
+   void *ptr; // pointer to usable aligned memory
+
+   WALBERLA_CUDA_CHECK( cudaMalloc( &pa, size + alignment ));
+   WALBERLA_CHECK_EQUAL(size_t(pa) % alignment, 0 , "CUDA malloc did not return memory with requested alignment");
+   ptr = (void *) ((char *) (pa) + alignment - offset);
+   freePointers_[ptr] = pa;
+
+   WALBERLA_ASSERT_EQUAL(((size_t) ptr + offset) % alignment, 0 );
+   return ptr;
+}
+
+
+inline void free_aligned_with_offset( void *ptr )
+{
+   // assume that pointer to real allocated chunk is stored just before
+   // chunk that was given to user
+   WALBERLA_CUDA_CHECK( cudaFree( freePointers_[ptr] ));
+   freePointers_.erase(ptr);
+}
+
+
+
+inline void *allocate_pitched_with_offset( size_t &pitchOut, size_t width, size_t height,
+                                           size_t alignment, size_t alignmentOffset )
+{
+   if( width % alignment == 0)
+      pitchOut = width;
+   else
+      pitchOut = ((width + alignment) / alignment ) * alignment;
+
+   WALBERLA_ASSERT_GREATER_EQUAL( pitchOut, width );
+   WALBERLA_ASSERT_EQUAL( pitchOut  % alignment, 0 );
+
+   return allocate_aligned_with_offset( pitchOut * height, alignment, alignmentOffset );
+}
+
+} // namespace cuda
+} // namespace walberla
+
diff --git a/src/cuda/GPUField.impl.h b/src/cuda/GPUField.impl.h
index e5b49c7d7..702af9b66 100644
--- a/src/cuda/GPUField.impl.h
+++ b/src/cuda/GPUField.impl.h
@@ -21,7 +21,7 @@
 
 #include "GPUField.h"
 #include "ErrorChecking.h"
-
+#include "AlignedAllocation.h"
 #include "core/logging/Logging.h"
 
 namespace walberla {
@@ -51,7 +51,12 @@ GPUField<T>::GPUField( uint_t _xSize, uint_t _ySize, uint_t _zSize, uint_t _fSiz
 
    if ( usePitchedMem_ )
    {
-      WALBERLA_CUDA_CHECK ( cudaMalloc3D ( &pitchedPtr_, extent ) );
+      size_t pitch;
+      const size_t alignment = 256;
+      void * mem = allocate_pitched_with_offset( pitch, extent.width, extent.height * extent.depth, alignment,
+                                                 sizeof(T) * nrOfGhostLayers_ );
+      WALBERLA_ASSERT_EQUAL( size_t((char*)(mem) + sizeof(T) * nrOfGhostLayers_ ) % alignment, 0 );
+      pitchedPtr_ = make_cudaPitchedPtr( mem, pitch, extent.width, extent.height );
    }
    else
    {
@@ -79,9 +84,15 @@ GPUField<T>::GPUField( uint_t _xSize, uint_t _ySize, uint_t _zSize, uint_t _fSiz
 template<typename T>
 GPUField<T>::~GPUField()
 {
-   cudaFree( pitchedPtr_.ptr );
+   if( usePitchedMem_ )
+      free_aligned_with_offset(pitchedPtr_.ptr );
+   else
+   {
+      WALBERLA_CUDA_CHECK( cudaFree( pitchedPtr_.ptr ) );
+   }
 }
 
+
 template<typename T>
 T * GPUField<T>::dataAt(cell_idx_t x, cell_idx_t y, cell_idx_t z, cell_idx_t f)
 {
diff --git a/src/field/allocation/AlignedMalloc.cpp b/src/field/allocation/AlignedMalloc.cpp
index 578bcfd9f..d7e4409d5 100644
--- a/src/field/allocation/AlignedMalloc.cpp
+++ b/src/field/allocation/AlignedMalloc.cpp
@@ -41,7 +41,7 @@ namespace field {
       void *pa;  // pointer to allocated memory
       void *ptr; // pointer to usable aligned memory
 
-      pa=std::malloc((size+alignment-1)+sizeof(void *));
+      pa = std::malloc((size + alignment - 1) + sizeof( void * ));
       if(!pa)
          return nullptr;
 
diff --git a/tests/cuda/AlignmentTest.cpp b/tests/cuda/AlignmentTest.cpp
new file mode 100644
index 000000000..3de12c762
--- /dev/null
+++ b/tests/cuda/AlignmentTest.cpp
@@ -0,0 +1,50 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file AlignmentTest.h
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+
+#include "cuda/AlignedAllocation.h"
+#include "core/mpi/Environment.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/logging/Logging.h"
+
+
+using namespace walberla;
+using namespace cuda;
+
+
+int main( int argc, char ** argv )
+{
+   debug::enterTestMode();
+   mpi::Environment env( argc, argv );
+
+   size_t pitch = 0;
+   size_t width = 7;
+   size_t height = 20;
+   size_t alignment = 512;
+   size_t offset = 16;
+   void *ptr = allocate_pitched_with_offset( pitch, width, height, alignment, offset );
+   WALBERLA_LOG_INFO("Pitch " << pitch);
+
+   char * cptr = reinterpret_cast<char*>( ptr );
+   WALBERLA_CHECK_EQUAL( size_t(cptr + offset) % alignment, 0 );
+
+   free_aligned_with_offset( ptr );
+
+   return 0;
+}
diff --git a/tests/cuda/CMakeLists.txt b/tests/cuda/CMakeLists.txt
index 39222ebde..dac2f4a1c 100644
--- a/tests/cuda/CMakeLists.txt
+++ b/tests/cuda/CMakeLists.txt
@@ -32,6 +32,10 @@ waLBerla_compile_test( FILES communication/CommTest )
 waLBerla_compile_test( FILES CudaMPI DEPENDS blockforest timeloop gui )
 #waLBerla_execute_test( NAME  CudaMPI )
 
+waLBerla_compile_test( FILES AlignmentTest.cpp DEPENDS blockforest timeloop )
+
+waLBerla_compile_test( FILES codegen/MicroBenchmarkGpuLbm.cpp codegen/MicroBenchmarkGpuLbm.py)
+
 waLBerla_add_executable ( NAME CpuGpuGeneratedEquivalenceTest
                           FILES codegen/EquivalenceTest.cpp codegen/EquivalenceTest.gen.py
                           DEPENDS blockforest boundary core cuda field stencil timeloop vtk gui )
diff --git a/tests/cuda/FieldTransferTest.cpp b/tests/cuda/FieldTransferTest.cpp
index 4cdd11b9f..7a41330a2 100644
--- a/tests/cuda/FieldTransferTest.cpp
+++ b/tests/cuda/FieldTransferTest.cpp
@@ -26,23 +26,27 @@
 
 #include "cuda/GPUField.h"
 #include "cuda/FieldCopy.h"
+#include "core/math/Random.h"
 
 
 using namespace walberla;
 
 void simpleTransfer()
 {
-   Field<double,4>  h_f1 ( 16, 20, 30, 42.0, field::fzyx );
-   Field<double,4>  h_f2 ( 16, 20, 30,  0.0, field::fzyx );
+   Field<double, 4> h_f1( 16, 20, 30, 42.0, field::fzyx );
+   Field<double, 4> h_f2( 16, 20, 30, 0.0, field::fzyx );
 
+   WALBERLA_FOR_ALL_CELLS_XYZ(&h_f1,
+      h_f1(x, y, z, 0) = math::realRandom<double>();
+   )
 
-   cuda::GPUField<double> d_f ( 16,20,30,4,0, field::fzyx );
+   cuda::GPUField<double> d_f( 16, 20, 30, 4, 0, field::fzyx );
 
-   WALBERLA_CHECK_EQUAL(  h_f1.xSize() ,d_f.xSize() );
-   WALBERLA_CHECK_EQUAL(  h_f1.ySize() ,d_f.ySize() );
-   WALBERLA_CHECK_EQUAL(  h_f1.zSize() ,d_f.zSize() );
-   WALBERLA_CHECK_EQUAL(  h_f1.fSize() ,d_f.fSize() );
-   WALBERLA_CHECK_EQUAL(  h_f1.layout(),d_f.layout()     );
+   WALBERLA_CHECK_EQUAL( h_f1.xSize(), d_f.xSize());
+   WALBERLA_CHECK_EQUAL( h_f1.ySize(), d_f.ySize());
+   WALBERLA_CHECK_EQUAL( h_f1.zSize(), d_f.zSize());
+   WALBERLA_CHECK_EQUAL( h_f1.fSize(), d_f.fSize());
+   WALBERLA_CHECK_EQUAL( h_f1.layout(), d_f.layout());
 
 
    cuda::fieldCpy( d_f, h_f1 );
@@ -52,9 +56,7 @@ void simpleTransfer()
 }
 
 
-
-
-int main( int argc, char ** argv )
+int main( int argc, char **argv )
 {
    debug::enterTestMode();
    walberla::Environment walberlaEnv( argc, argv );
diff --git a/tests/cuda/codegen/MicroBenchmarkGpuLbm.cpp b/tests/cuda/codegen/MicroBenchmarkGpuLbm.cpp
new file mode 100644
index 000000000..8516c0933
--- /dev/null
+++ b/tests/cuda/codegen/MicroBenchmarkGpuLbm.cpp
@@ -0,0 +1,66 @@
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file MicroBenchmarkPdfCopy.h
+//! \author Martin Bauer <martin.bauer@fau.de>
+//
+//======================================================================================================================
+
+
+#include "core/debug/TestSubsystem.h"
+#include "core/mpi/Environment.h"
+#include "blockforest/Initialization.h"
+
+#include "field/Field.h"
+
+#include "cuda/GPUField.h"
+#include "cuda/FieldCopy.h"
+#include "cuda/AddGPUFieldToStorage.h"
+
+#include "MicroBenchmarkCopyKernel.h"
+#include "MicroBenchmarkStreamKernel.h"
+
+
+using namespace walberla;
+
+
+int main( int argc, char **argv )
+{
+   debug::enterTestMode();
+   mpi::Environment env( argc, argv );
+
+   shared_ptr<StructuredBlockForest> blocks = blockforest::createUniformBlockGrid(1u, 1u, 1u,
+           128u, 128u, 128u, 1.0, false, false, false, false);
+
+   BlockDataID srcID = cuda::addGPUFieldToStorage<cuda::GPUField<double> >(blocks, "src", 19, field::fzyx, 1);
+   BlockDataID dstID = cuda::addGPUFieldToStorage<cuda::GPUField<double> >(blocks, "dst", 19, field::fzyx, 1);
+
+   int iterations = 3;
+
+   pystencils::MicroBenchmarkCopyKernel copy(dstID, srcID);
+   for( int i=0 ; i < iterations; ++i )
+      for( auto &block: *blocks )
+         copy( &block );
+
+
+   pystencils::MicroBenchmarkStreamKernel stream(dstID, srcID);
+   for( int i=0 ; i < iterations; ++i )
+      for( auto &block: *blocks )
+         stream( &block );
+
+   WALBERLA_CUDA_CHECK(cudaDeviceSynchronize());
+
+   return 0;
+}
diff --git a/tests/cuda/codegen/MicroBenchmarkGpuLbm.py b/tests/cuda/codegen/MicroBenchmarkGpuLbm.py
new file mode 100644
index 000000000..b722f0510
--- /dev/null
+++ b/tests/cuda/codegen/MicroBenchmarkGpuLbm.py
@@ -0,0 +1,27 @@
+import pystencils as ps
+from pystencils_walberla.sweep import Sweep
+from lbmpy.updatekernels import create_stream_pull_only_kernel
+from lbmpy.stencils import get_stencil
+
+dtype = 'float64'
+f_size = 19
+
+
+def copy_only():
+    src, dst = ps.fields("src({f_size}), dst({f_size}) : {dtype}[3D]".format(dtype=dtype, f_size=f_size),
+                         layout='fzyx')
+    return [ps.Assignment(dst(i), src(i)) for i in range(f_size)]
+
+
+def stream_only():
+    stencil = get_stencil("D3Q19")
+    return create_stream_pull_only_kernel(stencil, src_field_name='src',
+                                          dst_field_name='dst',
+                                          generic_field_type=dtype,
+                                          generic_layout='fzyx')
+
+
+opt = {'gpu_indexing_params': {'block_size': (128, 1, 1)}, 'data_type': dtype}
+
+Sweep.generate_from_equations('MicroBenchmarkCopyKernel', copy_only, target='gpu', optimization=opt)
+Sweep.generate_from_equations('MicroBenchmarkStreamKernel', stream_only, target='gpu', optimization=opt)
-- 
GitLab