From 427a8c684e656e14e965f7bcdf080a9cdaf8e0f2 Mon Sep 17 00:00:00 2001
From: Martin Bauer <martin.bauer@fau.de>
Date: Wed, 31 Oct 2018 13:46:29 +0100
Subject: [PATCH] Refactored test for GPU communication

- comparison to old CPU transfer communication included
---
 tests/cuda/codegen/EquivalenceTest.cpp    | 22 ++++++++++++++++++----
 tests/cuda/codegen/EquivalenceTest.gen.py | 12 +++++++-----
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/tests/cuda/codegen/EquivalenceTest.cpp b/tests/cuda/codegen/EquivalenceTest.cpp
index bbdc079d7..75bbd273e 100644
--- a/tests/cuda/codegen/EquivalenceTest.cpp
+++ b/tests/cuda/codegen/EquivalenceTest.cpp
@@ -6,7 +6,6 @@
 #include "field/FlagField.h"
 #include "field/AddToStorage.h"
 #include "lbm/communication/PdfFieldPackInfo.h"
-#include "lbm/vtk/VTKOutput.h"
 #include "lbm/PerformanceLogger.h"
 #include "blockforest/communication/UniformBufferedScheme.h"
 #include "timeloop/all.h"
@@ -58,7 +57,8 @@ void initPdfField( const shared_ptr<StructuredBlockForest> &blocks, BlockDataID
          auto globalZ = real_c( offset[2] + z );
          auto xArg = real_c(std::sin(real_c(globalX) / real_t(4) * real_c(domainBB.size(0)) ));
          auto zArg = real_c(std::sin(real_c(globalZ) / real_t(4) * real_c(domainBB.size(2)) ));
-         pdfField->setToEquilibrium( x, y, z, Vector3<real_t>( 0.05 * std::sin(xArg), 0, 0.05 * std::cos(zArg)));
+         pdfField->setToEquilibrium( x, y, z, Vector3<real_t>( real_t(0.05) * std::sin(xArg), 0,
+                                                               real_t(0.05) * std::cos(zArg)));
       );
    }
 }
@@ -82,6 +82,7 @@ int main( int argc, char **argv )
       BlockDataID flagFieldId = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field" );
       const FlagUID fluidFlagUID( "Fluid" );
       geometry::setNonBoundaryCellsToDomain<FlagField_T>( *blocks, flagFieldId, fluidFlagUID );
+      GeneratedLatticeModel_T generatedLatticeModel = GeneratedLatticeModel_T( omega );
 
 
       // Part 1 : Native walberla
@@ -99,7 +100,6 @@ int main( int argc, char **argv )
 
 
       // Part 2: Generated CPU Version
-      GeneratedLatticeModel_T generatedLatticeModel = GeneratedLatticeModel_T( omega );
       BlockDataID pdfFieldGeneratedId = lbm::addPdfFieldToStorage( blocks, "pdfGenerated", generatedLatticeModel, field::fzyx );
       initPdfField<GeneratedPdfField_T >( blocks, pdfFieldGeneratedId );
       CpuCommScheme_T cpuComm( blocks );
@@ -113,6 +113,7 @@ int main( int argc, char **argv )
       // Part 3: Generated GPU Version
       bool overlapCommunication = parameters.getParameter<bool>( "overlapCommunication", true );
       bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
+      bool oldCommunication = parameters.getParameter<bool>( "oldCommunication", false );
 
       BlockDataID pdfShadowCPU = lbm::addPdfFieldToStorage( blocks, "cpu shadow field", generatedLatticeModel, field::fzyx );
       initPdfField<GeneratedPdfField_T >( blocks, pdfShadowCPU );
@@ -126,10 +127,23 @@ int main( int argc, char **argv )
       gpuComm.addPackInfo( make_shared<pystencils::EquivalenceTest_GPUPackInfo>( pdfGpuFieldId ));
       auto runCommunication = [&]() { gpuComm(); };
 
+      CpuCommScheme_T oldGpuScheme( blocks );
+
+      std::vector<cudaStream_t > streams;
+      for(uint_t i=0; i < Stencil_T::Size; ++i ) {
+         cudaStream_t s;
+         cudaStreamCreate(&s);
+         streams.push_back(s);
+      }
+      using OldPackInfo = cuda::communication::GPUPackInfo<cuda::GPUField<real_t> >;
+      oldGpuScheme.addPackInfo( make_shared<OldPackInfo>(pdfGpuFieldId, streams) );
+
+
       SweepTimeloop gpuTimeLoop( blocks->getBlockStorage(), timesteps );
       if( !overlapCommunication )
       {
-         gpuTimeLoop.add() << BeforeFunction( runCommunication, "gpu communication" )
+         gpuTimeLoop.add() << (oldCommunication ? BeforeFunction(oldGpuScheme) :
+                                                  BeforeFunction( runCommunication, "gpu communication" ))
                            << Sweep( cudaLbKernel, "LB stream & collide gpu" );
       }
       else
diff --git a/tests/cuda/codegen/EquivalenceTest.gen.py b/tests/cuda/codegen/EquivalenceTest.gen.py
index af4d3a8d8..43140ca53 100644
--- a/tests/cuda/codegen/EquivalenceTest.gen.py
+++ b/tests/cuda/codegen/EquivalenceTest.gen.py
@@ -3,6 +3,8 @@ from lbmpy_walberla import generate_lattice_model_files
 from lbmpy.creationfunctions import create_lb_update_rule
 from pystencils_walberla.sweep import Sweep
 
+dtype = 'float64'
+
 # LB options
 options = {
     'method': 'srt',
@@ -12,14 +14,14 @@ options = {
     'compressible': False,
     'maxwellian_moments': False,
     'temporary_field_name': 'pdfs_tmp',
-    'optimization': {'cse_global': False,
-                     'cse_pdfs': False,
-                     'double_precision': True}
+    'optimization': {'cse_global': True,
+                     'cse_pdfs': True,
+                     'double_precision': dtype == 'float64'}
 }
 
 # GPU optimization options
-opt =       {'gpu_indexing_params': {'block_size': (128, 2, 1)},  'data_type': 'float64'}
-outer_opt = {'gpu_indexing_params': {'block_size': (32, 32, 32)}, 'data_type': 'float64'}
+opt =       {'gpu_indexing_params': {'block_size': (128, 1, 1)},  'data_type': dtype}
+outer_opt = {'gpu_indexing_params': {'block_size': (32, 32, 32)}, 'data_type': dtype}
 
 
 def lb_assignments():
-- 
GitLab