From 427a8c684e656e14e965f7bcdf080a9cdaf8e0f2 Mon Sep 17 00:00:00 2001 From: Martin Bauer <martin.bauer@fau.de> Date: Wed, 31 Oct 2018 13:46:29 +0100 Subject: [PATCH] Refactored test for GPU communication - comparison to old CPU transfer communication included --- tests/cuda/codegen/EquivalenceTest.cpp | 22 ++++++++++++++++++---- tests/cuda/codegen/EquivalenceTest.gen.py | 12 +++++++----- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/tests/cuda/codegen/EquivalenceTest.cpp b/tests/cuda/codegen/EquivalenceTest.cpp index bbdc079d7..75bbd273e 100644 --- a/tests/cuda/codegen/EquivalenceTest.cpp +++ b/tests/cuda/codegen/EquivalenceTest.cpp @@ -6,7 +6,6 @@ #include "field/FlagField.h" #include "field/AddToStorage.h" #include "lbm/communication/PdfFieldPackInfo.h" -#include "lbm/vtk/VTKOutput.h" #include "lbm/PerformanceLogger.h" #include "blockforest/communication/UniformBufferedScheme.h" #include "timeloop/all.h" @@ -58,7 +57,8 @@ void initPdfField( const shared_ptr<StructuredBlockForest> &blocks, BlockDataID auto globalZ = real_c( offset[2] + z ); auto xArg = real_c(std::sin(real_c(globalX) / real_t(4) * real_c(domainBB.size(0)) )); auto zArg = real_c(std::sin(real_c(globalZ) / real_t(4) * real_c(domainBB.size(2)) )); - pdfField->setToEquilibrium( x, y, z, Vector3<real_t>( 0.05 * std::sin(xArg), 0, 0.05 * std::cos(zArg))); + pdfField->setToEquilibrium( x, y, z, Vector3<real_t>( real_t(0.05) * std::sin(xArg), 0, + real_t(0.05) * std::cos(zArg))); ); } } @@ -82,6 +82,7 @@ int main( int argc, char **argv ) BlockDataID flagFieldId = field::addFlagFieldToStorage<FlagField_T>( blocks, "flag field" ); const FlagUID fluidFlagUID( "Fluid" ); geometry::setNonBoundaryCellsToDomain<FlagField_T>( *blocks, flagFieldId, fluidFlagUID ); + GeneratedLatticeModel_T generatedLatticeModel = GeneratedLatticeModel_T( omega ); // Part 1 : Native walberla @@ -99,7 +100,6 @@ int main( int argc, char **argv ) // Part 2: Generated CPU Version - GeneratedLatticeModel_T generatedLatticeModel = GeneratedLatticeModel_T( omega ); BlockDataID pdfFieldGeneratedId = lbm::addPdfFieldToStorage( blocks, "pdfGenerated", generatedLatticeModel, field::fzyx ); initPdfField<GeneratedPdfField_T >( blocks, pdfFieldGeneratedId ); CpuCommScheme_T cpuComm( blocks ); @@ -113,6 +113,7 @@ int main( int argc, char **argv ) // Part 3: Generated GPU Version bool overlapCommunication = parameters.getParameter<bool>( "overlapCommunication", true ); bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false ); + bool oldCommunication = parameters.getParameter<bool>( "oldCommunication", false ); BlockDataID pdfShadowCPU = lbm::addPdfFieldToStorage( blocks, "cpu shadow field", generatedLatticeModel, field::fzyx ); initPdfField<GeneratedPdfField_T >( blocks, pdfShadowCPU ); @@ -126,10 +127,23 @@ int main( int argc, char **argv ) gpuComm.addPackInfo( make_shared<pystencils::EquivalenceTest_GPUPackInfo>( pdfGpuFieldId )); auto runCommunication = [&]() { gpuComm(); }; + CpuCommScheme_T oldGpuScheme( blocks ); + + std::vector<cudaStream_t > streams; + for(uint_t i=0; i < Stencil_T::Size; ++i ) { + cudaStream_t s; + cudaStreamCreate(&s); + streams.push_back(s); + } + using OldPackInfo = cuda::communication::GPUPackInfo<cuda::GPUField<real_t> >; + oldGpuScheme.addPackInfo( make_shared<OldPackInfo>(pdfGpuFieldId, streams) ); + + SweepTimeloop gpuTimeLoop( blocks->getBlockStorage(), timesteps ); if( !overlapCommunication ) { - gpuTimeLoop.add() << BeforeFunction( runCommunication, "gpu communication" ) + gpuTimeLoop.add() << (oldCommunication ? BeforeFunction(oldGpuScheme) : + BeforeFunction( runCommunication, "gpu communication" )) << Sweep( cudaLbKernel, "LB stream & collide gpu" ); } else diff --git a/tests/cuda/codegen/EquivalenceTest.gen.py b/tests/cuda/codegen/EquivalenceTest.gen.py index af4d3a8d8..43140ca53 100644 --- a/tests/cuda/codegen/EquivalenceTest.gen.py +++ b/tests/cuda/codegen/EquivalenceTest.gen.py @@ -3,6 +3,8 @@ from lbmpy_walberla import generate_lattice_model_files from lbmpy.creationfunctions import create_lb_update_rule from pystencils_walberla.sweep import Sweep +dtype = 'float64' + # LB options options = { 'method': 'srt', @@ -12,14 +14,14 @@ options = { 'compressible': False, 'maxwellian_moments': False, 'temporary_field_name': 'pdfs_tmp', - 'optimization': {'cse_global': False, - 'cse_pdfs': False, - 'double_precision': True} + 'optimization': {'cse_global': True, + 'cse_pdfs': True, + 'double_precision': dtype == 'float64'} } # GPU optimization options -opt = {'gpu_indexing_params': {'block_size': (128, 2, 1)}, 'data_type': 'float64'} -outer_opt = {'gpu_indexing_params': {'block_size': (32, 32, 32)}, 'data_type': 'float64'} +opt = {'gpu_indexing_params': {'block_size': (128, 1, 1)}, 'data_type': dtype} +outer_opt = {'gpu_indexing_params': {'block_size': (32, 32, 32)}, 'data_type': dtype} def lb_assignments(): -- GitLab