From 6dc551089231a281a27a7360d881d2c66152ad17 Mon Sep 17 00:00:00 2001 From: Martin Bauer <martin.bauer@fau.de> Date: Wed, 24 Apr 2019 12:59:26 +0200 Subject: [PATCH] UniformGridGPU - fix in time measurement - options to disable boundaries and/or communication - priority for communication & outer part of kernel --- .../UniformGridGPU/UniformGridGPU.cpp | 24 +++++++++++++++---- .../UniformGridGPU/UniformGridGPU.prm | 7 +++++- .../UniformGridGPU/UniformGridGPU.py | 10 +++++--- 3 files changed, 33 insertions(+), 8 deletions(-) diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp index 4d8686956..e8c9299c4 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp @@ -87,7 +87,10 @@ int main( int argc, char **argv ) noSlip.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID ); //pressure.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("pressure"), fluidFlagUID ); - // Communication setup + bool disableBoundaries = parameters.getParameter<bool>( "disableBoundaries", false ); + bool kernelOnly = parameters.getParameter<bool>( "kernelOnly", false ); + + // Communication setup bool overlapCommunication = parameters.getParameter<bool>( "overlapCommunication", true ); bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false ); int communicationScheme = parameters.getParameter<int>( "communicationScheme", (int) CommunicationSchemeType::UniformGPUScheme_Baseline ); @@ -98,8 +101,6 @@ int main( int argc, char **argv ) pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega ); lbKernel.setOuterPriority( streamHighPriority ); - //CommScheme_T gpuComm( blocks, cudaEnabledMPI ); - //gpuComm.addPackInfo( make_shared<pystencils::UniformGridGPU_PackInfo>( pdfFieldGpuID )); UniformGridGPU_Communication< CommunicationStencil_T, cuda::GPUField< double > > gpuComm( blocks, pdfFieldGpuID, (CommunicationSchemeType) communicationScheme, cudaEnabledMPI ); @@ -116,6 +117,7 @@ int main( int argc, char **argv ) { for( auto &block: *blocks ) { + if(!disableBoundaries) { auto p = boundaryInnerStreams.parallelSection( innerStream ); p.run( [&block, &ubb]( cudaStream_t s ) { ubb.inner( &block, s ); } ); @@ -131,6 +133,7 @@ int main( int argc, char **argv ) for( auto &block: *blocks ) { + if(!disableBoundaries) { auto p = boundaryOuterStreams.parallelSection( outerStream ); p.run( [&block, &ubb]( cudaStream_t s ) { ubb.outer( &block, s ); } ); @@ -148,6 +151,7 @@ int main( int argc, char **argv ) gpuComm(); for( auto &block: *blocks ) { + if(!disableBoundaries) { auto p = boundaryStreams.parallelSection( defaultStream ); p.run( [&block, &ubb]( cudaStream_t s ) { ubb( &block, s ); } ); @@ -157,9 +161,21 @@ int main( int argc, char **argv ) } }; + auto kernelOnlyFunc = [&] () + { + for( auto &block: *blocks ) + lbKernel( &block ); + }; + SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps ); std::function<void()> timeStep = overlapCommunication ? std::function<void()>( overlapTimeStep ) : std::function<void()>( normalTimeStep ); + if( kernelOnly ) + { + WALBERLA_LOG_INFO_ON_ROOT("Running only compute kernel without boundary - this makes only sense for benchmarking!") + timeStep = kernelOnlyFunc; + } + timeLoop.add() << BeforeFunction( timeStep ) << Sweep( []( IBlock * ) {}, "time step" ); @@ -185,8 +201,8 @@ int main( int argc, char **argv ) cudaDeviceSynchronize(); WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps"); simTimer.start(); - cudaDeviceSynchronize(); timeLoop.run(); + cudaDeviceSynchronize(); simTimer.end(); WALBERLA_LOG_INFO_ON_ROOT("Simulation finished"); auto time = simTimer.last(); diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm index 2b340e0ef..22877854b 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm @@ -2,13 +2,16 @@ Parameters { omega 1.8; - timesteps 1000; + timesteps 500; remainingTimeLoggerFrequency 3; vtkWriteFrequency 0; overlapCommunication true; cudaEnabledMPI false; + + kernelOnly false; + disableBoundaries false; } DomainSetup @@ -20,8 +23,10 @@ DomainSetup Boundaries { + /* Border { direction W; walldistance -1; flag NoSlip; } Border { direction E; walldistance -1; flag NoSlip; } Border { direction S; walldistance -1; flag NoSlip; } Border { direction N; walldistance -1; flag UBB; } + */ } diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py index a4619226e..e5b05555a 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py @@ -6,6 +6,10 @@ from lbmpy_walberla import generate_lattice_model, generate_boundary from pystencils_walberla import CodeGeneration, generate_sweep +sweep_block_size = (128, 1, 1) +sweep_params = {'block_size': sweep_block_size} + + with CodeGeneration() as ctx: # LB options options = { @@ -16,8 +20,8 @@ with CodeGeneration() as ctx: 'compressible': False, 'temporary_field_name': 'pdfs_tmp', 'optimization': {'cse_global': True, - 'cse_pdfs': True, - 'gpu_indexing_params': {'block_size': (128, 1, 1)}} + 'cse_pdfs': False, + } } lb_method = create_lb_method(**options) update_rule = create_lb_update_rule(lb_method=lb_method, **options) @@ -27,7 +31,7 @@ with CodeGeneration() as ctx: # gpu LB sweep & boundaries generate_sweep(ctx, 'UniformGridGPU_LbKernel', update_rule, field_swaps=[('pdfs', 'pdfs_tmp')], - inner_outer_split=True, target='gpu') + inner_outer_split=True, target='gpu', gpu_indexing_params=sweep_params) generate_boundary(ctx, 'UniformGridGPU_NoSlip', NoSlip(), lb_method, target='gpu') generate_boundary(ctx, 'UniformGridGPU_UBB', UBB([0.05, 0, 0]), lb_method, target='gpu') -- GitLab