diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp index db5e25972442ba7fb2384d71b81dc42fe661aa8f..58a4d0356579a8fcfe89078bc947d44fdfb3fe43 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp @@ -76,9 +76,14 @@ int main( int argc, char **argv ) // Boundaries const FlagUID fluidFlagUID( "Fluid" ); - auto boundariesConfig = config->getOneBlock( "Boundaries" ); - geometry::initBoundaryHandling<FlagField_T>(*blocks, flagFieldID, boundariesConfig); - geometry::setNonBoundaryCellsToDomain<FlagField_T>(*blocks, flagFieldID, fluidFlagUID); + auto boundariesConfig = config->getBlock( "Boundaries" ); + bool disableBoundaries = true; + if( boundariesConfig ) + { + disableBoundaries = false; + geometry::initBoundaryHandling< FlagField_T >( *blocks, flagFieldID, boundariesConfig ); + geometry::setNonBoundaryCellsToDomain< FlagField_T >( *blocks, flagFieldID, fluidFlagUID ); + } lbm::UniformGridGPU_UBB ubb(blocks, pdfFieldGpuID); lbm::UniformGridGPU_NoSlip noSlip(blocks, pdfFieldGpuID); @@ -88,19 +93,17 @@ int main( int argc, char **argv ) noSlip.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID ); //pressure.fillFromFlagField<FlagField_T>( blocks, flagFieldID, FlagUID("pressure"), fluidFlagUID ); - bool disableBoundaries = parameters.getParameter<bool>( "disableBoundaries", false ); - bool kernelOnly = parameters.getParameter<bool>( "kernelOnly", false ); - // Communication setup - bool overlapCommunication = parameters.getParameter<bool>( "overlapCommunication", true ); bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false ); int communicationScheme = parameters.getParameter<int>( "communicationScheme", (int) CommunicationSchemeType::UniformGPUScheme_Baseline ); + Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1)); + int streamHighPriority = 0; int streamLowPriority = 0; WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority) ); - pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega ); + pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega, Cell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]) ); lbKernel.setOuterPriority( streamHighPriority ); UniformGridGPU_Communication< CommunicationStencil_T, cuda::GPUField< double > > gpuComm( blocks, pdfFieldGpuID, (CommunicationSchemeType) communicationScheme, cudaEnabledMPI ); @@ -110,8 +113,21 @@ int main( int argc, char **argv ) auto boundaryOuterStreams = cuda::ParallelStreams( streamHighPriority ); auto boundaryInnerStreams = cuda::ParallelStreams( streamHighPriority ); + uint_t currentTimeStep = 0; + + auto simpleOverlapTimeStep = [&] () + { + gpuComm.startCommunication(defaultStream); + for( auto &block: *blocks ) + lbKernel.inner( &block, defaultStream ); + gpuComm.wait(defaultStream); + for( auto &block: *blocks ) + lbKernel.outer( &block, defaultStream ); + }; + auto overlapTimeStep = [&]() { + cuda::NvtxRange namedRange("timestep"); auto innerOuterSection = innerOuterStreams.parallelSection( defaultStream ); innerOuterSection.run([&]( auto innerStream ) @@ -131,7 +147,7 @@ int main( int argc, char **argv ) innerOuterSection.run([&]( auto outerStream ) { - cuda::nameStream(outerStream, "inner stream"); + cuda::nameStream(outerStream, "outer stream"); gpuComm( outerStream ); for( auto &block: *blocks ) @@ -145,6 +161,7 @@ int main( int argc, char **argv ) lbKernel.outer( &block, outerStream ); } }); + currentTimeStep += 1; }; @@ -171,13 +188,22 @@ int main( int argc, char **argv ) }; SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps ); - std::function<void()> timeStep = overlapCommunication ? std::function<void()>( overlapTimeStep ) : - std::function<void()>( normalTimeStep ); - if( kernelOnly ) - { + + const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal"); + std::function<void()> timeStep; + if (timeStepStrategy == "noOverlap") + timeStep = std::function<void()>( normalTimeStep ); + else if (timeStepStrategy == "complexOverlap") + timeStep = std::function<void()>( overlapTimeStep ); + else if (timeStepStrategy == "simpleOverlap") + timeStep = simpleOverlapTimeStep; + else if (timeStepStrategy == "kernelOnly") { WALBERLA_LOG_INFO_ON_ROOT("Running only compute kernel without boundary - this makes only sense for benchmarking!") timeStep = kernelOnlyFunc; } + else { + WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', 'complexOverlap', 'simpleOverlap', 'kernelOnly'"); + } timeLoop.add() << BeforeFunction( timeStep ) << Sweep( []( IBlock * ) {}, "time step" ); @@ -196,33 +222,46 @@ int main( int argc, char **argv ) timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" ); } - auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", 3.0 ); // in seconds - timeLoop.addFuncAfterTimeStep( timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps(), remainingTimeLoggerFrequency ), "remaining time logger" ); - - - WcTimer simTimer; - cudaDeviceSynchronize(); - WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps"); - simTimer.start(); - timeLoop.run(); - cudaDeviceSynchronize(); - simTimer.end(); - WALBERLA_LOG_INFO_ON_ROOT("Simulation finished"); - auto time = simTimer.last(); - auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] ); - auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6; - WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process " << mlupsPerProcess); - WALBERLA_LOG_RESULT_ON_ROOT("Time per time step " << time / real_c( timesteps ) ); - WALBERLA_ROOT_SECTION() - { - python_coupling::PythonCallback pythonCallbackResults ( "results_callback" ); - if ( pythonCallbackResults.isCallable() ) - { - pythonCallbackResults.data().exposeValue( "mlups_per_process", mlupsPerProcess ); - // Call Python function to report results - pythonCallbackResults(); - } + + int warmupSteps = parameters.getParameter<int>( "warmupSteps", 2 ); + int outerIterations = parameters.getParameter<int>( "outerIterations", 1 ); + for(int i=0; i < warmupSteps; ++i ) + timeLoop.singleStep(); + + auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds + if (remainingTimeLoggerFrequency > 0) { + auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * outerIterations, remainingTimeLoggerFrequency ); + timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" ); + } + + for(int outerIteration = 0; outerIteration < outerIterations; ++outerIteration) + { + timeLoop.setCurrentTimeStepToZero(); + WcTimer simTimer; + cudaDeviceSynchronize(); + WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" ); + simTimer.start(); + timeLoop.run(); + cudaDeviceSynchronize(); + simTimer.end(); + WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" ); + auto time = simTimer.last(); + auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] ); + auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6; + WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess ); + WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps )); + WALBERLA_ROOT_SECTION() + { + python_coupling::PythonCallback pythonCallbackResults( "results_callback" ); + if ( pythonCallbackResults.isCallable()) + { + pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess ); + pythonCallbackResults.data().exposeValue( "githash", WALBERLA_GIT_SHA1 ); + // Call Python function to report results + pythonCallbackResults(); + } + } } } diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm index 22877854be5e21834fe8005bc6e1636ef2cce15e..c6c1e18b2e4ce242241912d2cfb46291a3cfab12 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm @@ -1,32 +1,32 @@ +DomainSetup +{ + blocks < 1, 1, 1 >; + cellsPerBlock < 256, 256, 128 >; + periodic < 1, 1, 1 >; +} Parameters { omega 1.8; - timesteps 500; + timesteps 10000; + warmupSteps 0; + outerIterations 1; - remainingTimeLoggerFrequency 3; - vtkWriteFrequency 0; + remainingTimeLoggerFrequency 30; + vtkWriteFrequency 500; - overlapCommunication true; cudaEnabledMPI false; - kernelOnly false; - disableBoundaries false; -} - -DomainSetup -{ - blocks < 1, 1, 1 >; - cellsPerBlock < 300, 300, 150 >; - periodic < 0, 0, 1 >; + timeStepStrategy noOverlap; + innerOuterSplit < 64, 1, 1>; } -Boundaries +/* +Boundaries { - /* Border { direction W; walldistance -1; flag NoSlip; } Border { direction E; walldistance -1; flag NoSlip; } Border { direction S; walldistance -1; flag NoSlip; } Border { direction N; walldistance -1; flag UBB; } - */ } +*/ diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py b/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..65ad0b5bf8e7d723673bcb356963a544f442c8c9 --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 + +import os +import pandas as pd +import waLBerla as wlb +from waLBerla.tools.config import block_decomposition +from waLBerla.tools.sqlitedb import sequenceValuesToScalars +from os import getcwd +from waLBerla.tools.jobscripts import createJobscript +from datetime import timedelta + + +CSV_FILE = "overlap_benchmark.csv" + +BASE_CONFIG = { + 'DomainSetup': { + 'cellsPerBlock': (256, 256, 256), + 'periodic': (1, 1, 1), + }, + 'Parameters': { + 'omega': 1.8, + 'timesteps': 400, + 'cudaEnabledMPI': False, + 'warmupSteps': 5, + 'outerIterations': 1, + } +} + + +class Scenario: + def __init__(self, **kwargs): + self.config_dict = BASE_CONFIG.copy() + self.config_dict['Parameters'].update(kwargs) + self.config_dict['DomainSetup']['blocks'] = block_decomposition(wlb.mpi.numProcesses()) + + @wlb.member_callback + def config(self, **kwargs): + from pprint import pformat + wlb.log_info_on_root("Scenario:\n" + pformat(self.config_dict)) + return self.config_dict + + @wlb.member_callback + def results_callback(self, **kwargs): + data = {} + data.update(self.config_dict['Parameters']) + data.update(self.config_dict['DomainSetup']) + data.update(kwargs) + sequenceValuesToScalars(data) + + df = pd.DataFrame.from_records([data]) + if not os.path.isfile(CSV_FILE): + df.to_csv(CSV_FILE, index=False) + else: + df.to_csv(CSV_FILE, index=False, mode='a', header=False) + + +def overlap_benchmark(): + scenarios = wlb.ScenarioManager() + inner_outer_splits = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1), (64, 1, 1), + (4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1), (64, 64, 1), + (4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32), (64, 64, 64)] + + for strategy in ['simpleOverlap', 'complexOverlap', 'noOverlap']: + for inner_outer_split in inner_outer_splits: + if strategy == 'noOverlap' and inner_outer_split != (1, 1, 1): + continue + scenario = Scenario(timeStepStrategy=strategy, innerOuterSplit=inner_outer_split) + scenarios.add(scenario) + + +if __name__ == '__main__': + for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2400]: + with open("job_overlap_benchmark_{:04d}.sh".format(node_count), 'w') as f: + js = createJobscript(nodes=node_count, + output_file='overlap_bench_{:04d}_%j.txt'.format(node_count), + error_file='overlap_bench_{:04d}_%j.txt'.format(node_count), + initial_dir=getcwd(), + exe_name='UniformGridBenchmarkGPU', + parameter_files=['overlap_benchmark.py'], + wall_time=timedelta(minutes=25), + machine='pizdaint_hybrid', + account='d105', + ) + f.write(js) +else: + overlap_benchmark()