diff --git a/apps/benchmarks/UniformGridGPU/CMakeLists.txt b/apps/benchmarks/UniformGridGPU/CMakeLists.txt index 6e704b430899c6e2545e786b93a4606e1c110021..857df4d5f20c4be25a6dacaafd8e3c987a53531f 100644 --- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt +++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt @@ -29,10 +29,12 @@ waLBerla_python_file_generates(UniformGridGPU_AA.py UniformGridGPU_AA_PackInfoPush.cu UniformGridGPU_AA_PackInfoPush.h UniformGridGPU_AA_MacroSetter.cpp UniformGridGPU_AA_MacroSetter.h UniformGridGPU_AA_MacroGetter.cpp UniformGridGPU_AA_MacroGetter.h + UniformGridGPU_AA_Defines.h ) -set(config "srt") -waLBerla_add_executable ( NAME UniformGridBenchmarkGPU_AA_${config} - FILES UniformGridGPU_AA.cpp UniformGridGPU_AA.py - DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui - CODEGEN_CFG ${config}) +foreach(config srt trt mrt smagorinsky entropic ) + waLBerla_add_executable ( NAME UniformGridBenchmarkGPU_AA_${config} + FILES UniformGridGPU_AA.cpp UniformGridGPU_AA.py + DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui + CODEGEN_CFG ${config}) +endforeach() diff --git a/apps/benchmarks/UniformGridGPU/InitShearVelocity.h b/apps/benchmarks/UniformGridGPU/InitShearVelocity.h index fe038ebdb41d7102cca994d742b3139e08eb30f7..2aed66b1ade0380aed7ff59080367897937dae12 100644 --- a/apps/benchmarks/UniformGridGPU/InitShearVelocity.h +++ b/apps/benchmarks/UniformGridGPU/InitShearVelocity.h @@ -6,7 +6,7 @@ namespace walberla { inline void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID velFieldID, - const real_t xMagnitude=0.1, const real_t fluctuationMagnitude=0.05 ) + const real_t xMagnitude=0.005, const real_t fluctuationMagnitude=0.05 ) { math::seedRandomGenerator(0); auto halfZ = blocks->getDomainCellBB().zMax() / 2; diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp index ff2c93c4db3dbcca34e2aaeeb75994c7c848abdb..a413df6934496259c62fb21b3295a5e9d64186e8 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp @@ -151,6 +151,8 @@ int main( int argc, char **argv ) communicationScheme = UniformGPUScheme_Memcpy; else if (communicationSchemeStr == "MPIDatatypes") communicationScheme = MPIDatatypes; + else if (communicationSchemeStr == "MPIDatatypesFull") + communicationScheme = MPIDatatypesFull; else { WALBERLA_ABORT_NO_DEBUG_INFO("Invalid choice for communicationScheme") } diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm index de2821b8277003c09714ea8ad8f8d161f42c69ec..3ef98e079053e48b38a698729785f7521dffb094 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.prm @@ -1,27 +1,28 @@ DomainSetup { blocks < 1, 1, 1 >; - cellsPerBlock < 512, 256, 256 >; + cellsPerBlock < 128, 128, 128 >; periodic < 1, 1, 1 >; } Parameters { - timesteps 2000; //10000; // time steps of one performance measurement - warmupSteps 0; // number of steps to run before measurement starts - outerIterations 1; // how many measurements to conduct + timesteps 1000; // time steps of one performance measurement + warmupSteps 100; // number of steps to run before measurement starts + outerIterations 3; // how many measurements to conduct // Can be one of: GPUPackInfo_Baseline, GPUPackInfo_Streams, UniformGPUScheme_Baseline, UniformGPUScheme_Memcpy communicationScheme UniformGPUScheme_Baseline; - vtkWriteFrequency 0; //100; // write a VTK file every n'th step, if zero VTK output is disabled + vtkWriteFrequency 0; // write a VTK file every n'th step, if zero VTK output is disabled cudaEnabledMPI false; // switch on if you have a CUDA-enabled MPI implementation - timeStepStrategy noOverlap; // can be: noOverlap, simpleOverlap, complexOverlap, kernelOnly + timeStepStrategy kernelOnly; // can be: noOverlap, simpleOverlap, complexOverlap, kernelOnly innerOuterSplit < 8, 1, 1>; // slice-thickness that 'outer'-kernels process when overlapping remainingTimeLoggerFrequency 5; // interval in seconds to log the estimated remaining time + gpuBlockSize < 128, 1, 1>; omega 1.92; initShearFlow 1; diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py index 80a4a3490b825d855d1ba0a30b0e3964e76a1c0f..b95b514b5ed4b29aeacca7f71b38440f5f7d8e0e 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py @@ -12,10 +12,14 @@ from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisio from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter omega = sp.symbols("omega") -# sweep_block_size = (128, 1, 1) -sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32), - TypedSymbol("cudaBlockSize1", np.int32), - 1) +compile_time_block_size = False + +if compile_time_block_size: + sweep_block_size = (128, 1, 1) +else: + sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32), + TypedSymbol("cudaBlockSize1", np.int32), + 1) sweep_params = {'block_size': sweep_block_size} diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp index 348f63fd0dd5d08fdb481cecde9ab25222e17bc4..be38e3cb3eb6fcaadaa3caca922776b7b2b6f480 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp @@ -21,9 +21,13 @@ #include "cuda/communication/UniformGPUScheme.h" #include "cuda/DeviceSelectMPI.h" #include "domain_decomposition/SharedSweep.h" -#include "stencil/D3Q19.h" -#include "stencil/D3Q27.h" #include "InitShearVelocity.h" +#include "gui/Gui.h" + +#ifdef WALBERLA_ENABLE_GUI +#include "lbm/gui/PdfFieldDisplayAdaptor.h" +#endif + #include "UniformGridGPU_AA_PackInfoPush.h" #include "UniformGridGPU_AA_PackInfoPull.h" @@ -31,11 +35,12 @@ #include "UniformGridGPU_AA_MacroGetter.h" #include "UniformGridGPU_AA_LbKernelEven.h" #include "UniformGridGPU_AA_LbKernelOdd.h" +#include "UniformGridGPU_AA_Defines.h" +#include <cmath> using namespace walberla; -using Stencil_T = stencil::D3Q19; //TODO make generic - and determine from python script using CommunicationStencil_T = Stencil_T; using PdfField_T = GhostLayerField< real_t, Stencil_T::Q >; using VelocityField_T = GhostLayerField< real_t, 3 >; @@ -50,6 +55,8 @@ int main( int argc, char **argv ) { WALBERLA_MPI_WORLD_BARRIER(); + WALBERLA_CUDA_CHECK( cudaPeekAtLastError() ); + auto config = *cfg; logging::configureLogging( config ); auto blocks = blockforest::createUniformBlockGridFromConfig( config ); @@ -61,7 +68,7 @@ int main( int argc, char **argv ) const uint_t timesteps = parameters.getParameter< uint_t >( "timesteps", uint_c( 50 )); // Creating fields - BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t( 42.0 ), field::fzyx ); + BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t( std::nan("") ), field::fzyx ); BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t( 0 ), field::fzyx ); WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" ); @@ -215,32 +222,52 @@ int main( int argc, char **argv ) timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" ); } - for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration ) + bool useGui = parameters.getParameter<bool>( "useGui", false ); + if( useGui ) + { + cuda::fieldCpy< PdfField_T, cuda::GPUField< real_t > >( blocks, pdfFieldCpuID, pdfFieldGpuID ); + timeLoop.addFuncAfterTimeStep( cuda::fieldCpyFunctor<PdfField_T, cuda::GPUField<real_t> >( blocks, pdfFieldCpuID, pdfFieldGpuID ), "copy to CPU" ); + GUI gui( timeLoop, blocks, argc, argv); + gui.registerDisplayAdaptorCreator( + [&](const IBlock & block, ConstBlockDataID blockDataID) -> gui::DisplayAdaptor * { + if ( block.isDataOfType< PdfField_T >( blockDataID) ) + return new lbm::PdfFieldDisplayAdaptor<GhostLayerField<real_t, Stencil_T::Q>, Stencil_T >( blockDataID ); + return nullptr; + }); + gui.run(); + } + else { - timeLoop.setCurrentTimeStepToZero(); - WcTimer simTimer; - cudaDeviceSynchronize(); - WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" ); - simTimer.start(); - timeLoop.run(); - cudaDeviceSynchronize(); - simTimer.end(); - WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" ); - auto time = simTimer.last(); - auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] ); - - auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6; - WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess ); - WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps )); - WALBERLA_ROOT_SECTION() + for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration ) { - python_coupling::PythonCallback pythonCallbackResults( "results_callback" ); - if ( pythonCallbackResults.isCallable()) + WALBERLA_CUDA_CHECK( cudaPeekAtLastError() ); + + timeLoop.setCurrentTimeStepToZero(); + WcTimer simTimer; + cudaDeviceSynchronize(); + WALBERLA_CUDA_CHECK( cudaPeekAtLastError() ); + WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" ); + simTimer.start(); + timeLoop.run(); + cudaDeviceSynchronize(); + simTimer.end(); + WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" ); + auto time = simTimer.last(); + auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] ); + + auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6; + WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess ); + WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps )); + WALBERLA_ROOT_SECTION() { - pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess ); - pythonCallbackResults.data().exposeValue( "githash", WALBERLA_GIT_SHA1 ); - // Call Python function to report results - pythonCallbackResults(); + python_coupling::PythonCallback pythonCallbackResults( "results_callback" ); + if ( pythonCallbackResults.isCallable()) + { + pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess ); + pythonCallbackResults.data().exposeValue( "githash", WALBERLA_GIT_SHA1 ); + // Call Python function to report results + pythonCallbackResults(); + } } } } diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py index 45ea8043d0cd1fe2a78080e70a93e9874844d40f..105cb48f4800cbc82ba16908025eb9dd120d8777 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py @@ -10,10 +10,14 @@ from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisio from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter omega = sp.symbols("omega") -# sweep_block_size = (128, 1, 1) -sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32), - TypedSymbol("cudaBlockSize1", np.int32), - 1) +compile_time_block_size = False + +if compile_time_block_size: + sweep_block_size = (128, 1, 1) +else: + sweep_block_size = (TypedSymbol("cudaBlockSize0", np.int32), + TypedSymbol("cudaBlockSize1", np.int32), + 1) sweep_params = {'block_size': sweep_block_size} @@ -58,7 +62,9 @@ with CodeGeneration() as ctx: common_options = { 'field_name': 'pdfs', 'optimization': {'cse_global': True, - 'cse_pdfs': False} + 'cse_pdfs': False, + 'field_layout': 'fzyx', + } } options = options_dict.get(ctx.config, options_dict['srt']) options.update(common_options) @@ -98,3 +104,6 @@ with CodeGeneration() as ctx: # communication generate_pack_info_from_kernel(ctx, 'UniformGridGPU_AA_PackInfoPull', update_rules['Odd'], kind='pull', target='gpu') generate_pack_info_from_kernel(ctx, 'UniformGridGPU_AA_PackInfoPush', update_rules['Odd'], kind='push', target='gpu') + + ctx.write_file("UniformGridGPU_AA_Defines.h", + '#include "stencil/D3Q{0}.h"\nusing Stencil_T = walberla::stencil::D3Q{0}; \n '.format(q)) diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h b/apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h index 6a3bf6b532fe67b753cf2d64737a25d7747e067d..db0ec86e6cc1f58c4548a57023ac8ce49d69478f 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_Communication.h @@ -6,6 +6,7 @@ #include "blockforest/communication/UniformBufferedScheme.h" #include "blockforest/communication/UniformDirectScheme.h" #include "field/communication/StencilRestrictedMPIDatatypeInfo.h" +#include "field/communication/UniformMPIDatatypeInfo.h" #include "cuda/communication/GPUPackInfo.h" #include "cuda/communication/UniformGPUScheme.h" #include "cuda/communication/MemcpyPackInfo.h" @@ -20,7 +21,8 @@ enum CommunicationSchemeType { GPUPackInfo_Streams = 1, UniformGPUScheme_Baseline = 2, UniformGPUScheme_Memcpy = 3, - MPIDatatypes = 4 + MPIDatatypes = 4, + MPIDatatypesFull = 5 }; @@ -36,6 +38,7 @@ public: auto generatedPackInfo = make_shared<pystencils::UniformGridGPU_PackInfo>( bdId ); auto memcpyPackInfo = make_shared< cuda::communication::MemcpyPackInfo< GPUFieldType > >( bdId ); auto dataTypeInfo = make_shared< field::communication::StencilRestrictedMPIDatatypeInfo< GPUFieldType, StencilType > >( bdId ); + auto dataTypeInfoFull = make_shared< field::communication::UniformMPIDatatypeInfo<GPUFieldType> >( bdId ); switch(_commSchemeType) { @@ -63,6 +66,12 @@ public: } _directScheme = make_shared< blockforest::communication::UniformDirectScheme< StencilType > >( bf, dataTypeInfo ); break; + case MPIDatatypesFull: + if( ! cudaEnabledMPI ) { + WALBERLA_ABORT("MPI datatype-based communication not possible if no cudaEnabledMPI is available."); + } + _directScheme = make_shared< blockforest::communication::UniformDirectScheme< StencilType > >( bf, dataTypeInfoFull ); + break; default: WALBERLA_ABORT("Invalid GPU communication scheme specified!"); } @@ -103,6 +112,7 @@ public: _gpuCommunicationScheme->startCommunication( communicationStream ); break; case MPIDatatypes: + case MPIDatatypesFull: WALBERLA_ASSERT_NOT_NULLPTR( _directScheme ); _directScheme->startCommunication(); break; @@ -131,6 +141,7 @@ public: _gpuCommunicationScheme->wait( communicationStream ); break; case MPIDatatypes: + case MPIDatatypesFull: WALBERLA_ASSERT_NOT_NULLPTR( _directScheme ); _directScheme->wait(); break; diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py b/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py index 420be74fed17999af756a54aa7330bd9217580cf..1beacb0059ba1df29e03cc71387ee645cf730268 100755 --- a/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py +++ b/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py @@ -9,6 +9,7 @@ from os import getcwd from waLBerla.tools.jobscripts import createJobscript from datetime import timedelta from copy import deepcopy +import sys CSV_FILE = "overlap_benchmark.csv" @@ -22,16 +23,18 @@ BASE_CONFIG = { 'timesteps': 400, 'cudaEnabledMPI': False, 'warmupSteps': 5, - 'outerIterations': 1, + 'outerIterations': 3, + 'initShearFlow': True, } } class Scenario: - def __init__(self, **kwargs): + def __init__(self, cells_per_block=(256, 128, 128), **kwargs): self.config_dict = deepcopy(BASE_CONFIG) self.config_dict['Parameters'].update(kwargs) self.config_dict['DomainSetup']['blocks'] = block_decomposition(wlb.mpi.numProcesses()) + self.config_dict['DomainSetup']['cellsPerBlock'] = cells_per_block @wlb.member_callback def config(self, **kwargs): @@ -45,6 +48,10 @@ class Scenario: data.update(self.config_dict['Parameters']) data.update(self.config_dict['DomainSetup']) data.update(kwargs) + data['executable'] = sys.argv[0] + data['compile_flags'] = wlb.build_info.compiler_flags + data['walberla_version'] = wlb.build_info.version + data['build_machine'] = wlb.build_info.build_machine sequenceValuesToScalars(data) df = pd.DataFrame.from_records([data]) @@ -73,15 +80,50 @@ def overlap_benchmark(): scenarios.add(scenario) -def generate_jobscripts(machine='pizdaint_hybrid'): +def single_gpu_benchmark(): + scenarios = wlb.ScenarioManager() + block_sizes = [(i, i, i) for i in (64, 128, 256, 384)] + [(512, 512, 128)] + cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1), + (32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1), + (32, 4, 1), (64, 4, 1), (128, 4, 1), + (32, 8, 1), (64, 8, 1), + (32, 16, 1)] + for block_size in block_sizes: + for cuda_block_size in cuda_blocks: + cells = block_size[0] * block_size[1] * block_size[2] + time_steps_for_128_cubed = 1000 + time_steps = (128 ** 3 / cells) * time_steps_for_128_cubed + scenario = Scenario(cells_per_block=block_size, + gpuBlockSize=cuda_block_size, + timeStepStrategy='kernelOnly', + timesteps=int(time_steps)) + scenarios.add(scenario) + + +all_executables = ('UniformGridBenchmarkGPU_AA_entropic', + 'UniformGridBenchmarkGPU_AA_mrt', + 'UniformGridBenchmarkGPU_AA_smagorinsky', + 'UniformGridBenchmarkGPU_AA_srt', + 'UniformGridBenchmarkGPU_AA_trt', + 'UniformGridBenchmarkGPU_entropic', + 'UniformGridBenchmarkGPU_mrt', + 'UniformGridBenchmarkGPU_smagorinsky', + 'UniformGridBenchmarkGPU_srt', + 'UniformGridBenchmarkGPU_trt') + + +def generate_jobscripts(machine='pizdaint_hybrid', + exe_names=all_executables): for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 2400]: with open("job_overlap_benchmark_{:04d}.sh".format(node_count), 'w') as f: + js = createJobscript(nodes=node_count, output_file='overlap_bench_{:04d}_%j.txt'.format(node_count), error_file='overlap_bench_{:04d}_%j.txt'.format(node_count), initial_dir=getcwd(), - exe_name='UniformGridBenchmarkGPU', - parameter_files=['overlap_benchmark.py'], + #exe_name='UniformGridBenchmarkGPU', + #parameter_files=['overlap_benchmark.py'], + commands=list((exe, 'overlap_benchmark.py') for exe in exe_names), wall_time=timedelta(minutes=25), machine=machine, account='d105', @@ -93,4 +135,4 @@ if __name__ == '__main__': print("Called without waLBerla - generating job scripts for PizDaint") generate_jobscripts() else: - overlap_benchmark() + single_gpu_benchmark()