From e33c19beb328d0d73cffdef243086dae93130899 Mon Sep 17 00:00:00 2001 From: Martin Bauer <martin.bauer@fau.de> Date: Tue, 25 Jun 2019 09:27:45 +0200 Subject: [PATCH] UniformGridGPU more configurations --- apps/benchmarks/UniformGridGPU/CMakeLists.txt | 2 +- .../UniformGridGPU/UniformGridGPU.cpp | 37 ++++++++++------- .../UniformGridGPU/UniformGridGPU.py | 40 +++++++++++++++++-- 3 files changed, 59 insertions(+), 20 deletions(-) diff --git a/apps/benchmarks/UniformGridGPU/CMakeLists.txt b/apps/benchmarks/UniformGridGPU/CMakeLists.txt index 66497288f..ca6aa7b23 100644 --- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt +++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt @@ -13,7 +13,7 @@ waLBerla_python_file_generates(UniformGridGPU.py UniformGridGPU_Defines.h ) -foreach(config srt trt mrt smagorinsky entropic ) +foreach(config srt trt mrt smagorinsky entropic smagorinsky_noopt entropic_kbc_n4 entropic_kbc_n4_noopt mrt_noopt mrt_full mrt_full_noopt) waLBerla_add_executable ( NAME UniformGridBenchmarkGPU_${config} FILES UniformGridGPU.cpp UniformGridGPU.py DEPENDS blockforest boundary core cuda domain_decomposition field geometry timeloop vtk gui diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp index 0af7c1360..c1ed30eaf 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp @@ -95,26 +95,32 @@ int main( int argc, char **argv ) Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t> >( "cellsPerBlock" ); // Reading parameters auto parameters = config->getOneBlock( "Parameters" ); + const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal"); const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 )); const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 )); const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", false); // Creating fields - BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t(99.8), field::fzyx); + BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t(0), field::fzyx); BlockDataID velFieldCpuID = field::addToStorage< VelocityField_T >( blocks, "vel", real_t(0), field::fzyx); - if( initShearFlow ) { - WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow"); - initShearVelocity( blocks, velFieldCpuID ); - } - pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldCpuID, velFieldCpuID); - for( auto & block : *blocks ) - setterSweep( &block ); - // setter sweep only initializes interior of domain - for push schemes to work a first communication is required here - blockforest::communication::UniformBufferedScheme<CommunicationStencil_T> initialComm(blocks); - initialComm.addPackInfo( make_shared< field::communication::PackInfo<PdfField_T> >( pdfFieldCpuID ) ); - initialComm(); + if( timeStepStrategy != "kernelOnlyNoInit") + { + if ( initShearFlow ) + { + WALBERLA_LOG_INFO_ON_ROOT( "Initializing shear flow" ); + initShearVelocity( blocks, velFieldCpuID ); + } + + pystencils::UniformGridGPU_MacroSetter setterSweep(pdfFieldCpuID, velFieldCpuID); + for( auto & block : *blocks ) + setterSweep( &block ); + // setter sweep only initializes interior of domain - for push schemes to work a first communication is required here + blockforest::communication::UniformBufferedScheme<CommunicationStencil_T> initialComm(blocks); + initialComm.addPackInfo( make_shared< field::communication::PackInfo<PdfField_T> >( pdfFieldCpuID ) ); + initialComm(); + } BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage<PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true ); BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >( blocks, "flag field" ); @@ -165,7 +171,9 @@ int main( int argc, char **argv ) int streamLowPriority = 0; WALBERLA_CUDA_CHECK( cudaDeviceGetStreamPriorityRange(&streamLowPriority, &streamHighPriority) ); WALBERLA_CHECK(gpuBlockSize[2] == 1); - pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega, gpuBlockSize[0], gpuBlockSize[1], + pystencils::UniformGridGPU_LbKernel lbKernel( pdfFieldGpuID, omega, + 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, + gpuBlockSize[0], gpuBlockSize[1], Cell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]) ); lbKernel.setOuterPriority( streamHighPriority ); UniformGridGPU_Communication< CommunicationStencil_T, cuda::GPUField< double > > @@ -252,7 +260,6 @@ int main( int argc, char **argv ) SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps ); - const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal"); std::function<void()> timeStep; if (timeStepStrategy == "noOverlap") timeStep = std::function<void()>( normalTimeStep ); @@ -260,7 +267,7 @@ int main( int argc, char **argv ) timeStep = std::function<void()>( overlapTimeStep ); else if (timeStepStrategy == "simpleOverlap") timeStep = simpleOverlapTimeStep; - else if (timeStepStrategy == "kernelOnly") { + else if (timeStepStrategy == "kernelOnly" or timeStepStrategy == "kernelOnlyNoInit") { WALBERLA_LOG_INFO_ON_ROOT("Running only compute kernel without boundary - this makes only sense for benchmarking!") timeStep = kernelOnlyFunc; } diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py index d3b9444f7..130337cd3 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py @@ -12,6 +12,7 @@ from pystencils.fast_approximation import insert_fast_sqrts, insert_fast_divisio from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter omega = sp.symbols("omega") +omega_fill = sp.symbols("omega_:10") compile_time_block_size = False if compile_time_block_size: @@ -40,6 +41,11 @@ options_dict = { 'stencil': 'D3Q19', 'relaxation_rates': [0, omega, 1.3, 1.4, omega, 1.2, 1.1], }, + 'mrt_full': { + 'method': 'mrt', + 'stencil': 'D3Q19', + 'relaxation_rates': [omega_fill[0], omega, omega_fill[1], omega_fill[2], omega_fill[3], omega_fill[4], omega_fill[5]], + }, 'entropic': { 'method': 'mrt3', 'stencil': 'D3Q19', @@ -47,6 +53,13 @@ options_dict = { 'relaxation_rates': [omega, omega, sp.Symbol("omega_free")], 'entropic': True, }, + 'entropic_kbc_n4': { + 'method': 'trt-kbc-n4', + 'stencil': 'D3Q27', + 'compressible': True, + 'relaxation_rates': [omega, sp.Symbol("omega_free")], + 'entropic': True, + }, 'smagorinsky': { 'method': 'srt', 'stencil': 'D3Q19', @@ -76,8 +89,19 @@ with CodeGeneration() as ctx: 'optimization': {'cse_global': True, 'cse_pdfs': False} } - options = options_dict.get(ctx.config, options_dict['srt']) + config_name = ctx.config + noopt = False + if config_name.endswith("_noopt"): + noopt = True + config_name = config_name[:-len("_noopt")] + + options = options_dict[config_name] options.update(common_options) + options = options.copy() + + if noopt: + options['optimization']['cse_global'] = False + options['optimization']['cse_pdfs'] = False stencil_str = options['stencil'] q = int(stencil_str[stencil_str.find('Q')+1:]) @@ -85,14 +109,22 @@ with CodeGeneration() as ctx: options['optimization']['symbolic_field'] = pdfs vp = [ + ('double', 'omega_0'), + ('double', 'omega_1'), + ('double', 'omega_2'), + ('double', 'omega_3'), + ('double', 'omega_4'), + ('double', 'omega_5'), + ('double', 'omega_6'), ('int32_t', 'cudaBlockSize0'), - ('int32_t', 'cudaBlockSize1') + ('int32_t', 'cudaBlockSize1'), ] lb_method = create_lb_method(**options) update_rule = create_lb_update_rule(lb_method=lb_method, **options) - update_rule = insert_fast_divisions(update_rule) - update_rule = insert_fast_sqrts(update_rule) + if not noopt: + update_rule = insert_fast_divisions(update_rule) + update_rule = insert_fast_sqrts(update_rule) # CPU lattice model - required for macroscopic value computation, VTK output etc. options_without_opt = options.copy() -- GitLab