From aeff901c3cfcbe3ecb0d8a386508a5718a155c00 Mon Sep 17 00:00:00 2001 From: Martin Bauer <martin.bauer@fau.de> Date: Mon, 7 Oct 2019 13:55:40 +0200 Subject: [PATCH] UniformGridGenerated: detailed output --- .../UniformGridGenerated/CMakeLists.txt | 1 + .../UniformGridGenerated/UniformGrid.prm | 1 - .../UniformGridGenerated.cpp | 36 ++++--- .../UniformGridGenerated.py | 63 +++++++---- .../benchmarks/UniformGridGenerated/params.py | 100 ++++++++++++++++++ 5 files changed, 164 insertions(+), 37 deletions(-) create mode 100644 apps/benchmarks/UniformGridGenerated/params.py diff --git a/apps/benchmarks/UniformGridGenerated/CMakeLists.txt b/apps/benchmarks/UniformGridGenerated/CMakeLists.txt index 5d3f2dca0..b3f9795f7 100644 --- a/apps/benchmarks/UniformGridGenerated/CMakeLists.txt +++ b/apps/benchmarks/UniformGridGenerated/CMakeLists.txt @@ -1,4 +1,5 @@ waLBerla_link_files_to_builddir( "*.prm" ) +waLBerla_link_files_to_builddir( "*.py" ) waLBerla_python_file_generates(UniformGridGenerated.py diff --git a/apps/benchmarks/UniformGridGenerated/UniformGrid.prm b/apps/benchmarks/UniformGridGenerated/UniformGrid.prm index e53b3cd4f..cf2f26ad8 100644 --- a/apps/benchmarks/UniformGridGenerated/UniformGrid.prm +++ b/apps/benchmarks/UniformGridGenerated/UniformGrid.prm @@ -13,7 +13,6 @@ Parameters outerIterations 1; // how many measurements to conduct vtkWriteFrequency 100; // write a VTK file every n'th step, if zero VTK output is disabled - cudaEnabledMPI false; // switch on if you have a CUDA-enabled MPI implementation timeStepMode aa; // can be: noOverlap, simpleOverlap, complexOverlap, kernelOnly remainingTimeLoggerFrequency 0; // interval in seconds to log the estimated remaining time directComm 1; diff --git a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp index d6671b3b2..d5515c0c7 100644 --- a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp +++ b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp @@ -1,5 +1,6 @@ #include "core/Environment.h" #include "core/logging/Initialization.h" +#include "core/OpenMP.h" #include "python_coupling/CreateConfig.h" #include "python_coupling/PythonCallback.h" #include "python_coupling/DictWrapper.h" @@ -11,6 +12,7 @@ #include "timeloop/all.h" #include "core/timing/TimingPool.h" #include "core/timing/RemainingTimeLogger.h" +#include "core/waLBerlaBuildInfo.h" #include "domain_decomposition/SharedSweep.h" #include "gui/Gui.h" #include "InitShearVelocity.h" @@ -30,6 +32,7 @@ #include "GenMpiDtypeInfoAAPull.h" #include "GenMpiDtypeInfoAAPush.h" +#include <iomanip> using namespace walberla; @@ -162,24 +165,26 @@ int main( int argc, char **argv ) { timeLoop.setCurrentTimeStepToZero(); WcTimer simTimer; - WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" ); + + auto threads = omp_get_max_threads(); + simTimer.start(); timeLoop.run(); - /* - pystencils::GenLbKernelAAEven k1(pdfFieldId, omega); - pystencils::GenLbKernelAAOdd k2(pdfFieldId, omega); - for(int t=0; t < timesteps / 2; ++t) - { for( auto & b : *blocks) { - k1(&b); - k2(&b); - }}*/ simTimer.end(); - WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" ); auto time = simTimer.last(); auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] ); auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6; - WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess ); - WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps )); + + using std::setw; + WALBERLA_LOG_INFO_ON_ROOT(setw(18) << timeStepMode << + " procs: " << setw(6) << MPIManager::instance()->numProcesses() << + " threads: " << threads << + " direct_comm: " << directComm << + " time steps: " << timesteps << + setw(15) << " block size: " << cellsPerBlock << + " mlups/core: " << int(mlupsPerProcess/ threads) << + " mlups: " << int(mlupsPerProcess) * MPIManager::instance()->numProcesses()); + WALBERLA_ROOT_SECTION() { python_coupling::PythonCallback pythonCallbackResults( "results_callback" ); @@ -188,8 +193,11 @@ int main( int argc, char **argv ) pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess ); pythonCallbackResults.data().exposeValue( "stencil", infoStencil ); pythonCallbackResults.data().exposeValue( "configName", infoConfigName ); - pythonCallbackResults.data().exposeValue( "cse_global", infoCseGlobal ); - pythonCallbackResults.data().exposeValue( "cse_pdfs", infoCsePdfs ); + pythonCallbackResults.data().exposeValue( "optimizations", optimizationDict ); + pythonCallbackResults.data().exposeValue( "githash", core::buildinfo::gitSHA1() ); + pythonCallbackResults.data().exposeValue( "compilerFlags", core::buildinfo::compilerFlags() ); + pythonCallbackResults.data().exposeValue( "buildMachine", core::buildinfo::buildMachine() ); + // Call Python function to report results pythonCallbackResults(); } diff --git a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py index 3958d308e..1228bf95f 100644 --- a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py +++ b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py @@ -69,25 +69,33 @@ info_header = """ #include "stencil/D3Q{q}.h"\nusing Stencil_T = walberla::stencil::D3Q{q}; const char * infoStencil = "{stencil}"; const char * infoConfigName = "{configName}"; -const bool infoCseGlobal = {cse_global}; -const bool infoCsePdfs = {cse_pdfs}; +const char * optimizationDict = "{optimizationDict}"; """ - with CodeGeneration() as ctx: common_options = { 'field_name': 'pdfs', 'temporary_field_name': 'pdfs_tmp', - 'optimization': {'cse_global': False, - 'cse_pdfs': False, - 'split': False} + } + opts = { + 'two_field_cse_pdfs': False, + 'two_field_cse_global': False, + 'two_field_split': True, + 'two_field_nt_stores': True, + + 'aa_even_cse_pdfs': False, + 'aa_even_cse_global': False, + 'aa_even_split': False, + 'aa_even_nt_stores': False, + + 'aa_odd_cse_pdfs': False, + 'aa_odd_cse_global': False, + 'aa_odd_split': True, + 'aa_odd_nt_stores': False, } config_name = ctx.config noopt = False d3q27 = False - if config_name.endswith("_noopt"): - noopt = True - config_name = config_name[:-len("_noopt")] if config_name.endswith("_d3q27"): d3q27 = True config_name = config_name[:-len("_d3q27")] @@ -104,20 +112,33 @@ with CodeGeneration() as ctx: stencil_str = options['stencil'] q = int(stencil_str[stencil_str.find('Q')+1:]) pdfs, velocity_field = ps.fields("pdfs({q}), velocity(3) : double[3D]".format(q=q), layout='fzyx') - options['optimization']['symbolic_field'] = pdfs - update_rule_two_field = create_lb_update_rule(**options) - update_rule_aa_even = create_lb_update_rule(kernel_type=AAEvenTimeStepAccessor(), **options) - options['optimization']['split'] = True - update_rule_aa_odd = create_lb_update_rule(kernel_type=AAOddTimeStepAccessor(), **options) - - vec = {'nontemporal': False, 'assume_aligned': True, 'assume_inner_stride_one': True} + update_rule_two_field = create_lb_update_rule(optimization={'symbolic_field': pdfs, + 'split': opts['two_field_split'], + 'cse_global': opts['two_field_cse_global'], + 'cse_pdfs': opts['two_field_cse_pdfs']}, **options) + update_rule_aa_even = create_lb_update_rule(kernel_type=AAEvenTimeStepAccessor(), + optimization={'symbolic_field': pdfs, + 'split': opts['aa_even_split'], + 'cse_global': opts['aa_even_cse_global'], + 'cse_pdfs': opts['aa_even_cse_pdfs']}, **options) + update_rule_aa_odd = create_lb_update_rule(kernel_type=AAOddTimeStepAccessor(), + optimization={'symbolic_field': pdfs, + 'split': opts['aa_odd_split'], + 'cse_global': opts['aa_odd_cse_global'], + 'cse_pdfs': opts['aa_odd_cse_pdfs']}, **options) + + vec = { 'assume_aligned': True, 'assume_inner_stride_one': True} # Sweeps - generate_sweep(ctx, 'GenLbKernel', update_rule_two_field, field_swaps=[('pdfs', 'pdfs_tmp')]) - generate_sweep(ctx, 'GenLbKernelAAEven', update_rule_aa_even, cpu_vectorize_info={'assume_aligned': True}, + vec['nontemporal'] = opts['two_field_nt_stores'] + generate_sweep(ctx, 'GenLbKernel', update_rule_two_field, field_swaps=[('pdfs', 'pdfs_tmp')], + cpu_vectorize_info=vec) + vec['nontemporal'] = opts['aa_even_nt_stores'] + generate_sweep(ctx, 'GenLbKernelAAEven', update_rule_aa_even, cpu_vectorize_info=vec, cpu_openmp=True, ghost_layers=1) - generate_sweep(ctx, 'GenLbKernelAAOdd', update_rule_aa_odd, cpu_vectorize_info={'assume_aligned': True}, + vec['nontemporal'] = opts['aa_odd_nt_stores'] + generate_sweep(ctx, 'GenLbKernelAAOdd', update_rule_aa_odd, cpu_vectorize_info=vec, cpu_openmp=True, ghost_layers=1) setter_assignments = macroscopic_values_setter(update_rule_two_field.method, velocity=velocity_field.center_vector, @@ -144,8 +165,6 @@ with CodeGeneration() as ctx: 'stencil': stencil_str, 'q': q, 'configName': ctx.config, - 'cse_global': int(options['optimization']['cse_global']), - 'cse_pdfs': int(options['optimization']['cse_pdfs']), + 'optimizationDict': str(opts), } ctx.write_file("GenDefines.h", info_header.format(**infoHeaderParams)) - diff --git a/apps/benchmarks/UniformGridGenerated/params.py b/apps/benchmarks/UniformGridGenerated/params.py new file mode 100644 index 000000000..399d29fd0 --- /dev/null +++ b/apps/benchmarks/UniformGridGenerated/params.py @@ -0,0 +1,100 @@ +import math +import os +import operator +import waLBerla as wlb +from waLBerla.tools.sqlitedb import * +from functools import reduce + + +def prod(seq): + return reduce(operator.mul, seq, 1) + + +def get_block_decomposition(block_decomposition, num_processes): + bx = by = bz = 1 + blocks_per_axis = int(math.log(num_processes, 2)) + for i in range(blocks_per_axis): + decomposition_axis = block_decomposition[i % len(block_decomposition)] + if decomposition_axis == 'y': + by *= 2 + elif decomposition_axis == 'z': + bz *= 2 + elif decomposition_axis == 'x': + bx *= 2 + + assert (bx * by * bz) == num_processes + return bx, by, bz + + +def calculate_time_steps(runtime, expected_mlups, domain_size): + cells = prod(domain_size) + time_steps_per_second = expected_mlups * 1e6 / cells + return int(time_steps_per_second * runtime) + + +class BenchmarkScenario: + def __init__(self, block_size=(256, 128, 128), direct_comm=True, time_step_mode='aa', db_file_name='uniform_grid_gen.sqlite'): + self.block_size = block_size + self.direct_comm = direct_comm + self.time_step_mode = time_step_mode + self.threads = int(os.environ['OMP_NUM_THREADS']) + self.processes = wlb.mpi.numProcesses() + self.db_file_name = db_file_name + + @wlb.member_callback + def config(self, **kwargs): + time_steps_for_128_cubed = 50 + time_steps = int(128**3 / prod(self.block_size) * time_steps_for_128_cubed) + time_steps = max(10, time_steps) + cfg = { + 'DomainSetup': { + 'blocks': (1, 1, self.processes), + 'cellsPerBlock': (self.block_size[0], self.block_size[1], self.block_size[2] * self.threads), + 'periodic': (1, 1, 1), + }, + 'Parameters': { + 'timesteps': time_steps, + 'warmupSteps': 6, + 'outerIterations': 3, + 'vtkWriteFrequency': 0, + 'remainingTimeLoggerFrequency': 0, + 'omega': 1.6, + 'timeStepMode': self.time_step_mode, + 'directComm': self.direct_comm, + } + } + return cfg + + @wlb.member_callback + def results_callback(self, mlupsPerProcess, optimizations, **kwargs): + cfg = self.config() + result = { + 'block_size': self.block_size, + 'mlups_per_core': mlupsPerProcess / self.threads, + 'threads': self.threads, + 'processes': self.processes, + 'time_step_mode': self.time_step_mode, + 'direct_comm': self.direct_comm, + 'time_steps': cfg['Parameters']['timesteps'], + 'I_MPI_PIN_CELL': os.environ.get('I_MPI_PIN_CELL', ''), + 'I_MPI_PIN_DOMAIN': os.environ.get('I_MPI_PIN_CELL', ''), + } + + optimizations = eval(optimizations) + result.update(optimizations) + result.update(kwargs) + sequenceValuesToScalars(result) + checkAndUpdateSchema(result, "runs", self.db_file_name) + storeSingle(result, "runs", self.db_file_name) + + +def benchmark(): + scenarios = wlb.ScenarioManager() + for block_size in [(128, 128, 128), (128, 64, 64), (64, 64, 128), (64, 64, 64), (64, 32, 32), (32, 32, 32), (16, 16, 16), (256, 128, 64), (512, 128, 32)]: + for direct_comm in (True, False): + for time_step_mode in ['aa', 'aaKernelOnly', 'twoField']: + sc = BenchmarkScenario(block_size=block_size, direct_comm=direct_comm, time_step_mode=time_step_mode) + scenarios.add(sc) + +benchmark() + -- GitLab