From a1f716d0e757ac8e33abeb7868e98757b33a0b2a Mon Sep 17 00:00:00 2001 From: Martin Bauer <martin.bauer@fau.de> Date: Tue, 1 Oct 2019 08:25:52 +0200 Subject: [PATCH] New benchmark for generated LBM on uniform grid on CPU --- apps/benchmarks/CMakeLists.txt | 1 + .../simulation_setup/communication_compare.py | 196 ++++++++++++++++++ .../UniformGridGenerated/CMakeLists.txt | 12 ++ .../UniformGridGenerated.cpp | 157 ++++++++++++++ .../UniformGridGenerated.py | 133 ++++++++++++ 5 files changed, 499 insertions(+) create mode 100755 apps/benchmarks/UniformGridGPU/simulation_setup/communication_compare.py create mode 100644 apps/benchmarks/UniformGridGenerated/CMakeLists.txt create mode 100644 apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp create mode 100644 apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py diff --git a/apps/benchmarks/CMakeLists.txt b/apps/benchmarks/CMakeLists.txt index 8e83319a..a59d3bc1 100644 --- a/apps/benchmarks/CMakeLists.txt +++ b/apps/benchmarks/CMakeLists.txt @@ -14,3 +14,4 @@ add_subdirectory( ProbeVsExtraMessage ) add_subdirectory( SchaeferTurek ) add_subdirectory( UniformGrid ) add_subdirectory( UniformGridGPU ) +add_subdirectory( UniformGridGenerated ) diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/communication_compare.py b/apps/benchmarks/UniformGridGPU/simulation_setup/communication_compare.py new file mode 100755 index 00000000..4a20b377 --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/simulation_setup/communication_compare.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 + +import os +import pandas as pd +import waLBerla as wlb +from waLBerla.tools.config import block_decomposition +from waLBerla.tools.sqlitedb import sequenceValuesToScalars +from copy import deepcopy +import sys + +CSV_FILE = "new.csv" + +BASE_CONFIG = { + 'DomainSetup': { + 'cellsPerBlock': (256, 128, 128), + 'periodic': (1, 1, 1), + }, + 'Parameters': { + 'omega': 1.8, + 'timesteps': 400, + 'cudaEnabledMPI': False, + 'warmupSteps': 5, + 'outerIterations': 3, + 'initShearFlow': True, + } +} + + +class Scenario: + def __init__(self, cells_per_block=(256, 128, 128), **kwargs): + self.config_dict = deepcopy(BASE_CONFIG) + self.config_dict['Parameters'].update(kwargs) + self.config_dict['DomainSetup']['blocks'] = block_decomposition(wlb.mpi.numProcesses()) + self.config_dict['DomainSetup']['cellsPerBlock'] = cells_per_block + + @wlb.member_callback + def config(self, **kwargs): + from pprint import pformat + wlb.log_info_on_root("Scenario:\n" + pformat(self.config_dict)) + return self.config_dict + + @wlb.member_callback + def results_callback(self, **kwargs): + data = {} + data.update(self.config_dict['Parameters']) + data.update(self.config_dict['DomainSetup']) + data.update(kwargs) + data['executable'] = sys.argv[0] + data['compile_flags'] = wlb.build_info.compiler_flags + data['walberla_version'] = wlb.build_info.version + data['build_machine'] = wlb.build_info.build_machine + sequenceValuesToScalars(data) + + df = pd.DataFrame.from_records([data]) + if not os.path.isfile(CSV_FILE): + df.to_csv(CSV_FILE, index=False) + else: + df.to_csv(CSV_FILE, index=False, mode='a', header=False) + + +def overlap_benchmark(): + scenarios = wlb.ScenarioManager() + inner_outer_splits = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1), + (4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1), + (4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32)] + + for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']: # 'GPUPackInfo_Baseline', 'GPUPackInfo_Streams' + # no overlap + scenarios.add(Scenario(timeStepStrategy='noOverlap', communicationScheme=comm_strategy, innerOuterSplit=(1, 1, 1))) + + # overlap + for overlap_strategy in ['simpleOverlap', 'complexOverlap']: + for inner_outer_split in inner_outer_splits: + scenario = Scenario(timeStepStrategy=overlap_strategy, + communicationScheme=comm_strategy, + innerOuterSplit=inner_outer_split) + scenarios.add(scenario) + + +def communication_compare(): + scenarios = wlb.ScenarioManager() + for block_size in [(128, 128, 128), (32, 32, 32), (64, 64, 64), (256, 256, 256)]: + for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']: + sc = Scenario(cells_per_block=block_size, + gpuBlockSize=(128, 1, 1), + timeStepStrategy='noOverlap', + communicationScheme=comm_strategy) + scenarios.add(sc) + for inner_outer_split in [(4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1)]: + if 3 * inner_outer_split[0] < block_size[0]: + continue + sc = Scenario(cells_per_block=block_size, + gpuBlockSize=(128, 1, 1), + timeStepStrategy='simpleOverlap', + innerOuterSplit=inner_outer_split, + communicationScheme=comm_strategy) + scenarios.add(sc) + + +def single_gpu_benchmark(): + scenarios = wlb.ScenarioManager() + block_sizes = [(i, i, i) for i in (64, 128, 256, 384)] + [(512, 512, 128)] + cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1), + (32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1), + (32, 4, 1), (64, 4, 1), (128, 4, 1), + (32, 8, 1), (64, 8, 1), + (32, 16, 1)] + for block_size in block_sizes: + for cuda_block_size in cuda_blocks: + cells = block_size[0] * block_size[1] * block_size[2] + time_steps_for_128_cubed = 1000 + time_steps = (128 ** 3 / cells) * time_steps_for_128_cubed + scenario = Scenario(cells_per_block=block_size, + gpuBlockSize=cuda_block_size, + timeStepStrategy='kernelOnly', + timesteps=int(time_steps)) + scenarios.add(scenario) + + +job_script_header = """ +#!/bin/bash -l +#SBATCH --job-name=scaling +#SBATCH --time=0:30:00 +#SBATCH --nodes={nodes} +#SBATCH -o out_scaling_{nodes}_%j.txt +#SBATCH -e err_scaling_{nodes}_%j.txt +#SBATCH --ntasks-per-core=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=1 +#SBATCH --partition=normal +#SBATCH --constraint=gpu +#SBATCH --account=d105 + +cd {folder} + +source ~/env.sh + +module load daint-gpu +module load craype-accel-nvidia60 +export MPICH_RDMA_ENABLED_CUDA=1 # allow GPU-GPU data transfer +export CRAY_CUDA_MPS=1 # allow GPU sharing +export MPICH_G2G_PIPELINE=256 # adapt maximum number of concurrent in-flight messages + +export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK +export CRAY_CUDA_MPS=1 + +export MPICH_RANK_REORDER_METHOD=3 +export PMI_MMAP_SYNC_WAIT_TIME=300 + + +# grid_order -R -H -c 1,1,8 -g 16,16,8 + +ulimit -c 0 +""" + +job_script_exe_part = """ + +export WALBERLA_SCENARIO_IDX=0 +while srun -n {nodes} ./{app} {config} +do + ((WALBERLA_SCENARIO_IDX++)) +done +""" + + +all_executables = ('UniformGridBenchmarkGPU_mrt_d3q27', + 'UniformGridBenchmarkGPU_smagorinsky_d3q27', + 'UniformGridBenchmarkGPU_cumulant' + 'UniformGridBenchmarkGPU_cumulant_d3q27', + ) + + +def generate_jobscripts(exe_names=all_executables): + for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 2400]: + folder_name = "scaling_{:04d}".format(node_count) + os.makedirs(folder_name, exist_ok=True) + + # run grid_order + import subprocess + decomposition = block_decomposition(node_count) + decomposition_str = ",".join(str(e) for e in decomposition) + subprocess.check_call(['grid_order', '-R', '-H', '-g', decomposition_str]) + + job_script = job_script_header.format(nodes=node_count, folder=os.path.join(os.getcwd(), folder_name)) + for exe in exe_names: + job_script += job_script_exe_part.format(app="../" + exe, nodes=node_count, config='../communication_compare.py') + + with open(os.path.join(folder_name, 'job.sh'), 'w') as f: + f.write(job_script) + + +if __name__ == '__main__': + print("Called without waLBerla - generating job scripts for PizDaint") + generate_jobscripts() +else: + single_gpu_benchmark() diff --git a/apps/benchmarks/UniformGridGenerated/CMakeLists.txt b/apps/benchmarks/UniformGridGenerated/CMakeLists.txt new file mode 100644 index 00000000..def7d93f --- /dev/null +++ b/apps/benchmarks/UniformGridGenerated/CMakeLists.txt @@ -0,0 +1,12 @@ +waLBerla_python_file_generates(UniformGridGenerated.py + UniformGridGenerated_LatticeModel.cpp + UniformGridGenerated_Defines.h) + + +foreach(config trt ) + waLBerla_add_executable ( NAME UniformGridBenchmarkGenerated_${config} + FILES UniformGridGenerated.cpp UniformGridGenerated.py + DEPENDS blockforest boundary core domain_decomposition field geometry timeloop vtk gui + CODEGEN_CFG ${config}) +endforeach() + diff --git a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp new file mode 100644 index 00000000..f48b0f75 --- /dev/null +++ b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp @@ -0,0 +1,157 @@ +#include "core/Environment.h" +#include "core/logging/Initialization.h" +#include "core/math/Random.h" +#include "python_coupling/CreateConfig.h" +#include "python_coupling/PythonCallback.h" +#include "python_coupling/DictWrapper.h" +#include "blockforest/Initialization.h" +#include "field/vtk/VTKWriter.h" +#include "field/communication/PackInfo.h" +#include "blockforest/communication/UniformBufferedScheme.h" +#include "timeloop/all.h" +#include "core/timing/TimingPool.h" +#include "core/timing/RemainingTimeLogger.h" +#include "domain_decomposition/SharedSweep.h" +#include "lbm/communication/PdfFieldPackInfo.h" +#include "lbm/field/AddToStorage.h" +#include "lbm/vtk/VTKOutput.h" +#include "lbm/gui/Connection.h" +#include "lbm/vtk/Velocity.h" +#include "gui/Gui.h" + +#include "UniformGridGenerated_LatticeModel.h" +#include "UniformGridGenerated_Defines.h" + + +using namespace walberla; + +typedef lbm::UniformGridGenerated_LatticeModel LatticeModel_T; +typedef LatticeModel_T::Stencil Stencil_T; +typedef LatticeModel_T::CommunicationStencil CommunicationStencil_T; +typedef lbm::PdfField< LatticeModel_T > PdfField_T; + + +void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID pdfFieldId, + const real_t xMagnitude=0.1, const real_t fluctuationMagnitude=0.05 ) +{ + math::seedRandomGenerator(0); + auto halfZ = blocks->getDomainCellBB().zMax() / 2; + for( auto & block: *blocks) + { + auto pdfField = block.getData<PdfField_T>( pdfFieldId ); + WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(pdfField, + Cell globalCell; + blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z)); + real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude); + + if( globalCell[2] >= halfZ ) { + pdfField->setDensityAndVelocity(x, y, z, Vector3<real_t>(xMagnitude, 0, randomReal), real_t(1.0)); + } else { + pdfField->setDensityAndVelocity(x, y, z, Vector3<real_t>(-xMagnitude, 0, randomReal), real_t(1.0)); + } + ); + } +} + + +int main( int argc, char **argv ) +{ + mpi::Environment env( argc, argv ); + + for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg ) + { + WALBERLA_MPI_WORLD_BARRIER(); + + auto config = *cfg; + logging::configureLogging( config ); + auto blocks = blockforest::createUniformBlockGridFromConfig( config ); + + Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t> >( "cellsPerBlock" ); + // Reading parameters + auto parameters = config->getOneBlock( "Parameters" ); + const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal"); + const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 )); + const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 )); + const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", false); + + // Creating fields + LatticeModel_T latticeModel = LatticeModel_T( omega ); + BlockDataID pdfFieldId = lbm::addPdfFieldToStorage( blocks, "pdf field", latticeModel); + + if( initShearFlow ) { + initShearVelocity(blocks, pdfFieldId); + } + + SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps ); + blockforest::communication::UniformBufferedScheme< CommunicationStencil_T > communication( blocks ); + communication.addPackInfo( make_shared< lbm::PdfFieldPackInfo< LatticeModel_T > >( pdfFieldId ) ); + + timeLoop.add() << BeforeFunction( communication, "communication" ) + << Sweep( LatticeModel_T::Sweep( pdfFieldId ), "LB stream & collide" ); + + int warmupSteps = parameters.getParameter<int>( "warmupSteps", 2 ); + int outerIterations = parameters.getParameter<int>( "outerIterations", 1 ); + for(int i=0; i < warmupSteps; ++i ) + timeLoop.singleStep(); + + auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds + if (remainingTimeLoggerFrequency > 0) { + auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * outerIterations, remainingTimeLoggerFrequency ); + timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" ); + } + + // VTK + uint_t vtkWriteFrequency = parameters.getParameter<uint_t>( "vtkWriteFrequency", 0 ); + if( vtkWriteFrequency > 0 ) + { + auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out", + "simulation_step", false, true, true, false, 0 ); + auto velWriter = make_shared< lbm::VelocityVTKWriter<LatticeModel_T> >(pdfFieldId, "vel"); + vtkOutput->addCellDataWriter(velWriter); + timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" ); + } + + + bool useGui = parameters.getParameter<bool>( "useGui", false ); + if( useGui ) + { + GUI gui( timeLoop, blocks, argc, argv); + lbm::connectToGui<LatticeModel_T>(gui); + gui.run(); + } + else + { + for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration ) + { + timeLoop.setCurrentTimeStepToZero(); + WcTimer simTimer; + WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" ); + simTimer.start(); + timeLoop.run(); + simTimer.end(); + WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" ); + auto time = simTimer.last(); + auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] ); + auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6; + WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess ); + WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps )); + WALBERLA_ROOT_SECTION() + { + python_coupling::PythonCallback pythonCallbackResults( "results_callback" ); + if ( pythonCallbackResults.isCallable()) + { + pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess ); + pythonCallbackResults.data().exposeValue( "stencil", infoStencil ); + pythonCallbackResults.data().exposeValue( "configName", infoConfigName ); + pythonCallbackResults.data().exposeValue( "cse_global", infoCseGlobal ); + pythonCallbackResults.data().exposeValue( "cse_pdfs", infoCsePdfs ); + // Call Python function to report results + pythonCallbackResults(); + } + } + } + } + } + + return 0; +} diff --git a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py new file mode 100644 index 00000000..8f7bf791 --- /dev/null +++ b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py @@ -0,0 +1,133 @@ +import sympy as sp +import pystencils as ps +from lbmpy.creationfunctions import create_lb_collision_rule +from lbmpy.fieldaccess import StreamPullTwoFieldsAccessor +from lbmpy_walberla import generate_lattice_model +from pystencils_walberla import CodeGeneration + +omega = sp.symbols("omega") +omega_fill = sp.symbols("omega_:10") + +options_dict = { + 'srt': { + 'method': 'srt', + 'stencil': 'D3Q19', + 'relaxation_rate': omega, + 'compressible': False, + }, + 'trt': { + 'method': 'trt', + 'stencil': 'D3Q19', + 'relaxation_rate': omega, + }, + 'mrt': { + 'method': 'mrt', + 'stencil': 'D3Q19', + 'relaxation_rates': [0, omega, 1.3, 1.4, omega, 1.2, 1.1, 1.15, 1.234, 1.4235, 1.242, 1.2567, 0.9, 0.7], + }, + 'mrt_full': { + 'method': 'mrt', + 'stencil': 'D3Q19', + 'relaxation_rates': [omega_fill[0], omega, omega_fill[1], omega_fill[2], omega_fill[3], omega_fill[4], omega_fill[5]], + }, + 'mrt3': { + 'method': 'mrt3', + 'stencil': 'D3Q19', + 'relaxation_rates': [omega, 1.1, 1.2], + }, + 'entropic': { + 'method': 'mrt3', + 'stencil': 'D3Q19', + 'compressible': True, + 'relaxation_rates': [omega, omega, sp.Symbol("omega_free")], + 'entropic': True, + }, + 'entropic_kbc_n4': { + 'method': 'trt-kbc-n4', + 'stencil': 'D3Q27', + 'compressible': True, + 'relaxation_rates': [omega, sp.Symbol("omega_free")], + 'entropic': True, + }, + 'smagorinsky': { + 'method': 'srt', + 'stencil': 'D3Q19', + 'smagorinsky': True, + 'relaxation_rate': omega, + }, + 'cumulant': { + 'stencil': 'D3Q19', + 'compressible': True, + 'method': 'mrt', + 'cumulant': True, + 'relaxation_rates': [0, omega, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + }, +} + +info_header = """ +#include "stencil/D3Q{q}.h"\nusing Stencil_T = walberla::stencil::D3Q{q}; +const char * infoStencil = "{stencil}"; +const char * infoConfigName = "{configName}"; +const bool infoCseGlobal = {cse_global}; +const bool infoCsePdfs = {cse_pdfs}; +""" + + +with CodeGeneration() as ctx: + accessor = StreamPullTwoFieldsAccessor() + assert not accessor.is_inplace, "This app does not work for inplace accessors" + + common_options = { + 'field_name': 'pdfs', + 'temporary_field_name': 'pdfs_tmp', + 'kernel_type': accessor, + 'optimization': {'cse_global': True, + 'cse_pdfs': False} + } + config_name = ctx.config + noopt = False + d3q27 = False + if config_name.endswith("_noopt"): + noopt = True + config_name = config_name[:-len("_noopt")] + if config_name.endswith("_d3q27"): + d3q27 = True + config_name = config_name[:-len("_d3q27")] + + options = options_dict[config_name] + options.update(common_options) + options = options.copy() + + if noopt: + options['optimization']['cse_global'] = False + options['optimization']['cse_pdfs'] = False + if d3q27: + options['stencil'] = 'D3Q27' + + stencil_str = options['stencil'] + q = int(stencil_str[stencil_str.find('Q')+1:]) + pdfs, velocity_field = ps.fields("pdfs({q}), velocity(3) : double[3D]".format(q=q), layout='fzyx') + options['optimization']['symbolic_field'] = pdfs + + vp = [ + ('double', 'omega_0'), + ('double', 'omega_1'), + ('double', 'omega_2'), + ('double', 'omega_3'), + ('double', 'omega_4'), + ('double', 'omega_5'), + ('double', 'omega_6'), + ('int32_t', 'cudaBlockSize0'), + ('int32_t', 'cudaBlockSize1'), + ] + update_rule = create_lb_collision_rule(**options) + generate_lattice_model(ctx, 'UniformGridGenerated_LatticeModel', update_rule) + + infoHeaderParams = { + 'stencil': stencil_str, + 'q': q, + 'configName': ctx.config, + 'cse_global': int(options['optimization']['cse_global']), + 'cse_pdfs': int(options['optimization']['cse_pdfs']), + } + ctx.write_file("UniformGridGenerated_Defines.h", info_header.format(**infoHeaderParams)) -- GitLab