diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp index c1ed30eafa8876741fc36490d51e79f998d131a0..391feddd71867bc6e3e3618edb1f878c8426e5ef 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp @@ -98,7 +98,7 @@ int main( int argc, char **argv ) const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal"); const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 )); const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 )); - const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", false); + const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", true); // Creating fields BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t(0), field::fzyx); @@ -165,7 +165,12 @@ int main( int argc, char **argv ) } Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1)); - + for(int i=0; i< 3; ++i) + { + if( int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) { + WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock"); + } + } int streamHighPriority = 0; int streamLowPriority = 0; diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py index c8a2575067e0947cb135f566c85a8175042168e5..9ae07cd26653e21bc2f02fffc086386d816613a0 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.py +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.py @@ -1,7 +1,7 @@ import sympy as sp import numpy as np import pystencils as ps -from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule +from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule, create_lb_collision_rule from lbmpy.boundaries import NoSlip, UBB from lbmpy.fieldaccess import StreamPullTwoFieldsAccessor, StreamPushTwoFieldsAccessor from pystencils_walberla import generate_pack_info_from_kernel @@ -39,7 +39,7 @@ options_dict = { 'mrt': { 'method': 'mrt', 'stencil': 'D3Q19', - 'relaxation_rates': [0, omega, 1.3, 1.4, omega, 1.2, 1.1, 1.15, 1.234, 1.4235, 1.242, 1.2567, 0.9, 0.7], + 'relaxation_rates': [omega, 1.3, 1.4, 1.2, 1.1, 1.15, 1.234, 1.4235], }, 'mrt_full': { 'method': 'mrt', @@ -147,7 +147,7 @@ with CodeGeneration() as ctx: # CPU lattice model - required for macroscopic value computation, VTK output etc. options_without_opt = options.copy() del options_without_opt['optimization'] - generate_lattice_model(ctx, 'UniformGridGPU_LatticeModel', lb_method, update_rule_params=options_without_opt) + generate_lattice_model(ctx, 'UniformGridGPU_LatticeModel', create_lb_collision_rule(lb_method=lb_method, **options_without_opt)) # gpu LB sweep & boundaries generate_sweep(ctx, 'UniformGridGPU_LbKernel', update_rule, diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp index fdf1c1bf759ec07236098a22d70111fa9561a2da..c5fb073abd4a45c69308e6be92c338f3f2d0e314 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.cpp @@ -83,6 +83,14 @@ int main( int argc, char **argv ) BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage< PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true ); Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1)); + + for(int i=0; i< 3; ++i) + { + if( int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) { + WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock"); + } + } + Cell innerOuterSplitCell (innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]); bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false ); Vector3<int32_t> gpuBlockSize = parameters.getParameter<Vector3<int32_t> > ("gpuBlockSize", Vector3<int32_t>(256, 1, 1)); diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py index 08d8b875ac232f76dbf1a065a6e7ee4810c8fc66..882de06f9d1bbe5ae2a189d0b38a0dd707d98bc5 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU_AA.py @@ -36,7 +36,7 @@ options_dict = { 'mrt': { 'method': 'mrt', 'stencil': 'D3Q19', - 'relaxation_rates': [0, omega, 1.3, 1.4, omega, 1.2, 1.1], + 'relaxation_rates': [omega, 1.3, 1.4, omega, 1.2, 1.1], }, 'entropic': { 'method': 'mrt3', diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/base.py b/apps/benchmarks/UniformGridGPU/simulation_setup/base.py deleted file mode 100644 index 235bfe948d57ae5947a944b368aa87bdc4d64e9c..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGPU/simulation_setup/base.py +++ /dev/null @@ -1,51 +0,0 @@ -# encoding: utf-8 - -import math -import operator -from functools import reduce -import waLBerla as wlb - -# Constants that define the size of blocks that are used in the benchmarks -MIN_CELLS_PER_BLOCK = 16 -MAX_CELLS_PER_BLOCK = 256 -INC_CELLS_PER_BLOCK = 16 -# Amount of cells per block -cells_per_block_interval = range(MIN_CELLS_PER_BLOCK, MAX_CELLS_PER_BLOCK + 1, INC_CELLS_PER_BLOCK) -# Blocks with size in [16, 32, 64, 128, 256] -cells_per_block = [num_cells for num_cells in cells_per_block_interval] -# Number of active MPI processes -num_processes = wlb.mpi.numProcesses() -# Whether to overlap computation with communication -overlap_communication = [False, True] -# Whether MPI supports buffers in GPU memory -cuda_enabled_mpi = [False, True] -# Supported communication schemes -communication_schemes = ['GPUPackInfo_Streams', 'UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy'] - - -def calculate_time_steps(runtime, expected_mlups, domain_size): - cells = reduce(operator.mul, domain_size, 1) - time_steps_per_second = expected_mlups * 1e6 / cells - return int(time_steps_per_second * runtime) - - -def side_length_to_fill_memory(memory_fill_percentage, memory_in_gb): - bytes_per_cell = 19 * 2 * 8 - max_cells = memory_in_gb * 1e9 / bytes_per_cell * memory_fill_percentage - return int(max_cells**(1/3)) - - -def get_block_decomposition(block_decomposition, num_processes): - bx = by = bz = 1 - blocks_per_axis = int(math.log(num_processes, 2)) - for i in range(blocks_per_axis): - decomposition_axis = block_decomposition[i % len(block_decomposition)] - if decomposition_axis == 'y': - by *= 2 - elif decomposition_axis == 'z': - bz *= 2 - elif decomposition_axis == 'x': - bx *= 2 - - assert (bx * by * bz) == num_processes - return bx, by, bz diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark.py b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark.py deleted file mode 100644 index daf4fee66347cbe31015aaf1420318689653c8bb..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark.py +++ /dev/null @@ -1,109 +0,0 @@ -# encoding: utf-8 - -import os -import pandas as pd -import waLBerla as wlb -import copy -from datetime import datetime - - -CommunicationSchemeType = { - 'GPUPackInfo_Baseline': 0, - 'GPUPackInfo_Streams': 1, - 'UniformGPUScheme_Baseline': 2, - 'UniformGPUScheme_Memcpy': 3, -} - -CommunicationSchemeName = { - 0: 'GPUPackInfo_Baseline', - 1: 'GPUPackInfo_Streams', - 2: 'UniformGPUScheme_Baseline', - 3: 'UniformGPUScheme_Memcpy', -} - -# Base configuration for the benchmark -BASE_CONFIG = { - 'DomainSetup': { - 'cellsPerBlock': (64, 64, 64), - 'blocks': (1, 1, 1), - 'nrOfProcesses': (1, 1, 1), - 'periodic': (0, 0, 1), - 'dx': 1.0 - }, - 'Parameters': { - 'omega': 1.8, - 'timesteps': 1001, - 'remainingTimeLoggerFrequency': 250, - 'vtkWriteFrequency': 0, - 'overlapCommunication': False, - 'cudaEnabledMPI': False, - 'initialVelocity': (0, 0, 0), - 'performanceReportFrequency': 250, - 'communicationScheme': CommunicationSchemeType['UniformGPUScheme_Baseline'], - }, - 'Boundaries': { - 'Border': [ - {'direction': 'W', 'walldistance': -1, 'flag': 'NoSlip'}, - {'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'}, - {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'}, - {'direction': 'N', 'walldistance': -1, 'flag': 'UBB'}, - ] - } -} - - -class BenchmarkScenario: - def __init__(self, testcase, time_steps, decomposition_axes=None, fully_periodic=False): - self.testcase = testcase - self.scenario_config = copy.deepcopy(BASE_CONFIG) - self.scenario_config['Parameters']['timesteps'] = time_steps - self.fully_periodic = fully_periodic - if fully_periodic: - del self.scenario_config['Boundaries']['Border'] - self.scenario_config['DomainSetup']['periodic'] = (1, 1, 1) - self.decomposition_axes = decomposition_axes - - now = datetime.now().replace(second=0, microsecond=0) - self.output_filename = f'{self.testcase}_{now.strftime("%Y-%m-%d_%H-%M")}.csv' - - def get_data(self): - block_setup = self.scenario_config.get('DomainSetup') - params = self.scenario_config.get('Parameters') - - return { - 'processesX': block_setup.get('nrOfProcesses')[0], - 'processesY': block_setup.get('nrOfProcesses')[1], - 'processesZ': block_setup.get('nrOfProcesses')[2], - 'blocksX': block_setup.get('blocks')[0], - 'blocksY': block_setup.get('blocks')[1], - 'blocksZ': block_setup.get('blocks')[2], - 'fully_periodic': self.fully_periodic, - 'cellsPerBlockX': block_setup.get('cellsPerBlock')[0], - 'cellsPerBlockY': block_setup.get('cellsPerBlock')[1], - 'cellsPerBlockZ': block_setup.get('cellsPerBlock')[2], - 'cudaEnabledMPI': params.get('cudaEnabledMPI'), - 'overlapCommunication': params.get('overlapCommunication'), - 'time_steps': params['timesteps'], - 'domainDecomposition': self.decomposition_axes, - 'communicationScheme': CommunicationSchemeName[params.get('communicationScheme')], - } - - @wlb.member_callback - def config(self, **kwargs): - from pprint import pformat - wlb.log_info_on_root("Scenario:\n" + pformat(self.get_data())) - return self.scenario_config - - @wlb.member_callback - def results_callback(self, **kwargs): - data = self.get_data() - data.update(kwargs) - self.save_data([data]) - - def save_data(self, data): - df = pd.DataFrame(data) - if not os.path.isfile(self.output_filename): - df.to_csv(self.output_filename, index=False) - else: - df.to_csv(self.output_filename, index=False, mode='a', header=False) - diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py new file mode 100755 index 0000000000000000000000000000000000000000..ca66b364cd331050e12a3540036fbe0ab9bd083b --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py @@ -0,0 +1,240 @@ +#!/usr/bin/env python3 +""" +This is a waLBerla parameter file that tests (almost) all parameter combinations for GPU communication. +Build waLBerla with -DWALBERLA_BUILD_WITH_PYTHON=1 then run e.g. + ./UniformGridBenchmarkGPU_AA_trt simulation_setup/benchmark_configs.py + +Look at the end of the file to select the benchmark to run +""" + +import os +import waLBerla as wlb +from waLBerla.tools.config import block_decomposition, toPrm +from waLBerla.tools.sqlitedb import * +from copy import deepcopy +import sys +import sqlite3 + +# Number of time steps run for a workload of 128^3 per GPU +# if double as many cells are on the GPU, half as many time steps are run etc. +# increase this to get more reliable measurements +TIME_STEPS_FOR_128_BLOCK = 200 +DB_FILE = "gpu_benchmark.sqlite3" + +BASE_CONFIG = { + 'DomainSetup': { + 'cellsPerBlock': (256, 128, 128), + 'periodic': (1, 1, 1), + }, + 'Parameters': { + 'omega': 1.8, + 'cudaEnabledMPI': False, + 'warmupSteps': 5, + 'outerIterations': 3, + } +} + + +def num_time_steps(block_size): + cells = block_size[0] * block_size[1] * block_size[2] + time_steps = (128 ** 3 / cells) * TIME_STEPS_FOR_128_BLOCK + return int(time_steps) + + +class Scenario: + def __init__(self, cells_per_block=(256, 128, 128), **kwargs): + self.config_dict = deepcopy(BASE_CONFIG) + self.config_dict['Parameters'].update(kwargs) + self.config_dict['DomainSetup']['blocks'] = block_decomposition(wlb.mpi.numProcesses()) + self.config_dict['DomainSetup']['cellsPerBlock'] = cells_per_block + + @wlb.member_callback + def config(self, **kwargs): + from pprint import pformat + wlb.log_info_on_root("Scenario:\n" + pformat(self.config_dict)) + # Write out the configuration as text-based prm: + #print(toPrm(self.config_dict)) + return self.config_dict + + @wlb.member_callback + def results_callback(self, **kwargs): + data = {} + data.update(self.config_dict['Parameters']) + data.update(self.config_dict['DomainSetup']) + data.update(kwargs) + data['executable'] = sys.argv[0] + data['compile_flags'] = wlb.build_info.compiler_flags + data['walberla_version'] = wlb.build_info.version + data['build_machine'] = wlb.build_info.build_machine + sequenceValuesToScalars(data) + + result = data + sequenceValuesToScalars(result) + num_tries = 4 + for num_try in range(num_tries): # check multiple times e.g. may fail when multiple benchmark processes are running + try: + checkAndUpdateSchema(result, "runs", DB_FILE) + storeSingle(result, "runs", DB_FILE) + break + except sqlite3.OperationalError as e: + wlb.log_warning("Sqlite DB writing failed: try {}/{} {}".format(num_try+1, num_tries, str(e))) + + +# -------------------------------------- Functions trying different parameter sets ------------------------------------------------------------------- + + +def overlap_benchmark(): + """Tests different communication overlapping strategies""" + wlb.log_info_on_root("Running different communication overlap strategies") + wlb.log_info_on_root("") + + scenarios = wlb.ScenarioManager() + inner_outer_splits = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1), + (4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1), + (4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32)] + + for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']: # 'GPUPackInfo_Baseline', 'GPUPackInfo_Streams' + # no overlap + scenarios.add(Scenario(timeStepStrategy='noOverlap', communicationScheme=comm_strategy, innerOuterSplit=(1, 1, 1))) + + # overlap + for overlap_strategy in ['simpleOverlap', 'complexOverlap']: + for inner_outer_split in inner_outer_splits: + scenario = Scenario(timeStepStrategy=overlap_strategy, + communicationScheme=comm_strategy, + innerOuterSplit=inner_outer_split, + timesteps=num_time_steps((256, 128, 128))) + scenarios.add(scenario) + + +def communication_compare(): + """Tests different communication strategies""" + wlb.log_info_on_root("Running benchmarks to compare communication strategies") + wlb.log_info_on_root("") + + scenarios = wlb.ScenarioManager() + for block_size in [(128, 128, 128), (32, 32, 32), (64, 64, 64), (256, 256, 256)]: + for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']: + + sc = Scenario(cells_per_block=block_size, + gpuBlockSize=(128, 1, 1), + timeStepStrategy='noOverlap', + communicationScheme=comm_strategy, + timesteps=num_time_steps(block_size)) + scenarios.add(sc) + for inner_outer_split in [(4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1)]: + if 3 * inner_outer_split[0] > block_size[0]: # ensure that the inner part of the domain is still large enough + continue + sc = Scenario(cells_per_block=block_size, + gpuBlockSize=(128, 1, 1), + timeStepStrategy='simpleOverlap', + innerOuterSplit=inner_outer_split, + communicationScheme=comm_strategy, + timesteps=num_time_steps(block_size)) + scenarios.add(sc) + + +def single_gpu_benchmark(): + """Benchmarks only the LBM compute kernel""" + wlb.log_info_on_root("Running single GPU benchmarks") + wlb.log_info_on_root("") + + scenarios = wlb.ScenarioManager() + block_sizes = [(i, i, i) for i in (64, 128, 256, 384)] + [(512, 512, 128)] + cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1), + (32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1), + (32, 4, 1), (64, 4, 1), (128, 4, 1), + (32, 8, 1), (64, 8, 1), + (32, 16, 1)] + for block_size in block_sizes: + for cuda_block_size in cuda_blocks: + scenario = Scenario(cells_per_block=block_size, + gpuBlockSize=cuda_block_size, + timeStepStrategy='kernelOnly', + timesteps=num_time_steps(block_size)) + scenarios.add(scenario) + + +# -------------------------------------- Optional job script generation for PizDaint ------------------------------------------------------------------- + + +job_script_header = """ +#!/bin/bash -l +#SBATCH --job-name=scaling +#SBATCH --time=0:30:00 +#SBATCH --nodes={nodes} +#SBATCH -o out_scaling_{nodes}_%j.txt +#SBATCH -e err_scaling_{nodes}_%j.txt +#SBATCH --ntasks-per-core=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=1 +#SBATCH --partition=normal +#SBATCH --constraint=gpu +#SBATCH --account=d105 + +cd {folder} + +source ~/env.sh + +module load daint-gpu +module load craype-accel-nvidia60 +export MPICH_RDMA_ENABLED_CUDA=1 # allow GPU-GPU data transfer +export CRAY_CUDA_MPS=1 # allow GPU sharing +export MPICH_G2G_PIPELINE=256 # adapt maximum number of concurrent in-flight messages + +export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK +export CRAY_CUDA_MPS=1 + +export MPICH_RANK_REORDER_METHOD=3 +export PMI_MMAP_SYNC_WAIT_TIME=300 + + +# grid_order -R -H -c 1,1,8 -g 16,16,8 + +ulimit -c 0 +""" + +job_script_exe_part = """ + +export WALBERLA_SCENARIO_IDX=0 +while srun -n {nodes} ./{app} {config} +do + ((WALBERLA_SCENARIO_IDX++)) +done +""" + + +all_executables = ('UniformGridBenchmarkGPU_mrt_d3q27', + 'UniformGridBenchmarkGPU_smagorinsky_d3q27', + 'UniformGridBenchmarkGPU_cumulant' + 'UniformGridBenchmarkGPU_cumulant_d3q27') + + +def generate_jobscripts(exe_names=all_executables): + for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 2400]: + folder_name = "scaling_{:04d}".format(node_count) + os.makedirs(folder_name, exist_ok=True) + + # run grid_order + import subprocess + decomposition = block_decomposition(node_count) + decomposition_str = ",".join(str(e) for e in decomposition) + subprocess.check_call(['grid_order', '-R', '-H', '-g', decomposition_str]) + + job_script = job_script_header.format(nodes=node_count, folder=os.path.join(os.getcwd(), folder_name)) + for exe in exe_names: + job_script += job_script_exe_part.format(app="../" + exe, nodes=node_count, config='../communication_compare.py') + + with open(os.path.join(folder_name, 'job.sh'), 'w') as f: + f.write(job_script) + + +if __name__ == '__main__': + print("Called without waLBerla - generating job scripts for PizDaint") + generate_jobscripts() +else: + wlb.log_info_on_root("Batch run of benchmark scenarios, saving result to {}".format(DB_FILE)) + # Select the benchmark you want to run + single_gpu_benchmark() # benchmarks different CUDA block sizes and domain sizes and measures single GPU performance of compute kernel (no communication) + #communication_compare() # benchmarks different communication routines, with and without overlap + #overlap_benchmark() # benchmarks different communication overlap options diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/inter_node.py b/apps/benchmarks/UniformGridGPU/simulation_setup/inter_node.py deleted file mode 100644 index 6498878fa229f8f2a7219fb2dc85abd32b211d19..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGPU/simulation_setup/inter_node.py +++ /dev/null @@ -1,47 +0,0 @@ -# encoding: utf-8 - -import itertools -import waLBerla as wlb -from base import get_block_decomposition, communication_schemes, overlap_communication, \ - cuda_enabled_mpi, cells_per_block, num_processes -from benchmark import BenchmarkScenario, CommunicationSchemeType - - -# Stores the scenarios for the current simulation -scenarios = wlb.ScenarioManager() - -# Generates all block decompositions of xyz, 2 directions at a time -#block_decompositions = itertools.combinations_with_replacement('xyz', r=2) -block_decompositions = ['xy', 'yz', 'xz'] - -scenario_generator = itertools.product(communication_schemes, overlap_communication, cuda_enabled_mpi, block_decompositions, cells_per_block) - -testcase_name = "inter-node" - -for scenario_params in scenario_generator: - # Extract parameters from tuple - comm_scheme, is_communication_overlapped, is_cuda_enabled_mpi, decomposition_axes, num_cells_per_block = scenario_params - if comm_scheme != 'UniformGPUScheme_Baseline' and is_cuda_enabled_mpi is True: - # Skip CUDA enabled MPI tests for GPUPackInfo tests - continue - elif comm_scheme == 'GPUPackInfo_Baseline' and is_communication_overlapped is True: - # Skip communication overlap tests for GPUPackInfo baseline - continue - # Convert the axes decompositions to string - decomposition_axes_str = ''.join(decomposition_axes) - # Compute block decomposition based on the specified axes and the number of processes - blocks = get_block_decomposition(decomposition_axes, num_processes) - # Create a benchmark scenario - scenario = BenchmarkScenario(testcase=testcase_name, decomposition_axes=decomposition_axes_str) - # Domain Setup parameters - domain_setup = scenario.scenario_config['DomainSetup'] - domain_setup['cellsPerBlock'] = 3 * (num_cells_per_block,) - domain_setup['nrOfProcesses'] = blocks - domain_setup['blocks'] = blocks - # Additional parameters for benchmarking - params = scenario.scenario_config['Parameters'] - params['cudaEnabledMPI'] = is_cuda_enabled_mpi - params['overlapCommunication'] = is_communication_overlapped - params['communicationScheme'] = CommunicationSchemeType[comm_scheme] - # Add scenario for execution - scenarios.add(scenario) diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/intra_node.py b/apps/benchmarks/UniformGridGPU/simulation_setup/intra_node.py deleted file mode 100644 index 46b6c06002d0ac663dfeef6ae2dbb32bb8ba300d..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGPU/simulation_setup/intra_node.py +++ /dev/null @@ -1,47 +0,0 @@ -# encoding: utf-8 - -import itertools -import waLBerla as wlb -from base import get_block_decomposition, communication_schemes, overlap_communication, \ - cuda_enabled_mpi, cells_per_block, num_processes -from benchmark import BenchmarkScenario, CommunicationSchemeType - - -# Stores the scenarios for the current simulation -scenarios = wlb.ScenarioManager() - -# Generates all block decompositions of xyz, 2 directions at a time -#block_decompositions = itertools.combinations_with_replacement('xyz', r=2) -block_decompositions = ['xy', 'yz', 'zx'] - -scenario_generator = itertools.product(communication_schemes, overlap_communication, cuda_enabled_mpi, block_decompositions, cells_per_block) - -testcase_name = "intra-node" - -for scenario_params in scenario_generator: - # Extract parameters from tuple - comm_scheme, is_communication_overlapped, is_cuda_enabled_mpi, decomposition_axes, num_cells_per_block = scenario_params - if comm_scheme != 'UniformGPUScheme_Baseline' and is_cuda_enabled_mpi is True: - # Skip CUDA enabled MPI tests for GPUPackInfo tests - continue - elif comm_scheme == 'GPUPackInfo_Baseline' and is_communication_overlapped is True: - # Skip communication overlap tests for GPUPackInfo baseline - continue - # Convert the axes decompositions to string - decomposition_axes_str = ''.join(decomposition_axes) - # Compute block decomposition based on the specified axes and the number of processes - blocks = get_block_decomposition(decomposition_axes, num_processes) - # Create a benchmark scenario - scenario = BenchmarkScenario(testcase=testcase_name, decomposition_axes=decomposition_axes_str) - # Domain Setup parameters - domain_setup = scenario.scenario_config['DomainSetup'] - domain_setup['cellsPerBlock'] = 3 * (num_cells_per_block,) - domain_setup['nrOfProcesses'] = blocks - domain_setup['blocks'] = blocks - # Additional parameters for benchmarking - params = scenario.scenario_config['Parameters'] - params['cudaEnabledMPI'] = is_cuda_enabled_mpi - params['overlapCommunication'] = is_communication_overlapped - params['communicationScheme'] = CommunicationSchemeType[comm_scheme] - # Add scenario for execution - scenarios.add(scenario) diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py b/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py deleted file mode 100755 index 6d64d4d89f7ed0b82a194997b08c60cbeabd236d..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGPU/simulation_setup/overlap_benchmark.py +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env python3 - -import os -import pandas as pd -import waLBerla as wlb -from waLBerla.tools.config import block_decomposition -from waLBerla.tools.sqlitedb import sequenceValuesToScalars -from os import getcwd -from waLBerla.tools.jobscripts import createJobscript -from datetime import timedelta -from copy import deepcopy -import sys - -CSV_FILE = "overlap_benchmark.csv" - -BASE_CONFIG = { - 'DomainSetup': { - 'cellsPerBlock': (256, 128, 128), - 'periodic': (1, 1, 1), - }, - 'Parameters': { - 'omega': 1.8, - 'timesteps': 400, - 'cudaEnabledMPI': False, - 'warmupSteps': 5, - 'outerIterations': 3, - 'initShearFlow': True, - } -} - - -class Scenario: - def __init__(self, cells_per_block=(256, 128, 128), **kwargs): - self.config_dict = deepcopy(BASE_CONFIG) - self.config_dict['Parameters'].update(kwargs) - self.config_dict['DomainSetup']['blocks'] = block_decomposition(wlb.mpi.numProcesses()) - self.config_dict['DomainSetup']['cellsPerBlock'] = cells_per_block - - @wlb.member_callback - def config(self, **kwargs): - from pprint import pformat - wlb.log_info_on_root("Scenario:\n" + pformat(self.config_dict)) - return self.config_dict - - @wlb.member_callback - def results_callback(self, **kwargs): - data = {} - data.update(self.config_dict['Parameters']) - data.update(self.config_dict['DomainSetup']) - data.update(kwargs) - data['executable'] = sys.argv[0] - data['compile_flags'] = wlb.build_info.compiler_flags - data['walberla_version'] = wlb.build_info.version - data['build_machine'] = wlb.build_info.build_machine - sequenceValuesToScalars(data) - - df = pd.DataFrame.from_records([data]) - if not os.path.isfile(CSV_FILE): - df.to_csv(CSV_FILE, index=False) - else: - df.to_csv(CSV_FILE, index=False, mode='a', header=False) - - -def overlap_benchmark(): - scenarios = wlb.ScenarioManager() - inner_outer_splits = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1), - (4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1), - (4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32)] - - for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']: # 'GPUPackInfo_Baseline', 'GPUPackInfo_Streams' - # no overlap - scenarios.add(Scenario(timeStepStrategy='noOverlap', communicationScheme=comm_strategy, innerOuterSplit=(1, 1, 1))) - - # overlap - for overlap_strategy in ['simpleOverlap', 'complexOverlap']: - for inner_outer_split in inner_outer_splits: - scenario = Scenario(timeStepStrategy=overlap_strategy, - communicationScheme=comm_strategy, - innerOuterSplit=inner_outer_split) - scenarios.add(scenario) - - -def single_gpu_benchmark(): - scenarios = wlb.ScenarioManager() - block_sizes = [(i, i, i) for i in (64, 128, 256, 384)] + [(512, 512, 128)] - cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1), - (32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1), - (32, 4, 1), (64, 4, 1), (128, 4, 1), - (32, 8, 1), (64, 8, 1), - (32, 16, 1)] - for block_size in block_sizes: - for cuda_block_size in cuda_blocks: - cells = block_size[0] * block_size[1] * block_size[2] - time_steps_for_128_cubed = 1000 - time_steps = (128 ** 3 / cells) * time_steps_for_128_cubed - scenario = Scenario(cells_per_block=block_size, - gpuBlockSize=cuda_block_size, - timeStepStrategy='kernelOnly', - timesteps=int(time_steps)) - scenarios.add(scenario) - - -all_executables = ('UniformGridBenchmarkGPU_AA_entropic', - 'UniformGridBenchmarkGPU_AA_mrt', - 'UniformGridBenchmarkGPU_AA_smagorinsky', - 'UniformGridBenchmarkGPU_AA_srt', - 'UniformGridBenchmarkGPU_AA_trt', - 'UniformGridBenchmarkGPU_entropic', - 'UniformGridBenchmarkGPU_mrt', - 'UniformGridBenchmarkGPU_smagorinsky', - 'UniformGridBenchmarkGPU_srt', - 'UniformGridBenchmarkGPU_trt') - - -def generate_jobscripts(machine='pizdaint_hybrid', - exe_names=all_executables): - for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 2400]: - with open("job_overlap_benchmark_{:04d}.sh".format(node_count), 'w') as f: - - js = createJobscript(nodes=node_count, - output_file='overlap_bench_{:04d}_%j.txt'.format(node_count), - error_file='overlap_bench_{:04d}_%j.txt'.format(node_count), - initial_dir=getcwd(), - commands=list(("./" + exe, 'overlap_benchmark.py') for exe in exe_names), - wall_time=timedelta(minutes=25), - machine=machine, - account='d105', - ) - f.write(js) - - -if __name__ == '__main__': - print("Called without waLBerla - generating job scripts for PizDaint") - generate_jobscripts() -else: - single_gpu_benchmark() diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/pizdaint_jobfiles.py b/apps/benchmarks/UniformGridGPU/simulation_setup/pizdaint_jobfiles.py deleted file mode 100755 index bfd59cd4c343015b4dd2ffe931f2167e7129665a..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGPU/simulation_setup/pizdaint_jobfiles.py +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/env python3 -from os import getcwd -from waLBerla.tools.jobscripts import createJobscript -from datetime import timedelta - - -for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2400]: - with open("job_weak_scaling_{:04d}.sh".format(node_count), 'w') as f: - js = createJobscript(nodes=node_count, - output_file='out_lbm_bench_{:04d}_%j.txt'.format(node_count), - error_file='err_lbm_bench_{:04d}_%j.txt'.format(node_count), - initial_dir=getcwd(), - exe_name='UniformGridBenchmarkGPU', - parameter_files=['weak_scaling.py'], - wall_time=timedelta(minutes=25), - machine='pizdaint_hybrid', - account='d105', - ) - f.write(js) diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/single_node.prm b/apps/benchmarks/UniformGridGPU/simulation_setup/single_node.prm new file mode 100644 index 0000000000000000000000000000000000000000..ef257af8bc013da08ad5d04765e786c44a756a5e --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/simulation_setup/single_node.prm @@ -0,0 +1,37 @@ +DomainSetup +{ + blocks < 1,1,1 >; // use as many blocks as MPI processes + cellsPerBlock < 128,64,64 >; // domain size per MPI process, leave constant for weak scaling + periodic < 1,1,1 >; +} + + +Parameters +{ + cudaEnabledMPI False; // set to true, if MPI was compiled with CUDA + outerIterations 3; // number of measurements to make + timeStepStrategy simpleOverlap; // one of simpleOverlap, noOverlap, the non-AA version also supports complexOverlap + // fastest is simpleOverlap + innerOuterSplit <8, 1, 1>; // only important when overlapping communication + // domain is split into communication-dependent outer and inner part + // this parameter makes the outer part larger than necessary since the processing of a single outer layer is slow + // this parameter specifies the thickness of the outer layer in each direction + // make sure your block is large enough, the outer part is 2*innerOuterSplit big, make sure there is a inner part left + timesteps 2000; // time steps per measurement + warmupSteps 5; // number of time steps before starting measurement + + vtkWriteFrequency 0; // how often to write VTK output + + gpuBlockSize < 128,1,1 >; // size of CUDA blocks - usually large x extents are fast + omega 1.8; + + // valid in the non-AA version - determines how the ghost layer exchange is done + // the AA version uses always the fastest "UniformGPUScheme_Baseline" + //communicationScheme UniformGPUScheme_Baseline + // UniformGPUScheme_Baseline: packing/unpacking in generated kernels, every direction is handled by separate CUDA stream, can make use of CUDA aware MPI, most probably the fastest version + // UniformGPUScheme_Memcpy: some as above, but packing is done with cudaMemcpy(3D) + // MPIDatatypes: use MPI datatypes for packing, needs CUDA aware MPI + // MPIDatatypesFull: same as above but sends all PDFs + // GPUPackInfo_Baseline: old implementation based on communication mechanism for CPUs + // GPUPackInfo_Streams: same as above but with CUDA streams +} diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/single_node.py b/apps/benchmarks/UniformGridGPU/simulation_setup/single_node.py deleted file mode 100644 index 78da581df4c61cb84a2e94fae4a9b4ed5505d393..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGPU/simulation_setup/single_node.py +++ /dev/null @@ -1,20 +0,0 @@ -# encoding: utf-8 - -import itertools -import waLBerla as wlb -from base import cells_per_block, num_processes -from benchmark import BenchmarkScenario - - -scenarios = wlb.ScenarioManager() - -testcase_name = "single-node" - -assert num_processes == 1 - -for num_cells_per_block in cells_per_block: - # Create a benchmark scenario - scenario = BenchmarkScenario(testcase=testcase_name) - scenario.scenario_config['DomainSetup']['cellsPerBlock'] = 3 * (num_cells_per_block,) - # Add scenario for execution - scenarios.add(scenario) diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/strong_scaling.py b/apps/benchmarks/UniformGridGPU/simulation_setup/strong_scaling.py deleted file mode 100644 index 7f038c837b81d836fba828685e15d634bad3f5dc..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGPU/simulation_setup/strong_scaling.py +++ /dev/null @@ -1,56 +0,0 @@ -# encoding: utf-8 - -import itertools -import waLBerla as wlb -from base import get_block_decomposition, communication_schemes, overlap_communication, \ - cuda_enabled_mpi, num_processes -from benchmark import BenchmarkScenario, CommunicationSchemeType - - -# Stores the scenarios for the current simulation -scenarios = wlb.ScenarioManager() - -# Generates all block decompositions of xyz, 2 directions at a time -#block_decompositions = itertools.combinations_with_replacement('xyz', r=2) -block_decompositions = ['xyz', 'yzx', 'yxz', 'zyx'] - -cells_per_block = [256,] - -if num_processes == 1: - scenario_generator = itertools.product(communication_schemes, [False,], [False,], - block_decompositions, cells_per_block) -else: - scenario_generator = itertools.product(communication_schemes, overlap_communication, - cuda_enabled_mpi, block_decompositions, cells_per_block) - -testcase_name = "strong-scaling" - -for scenario_params in scenario_generator: - # Extract parameters from tuple - comm_scheme, is_communication_overlapped, is_cuda_enabled_mpi, decomposition_axes, num_cells_per_block = scenario_params - if comm_scheme != 'UniformGPUScheme_Baseline' and is_cuda_enabled_mpi is True: - # Skip CUDA enabled MPI tests for GPUPackInfo tests - continue - elif comm_scheme == 'GPUPackInfo_Baseline' and is_communication_overlapped is True: - # Skip communication overlap tests for GPUPackInfo baseline - continue - - # Convert the axes decompositions to string - decomposition_axes_str = ''.join(decomposition_axes) - # Compute block decomposition based on the specified axes and the number of processes - blocks = get_block_decomposition(decomposition_axes, num_processes) - # Create a benchmark scenario - scenario = BenchmarkScenario(testcase=testcase_name, decomposition_axes=decomposition_axes_str) - # Domain Setup parameters - domain_setup = scenario.scenario_config['DomainSetup'] - domain_setup['cellsPerBlock'] = tuple(num_cells_per_block // block for block in blocks) - domain_setup['nrOfProcesses'] = blocks - domain_setup['blocks'] = blocks - # Additional parameters for benchmarking - params = scenario.scenario_config['Parameters'] - params['cudaEnabledMPI'] = is_cuda_enabled_mpi - params['overlapCommunication'] = is_communication_overlapped - params['communicationScheme'] = CommunicationSchemeType[comm_scheme] - # Add scenario for execution - scenarios.add(scenario) - diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/weak_scaling.py b/apps/benchmarks/UniformGridGPU/simulation_setup/weak_scaling.py deleted file mode 100644 index 66f9b2590625e7401c6cf830d35286c7bda23790..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGPU/simulation_setup/weak_scaling.py +++ /dev/null @@ -1,65 +0,0 @@ -# encoding: utf-8 - -import itertools -import waLBerla as wlb -from base import get_block_decomposition, communication_schemes, overlap_communication, \ - cuda_enabled_mpi, num_processes, calculate_time_steps, side_length_to_fill_memory -from benchmark import BenchmarkScenario, CommunicationSchemeType - - -# Stores the scenarios for the current simulation -scenarios = wlb.ScenarioManager() - -# Generates all block decompositions of xyz, 2 directions at a time -#block_decompositions = itertools.combinations_with_replacement('xyz', r=2) -block_decompositions = ['xyz', 'yzx', 'zyx', 'yxz'] - -# compute number of cells depending on GPU memory i.e. by specifying the percentage of GPU memory to fill -gpu_memory_gb = 16 -cells_per_block = [side_length_to_fill_memory(pc, gpu_memory_gb) for pc in (0.8, 0.5, 0.05)] - -expected_mlups = 200 # to compute how many time steps have to be done -time_per_scenarios = 5 # benchmark time in seconds - -fully_periodic = [False, True] - -if num_processes == 1: - scenario_generator = itertools.product(communication_schemes, [False, ], [False, ], - block_decompositions, cells_per_block, fully_periodic) -else: - scenario_generator = itertools.product(communication_schemes, [True], - cuda_enabled_mpi, block_decompositions, cells_per_block, fully_periodic) - -testcase_name = "weak-scaling" - -for scenario_params in scenario_generator: - # Extract parameters from tuple - comm_scheme, is_communication_overlapped, is_cuda_enabled_mpi, decomposition_axes, num_cells_per_block, fully_periodic = scenario_params - if comm_scheme != 'UniformGPUScheme_Baseline' and is_cuda_enabled_mpi is True: - # Skip CUDA enabled MPI tests for GPUPackInfo tests - continue - elif comm_scheme == 'GPUPackInfo_Baseline' and is_communication_overlapped is True: - # Skip communication overlap tests for GPUPackInfo without streams - continue - - # Convert the axes decompositions to string - decomposition_axes_str = ''.join(decomposition_axes) - # Compute block decomposition based on the specified axes and the number of processes - blocks = get_block_decomposition(decomposition_axes, num_processes) - # Estimate number of time steps - time_steps = max(50, calculate_time_steps(time_per_scenarios, expected_mlups, 3 * (num_cells_per_block,))) - # Create a benchmark scenario - scenario = BenchmarkScenario(testcase=testcase_name, decomposition_axes=decomposition_axes_str, - time_steps=time_steps, fully_periodic=fully_periodic) - # Domain Setup parameters - domain_setup = scenario.scenario_config['DomainSetup'] - domain_setup['cellsPerBlock'] = 3 * (num_cells_per_block,) - domain_setup['nrOfProcesses'] = blocks - domain_setup['blocks'] = blocks - # Additional parameters for benchmarking - params = scenario.scenario_config['Parameters'] - params['cudaEnabledMPI'] = is_cuda_enabled_mpi - params['overlapCommunication'] = is_communication_overlapped - params['communicationScheme'] = CommunicationSchemeType[comm_scheme] - # Add scenario for execution - scenarios.add(scenario)