diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp index d749568caed53a30379bafeb8c65fd742b45bbc3..f6dc064cfa6a2dc138a9bf4fee5523c7e8a969a0 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp @@ -176,18 +176,14 @@ int main( int argc, char **argv ) auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", 3.0 ); // in seconds timeLoop.addFuncAfterTimeStep( timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps(), remainingTimeLoggerFrequency ), "remaining time logger" ); - /* - lbm::PerformanceLogger<FlagField_T> performanceLogger(blocks, flagFieldID, fluidFlagUID, 500); - timeLoop.addFuncAfterTimeStep( performanceLogger, "remaining time logger" ); - - timeLoop.run(); - */ auto performanceReportFrequency = parameters.getParameter< uint_t >( "performanceReportFrequency", 500 ); // in timesteps lbm::PerformanceLogger<FlagField_T> performanceLogger(blocks, flagFieldID, fluidFlagUID, performanceReportFrequency); timeLoop.addFuncAfterTimeStep([&performanceLogger] { performanceLogger(); }, "performance logger" ); + WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps"); timeLoop.run(); + WALBERLA_LOG_INFO_ON_ROOT("Simulation finished"); std::map< std::string, int > integerProperties; std::map< std::string, double > realProperties; @@ -204,9 +200,6 @@ int main( int argc, char **argv ) pythonCallbackResults.data().exposeValue( "mlups_total", realProperties["MLUPS"] ); pythonCallbackResults.data().exposeValue( "mlups_process", realProperties["MLUPS_process"] ); - pythonCallbackResults.data().exposeValue( "mflups_total", realProperties["MFLUPS"] ); - pythonCallbackResults.data().exposeValue( "mflups_process", realProperties["MFLUPS_process"] ); - // Call Python function to report results pythonCallbackResults(); } diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/base.py b/apps/benchmarks/UniformGridGPU/simulation_setup/base.py index c72988a8bc772426e794b2e5660fa34fd8d17126..7b98938f2b226dd834dfbc8a46875e940d003bd3 100644 --- a/apps/benchmarks/UniformGridGPU/simulation_setup/base.py +++ b/apps/benchmarks/UniformGridGPU/simulation_setup/base.py @@ -1,9 +1,10 @@ # encoding: utf-8 import math +import operator +from functools import reduce import waLBerla as wlb - # Constants that define the size of blocks that are used in the benchmarks MIN_CELLS_PER_BLOCK = 16 MAX_CELLS_PER_BLOCK = 256 @@ -13,7 +14,7 @@ cells_per_block_interval = range(MIN_CELLS_PER_BLOCK, MAX_CELLS_PER_BLOCK + 1, I # Blocks with size in [16, 32, 64, 128, 256] cells_per_block = [num_cells for num_cells in cells_per_block_interval] # Number of active MPI processes -num_processes = wlb.mpi.numProcesses() +num_processes = wlb.mpi.numProcesses() # Whether to overlap computation with communication overlap_communication = [False, True] # Whether MPI supports buffers in GPU memory @@ -22,6 +23,18 @@ cuda_enabled_mpi = [False, True] communication_schemes = ['GPUPackInfo_Streams', 'UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy'] +def calculate_time_steps(runtime, expected_mlups, domain_size): + cells = reduce(operator.mul, domain_size, 1) + time_steps_per_second = expected_mlups * 1e6 / cells + return time_steps_per_second * runtime + + +def side_length_to_fill_memory(memory_fill_percentage, memory_in_gb): + bytes_per_cell = 19 * 2 * 8 + max_cells = memory_in_gb * 1e9 / bytes_per_cell * memory_fill_percentage + return int(max_cells**(1/3)) + + def get_block_decomposition(block_decomposition, num_processes): bx = by = bz = 1 blocks_per_axis = int(math.log(num_processes, 2)) @@ -35,5 +48,4 @@ def get_block_decomposition(block_decomposition, num_processes): bx *= 2 assert (bx * by * bz) == num_processes - - return (bx, by, bz) + return bx, by, bz diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark.py b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark.py index 76c8a8967711f7e902acda23b4ad9bf56f587f31..daf4fee66347cbe31015aaf1420318689653c8bb 100644 --- a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark.py +++ b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark.py @@ -23,7 +23,7 @@ CommunicationSchemeName = { # Base configuration for the benchmark BASE_CONFIG = { - 'DomainSetup' : { + 'DomainSetup': { 'cellsPerBlock': (64, 64, 64), 'blocks': (1, 1, 1), 'nrOfProcesses': (1, 1, 1), @@ -53,44 +53,52 @@ BASE_CONFIG = { class BenchmarkScenario: - def __init__(self, testcase, decomposition_axes=None): + def __init__(self, testcase, time_steps, decomposition_axes=None, fully_periodic=False): self.testcase = testcase self.scenario_config = copy.deepcopy(BASE_CONFIG) + self.scenario_config['Parameters']['timesteps'] = time_steps + self.fully_periodic = fully_periodic + if fully_periodic: + del self.scenario_config['Boundaries']['Border'] + self.scenario_config['DomainSetup']['periodic'] = (1, 1, 1) self.decomposition_axes = decomposition_axes now = datetime.now().replace(second=0, microsecond=0) self.output_filename = f'{self.testcase}_{now.strftime("%Y-%m-%d_%H-%M")}.csv' - @wlb.member_callback - def config(self, **kwargs): - return self.scenario_config - - @wlb.member_callback - def results_callback(self, **kwargs): + def get_data(self): block_setup = self.scenario_config.get('DomainSetup') params = self.scenario_config.get('Parameters') - data = [{ + return { 'processesX': block_setup.get('nrOfProcesses')[0], 'processesY': block_setup.get('nrOfProcesses')[1], 'processesZ': block_setup.get('nrOfProcesses')[2], 'blocksX': block_setup.get('blocks')[0], 'blocksY': block_setup.get('blocks')[1], 'blocksZ': block_setup.get('blocks')[2], + 'fully_periodic': self.fully_periodic, 'cellsPerBlockX': block_setup.get('cellsPerBlock')[0], 'cellsPerBlockY': block_setup.get('cellsPerBlock')[1], 'cellsPerBlockZ': block_setup.get('cellsPerBlock')[2], 'cudaEnabledMPI': params.get('cudaEnabledMPI'), 'overlapCommunication': params.get('overlapCommunication'), + 'time_steps': params['timesteps'], 'domainDecomposition': self.decomposition_axes, 'communicationScheme': CommunicationSchemeName[params.get('communicationScheme')], - 'mlupsTotal': kwargs.get('mlups_total'), - 'mlupsProcess': kwargs.get('mlups_process'), - 'mflupsTotal': kwargs.get('mflups_total'), - 'mflupsProcess': kwargs.get('mflups_process'), - }] + } - self.save_data(data) + @wlb.member_callback + def config(self, **kwargs): + from pprint import pformat + wlb.log_info_on_root("Scenario:\n" + pformat(self.get_data())) + return self.scenario_config + + @wlb.member_callback + def results_callback(self, **kwargs): + data = self.get_data() + data.update(kwargs) + self.save_data([data]) def save_data(self, data): df = pd.DataFrame(data) diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/pizdaint_jobfiles.py b/apps/benchmarks/UniformGridGPU/simulation_setup/pizdaint_jobfiles.py new file mode 100644 index 0000000000000000000000000000000000000000..4c14d5b9b562bb53143b22bb660d82fecf66e500 --- /dev/null +++ b/apps/benchmarks/UniformGridGPU/simulation_setup/pizdaint_jobfiles.py @@ -0,0 +1,17 @@ +from os import getcwd +from waLBerla.tools.jobscripts import createJobscript +from datetime import timedelta + + +for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2400]: + with open("job_weak_scaling_{:04d}.sh".format(node_count), 'w') as f: + js = createJobscript(nodes=node_count, + output_file='out_lbm_bench_%j.txt', + error_file='err_lbm_bench_%j.txt', + initial_dir=getcwd(), + exe_name='UniformGridBenchmarkGPU', + parameter_files=['weak_scaling.py'], + wall_time=timedelta(minutes=25), + machine='pizdaint_hybrid' + ) + f.write(js) diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/weak_scaling.py b/apps/benchmarks/UniformGridGPU/simulation_setup/weak_scaling.py index c29d935dca8306856ef4646c0565f636bb0fe7fb..8076aeef7405f6bb5101d7a61be488c0babd3932 100644 --- a/apps/benchmarks/UniformGridGPU/simulation_setup/weak_scaling.py +++ b/apps/benchmarks/UniformGridGPU/simulation_setup/weak_scaling.py @@ -3,7 +3,7 @@ import itertools import waLBerla as wlb from base import get_block_decomposition, communication_schemes, overlap_communication, \ - cuda_enabled_mpi, num_processes + cuda_enabled_mpi, num_processes, calculate_time_steps, side_length_to_fill_memory from benchmark import BenchmarkScenario, CommunicationSchemeType @@ -14,20 +14,27 @@ scenarios = wlb.ScenarioManager() #block_decompositions = itertools.combinations_with_replacement('xyz', r=2) block_decompositions = ['xyz', 'yzx', 'zyx', 'yxz'] -cells_per_block = [64, 128, 240, 256] +# compute number of cells depending on GPU memory i.e. by specifying the percentage of GPU memory to fill +gpu_memory_gb = 16 +cells_per_block = [side_length_to_fill_memory(pc, gpu_memory_gb) for pc in (0.8, 0.5, 0.05)] + +expected_mlups = 200 # to compute how many time steps have to be done +time_per_scenarios = 3 # benchmark time in seconds + +fully_periodic = [False, True] if num_processes == 1: - scenario_generator = itertools.product(communication_schemes, [False,], [False,], - block_decompositions, cells_per_block) + scenario_generator = itertools.product(communication_schemes, [False, ], [False, ], + block_decompositions, cells_per_block, fully_periodic) else: - scenario_generator = itertools.product(communication_schemes, overlap_communication, - cuda_enabled_mpi, block_decompositions, cells_per_block) + scenario_generator = itertools.product(communication_schemes, [True], + cuda_enabled_mpi, block_decompositions, cells_per_block, fully_periodic) testcase_name = "weak-scaling" for scenario_params in scenario_generator: # Extract parameters from tuple - comm_scheme, is_communication_overlapped, is_cuda_enabled_mpi, decomposition_axes, num_cells_per_block = scenario_params + comm_scheme, is_communication_overlapped, is_cuda_enabled_mpi, decomposition_axes, num_cells_per_block, fully_periodic = scenario_params if comm_scheme != 'UniformGPUScheme_Baseline' and is_cuda_enabled_mpi is True: # Skip CUDA enabled MPI tests for GPUPackInfo tests continue @@ -39,8 +46,11 @@ for scenario_params in scenario_generator: decomposition_axes_str = ''.join(decomposition_axes) # Compute block decomposition based on the specified axes and the number of processes blocks = get_block_decomposition(decomposition_axes, num_processes) + # Estimate number of time steps + time_steps = max(50, calculate_time_steps(time_per_scenarios, expected_mlups, 3 * (num_cells_per_block,))) # Create a benchmark scenario - scenario = BenchmarkScenario(testcase=testcase_name, decomposition_axes=decomposition_axes_str) + scenario = BenchmarkScenario(testcase=testcase_name, decomposition_axes=decomposition_axes_str, + time_steps=time_steps, fully_periodic=fully_periodic) # Domain Setup parameters domain_setup = scenario.scenario_config['DomainSetup'] domain_setup['cellsPerBlock'] = 3 * (num_cells_per_block,)