Commit 79668534 authored by Martin Bauer's avatar Martin Bauer
Browse files

Extended UniformGridGPU simulation setup

parent 44a56363
......@@ -176,18 +176,14 @@ int main( int argc, char **argv )
auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", 3.0 ); // in seconds
timeLoop.addFuncAfterTimeStep( timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps(), remainingTimeLoggerFrequency ), "remaining time logger" );
/*
lbm::PerformanceLogger<FlagField_T> performanceLogger(blocks, flagFieldID, fluidFlagUID, 500);
timeLoop.addFuncAfterTimeStep( performanceLogger, "remaining time logger" );
timeLoop.run();
*/
auto performanceReportFrequency = parameters.getParameter< uint_t >( "performanceReportFrequency", 500 ); // in timesteps
lbm::PerformanceLogger<FlagField_T> performanceLogger(blocks, flagFieldID, fluidFlagUID, performanceReportFrequency);
timeLoop.addFuncAfterTimeStep([&performanceLogger] { performanceLogger(); }, "performance logger" );
WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps");
timeLoop.run();
WALBERLA_LOG_INFO_ON_ROOT("Simulation finished");
std::map< std::string, int > integerProperties;
std::map< std::string, double > realProperties;
......@@ -204,9 +200,6 @@ int main( int argc, char **argv )
pythonCallbackResults.data().exposeValue( "mlups_total", realProperties["MLUPS"] );
pythonCallbackResults.data().exposeValue( "mlups_process", realProperties["MLUPS_process"] );
pythonCallbackResults.data().exposeValue( "mflups_total", realProperties["MFLUPS"] );
pythonCallbackResults.data().exposeValue( "mflups_process", realProperties["MFLUPS_process"] );
// Call Python function to report results
pythonCallbackResults();
}
......
# encoding: utf-8
import math
import operator
from functools import reduce
import waLBerla as wlb
# Constants that define the size of blocks that are used in the benchmarks
MIN_CELLS_PER_BLOCK = 16
MAX_CELLS_PER_BLOCK = 256
......@@ -13,7 +14,7 @@ cells_per_block_interval = range(MIN_CELLS_PER_BLOCK, MAX_CELLS_PER_BLOCK + 1, I
# Blocks with size in [16, 32, 64, 128, 256]
cells_per_block = [num_cells for num_cells in cells_per_block_interval]
# Number of active MPI processes
num_processes = wlb.mpi.numProcesses()
num_processes = wlb.mpi.numProcesses()
# Whether to overlap computation with communication
overlap_communication = [False, True]
# Whether MPI supports buffers in GPU memory
......@@ -22,6 +23,18 @@ cuda_enabled_mpi = [False, True]
communication_schemes = ['GPUPackInfo_Streams', 'UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']
def calculate_time_steps(runtime, expected_mlups, domain_size):
cells = reduce(operator.mul, domain_size, 1)
time_steps_per_second = expected_mlups * 1e6 / cells
return time_steps_per_second * runtime
def side_length_to_fill_memory(memory_fill_percentage, memory_in_gb):
bytes_per_cell = 19 * 2 * 8
max_cells = memory_in_gb * 1e9 / bytes_per_cell * memory_fill_percentage
return int(max_cells**(1/3))
def get_block_decomposition(block_decomposition, num_processes):
bx = by = bz = 1
blocks_per_axis = int(math.log(num_processes, 2))
......@@ -35,5 +48,4 @@ def get_block_decomposition(block_decomposition, num_processes):
bx *= 2
assert (bx * by * bz) == num_processes
return (bx, by, bz)
return bx, by, bz
......@@ -23,7 +23,7 @@ CommunicationSchemeName = {
# Base configuration for the benchmark
BASE_CONFIG = {
'DomainSetup' : {
'DomainSetup': {
'cellsPerBlock': (64, 64, 64),
'blocks': (1, 1, 1),
'nrOfProcesses': (1, 1, 1),
......@@ -53,44 +53,52 @@ BASE_CONFIG = {
class BenchmarkScenario:
def __init__(self, testcase, decomposition_axes=None):
def __init__(self, testcase, time_steps, decomposition_axes=None, fully_periodic=False):
self.testcase = testcase
self.scenario_config = copy.deepcopy(BASE_CONFIG)
self.scenario_config['Parameters']['timesteps'] = time_steps
self.fully_periodic = fully_periodic
if fully_periodic:
del self.scenario_config['Boundaries']['Border']
self.scenario_config['DomainSetup']['periodic'] = (1, 1, 1)
self.decomposition_axes = decomposition_axes
now = datetime.now().replace(second=0, microsecond=0)
self.output_filename = f'{self.testcase}_{now.strftime("%Y-%m-%d_%H-%M")}.csv'
@wlb.member_callback
def config(self, **kwargs):
return self.scenario_config
@wlb.member_callback
def results_callback(self, **kwargs):
def get_data(self):
block_setup = self.scenario_config.get('DomainSetup')
params = self.scenario_config.get('Parameters')
data = [{
return {
'processesX': block_setup.get('nrOfProcesses')[0],
'processesY': block_setup.get('nrOfProcesses')[1],
'processesZ': block_setup.get('nrOfProcesses')[2],
'blocksX': block_setup.get('blocks')[0],
'blocksY': block_setup.get('blocks')[1],
'blocksZ': block_setup.get('blocks')[2],
'fully_periodic': self.fully_periodic,
'cellsPerBlockX': block_setup.get('cellsPerBlock')[0],
'cellsPerBlockY': block_setup.get('cellsPerBlock')[1],
'cellsPerBlockZ': block_setup.get('cellsPerBlock')[2],
'cudaEnabledMPI': params.get('cudaEnabledMPI'),
'overlapCommunication': params.get('overlapCommunication'),
'time_steps': params['timesteps'],
'domainDecomposition': self.decomposition_axes,
'communicationScheme': CommunicationSchemeName[params.get('communicationScheme')],
'mlupsTotal': kwargs.get('mlups_total'),
'mlupsProcess': kwargs.get('mlups_process'),
'mflupsTotal': kwargs.get('mflups_total'),
'mflupsProcess': kwargs.get('mflups_process'),
}]
}
self.save_data(data)
@wlb.member_callback
def config(self, **kwargs):
from pprint import pformat
wlb.log_info_on_root("Scenario:\n" + pformat(self.get_data()))
return self.scenario_config
@wlb.member_callback
def results_callback(self, **kwargs):
data = self.get_data()
data.update(kwargs)
self.save_data([data])
def save_data(self, data):
df = pd.DataFrame(data)
......
from os import getcwd
from waLBerla.tools.jobscripts import createJobscript
from datetime import timedelta
for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2400]:
with open("job_weak_scaling_{:04d}.sh".format(node_count), 'w') as f:
js = createJobscript(nodes=node_count,
output_file='out_lbm_bench_%j.txt',
error_file='err_lbm_bench_%j.txt',
initial_dir=getcwd(),
exe_name='UniformGridBenchmarkGPU',
parameter_files=['weak_scaling.py'],
wall_time=timedelta(minutes=25),
machine='pizdaint_hybrid'
)
f.write(js)
......@@ -3,7 +3,7 @@
import itertools
import waLBerla as wlb
from base import get_block_decomposition, communication_schemes, overlap_communication, \
cuda_enabled_mpi, num_processes
cuda_enabled_mpi, num_processes, calculate_time_steps, side_length_to_fill_memory
from benchmark import BenchmarkScenario, CommunicationSchemeType
......@@ -14,20 +14,27 @@ scenarios = wlb.ScenarioManager()
#block_decompositions = itertools.combinations_with_replacement('xyz', r=2)
block_decompositions = ['xyz', 'yzx', 'zyx', 'yxz']
cells_per_block = [64, 128, 240, 256]
# compute number of cells depending on GPU memory i.e. by specifying the percentage of GPU memory to fill
gpu_memory_gb = 16
cells_per_block = [side_length_to_fill_memory(pc, gpu_memory_gb) for pc in (0.8, 0.5, 0.05)]
expected_mlups = 200 # to compute how many time steps have to be done
time_per_scenarios = 3 # benchmark time in seconds
fully_periodic = [False, True]
if num_processes == 1:
scenario_generator = itertools.product(communication_schemes, [False,], [False,],
block_decompositions, cells_per_block)
scenario_generator = itertools.product(communication_schemes, [False, ], [False, ],
block_decompositions, cells_per_block, fully_periodic)
else:
scenario_generator = itertools.product(communication_schemes, overlap_communication,
cuda_enabled_mpi, block_decompositions, cells_per_block)
scenario_generator = itertools.product(communication_schemes, [True],
cuda_enabled_mpi, block_decompositions, cells_per_block, fully_periodic)
testcase_name = "weak-scaling"
for scenario_params in scenario_generator:
# Extract parameters from tuple
comm_scheme, is_communication_overlapped, is_cuda_enabled_mpi, decomposition_axes, num_cells_per_block = scenario_params
comm_scheme, is_communication_overlapped, is_cuda_enabled_mpi, decomposition_axes, num_cells_per_block, fully_periodic = scenario_params
if comm_scheme != 'UniformGPUScheme_Baseline' and is_cuda_enabled_mpi is True:
# Skip CUDA enabled MPI tests for GPUPackInfo tests
continue
......@@ -39,8 +46,11 @@ for scenario_params in scenario_generator:
decomposition_axes_str = ''.join(decomposition_axes)
# Compute block decomposition based on the specified axes and the number of processes
blocks = get_block_decomposition(decomposition_axes, num_processes)
# Estimate number of time steps
time_steps = max(50, calculate_time_steps(time_per_scenarios, expected_mlups, 3 * (num_cells_per_block,)))
# Create a benchmark scenario
scenario = BenchmarkScenario(testcase=testcase_name, decomposition_axes=decomposition_axes_str)
scenario = BenchmarkScenario(testcase=testcase_name, decomposition_axes=decomposition_axes_str,
time_steps=time_steps, fully_periodic=fully_periodic)
# Domain Setup parameters
domain_setup = scenario.scenario_config['DomainSetup']
domain_setup['cellsPerBlock'] = 3 * (num_cells_per_block,)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment