Skip to content
Snippets Groups Projects
Commit 79668534 authored by Martin Bauer's avatar Martin Bauer
Browse files

Extended UniformGridGPU simulation setup

parent 44a56363
No related merge requests found
...@@ -176,18 +176,14 @@ int main( int argc, char **argv ) ...@@ -176,18 +176,14 @@ int main( int argc, char **argv )
auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", 3.0 ); // in seconds auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", 3.0 ); // in seconds
timeLoop.addFuncAfterTimeStep( timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps(), remainingTimeLoggerFrequency ), "remaining time logger" ); timeLoop.addFuncAfterTimeStep( timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps(), remainingTimeLoggerFrequency ), "remaining time logger" );
/*
lbm::PerformanceLogger<FlagField_T> performanceLogger(blocks, flagFieldID, fluidFlagUID, 500);
timeLoop.addFuncAfterTimeStep( performanceLogger, "remaining time logger" );
timeLoop.run();
*/
auto performanceReportFrequency = parameters.getParameter< uint_t >( "performanceReportFrequency", 500 ); // in timesteps auto performanceReportFrequency = parameters.getParameter< uint_t >( "performanceReportFrequency", 500 ); // in timesteps
lbm::PerformanceLogger<FlagField_T> performanceLogger(blocks, flagFieldID, fluidFlagUID, performanceReportFrequency); lbm::PerformanceLogger<FlagField_T> performanceLogger(blocks, flagFieldID, fluidFlagUID, performanceReportFrequency);
timeLoop.addFuncAfterTimeStep([&performanceLogger] { performanceLogger(); }, "performance logger" ); timeLoop.addFuncAfterTimeStep([&performanceLogger] { performanceLogger(); }, "performance logger" );
WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps");
timeLoop.run(); timeLoop.run();
WALBERLA_LOG_INFO_ON_ROOT("Simulation finished");
std::map< std::string, int > integerProperties; std::map< std::string, int > integerProperties;
std::map< std::string, double > realProperties; std::map< std::string, double > realProperties;
...@@ -204,9 +200,6 @@ int main( int argc, char **argv ) ...@@ -204,9 +200,6 @@ int main( int argc, char **argv )
pythonCallbackResults.data().exposeValue( "mlups_total", realProperties["MLUPS"] ); pythonCallbackResults.data().exposeValue( "mlups_total", realProperties["MLUPS"] );
pythonCallbackResults.data().exposeValue( "mlups_process", realProperties["MLUPS_process"] ); pythonCallbackResults.data().exposeValue( "mlups_process", realProperties["MLUPS_process"] );
pythonCallbackResults.data().exposeValue( "mflups_total", realProperties["MFLUPS"] );
pythonCallbackResults.data().exposeValue( "mflups_process", realProperties["MFLUPS_process"] );
// Call Python function to report results // Call Python function to report results
pythonCallbackResults(); pythonCallbackResults();
} }
......
# encoding: utf-8 # encoding: utf-8
import math import math
import operator
from functools import reduce
import waLBerla as wlb import waLBerla as wlb
# Constants that define the size of blocks that are used in the benchmarks # Constants that define the size of blocks that are used in the benchmarks
MIN_CELLS_PER_BLOCK = 16 MIN_CELLS_PER_BLOCK = 16
MAX_CELLS_PER_BLOCK = 256 MAX_CELLS_PER_BLOCK = 256
...@@ -13,7 +14,7 @@ cells_per_block_interval = range(MIN_CELLS_PER_BLOCK, MAX_CELLS_PER_BLOCK + 1, I ...@@ -13,7 +14,7 @@ cells_per_block_interval = range(MIN_CELLS_PER_BLOCK, MAX_CELLS_PER_BLOCK + 1, I
# Blocks with size in [16, 32, 64, 128, 256] # Blocks with size in [16, 32, 64, 128, 256]
cells_per_block = [num_cells for num_cells in cells_per_block_interval] cells_per_block = [num_cells for num_cells in cells_per_block_interval]
# Number of active MPI processes # Number of active MPI processes
num_processes = wlb.mpi.numProcesses() num_processes = wlb.mpi.numProcesses()
# Whether to overlap computation with communication # Whether to overlap computation with communication
overlap_communication = [False, True] overlap_communication = [False, True]
# Whether MPI supports buffers in GPU memory # Whether MPI supports buffers in GPU memory
...@@ -22,6 +23,18 @@ cuda_enabled_mpi = [False, True] ...@@ -22,6 +23,18 @@ cuda_enabled_mpi = [False, True]
communication_schemes = ['GPUPackInfo_Streams', 'UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy'] communication_schemes = ['GPUPackInfo_Streams', 'UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']
def calculate_time_steps(runtime, expected_mlups, domain_size):
cells = reduce(operator.mul, domain_size, 1)
time_steps_per_second = expected_mlups * 1e6 / cells
return time_steps_per_second * runtime
def side_length_to_fill_memory(memory_fill_percentage, memory_in_gb):
bytes_per_cell = 19 * 2 * 8
max_cells = memory_in_gb * 1e9 / bytes_per_cell * memory_fill_percentage
return int(max_cells**(1/3))
def get_block_decomposition(block_decomposition, num_processes): def get_block_decomposition(block_decomposition, num_processes):
bx = by = bz = 1 bx = by = bz = 1
blocks_per_axis = int(math.log(num_processes, 2)) blocks_per_axis = int(math.log(num_processes, 2))
...@@ -35,5 +48,4 @@ def get_block_decomposition(block_decomposition, num_processes): ...@@ -35,5 +48,4 @@ def get_block_decomposition(block_decomposition, num_processes):
bx *= 2 bx *= 2
assert (bx * by * bz) == num_processes assert (bx * by * bz) == num_processes
return bx, by, bz
return (bx, by, bz)
...@@ -23,7 +23,7 @@ CommunicationSchemeName = { ...@@ -23,7 +23,7 @@ CommunicationSchemeName = {
# Base configuration for the benchmark # Base configuration for the benchmark
BASE_CONFIG = { BASE_CONFIG = {
'DomainSetup' : { 'DomainSetup': {
'cellsPerBlock': (64, 64, 64), 'cellsPerBlock': (64, 64, 64),
'blocks': (1, 1, 1), 'blocks': (1, 1, 1),
'nrOfProcesses': (1, 1, 1), 'nrOfProcesses': (1, 1, 1),
...@@ -53,44 +53,52 @@ BASE_CONFIG = { ...@@ -53,44 +53,52 @@ BASE_CONFIG = {
class BenchmarkScenario: class BenchmarkScenario:
def __init__(self, testcase, decomposition_axes=None): def __init__(self, testcase, time_steps, decomposition_axes=None, fully_periodic=False):
self.testcase = testcase self.testcase = testcase
self.scenario_config = copy.deepcopy(BASE_CONFIG) self.scenario_config = copy.deepcopy(BASE_CONFIG)
self.scenario_config['Parameters']['timesteps'] = time_steps
self.fully_periodic = fully_periodic
if fully_periodic:
del self.scenario_config['Boundaries']['Border']
self.scenario_config['DomainSetup']['periodic'] = (1, 1, 1)
self.decomposition_axes = decomposition_axes self.decomposition_axes = decomposition_axes
now = datetime.now().replace(second=0, microsecond=0) now = datetime.now().replace(second=0, microsecond=0)
self.output_filename = f'{self.testcase}_{now.strftime("%Y-%m-%d_%H-%M")}.csv' self.output_filename = f'{self.testcase}_{now.strftime("%Y-%m-%d_%H-%M")}.csv'
@wlb.member_callback def get_data(self):
def config(self, **kwargs):
return self.scenario_config
@wlb.member_callback
def results_callback(self, **kwargs):
block_setup = self.scenario_config.get('DomainSetup') block_setup = self.scenario_config.get('DomainSetup')
params = self.scenario_config.get('Parameters') params = self.scenario_config.get('Parameters')
data = [{ return {
'processesX': block_setup.get('nrOfProcesses')[0], 'processesX': block_setup.get('nrOfProcesses')[0],
'processesY': block_setup.get('nrOfProcesses')[1], 'processesY': block_setup.get('nrOfProcesses')[1],
'processesZ': block_setup.get('nrOfProcesses')[2], 'processesZ': block_setup.get('nrOfProcesses')[2],
'blocksX': block_setup.get('blocks')[0], 'blocksX': block_setup.get('blocks')[0],
'blocksY': block_setup.get('blocks')[1], 'blocksY': block_setup.get('blocks')[1],
'blocksZ': block_setup.get('blocks')[2], 'blocksZ': block_setup.get('blocks')[2],
'fully_periodic': self.fully_periodic,
'cellsPerBlockX': block_setup.get('cellsPerBlock')[0], 'cellsPerBlockX': block_setup.get('cellsPerBlock')[0],
'cellsPerBlockY': block_setup.get('cellsPerBlock')[1], 'cellsPerBlockY': block_setup.get('cellsPerBlock')[1],
'cellsPerBlockZ': block_setup.get('cellsPerBlock')[2], 'cellsPerBlockZ': block_setup.get('cellsPerBlock')[2],
'cudaEnabledMPI': params.get('cudaEnabledMPI'), 'cudaEnabledMPI': params.get('cudaEnabledMPI'),
'overlapCommunication': params.get('overlapCommunication'), 'overlapCommunication': params.get('overlapCommunication'),
'time_steps': params['timesteps'],
'domainDecomposition': self.decomposition_axes, 'domainDecomposition': self.decomposition_axes,
'communicationScheme': CommunicationSchemeName[params.get('communicationScheme')], 'communicationScheme': CommunicationSchemeName[params.get('communicationScheme')],
'mlupsTotal': kwargs.get('mlups_total'), }
'mlupsProcess': kwargs.get('mlups_process'),
'mflupsTotal': kwargs.get('mflups_total'),
'mflupsProcess': kwargs.get('mflups_process'),
}]
self.save_data(data) @wlb.member_callback
def config(self, **kwargs):
from pprint import pformat
wlb.log_info_on_root("Scenario:\n" + pformat(self.get_data()))
return self.scenario_config
@wlb.member_callback
def results_callback(self, **kwargs):
data = self.get_data()
data.update(kwargs)
self.save_data([data])
def save_data(self, data): def save_data(self, data):
df = pd.DataFrame(data) df = pd.DataFrame(data)
......
from os import getcwd
from waLBerla.tools.jobscripts import createJobscript
from datetime import timedelta
for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2400]:
with open("job_weak_scaling_{:04d}.sh".format(node_count), 'w') as f:
js = createJobscript(nodes=node_count,
output_file='out_lbm_bench_%j.txt',
error_file='err_lbm_bench_%j.txt',
initial_dir=getcwd(),
exe_name='UniformGridBenchmarkGPU',
parameter_files=['weak_scaling.py'],
wall_time=timedelta(minutes=25),
machine='pizdaint_hybrid'
)
f.write(js)
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
import itertools import itertools
import waLBerla as wlb import waLBerla as wlb
from base import get_block_decomposition, communication_schemes, overlap_communication, \ from base import get_block_decomposition, communication_schemes, overlap_communication, \
cuda_enabled_mpi, num_processes cuda_enabled_mpi, num_processes, calculate_time_steps, side_length_to_fill_memory
from benchmark import BenchmarkScenario, CommunicationSchemeType from benchmark import BenchmarkScenario, CommunicationSchemeType
...@@ -14,20 +14,27 @@ scenarios = wlb.ScenarioManager() ...@@ -14,20 +14,27 @@ scenarios = wlb.ScenarioManager()
#block_decompositions = itertools.combinations_with_replacement('xyz', r=2) #block_decompositions = itertools.combinations_with_replacement('xyz', r=2)
block_decompositions = ['xyz', 'yzx', 'zyx', 'yxz'] block_decompositions = ['xyz', 'yzx', 'zyx', 'yxz']
cells_per_block = [64, 128, 240, 256] # compute number of cells depending on GPU memory i.e. by specifying the percentage of GPU memory to fill
gpu_memory_gb = 16
cells_per_block = [side_length_to_fill_memory(pc, gpu_memory_gb) for pc in (0.8, 0.5, 0.05)]
expected_mlups = 200 # to compute how many time steps have to be done
time_per_scenarios = 3 # benchmark time in seconds
fully_periodic = [False, True]
if num_processes == 1: if num_processes == 1:
scenario_generator = itertools.product(communication_schemes, [False,], [False,], scenario_generator = itertools.product(communication_schemes, [False, ], [False, ],
block_decompositions, cells_per_block) block_decompositions, cells_per_block, fully_periodic)
else: else:
scenario_generator = itertools.product(communication_schemes, overlap_communication, scenario_generator = itertools.product(communication_schemes, [True],
cuda_enabled_mpi, block_decompositions, cells_per_block) cuda_enabled_mpi, block_decompositions, cells_per_block, fully_periodic)
testcase_name = "weak-scaling" testcase_name = "weak-scaling"
for scenario_params in scenario_generator: for scenario_params in scenario_generator:
# Extract parameters from tuple # Extract parameters from tuple
comm_scheme, is_communication_overlapped, is_cuda_enabled_mpi, decomposition_axes, num_cells_per_block = scenario_params comm_scheme, is_communication_overlapped, is_cuda_enabled_mpi, decomposition_axes, num_cells_per_block, fully_periodic = scenario_params
if comm_scheme != 'UniformGPUScheme_Baseline' and is_cuda_enabled_mpi is True: if comm_scheme != 'UniformGPUScheme_Baseline' and is_cuda_enabled_mpi is True:
# Skip CUDA enabled MPI tests for GPUPackInfo tests # Skip CUDA enabled MPI tests for GPUPackInfo tests
continue continue
...@@ -39,8 +46,11 @@ for scenario_params in scenario_generator: ...@@ -39,8 +46,11 @@ for scenario_params in scenario_generator:
decomposition_axes_str = ''.join(decomposition_axes) decomposition_axes_str = ''.join(decomposition_axes)
# Compute block decomposition based on the specified axes and the number of processes # Compute block decomposition based on the specified axes and the number of processes
blocks = get_block_decomposition(decomposition_axes, num_processes) blocks = get_block_decomposition(decomposition_axes, num_processes)
# Estimate number of time steps
time_steps = max(50, calculate_time_steps(time_per_scenarios, expected_mlups, 3 * (num_cells_per_block,)))
# Create a benchmark scenario # Create a benchmark scenario
scenario = BenchmarkScenario(testcase=testcase_name, decomposition_axes=decomposition_axes_str) scenario = BenchmarkScenario(testcase=testcase_name, decomposition_axes=decomposition_axes_str,
time_steps=time_steps, fully_periodic=fully_periodic)
# Domain Setup parameters # Domain Setup parameters
domain_setup = scenario.scenario_config['DomainSetup'] domain_setup = scenario.scenario_config['DomainSetup']
domain_setup['cellsPerBlock'] = 3 * (num_cells_per_block,) domain_setup['cellsPerBlock'] = 3 * (num_cells_per_block,)
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment