Commit 550358c0 authored by Martin Bauer's avatar Martin Bauer
Browse files

Cleaned up UniformGridGPU benchmark and adapted it to changes in python codegen packages

parent f50ec7bd
......@@ -98,7 +98,7 @@ int main( int argc, char **argv )
const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal");
const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 ));
const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", false);
const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", true);
// Creating fields
BlockDataID pdfFieldCpuID = field::addToStorage< PdfField_T >( blocks, "pdfs cpu", real_t(0), field::fzyx);
......@@ -165,7 +165,12 @@ int main( int argc, char **argv )
}
Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1));
for(int i=0; i< 3; ++i)
{
if( int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) {
WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock");
}
}
int streamHighPriority = 0;
int streamLowPriority = 0;
......
import sympy as sp
import numpy as np
import pystencils as ps
from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule
from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule, create_lb_collision_rule
from lbmpy.boundaries import NoSlip, UBB
from lbmpy.fieldaccess import StreamPullTwoFieldsAccessor, StreamPushTwoFieldsAccessor
from pystencils_walberla import generate_pack_info_from_kernel
......@@ -39,7 +39,7 @@ options_dict = {
'mrt': {
'method': 'mrt',
'stencil': 'D3Q19',
'relaxation_rates': [0, omega, 1.3, 1.4, omega, 1.2, 1.1, 1.15, 1.234, 1.4235, 1.242, 1.2567, 0.9, 0.7],
'relaxation_rates': [omega, 1.3, 1.4, 1.2, 1.1, 1.15, 1.234, 1.4235],
},
'mrt_full': {
'method': 'mrt',
......@@ -147,7 +147,7 @@ with CodeGeneration() as ctx:
# CPU lattice model - required for macroscopic value computation, VTK output etc.
options_without_opt = options.copy()
del options_without_opt['optimization']
generate_lattice_model(ctx, 'UniformGridGPU_LatticeModel', lb_method, update_rule_params=options_without_opt)
generate_lattice_model(ctx, 'UniformGridGPU_LatticeModel', create_lb_collision_rule(lb_method=lb_method, **options_without_opt))
# gpu LB sweep & boundaries
generate_sweep(ctx, 'UniformGridGPU_LbKernel', update_rule,
......
......@@ -83,6 +83,14 @@ int main( int argc, char **argv )
BlockDataID pdfFieldGpuID = cuda::addGPUFieldToStorage< PdfField_T >( blocks, pdfFieldCpuID, "pdfs on GPU", true );
Vector3<int> innerOuterSplit = parameters.getParameter<Vector3<int> >("innerOuterSplit", Vector3<int>(1, 1, 1));
for(int i=0; i< 3; ++i)
{
if( int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) {
WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock");
}
}
Cell innerOuterSplitCell (innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
bool cudaEnabledMPI = parameters.getParameter<bool>( "cudaEnabledMPI", false );
Vector3<int32_t> gpuBlockSize = parameters.getParameter<Vector3<int32_t> > ("gpuBlockSize", Vector3<int32_t>(256, 1, 1));
......
......@@ -36,7 +36,7 @@ options_dict = {
'mrt': {
'method': 'mrt',
'stencil': 'D3Q19',
'relaxation_rates': [0, omega, 1.3, 1.4, omega, 1.2, 1.1],
'relaxation_rates': [omega, 1.3, 1.4, omega, 1.2, 1.1],
},
'entropic': {
'method': 'mrt3',
......
# encoding: utf-8
import math
import operator
from functools import reduce
import waLBerla as wlb
# Constants that define the size of blocks that are used in the benchmarks
MIN_CELLS_PER_BLOCK = 16
MAX_CELLS_PER_BLOCK = 256
INC_CELLS_PER_BLOCK = 16
# Amount of cells per block
cells_per_block_interval = range(MIN_CELLS_PER_BLOCK, MAX_CELLS_PER_BLOCK + 1, INC_CELLS_PER_BLOCK)
# Blocks with size in [16, 32, 64, 128, 256]
cells_per_block = [num_cells for num_cells in cells_per_block_interval]
# Number of active MPI processes
num_processes = wlb.mpi.numProcesses()
# Whether to overlap computation with communication
overlap_communication = [False, True]
# Whether MPI supports buffers in GPU memory
cuda_enabled_mpi = [False, True]
# Supported communication schemes
communication_schemes = ['GPUPackInfo_Streams', 'UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']
def calculate_time_steps(runtime, expected_mlups, domain_size):
cells = reduce(operator.mul, domain_size, 1)
time_steps_per_second = expected_mlups * 1e6 / cells
return int(time_steps_per_second * runtime)
def side_length_to_fill_memory(memory_fill_percentage, memory_in_gb):
bytes_per_cell = 19 * 2 * 8
max_cells = memory_in_gb * 1e9 / bytes_per_cell * memory_fill_percentage
return int(max_cells**(1/3))
def get_block_decomposition(block_decomposition, num_processes):
bx = by = bz = 1
blocks_per_axis = int(math.log(num_processes, 2))
for i in range(blocks_per_axis):
decomposition_axis = block_decomposition[i % len(block_decomposition)]
if decomposition_axis == 'y':
by *= 2
elif decomposition_axis == 'z':
bz *= 2
elif decomposition_axis == 'x':
bx *= 2
assert (bx * by * bz) == num_processes
return bx, by, bz
# encoding: utf-8
import os
import pandas as pd
import waLBerla as wlb
import copy
from datetime import datetime
CommunicationSchemeType = {
'GPUPackInfo_Baseline': 0,
'GPUPackInfo_Streams': 1,
'UniformGPUScheme_Baseline': 2,
'UniformGPUScheme_Memcpy': 3,
}
CommunicationSchemeName = {
0: 'GPUPackInfo_Baseline',
1: 'GPUPackInfo_Streams',
2: 'UniformGPUScheme_Baseline',
3: 'UniformGPUScheme_Memcpy',
}
# Base configuration for the benchmark
BASE_CONFIG = {
'DomainSetup': {
'cellsPerBlock': (64, 64, 64),
'blocks': (1, 1, 1),
'nrOfProcesses': (1, 1, 1),
'periodic': (0, 0, 1),
'dx': 1.0
},
'Parameters': {
'omega': 1.8,
'timesteps': 1001,
'remainingTimeLoggerFrequency': 250,
'vtkWriteFrequency': 0,
'overlapCommunication': False,
'cudaEnabledMPI': False,
'initialVelocity': (0, 0, 0),
'performanceReportFrequency': 250,
'communicationScheme': CommunicationSchemeType['UniformGPUScheme_Baseline'],
},
'Boundaries': {
'Border': [
{'direction': 'W', 'walldistance': -1, 'flag': 'NoSlip'},
{'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'},
{'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'},
{'direction': 'N', 'walldistance': -1, 'flag': 'UBB'},
]
}
}
class BenchmarkScenario:
def __init__(self, testcase, time_steps, decomposition_axes=None, fully_periodic=False):
self.testcase = testcase
self.scenario_config = copy.deepcopy(BASE_CONFIG)
self.scenario_config['Parameters']['timesteps'] = time_steps
self.fully_periodic = fully_periodic
if fully_periodic:
del self.scenario_config['Boundaries']['Border']
self.scenario_config['DomainSetup']['periodic'] = (1, 1, 1)
self.decomposition_axes = decomposition_axes
now = datetime.now().replace(second=0, microsecond=0)
self.output_filename = f'{self.testcase}_{now.strftime("%Y-%m-%d_%H-%M")}.csv'
def get_data(self):
block_setup = self.scenario_config.get('DomainSetup')
params = self.scenario_config.get('Parameters')
return {
'processesX': block_setup.get('nrOfProcesses')[0],
'processesY': block_setup.get('nrOfProcesses')[1],
'processesZ': block_setup.get('nrOfProcesses')[2],
'blocksX': block_setup.get('blocks')[0],
'blocksY': block_setup.get('blocks')[1],
'blocksZ': block_setup.get('blocks')[2],
'fully_periodic': self.fully_periodic,
'cellsPerBlockX': block_setup.get('cellsPerBlock')[0],
'cellsPerBlockY': block_setup.get('cellsPerBlock')[1],
'cellsPerBlockZ': block_setup.get('cellsPerBlock')[2],
'cudaEnabledMPI': params.get('cudaEnabledMPI'),
'overlapCommunication': params.get('overlapCommunication'),
'time_steps': params['timesteps'],
'domainDecomposition': self.decomposition_axes,
'communicationScheme': CommunicationSchemeName[params.get('communicationScheme')],
}
@wlb.member_callback
def config(self, **kwargs):
from pprint import pformat
wlb.log_info_on_root("Scenario:\n" + pformat(self.get_data()))
return self.scenario_config
@wlb.member_callback
def results_callback(self, **kwargs):
data = self.get_data()
data.update(kwargs)
self.save_data([data])
def save_data(self, data):
df = pd.DataFrame(data)
if not os.path.isfile(self.output_filename):
df.to_csv(self.output_filename, index=False)
else:
df.to_csv(self.output_filename, index=False, mode='a', header=False)
#!/usr/bin/env python3
"""
This is a waLBerla parameter file that tests (almost) all parameter combinations for GPU communication.
Build waLBerla with -DWALBERLA_BUILD_WITH_PYTHON=1 then run e.g.
./UniformGridBenchmarkGPU_AA_trt simulation_setup/benchmark_configs.py
Look at the end of the file to select the benchmark to run
"""
import os
import pandas as pd
import waLBerla as wlb
from waLBerla.tools.config import block_decomposition
from waLBerla.tools.sqlitedb import sequenceValuesToScalars
from os import getcwd
from waLBerla.tools.jobscripts import createJobscript
from datetime import timedelta
from waLBerla.tools.config import block_decomposition, toPrm
from waLBerla.tools.sqlitedb import *
from copy import deepcopy
import sys
import sqlite3
CSV_FILE = "overlap_benchmark.csv"
# Number of time steps run for a workload of 128^3 per GPU
# if double as many cells are on the GPU, half as many time steps are run etc.
# increase this to get more reliable measurements
TIME_STEPS_FOR_128_BLOCK = 200
DB_FILE = "gpu_benchmark.sqlite3"
BASE_CONFIG = {
'DomainSetup': {
......@@ -20,15 +28,19 @@ BASE_CONFIG = {
},
'Parameters': {
'omega': 1.8,
'timesteps': 400,
'cudaEnabledMPI': False,
'warmupSteps': 5,
'outerIterations': 3,
'initShearFlow': True,
}
}
def num_time_steps(block_size):
cells = block_size[0] * block_size[1] * block_size[2]
time_steps = (128 ** 3 / cells) * TIME_STEPS_FOR_128_BLOCK
return int(time_steps)
class Scenario:
def __init__(self, cells_per_block=(256, 128, 128), **kwargs):
self.config_dict = deepcopy(BASE_CONFIG)
......@@ -40,6 +52,8 @@ class Scenario:
def config(self, **kwargs):
from pprint import pformat
wlb.log_info_on_root("Scenario:\n" + pformat(self.config_dict))
# Write out the configuration as text-based prm:
#print(toPrm(self.config_dict))
return self.config_dict
@wlb.member_callback
......@@ -54,14 +68,26 @@ class Scenario:
data['build_machine'] = wlb.build_info.build_machine
sequenceValuesToScalars(data)
df = pd.DataFrame.from_records([data])
if not os.path.isfile(CSV_FILE):
df.to_csv(CSV_FILE, index=False)
else:
df.to_csv(CSV_FILE, index=False, mode='a', header=False)
result = data
sequenceValuesToScalars(result)
num_tries = 4
for num_try in range(num_tries): # check multiple times e.g. may fail when multiple benchmark processes are running
try:
checkAndUpdateSchema(result, "runs", DB_FILE)
storeSingle(result, "runs", DB_FILE)
break
except sqlite3.OperationalError as e:
wlb.log_warning("Sqlite DB writing failed: try {}/{} {}".format(num_try+1, num_tries, str(e)))
# -------------------------------------- Functions trying different parameter sets -------------------------------------------------------------------
def overlap_benchmark():
"""Tests different communication overlapping strategies"""
wlb.log_info_on_root("Running different communication overlap strategies")
wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager()
inner_outer_splits = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1),
(4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1),
......@@ -76,11 +102,43 @@ def overlap_benchmark():
for inner_outer_split in inner_outer_splits:
scenario = Scenario(timeStepStrategy=overlap_strategy,
communicationScheme=comm_strategy,
innerOuterSplit=inner_outer_split)
innerOuterSplit=inner_outer_split,
timesteps=num_time_steps((256, 128, 128)))
scenarios.add(scenario)
def communication_compare():
"""Tests different communication strategies"""
wlb.log_info_on_root("Running benchmarks to compare communication strategies")
wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager()
for block_size in [(128, 128, 128), (32, 32, 32), (64, 64, 64), (256, 256, 256)]:
for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']:
sc = Scenario(cells_per_block=block_size,
gpuBlockSize=(128, 1, 1),
timeStepStrategy='noOverlap',
communicationScheme=comm_strategy,
timesteps=num_time_steps(block_size))
scenarios.add(sc)
for inner_outer_split in [(4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1)]:
if 3 * inner_outer_split[0] > block_size[0]: # ensure that the inner part of the domain is still large enough
continue
sc = Scenario(cells_per_block=block_size,
gpuBlockSize=(128, 1, 1),
timeStepStrategy='simpleOverlap',
innerOuterSplit=inner_outer_split,
communicationScheme=comm_strategy,
timesteps=num_time_steps(block_size))
scenarios.add(sc)
def single_gpu_benchmark():
"""Benchmarks only the LBM compute kernel"""
wlb.log_info_on_root("Running single GPU benchmarks")
wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager()
block_sizes = [(i, i, i) for i in (64, 128, 256, 384)] + [(512, 512, 128)]
cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1),
......@@ -90,47 +148,93 @@ def single_gpu_benchmark():
(32, 16, 1)]
for block_size in block_sizes:
for cuda_block_size in cuda_blocks:
cells = block_size[0] * block_size[1] * block_size[2]
time_steps_for_128_cubed = 1000
time_steps = (128 ** 3 / cells) * time_steps_for_128_cubed
scenario = Scenario(cells_per_block=block_size,
gpuBlockSize=cuda_block_size,
timeStepStrategy='kernelOnly',
timesteps=int(time_steps))
timesteps=num_time_steps(block_size))
scenarios.add(scenario)
all_executables = ('UniformGridBenchmarkGPU_AA_entropic',
'UniformGridBenchmarkGPU_AA_mrt',
'UniformGridBenchmarkGPU_AA_smagorinsky',
'UniformGridBenchmarkGPU_AA_srt',
'UniformGridBenchmarkGPU_AA_trt',
'UniformGridBenchmarkGPU_entropic',
'UniformGridBenchmarkGPU_mrt',
'UniformGridBenchmarkGPU_smagorinsky',
'UniformGridBenchmarkGPU_srt',
'UniformGridBenchmarkGPU_trt')
# -------------------------------------- Optional job script generation for PizDaint -------------------------------------------------------------------
job_script_header = """
#!/bin/bash -l
#SBATCH --job-name=scaling
#SBATCH --time=0:30:00
#SBATCH --nodes={nodes}
#SBATCH -o out_scaling_{nodes}_%j.txt
#SBATCH -e err_scaling_{nodes}_%j.txt
#SBATCH --ntasks-per-core=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=1
#SBATCH --partition=normal
#SBATCH --constraint=gpu
#SBATCH --account=d105
cd {folder}
source ~/env.sh
def generate_jobscripts(machine='pizdaint_hybrid',
exe_names=all_executables):
module load daint-gpu
module load craype-accel-nvidia60
export MPICH_RDMA_ENABLED_CUDA=1 # allow GPU-GPU data transfer
export CRAY_CUDA_MPS=1 # allow GPU sharing
export MPICH_G2G_PIPELINE=256 # adapt maximum number of concurrent in-flight messages
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
export CRAY_CUDA_MPS=1
export MPICH_RANK_REORDER_METHOD=3
export PMI_MMAP_SYNC_WAIT_TIME=300
# grid_order -R -H -c 1,1,8 -g 16,16,8
ulimit -c 0
"""
job_script_exe_part = """
export WALBERLA_SCENARIO_IDX=0
while srun -n {nodes} ./{app} {config}
do
((WALBERLA_SCENARIO_IDX++))
done
"""
all_executables = ('UniformGridBenchmarkGPU_mrt_d3q27',
'UniformGridBenchmarkGPU_smagorinsky_d3q27',
'UniformGridBenchmarkGPU_cumulant'
'UniformGridBenchmarkGPU_cumulant_d3q27')
def generate_jobscripts(exe_names=all_executables):
for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 2400]:
with open("job_overlap_benchmark_{:04d}.sh".format(node_count), 'w') as f:
folder_name = "scaling_{:04d}".format(node_count)
os.makedirs(folder_name, exist_ok=True)
# run grid_order
import subprocess
decomposition = block_decomposition(node_count)
decomposition_str = ",".join(str(e) for e in decomposition)
subprocess.check_call(['grid_order', '-R', '-H', '-g', decomposition_str])
job_script = job_script_header.format(nodes=node_count, folder=os.path.join(os.getcwd(), folder_name))
for exe in exe_names:
job_script += job_script_exe_part.format(app="../" + exe, nodes=node_count, config='../communication_compare.py')
js = createJobscript(nodes=node_count,
output_file='overlap_bench_{:04d}_%j.txt'.format(node_count),
error_file='overlap_bench_{:04d}_%j.txt'.format(node_count),
initial_dir=getcwd(),
commands=list(("./" + exe, 'overlap_benchmark.py') for exe in exe_names),
wall_time=timedelta(minutes=25),
machine=machine,
account='d105',
)
f.write(js)
with open(os.path.join(folder_name, 'job.sh'), 'w') as f:
f.write(job_script)
if __name__ == '__main__':
print("Called without waLBerla - generating job scripts for PizDaint")
generate_jobscripts()
else:
single_gpu_benchmark()
wlb.log_info_on_root("Batch run of benchmark scenarios, saving result to {}".format(DB_FILE))
# Select the benchmark you want to run
single_gpu_benchmark() # benchmarks different CUDA block sizes and domain sizes and measures single GPU performance of compute kernel (no communication)
#communication_compare() # benchmarks different communication routines, with and without overlap
#overlap_benchmark() # benchmarks different communication overlap options
# encoding: utf-8
import itertools
import waLBerla as wlb
from base import get_block_decomposition, communication_schemes, overlap_communication, \
cuda_enabled_mpi, cells_per_block, num_processes
from benchmark import BenchmarkScenario, CommunicationSchemeType
# Stores the scenarios for the current simulation
scenarios = wlb.ScenarioManager()
# Generates all block decompositions of xyz, 2 directions at a time
#block_decompositions = itertools.combinations_with_replacement('xyz', r=2)
block_decompositions = ['xy', 'yz', 'xz']
scenario_generator = itertools.product(communication_schemes, overlap_communication, cuda_enabled_mpi, block_decompositions, cells_per_block)
testcase_name = "inter-node"
for scenario_params in scenario_generator:
# Extract parameters from tuple
comm_scheme, is_communication_overlapped, is_cuda_enabled_mpi, decomposition_axes, num_cells_per_block = scenario_params
if comm_scheme != 'UniformGPUScheme_Baseline' and is_cuda_enabled_mpi is True:
# Skip CUDA enabled MPI tests for GPUPackInfo tests
continue
elif comm_scheme == 'GPUPackInfo_Baseline' and is_communication_overlapped is True:
# Skip communication overlap tests for GPUPackInfo baseline
continue
# Convert the axes decompositions to string
decomposition_axes_str = ''.join(decomposition_axes)
# Compute block decomposition based on the specified axes and the number of processes
blocks = get_block_decomposition(decomposition_axes, num_processes)
# Create a benchmark scenario
scenario = BenchmarkScenario(testcase=testcase_name, decomposition_axes=decomposition_axes_str)
# Domain Setup parameters
domain_setup = scenario.scenario_config['DomainSetup']
domain_setup['cellsPerBlock'] = 3 * (num_cells_per_block,)
domain_setup['nrOfProcesses'] = blocks
domain_setup['blocks'] = blocks
# Additional parameters for benchmarking
params = scenario.scenario_config['Parameters']
params['cudaEnabledMPI'] = is_cuda_enabled_mpi
params['overlapCommunication'] = is_communication_overlapped
params['communicationScheme'] = CommunicationSchemeType[comm_scheme]
# Add scenario for execution
scenarios.add(scenario)
# encoding: utf-8
import itertools
import waLBerla as wlb
from base import get_block_decomposition, communication_schemes, overlap_communication, \
cuda_enabled_mpi, cells_per_block, num_processes
from benchmark import BenchmarkScenario, CommunicationSchemeType
# Stores the scenarios for the current simulation
scenarios = wlb.ScenarioManager()
# Generates all block decompositions of xyz, 2 directions at a time
#block_decompositions = itertools.combinations_with_replacement('xyz', r=2)
block_decompositions = ['xy', 'yz', 'zx']
scenario_generator = itertools.product(communication_schemes, overlap_communication, cuda_enabled_mpi, block_decompositions, cells_per_block)
testcase_name = "intra-node"
for scenario_params in scenario_generator:
# Extract parameters from tuple
comm_scheme, is_communication_overlapped, is_cuda_enabled_mpi, decomposition_axes, num_cells_per_block = scenario_params
if comm_scheme != 'UniformGPUScheme_Baseline' and is_cuda_enabled_mpi is True:
# Skip CUDA enabled MPI tests for GPUPackInfo tests
continue
elif comm_scheme == 'GPUPackInfo_Baseline' and is_communication_overlapped is True:
# Skip communication overlap tests for GPUPackInfo baseline