Skip to content
Snippets Groups Projects
Commit a1f716d0 authored by Martin Bauer's avatar Martin Bauer
Browse files

New benchmark for generated LBM on uniform grid on CPU

parent cb331d8f
Branches gencpubench
No related merge requests found
Pipeline #18620 passed with stages
in 2 hours, 51 minutes, and 49 seconds
...@@ -14,3 +14,4 @@ add_subdirectory( ProbeVsExtraMessage ) ...@@ -14,3 +14,4 @@ add_subdirectory( ProbeVsExtraMessage )
add_subdirectory( SchaeferTurek ) add_subdirectory( SchaeferTurek )
add_subdirectory( UniformGrid ) add_subdirectory( UniformGrid )
add_subdirectory( UniformGridGPU ) add_subdirectory( UniformGridGPU )
add_subdirectory( UniformGridGenerated )
#!/usr/bin/env python3
import os
import pandas as pd
import waLBerla as wlb
from waLBerla.tools.config import block_decomposition
from waLBerla.tools.sqlitedb import sequenceValuesToScalars
from copy import deepcopy
import sys
CSV_FILE = "new.csv"
BASE_CONFIG = {
'DomainSetup': {
'cellsPerBlock': (256, 128, 128),
'periodic': (1, 1, 1),
},
'Parameters': {
'omega': 1.8,
'timesteps': 400,
'cudaEnabledMPI': False,
'warmupSteps': 5,
'outerIterations': 3,
'initShearFlow': True,
}
}
class Scenario:
def __init__(self, cells_per_block=(256, 128, 128), **kwargs):
self.config_dict = deepcopy(BASE_CONFIG)
self.config_dict['Parameters'].update(kwargs)
self.config_dict['DomainSetup']['blocks'] = block_decomposition(wlb.mpi.numProcesses())
self.config_dict['DomainSetup']['cellsPerBlock'] = cells_per_block
@wlb.member_callback
def config(self, **kwargs):
from pprint import pformat
wlb.log_info_on_root("Scenario:\n" + pformat(self.config_dict))
return self.config_dict
@wlb.member_callback
def results_callback(self, **kwargs):
data = {}
data.update(self.config_dict['Parameters'])
data.update(self.config_dict['DomainSetup'])
data.update(kwargs)
data['executable'] = sys.argv[0]
data['compile_flags'] = wlb.build_info.compiler_flags
data['walberla_version'] = wlb.build_info.version
data['build_machine'] = wlb.build_info.build_machine
sequenceValuesToScalars(data)
df = pd.DataFrame.from_records([data])
if not os.path.isfile(CSV_FILE):
df.to_csv(CSV_FILE, index=False)
else:
df.to_csv(CSV_FILE, index=False, mode='a', header=False)
def overlap_benchmark():
scenarios = wlb.ScenarioManager()
inner_outer_splits = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1),
(4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1),
(4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32)]
for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']: # 'GPUPackInfo_Baseline', 'GPUPackInfo_Streams'
# no overlap
scenarios.add(Scenario(timeStepStrategy='noOverlap', communicationScheme=comm_strategy, innerOuterSplit=(1, 1, 1)))
# overlap
for overlap_strategy in ['simpleOverlap', 'complexOverlap']:
for inner_outer_split in inner_outer_splits:
scenario = Scenario(timeStepStrategy=overlap_strategy,
communicationScheme=comm_strategy,
innerOuterSplit=inner_outer_split)
scenarios.add(scenario)
def communication_compare():
scenarios = wlb.ScenarioManager()
for block_size in [(128, 128, 128), (32, 32, 32), (64, 64, 64), (256, 256, 256)]:
for comm_strategy in ['UniformGPUScheme_Baseline', 'UniformGPUScheme_Memcpy']:
sc = Scenario(cells_per_block=block_size,
gpuBlockSize=(128, 1, 1),
timeStepStrategy='noOverlap',
communicationScheme=comm_strategy)
scenarios.add(sc)
for inner_outer_split in [(4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1)]:
if 3 * inner_outer_split[0] < block_size[0]:
continue
sc = Scenario(cells_per_block=block_size,
gpuBlockSize=(128, 1, 1),
timeStepStrategy='simpleOverlap',
innerOuterSplit=inner_outer_split,
communicationScheme=comm_strategy)
scenarios.add(sc)
def single_gpu_benchmark():
scenarios = wlb.ScenarioManager()
block_sizes = [(i, i, i) for i in (64, 128, 256, 384)] + [(512, 512, 128)]
cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1),
(32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1),
(32, 4, 1), (64, 4, 1), (128, 4, 1),
(32, 8, 1), (64, 8, 1),
(32, 16, 1)]
for block_size in block_sizes:
for cuda_block_size in cuda_blocks:
cells = block_size[0] * block_size[1] * block_size[2]
time_steps_for_128_cubed = 1000
time_steps = (128 ** 3 / cells) * time_steps_for_128_cubed
scenario = Scenario(cells_per_block=block_size,
gpuBlockSize=cuda_block_size,
timeStepStrategy='kernelOnly',
timesteps=int(time_steps))
scenarios.add(scenario)
job_script_header = """
#!/bin/bash -l
#SBATCH --job-name=scaling
#SBATCH --time=0:30:00
#SBATCH --nodes={nodes}
#SBATCH -o out_scaling_{nodes}_%j.txt
#SBATCH -e err_scaling_{nodes}_%j.txt
#SBATCH --ntasks-per-core=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=1
#SBATCH --partition=normal
#SBATCH --constraint=gpu
#SBATCH --account=d105
cd {folder}
source ~/env.sh
module load daint-gpu
module load craype-accel-nvidia60
export MPICH_RDMA_ENABLED_CUDA=1 # allow GPU-GPU data transfer
export CRAY_CUDA_MPS=1 # allow GPU sharing
export MPICH_G2G_PIPELINE=256 # adapt maximum number of concurrent in-flight messages
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
export CRAY_CUDA_MPS=1
export MPICH_RANK_REORDER_METHOD=3
export PMI_MMAP_SYNC_WAIT_TIME=300
# grid_order -R -H -c 1,1,8 -g 16,16,8
ulimit -c 0
"""
job_script_exe_part = """
export WALBERLA_SCENARIO_IDX=0
while srun -n {nodes} ./{app} {config}
do
((WALBERLA_SCENARIO_IDX++))
done
"""
all_executables = ('UniformGridBenchmarkGPU_mrt_d3q27',
'UniformGridBenchmarkGPU_smagorinsky_d3q27',
'UniformGridBenchmarkGPU_cumulant'
'UniformGridBenchmarkGPU_cumulant_d3q27',
)
def generate_jobscripts(exe_names=all_executables):
for node_count in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 2400]:
folder_name = "scaling_{:04d}".format(node_count)
os.makedirs(folder_name, exist_ok=True)
# run grid_order
import subprocess
decomposition = block_decomposition(node_count)
decomposition_str = ",".join(str(e) for e in decomposition)
subprocess.check_call(['grid_order', '-R', '-H', '-g', decomposition_str])
job_script = job_script_header.format(nodes=node_count, folder=os.path.join(os.getcwd(), folder_name))
for exe in exe_names:
job_script += job_script_exe_part.format(app="../" + exe, nodes=node_count, config='../communication_compare.py')
with open(os.path.join(folder_name, 'job.sh'), 'w') as f:
f.write(job_script)
if __name__ == '__main__':
print("Called without waLBerla - generating job scripts for PizDaint")
generate_jobscripts()
else:
single_gpu_benchmark()
waLBerla_python_file_generates(UniformGridGenerated.py
UniformGridGenerated_LatticeModel.cpp
UniformGridGenerated_Defines.h)
foreach(config trt )
waLBerla_add_executable ( NAME UniformGridBenchmarkGenerated_${config}
FILES UniformGridGenerated.cpp UniformGridGenerated.py
DEPENDS blockforest boundary core domain_decomposition field geometry timeloop vtk gui
CODEGEN_CFG ${config})
endforeach()
#include "core/Environment.h"
#include "core/logging/Initialization.h"
#include "core/math/Random.h"
#include "python_coupling/CreateConfig.h"
#include "python_coupling/PythonCallback.h"
#include "python_coupling/DictWrapper.h"
#include "blockforest/Initialization.h"
#include "field/vtk/VTKWriter.h"
#include "field/communication/PackInfo.h"
#include "blockforest/communication/UniformBufferedScheme.h"
#include "timeloop/all.h"
#include "core/timing/TimingPool.h"
#include "core/timing/RemainingTimeLogger.h"
#include "domain_decomposition/SharedSweep.h"
#include "lbm/communication/PdfFieldPackInfo.h"
#include "lbm/field/AddToStorage.h"
#include "lbm/vtk/VTKOutput.h"
#include "lbm/gui/Connection.h"
#include "lbm/vtk/Velocity.h"
#include "gui/Gui.h"
#include "UniformGridGenerated_LatticeModel.h"
#include "UniformGridGenerated_Defines.h"
using namespace walberla;
typedef lbm::UniformGridGenerated_LatticeModel LatticeModel_T;
typedef LatticeModel_T::Stencil Stencil_T;
typedef LatticeModel_T::CommunicationStencil CommunicationStencil_T;
typedef lbm::PdfField< LatticeModel_T > PdfField_T;
void initShearVelocity(const shared_ptr<StructuredBlockStorage> & blocks, BlockDataID pdfFieldId,
const real_t xMagnitude=0.1, const real_t fluctuationMagnitude=0.05 )
{
math::seedRandomGenerator(0);
auto halfZ = blocks->getDomainCellBB().zMax() / 2;
for( auto & block: *blocks)
{
auto pdfField = block.getData<PdfField_T>( pdfFieldId );
WALBERLA_FOR_ALL_CELLS_INCLUDING_GHOST_LAYER_XYZ(pdfField,
Cell globalCell;
blocks->transformBlockLocalToGlobalCell(globalCell, block, Cell(x, y, z));
real_t randomReal = xMagnitude * math::realRandom<real_t>(-fluctuationMagnitude, fluctuationMagnitude);
if( globalCell[2] >= halfZ ) {
pdfField->setDensityAndVelocity(x, y, z, Vector3<real_t>(xMagnitude, 0, randomReal), real_t(1.0));
} else {
pdfField->setDensityAndVelocity(x, y, z, Vector3<real_t>(-xMagnitude, 0, randomReal), real_t(1.0));
}
);
}
}
int main( int argc, char **argv )
{
mpi::Environment env( argc, argv );
for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg )
{
WALBERLA_MPI_WORLD_BARRIER();
auto config = *cfg;
logging::configureLogging( config );
auto blocks = blockforest::createUniformBlockGridFromConfig( config );
Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t> >( "cellsPerBlock" );
// Reading parameters
auto parameters = config->getOneBlock( "Parameters" );
const std::string timeStepStrategy = parameters.getParameter<std::string>( "timeStepStrategy", "normal");
const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 ));
const uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 50 ));
const bool initShearFlow = parameters.getParameter<bool>("initShearFlow", false);
// Creating fields
LatticeModel_T latticeModel = LatticeModel_T( omega );
BlockDataID pdfFieldId = lbm::addPdfFieldToStorage( blocks, "pdf field", latticeModel);
if( initShearFlow ) {
initShearVelocity(blocks, pdfFieldId);
}
SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps );
blockforest::communication::UniformBufferedScheme< CommunicationStencil_T > communication( blocks );
communication.addPackInfo( make_shared< lbm::PdfFieldPackInfo< LatticeModel_T > >( pdfFieldId ) );
timeLoop.add() << BeforeFunction( communication, "communication" )
<< Sweep( LatticeModel_T::Sweep( pdfFieldId ), "LB stream & collide" );
int warmupSteps = parameters.getParameter<int>( "warmupSteps", 2 );
int outerIterations = parameters.getParameter<int>( "outerIterations", 1 );
for(int i=0; i < warmupSteps; ++i )
timeLoop.singleStep();
auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds
if (remainingTimeLoggerFrequency > 0) {
auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * outerIterations, remainingTimeLoggerFrequency );
timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" );
}
// VTK
uint_t vtkWriteFrequency = parameters.getParameter<uint_t>( "vtkWriteFrequency", 0 );
if( vtkWriteFrequency > 0 )
{
auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
"simulation_step", false, true, true, false, 0 );
auto velWriter = make_shared< lbm::VelocityVTKWriter<LatticeModel_T> >(pdfFieldId, "vel");
vtkOutput->addCellDataWriter(velWriter);
timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" );
}
bool useGui = parameters.getParameter<bool>( "useGui", false );
if( useGui )
{
GUI gui( timeLoop, blocks, argc, argv);
lbm::connectToGui<LatticeModel_T>(gui);
gui.run();
}
else
{
for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration )
{
timeLoop.setCurrentTimeStepToZero();
WcTimer simTimer;
WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
simTimer.start();
timeLoop.run();
simTimer.end();
WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
auto time = simTimer.last();
auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
WALBERLA_ROOT_SECTION()
{
python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
if ( pythonCallbackResults.isCallable())
{
pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
pythonCallbackResults.data().exposeValue( "stencil", infoStencil );
pythonCallbackResults.data().exposeValue( "configName", infoConfigName );
pythonCallbackResults.data().exposeValue( "cse_global", infoCseGlobal );
pythonCallbackResults.data().exposeValue( "cse_pdfs", infoCsePdfs );
// Call Python function to report results
pythonCallbackResults();
}
}
}
}
}
return 0;
}
import sympy as sp
import pystencils as ps
from lbmpy.creationfunctions import create_lb_collision_rule
from lbmpy.fieldaccess import StreamPullTwoFieldsAccessor
from lbmpy_walberla import generate_lattice_model
from pystencils_walberla import CodeGeneration
omega = sp.symbols("omega")
omega_fill = sp.symbols("omega_:10")
options_dict = {
'srt': {
'method': 'srt',
'stencil': 'D3Q19',
'relaxation_rate': omega,
'compressible': False,
},
'trt': {
'method': 'trt',
'stencil': 'D3Q19',
'relaxation_rate': omega,
},
'mrt': {
'method': 'mrt',
'stencil': 'D3Q19',
'relaxation_rates': [0, omega, 1.3, 1.4, omega, 1.2, 1.1, 1.15, 1.234, 1.4235, 1.242, 1.2567, 0.9, 0.7],
},
'mrt_full': {
'method': 'mrt',
'stencil': 'D3Q19',
'relaxation_rates': [omega_fill[0], omega, omega_fill[1], omega_fill[2], omega_fill[3], omega_fill[4], omega_fill[5]],
},
'mrt3': {
'method': 'mrt3',
'stencil': 'D3Q19',
'relaxation_rates': [omega, 1.1, 1.2],
},
'entropic': {
'method': 'mrt3',
'stencil': 'D3Q19',
'compressible': True,
'relaxation_rates': [omega, omega, sp.Symbol("omega_free")],
'entropic': True,
},
'entropic_kbc_n4': {
'method': 'trt-kbc-n4',
'stencil': 'D3Q27',
'compressible': True,
'relaxation_rates': [omega, sp.Symbol("omega_free")],
'entropic': True,
},
'smagorinsky': {
'method': 'srt',
'stencil': 'D3Q19',
'smagorinsky': True,
'relaxation_rate': omega,
},
'cumulant': {
'stencil': 'D3Q19',
'compressible': True,
'method': 'mrt',
'cumulant': True,
'relaxation_rates': [0, omega, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
},
}
info_header = """
#include "stencil/D3Q{q}.h"\nusing Stencil_T = walberla::stencil::D3Q{q};
const char * infoStencil = "{stencil}";
const char * infoConfigName = "{configName}";
const bool infoCseGlobal = {cse_global};
const bool infoCsePdfs = {cse_pdfs};
"""
with CodeGeneration() as ctx:
accessor = StreamPullTwoFieldsAccessor()
assert not accessor.is_inplace, "This app does not work for inplace accessors"
common_options = {
'field_name': 'pdfs',
'temporary_field_name': 'pdfs_tmp',
'kernel_type': accessor,
'optimization': {'cse_global': True,
'cse_pdfs': False}
}
config_name = ctx.config
noopt = False
d3q27 = False
if config_name.endswith("_noopt"):
noopt = True
config_name = config_name[:-len("_noopt")]
if config_name.endswith("_d3q27"):
d3q27 = True
config_name = config_name[:-len("_d3q27")]
options = options_dict[config_name]
options.update(common_options)
options = options.copy()
if noopt:
options['optimization']['cse_global'] = False
options['optimization']['cse_pdfs'] = False
if d3q27:
options['stencil'] = 'D3Q27'
stencil_str = options['stencil']
q = int(stencil_str[stencil_str.find('Q')+1:])
pdfs, velocity_field = ps.fields("pdfs({q}), velocity(3) : double[3D]".format(q=q), layout='fzyx')
options['optimization']['symbolic_field'] = pdfs
vp = [
('double', 'omega_0'),
('double', 'omega_1'),
('double', 'omega_2'),
('double', 'omega_3'),
('double', 'omega_4'),
('double', 'omega_5'),
('double', 'omega_6'),
('int32_t', 'cudaBlockSize0'),
('int32_t', 'cudaBlockSize1'),
]
update_rule = create_lb_collision_rule(**options)
generate_lattice_model(ctx, 'UniformGridGenerated_LatticeModel', update_rule)
infoHeaderParams = {
'stencil': stencil_str,
'q': q,
'configName': ctx.config,
'cse_global': int(options['optimization']['cse_global']),
'cse_pdfs': int(options['optimization']['cse_pdfs']),
}
ctx.write_file("UniformGridGenerated_Defines.h", info_header.format(**infoHeaderParams))
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment