Commit aeff901c authored by Martin Bauer's avatar Martin Bauer
Browse files

UniformGridGenerated: detailed output

parent 5e9438d0
waLBerla_link_files_to_builddir( "*.prm" )
waLBerla_link_files_to_builddir( "*.py" )
waLBerla_python_file_generates(UniformGridGenerated.py
......
......@@ -13,7 +13,6 @@ Parameters
outerIterations 1; // how many measurements to conduct
vtkWriteFrequency 100; // write a VTK file every n'th step, if zero VTK output is disabled
cudaEnabledMPI false; // switch on if you have a CUDA-enabled MPI implementation
timeStepMode aa; // can be: noOverlap, simpleOverlap, complexOverlap, kernelOnly
remainingTimeLoggerFrequency 0; // interval in seconds to log the estimated remaining time
directComm 1;
......
#include "core/Environment.h"
#include "core/logging/Initialization.h"
#include "core/OpenMP.h"
#include "python_coupling/CreateConfig.h"
#include "python_coupling/PythonCallback.h"
#include "python_coupling/DictWrapper.h"
......@@ -11,6 +12,7 @@
#include "timeloop/all.h"
#include "core/timing/TimingPool.h"
#include "core/timing/RemainingTimeLogger.h"
#include "core/waLBerlaBuildInfo.h"
#include "domain_decomposition/SharedSweep.h"
#include "gui/Gui.h"
#include "InitShearVelocity.h"
......@@ -30,6 +32,7 @@
#include "GenMpiDtypeInfoAAPull.h"
#include "GenMpiDtypeInfoAAPush.h"
#include <iomanip>
using namespace walberla;
......@@ -162,24 +165,26 @@ int main( int argc, char **argv )
{
timeLoop.setCurrentTimeStepToZero();
WcTimer simTimer;
WALBERLA_LOG_INFO_ON_ROOT( "Starting simulation with " << timesteps << " time steps" );
auto threads = omp_get_max_threads();
simTimer.start();
timeLoop.run();
/*
pystencils::GenLbKernelAAEven k1(pdfFieldId, omega);
pystencils::GenLbKernelAAOdd k2(pdfFieldId, omega);
for(int t=0; t < timesteps / 2; ++t)
{ for( auto & b : *blocks) {
k1(&b);
k2(&b);
}}*/
simTimer.end();
WALBERLA_LOG_INFO_ON_ROOT( "Simulation finished" );
auto time = simTimer.last();
auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] );
auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6;
WALBERLA_LOG_RESULT_ON_ROOT( "MLUPS per process " << mlupsPerProcess );
WALBERLA_LOG_RESULT_ON_ROOT( "Time per time step " << time / real_c( timesteps ));
using std::setw;
WALBERLA_LOG_INFO_ON_ROOT(setw(18) << timeStepMode <<
" procs: " << setw(6) << MPIManager::instance()->numProcesses() <<
" threads: " << threads <<
" direct_comm: " << directComm <<
" time steps: " << timesteps <<
setw(15) << " block size: " << cellsPerBlock <<
" mlups/core: " << int(mlupsPerProcess/ threads) <<
" mlups: " << int(mlupsPerProcess) * MPIManager::instance()->numProcesses());
WALBERLA_ROOT_SECTION()
{
python_coupling::PythonCallback pythonCallbackResults( "results_callback" );
......@@ -188,8 +193,11 @@ int main( int argc, char **argv )
pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess );
pythonCallbackResults.data().exposeValue( "stencil", infoStencil );
pythonCallbackResults.data().exposeValue( "configName", infoConfigName );
pythonCallbackResults.data().exposeValue( "cse_global", infoCseGlobal );
pythonCallbackResults.data().exposeValue( "cse_pdfs", infoCsePdfs );
pythonCallbackResults.data().exposeValue( "optimizations", optimizationDict );
pythonCallbackResults.data().exposeValue( "githash", core::buildinfo::gitSHA1() );
pythonCallbackResults.data().exposeValue( "compilerFlags", core::buildinfo::compilerFlags() );
pythonCallbackResults.data().exposeValue( "buildMachine", core::buildinfo::buildMachine() );
// Call Python function to report results
pythonCallbackResults();
}
......
......@@ -69,25 +69,33 @@ info_header = """
#include "stencil/D3Q{q}.h"\nusing Stencil_T = walberla::stencil::D3Q{q};
const char * infoStencil = "{stencil}";
const char * infoConfigName = "{configName}";
const bool infoCseGlobal = {cse_global};
const bool infoCsePdfs = {cse_pdfs};
const char * optimizationDict = "{optimizationDict}";
"""
with CodeGeneration() as ctx:
common_options = {
'field_name': 'pdfs',
'temporary_field_name': 'pdfs_tmp',
'optimization': {'cse_global': False,
'cse_pdfs': False,
'split': False}
}
opts = {
'two_field_cse_pdfs': False,
'two_field_cse_global': False,
'two_field_split': True,
'two_field_nt_stores': True,
'aa_even_cse_pdfs': False,
'aa_even_cse_global': False,
'aa_even_split': False,
'aa_even_nt_stores': False,
'aa_odd_cse_pdfs': False,
'aa_odd_cse_global': False,
'aa_odd_split': True,
'aa_odd_nt_stores': False,
}
config_name = ctx.config
noopt = False
d3q27 = False
if config_name.endswith("_noopt"):
noopt = True
config_name = config_name[:-len("_noopt")]
if config_name.endswith("_d3q27"):
d3q27 = True
config_name = config_name[:-len("_d3q27")]
......@@ -104,20 +112,33 @@ with CodeGeneration() as ctx:
stencil_str = options['stencil']
q = int(stencil_str[stencil_str.find('Q')+1:])
pdfs, velocity_field = ps.fields("pdfs({q}), velocity(3) : double[3D]".format(q=q), layout='fzyx')
options['optimization']['symbolic_field'] = pdfs
update_rule_two_field = create_lb_update_rule(**options)
update_rule_aa_even = create_lb_update_rule(kernel_type=AAEvenTimeStepAccessor(), **options)
options['optimization']['split'] = True
update_rule_aa_odd = create_lb_update_rule(kernel_type=AAOddTimeStepAccessor(), **options)
vec = {'nontemporal': False, 'assume_aligned': True, 'assume_inner_stride_one': True}
update_rule_two_field = create_lb_update_rule(optimization={'symbolic_field': pdfs,
'split': opts['two_field_split'],
'cse_global': opts['two_field_cse_global'],
'cse_pdfs': opts['two_field_cse_pdfs']}, **options)
update_rule_aa_even = create_lb_update_rule(kernel_type=AAEvenTimeStepAccessor(),
optimization={'symbolic_field': pdfs,
'split': opts['aa_even_split'],
'cse_global': opts['aa_even_cse_global'],
'cse_pdfs': opts['aa_even_cse_pdfs']}, **options)
update_rule_aa_odd = create_lb_update_rule(kernel_type=AAOddTimeStepAccessor(),
optimization={'symbolic_field': pdfs,
'split': opts['aa_odd_split'],
'cse_global': opts['aa_odd_cse_global'],
'cse_pdfs': opts['aa_odd_cse_pdfs']}, **options)
vec = { 'assume_aligned': True, 'assume_inner_stride_one': True}
# Sweeps
generate_sweep(ctx, 'GenLbKernel', update_rule_two_field, field_swaps=[('pdfs', 'pdfs_tmp')])
generate_sweep(ctx, 'GenLbKernelAAEven', update_rule_aa_even, cpu_vectorize_info={'assume_aligned': True},
vec['nontemporal'] = opts['two_field_nt_stores']
generate_sweep(ctx, 'GenLbKernel', update_rule_two_field, field_swaps=[('pdfs', 'pdfs_tmp')],
cpu_vectorize_info=vec)
vec['nontemporal'] = opts['aa_even_nt_stores']
generate_sweep(ctx, 'GenLbKernelAAEven', update_rule_aa_even, cpu_vectorize_info=vec,
cpu_openmp=True, ghost_layers=1)
generate_sweep(ctx, 'GenLbKernelAAOdd', update_rule_aa_odd, cpu_vectorize_info={'assume_aligned': True},
vec['nontemporal'] = opts['aa_odd_nt_stores']
generate_sweep(ctx, 'GenLbKernelAAOdd', update_rule_aa_odd, cpu_vectorize_info=vec,
cpu_openmp=True, ghost_layers=1)
setter_assignments = macroscopic_values_setter(update_rule_two_field.method, velocity=velocity_field.center_vector,
......@@ -144,8 +165,6 @@ with CodeGeneration() as ctx:
'stencil': stencil_str,
'q': q,
'configName': ctx.config,
'cse_global': int(options['optimization']['cse_global']),
'cse_pdfs': int(options['optimization']['cse_pdfs']),
'optimizationDict': str(opts),
}
ctx.write_file("GenDefines.h", info_header.format(**infoHeaderParams))
import math
import os
import operator
import waLBerla as wlb
from waLBerla.tools.sqlitedb import *
from functools import reduce
def prod(seq):
return reduce(operator.mul, seq, 1)
def get_block_decomposition(block_decomposition, num_processes):
bx = by = bz = 1
blocks_per_axis = int(math.log(num_processes, 2))
for i in range(blocks_per_axis):
decomposition_axis = block_decomposition[i % len(block_decomposition)]
if decomposition_axis == 'y':
by *= 2
elif decomposition_axis == 'z':
bz *= 2
elif decomposition_axis == 'x':
bx *= 2
assert (bx * by * bz) == num_processes
return bx, by, bz
def calculate_time_steps(runtime, expected_mlups, domain_size):
cells = prod(domain_size)
time_steps_per_second = expected_mlups * 1e6 / cells
return int(time_steps_per_second * runtime)
class BenchmarkScenario:
def __init__(self, block_size=(256, 128, 128), direct_comm=True, time_step_mode='aa', db_file_name='uniform_grid_gen.sqlite'):
self.block_size = block_size
self.direct_comm = direct_comm
self.time_step_mode = time_step_mode
self.threads = int(os.environ['OMP_NUM_THREADS'])
self.processes = wlb.mpi.numProcesses()
self.db_file_name = db_file_name
@wlb.member_callback
def config(self, **kwargs):
time_steps_for_128_cubed = 50
time_steps = int(128**3 / prod(self.block_size) * time_steps_for_128_cubed)
time_steps = max(10, time_steps)
cfg = {
'DomainSetup': {
'blocks': (1, 1, self.processes),
'cellsPerBlock': (self.block_size[0], self.block_size[1], self.block_size[2] * self.threads),
'periodic': (1, 1, 1),
},
'Parameters': {
'timesteps': time_steps,
'warmupSteps': 6,
'outerIterations': 3,
'vtkWriteFrequency': 0,
'remainingTimeLoggerFrequency': 0,
'omega': 1.6,
'timeStepMode': self.time_step_mode,
'directComm': self.direct_comm,
}
}
return cfg
@wlb.member_callback
def results_callback(self, mlupsPerProcess, optimizations, **kwargs):
cfg = self.config()
result = {
'block_size': self.block_size,
'mlups_per_core': mlupsPerProcess / self.threads,
'threads': self.threads,
'processes': self.processes,
'time_step_mode': self.time_step_mode,
'direct_comm': self.direct_comm,
'time_steps': cfg['Parameters']['timesteps'],
'I_MPI_PIN_CELL': os.environ.get('I_MPI_PIN_CELL', ''),
'I_MPI_PIN_DOMAIN': os.environ.get('I_MPI_PIN_CELL', ''),
}
optimizations = eval(optimizations)
result.update(optimizations)
result.update(kwargs)
sequenceValuesToScalars(result)
checkAndUpdateSchema(result, "runs", self.db_file_name)
storeSingle(result, "runs", self.db_file_name)
def benchmark():
scenarios = wlb.ScenarioManager()
for block_size in [(128, 128, 128), (128, 64, 64), (64, 64, 128), (64, 64, 64), (64, 32, 32), (32, 32, 32), (16, 16, 16), (256, 128, 64), (512, 128, 32)]:
for direct_comm in (True, False):
for time_step_mode in ['aa', 'aaKernelOnly', 'twoField']:
sc = BenchmarkScenario(block_size=block_size, direct_comm=direct_comm, time_step_mode=time_step_mode)
scenarios.add(sc)
benchmark()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment