Commit bfe0428a authored by Markus Holzer's avatar Markus Holzer
Browse files

Validated App

parent 4f0d1ab6
Pipeline #35901 canceled with stages
in 42 seconds
......@@ -16,6 +16,7 @@ foreach(streaming_pattern pull push aa esotwist)
UniformGridCPU_NoSlip.cpp UniformGridCPU_NoSlip.h
UniformGridCPU_UBB.cpp UniformGridCPU_UBB.h
UniformGridCPU_MacroSetter.cpp UniformGridCPU_MacroSetter.h
UniformGridCPU_MacroGetter.cpp UniformGridCPU_MacroGetter.h
UniformGridCPU_StreamOnlyKernel.cpp UniformGridCPU_StreamOnlyKernel.h
UniformGridCPU_InfoHeader.h
)
......
DomainSetup
{
blocks < 1, 1, 1 >;
cellsPerBlock < 64, 64, 64 >;
periodic < 1, 1, 1 >;
}
Parameters
{
timesteps 200; // time steps of one performance measurement
warmupSteps 1; // number of steps to run before measurement starts
outerIterations 4; // how many measurements to conduct
vtkWriteFrequency 0; // write a VTK file every n'th step, if zero VTK output is disabled
timeStepMode twoField;
//twoFieldKernelType manualD3Q19;
remainingTimeLoggerFrequency 6; // interval in seconds to log the estimated remaining time
directComm 0;
omega 1.8;
shearVelocityMagnitude 0.02;
useGui 0;
}
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file UniformGridCPU.cpp
//! \author Martin Bauer <martin.bauer@fau.de>
//! \author Frederik Hennig <frederik.hennig@fau.de>
//! \author Markus Holzer <markus.holzer@fau.de>
//
//======================================================================================================================
#include "blockforest/Initialization.h"
#include "blockforest/communication/UniformBufferedScheme.h"
......@@ -28,8 +50,9 @@
using namespace walberla;
typedef lbm::UniformGridCPU_PackInfoEven PackInfoEven_T;
typedef lbm::UniformGridCPU_PackInfoOdd PackInfoOdd_T;
using PackInfoEven_T = lbm::UniformGridCPU_PackInfoEven;
using PackInfoOdd_T = lbm::UniformGridCPU_PackInfoOdd;
using LbSweep = lbm::UniformGridCPU_LbKernel;
using FlagField_T = FlagField< uint8_t >;
......@@ -62,6 +85,7 @@ int main(int argc, char** argv)
// Creating fields
BlockDataID pdfFieldId = blocks->addStructuredBlockData< PdfField_T >(pdfFieldAdder, "pdfs");
BlockDataID velFieldId = field::addToStorage< VelocityField_T >(blocks, "vel", real_t(0), field::fzyx);
BlockDataID densityFieldId = field::addToStorage< ScalarField_T >(blocks, "density", real_t(1.0), field::fzyx);
// Initialize velocity on cpu
if (initShearFlow)
......@@ -70,7 +94,8 @@ int main(int argc, char** argv)
initShearVelocity(blocks, velFieldId);
}
pystencils::UniformGridCPU_MacroSetter setterSweep(pdfFieldId, velFieldId);
pystencils::UniformGridCPU_MacroSetter setterSweep(densityFieldId, pdfFieldId, velFieldId);
pystencils::UniformGridCPU_MacroGetter getterSweep(densityFieldId, pdfFieldId, velFieldId);
// Set up initial PDF values
for (auto& block : *blocks)
......@@ -88,8 +113,6 @@ int main(int argc, char** argv)
}
Cell innerOuterSplitCell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]);
using LbSweep = lbm::UniformGridCPU_LbKernel;
LbSweep lbSweep(pdfFieldId, omega, innerOuterSplitCell);
pystencils::UniformGridCPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldId);
......@@ -100,6 +123,7 @@ int main(int argc, char** argv)
bool boundaries = false;
if (boundariesConfig)
{
WALBERLA_LOG_INFO_ON_ROOT("Setting boundary conditions")
boundaries = true;
geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldID, boundariesConfig);
geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID);
......@@ -231,6 +255,11 @@ int main(int argc, char** argv)
auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >(velFieldId, "vel");
vtkOutput->addCellDataWriter(velWriter);
vtkOutput->addBeforeFunction([&]() {
for (auto& block : *blocks){
getterSweep(&block);}
});
timeLoop.addFuncAfterTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
}
......
......@@ -98,8 +98,8 @@ with CodeGeneration() as ctx:
q = stencil.Q
dim = stencil.D
assert dim == 3, "This app supports only three-dimensional stencils"
pdfs, pdfs_tmp, velocity_field = ps.fields(f"pdfs({q}), pdfs_tmp({q}), velocity(3) : {field_type}[3D]",
layout='fzyx')
pdfs, pdfs_tmp = ps.fields(f"pdfs({q}), pdfs_tmp({q}): {field_type}[3D]", layout='fzyx')
density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx')
lbm_config = LBMConfig(stencil=stencil, field_name=pdfs.name, streaming_pattern=streaming_pattern, **options)
lbm_opt = LBMOptimisation(cse_global=True, cse_pdfs=False, symbolic_field=pdfs, field_layout='fzyx')
......@@ -132,11 +132,19 @@ with CodeGeneration() as ctx:
inner_outer_split=True, field_swaps=field_swaps)
# getter & setter
setter_assignments = macroscopic_values_setter(lb_method, density=1.0, velocity=velocity_field.center_vector,
setter_assignments = macroscopic_values_setter(lb_method,
density=density_field.center, velocity=velocity_field.center_vector,
pdfs=pdfs,
streaming_pattern=streaming_pattern,
previous_timestep=Timestep.EVEN)
getter_assignments = macroscopic_values_getter(lb_method,
density=density_field, velocity=velocity_field,
pdfs=pdfs,
streaming_pattern=streaming_pattern,
previous_timestep=Timestep.EVEN)
generate_sweep(ctx, 'UniformGridCPU_MacroSetter', setter_assignments, target=ps.Target.CPU)
generate_sweep(ctx, 'UniformGridCPU_MacroGetter', getter_assignments, target=ps.Target.CPU)
# Stream only kernel
generate_sweep(ctx, 'UniformGridCPU_StreamOnlyKernel', stream_only_kernel, field_swaps=field_swaps_stream_only,
......@@ -167,7 +175,8 @@ with CodeGeneration() as ctx:
stencil_typedefs = {'Stencil_T': stencil,
'CommunicationStencil_T': stencil}
field_typedefs = {'PdfField_T': pdfs,
'VelocityField_T': velocity_field}
'VelocityField_T': velocity_field,
'ScalarField_T': density_field}
# Info header containing correct template definitions for stencil and field
generate_info_header(ctx, 'UniformGridCPU_InfoHeader',
......
import os
import waLBerla as wlb
from waLBerla.tools.config import block_decomposition
from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
import sys
import sqlite3
from math import prod
# Number of time steps run for a workload of 128^3 per process
# if double as many cells are on the process, half as many time steps are run etc.
# increase this to get more reliable measurements
TIME_STEPS_FOR_128_BLOCK = 50
DB_FILE = "cpu_benchmark.sqlite3"
BASE_CONFIG = {
'DomainSetup': {
'cellsPerBlock': (256, 128, 128),
'periodic': (1, 1, 1),
},
'Parameters': {
'omega': 1.8,
'warmupSteps': 5,
'outerIterations': 3,
}
}
def num_time_steps(block_size, time_steps_for_128_block=TIME_STEPS_FOR_128_BLOCK):
cells = block_size[0] * block_size[1] * block_size[2]
time_steps = (128 ** 3 / cells) * time_steps_for_128_block
return int(time_steps)
def domain_block_size_ok(block_size, total_mem, gls=1, q=27, size_per_value=8):
"""Checks if a single block of given size fits into Main memory"""
return prod(b + 2 * gls for b in block_size) * q * size_per_value < total_mem
ldc_setup = {'Border': [
{'direction': 'N', 'walldistance': -1, 'flag': 'NoSlip'},
{'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'},
{'direction': 'N', 'walldistance': -1, 'flag': 'UBB'},
{'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'},
{'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'},
{'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'},
]}
class Scenario:
def __init__(self, cells_per_block=(128, 128, 128), periodic=(1, 1, 1),
timesteps=None, time_step_strategy="normal", omega=1.8, inner_outer_split=(1, 1, 1),
warmup_steps=5, outer_iterations=3, init_shear_flow=False, boundary_setup=False,
vtk_write_frequency=0, remaining_time_logger_frequency=-1):
if boundary_setup:
init_shear_flow = False
periodic = (0, 0, 0)
self.blocks = block_decomposition(wlb.mpi.numProcesses())
self.cells_per_block = cells_per_block
self.periodic = periodic
self.time_step_strategy = time_step_strategy
self.omega = omega
self.timesteps = timesteps if timesteps else num_time_steps(cells_per_block)
self.inner_outer_split = inner_outer_split
self.init_shear_flow = init_shear_flow
self.boundary_setup = boundary_setup
self.warmup_steps = warmup_steps
self.outer_iterations = outer_iterations
self.vtk_write_frequency = vtk_write_frequency
self.remaining_time_logger_frequency = remaining_time_logger_frequency
self.config_dict = self.config(print_dict=False)
@wlb.member_callback
def config(self, print_dict=True):
from pprint import pformat
config_dict = {
'DomainSetup': {
'blocks': self.blocks,
'cellsPerBlock': self.cells_per_block,
'periodic': self.periodic,
},
'Parameters': {
'omega': self.omega,
'warmupSteps': self.warmup_steps,
'outerIterations': self.outer_iterations,
'timeStepStrategy': self.time_step_strategy,
'timesteps': self.timesteps,
'initShearFlow': self.init_shear_flow,
'innerOuterSplit': self.inner_outer_split,
'vtkWriteFrequency': self.vtk_write_frequency,
'remainingTimeLoggerFrequency': self.remaining_time_logger_frequency
}
}
if self.boundary_setup:
config_dict["Boundaries"] = ldc_setup
if print_dict:
wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))
return config_dict
@wlb.member_callback
def results_callback(self, **kwargs):
data = {}
data.update(self.config_dict['Parameters'])
data.update(self.config_dict['DomainSetup'])
data.update(kwargs)
data['executable'] = sys.argv[0]
data['compile_flags'] = wlb.build_info.compiler_flags
data['walberla_version'] = wlb.build_info.version
data['build_machine'] = wlb.build_info.build_machine
sequenceValuesToScalars(data)
result = data
sequenceValuesToScalars(result)
num_tries = 4
# check multiple times e.g. may fail when multiple benchmark processes are running
table_name = f"runs_{data['stencil']}_{data['streamingPattern']}_{data['collisionSetup']}_{prod(self.blocks)}"
for num_try in range(num_tries):
try:
checkAndUpdateSchema(result, table_name, DB_FILE)
storeSingle(result, table_name, DB_FILE)
break
except sqlite3.OperationalError as e:
wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries} {str(e)}")
# -------------------------------------- Profiling -----------------------------------
def profiling():
"""Tests different communication overlapping strategies"""
wlb.log_info_on_root("Running 2 timesteps for profiling")
wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager()
cells = (128, 128, 128)
scenarios.add(Scenario(cells_per_block=cells, time_step_strategy='kernelOnly',
inner_outer_split=(1, 1, 1), timesteps=2,
outer_iterations=1, warmup_steps=0))
# -------------------------------------- Functions trying different parameter sets -----------------------------------
def overlap_benchmark():
"""Tests different communication overlapping strategies"""
wlb.log_info_on_root("Running different communication overlap strategies")
wlb.log_info_on_root("")
scenarios = wlb.ScenarioManager()
inner_outer_splits = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1),
(4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1),
(4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32)]
# no overlap
scenarios.add(Scenario(time_step_strategy='noOverlap',
inner_outer_split=(1, 1, 1)))
for inner_outer_split in inner_outer_splits:
scenario = Scenario(time_step_strategy='simpleOverlap',
inner_outer_split=inner_outer_split)
scenarios.add(scenario)
def single_node_benchmark():
"""Benchmarks only the LBM compute kernel"""
wlb.log_info_on_root("Running single Node benchmarks")
wlb.log_info_on_root("")
main_memory = int(os.environ.get('MAIN_MEMORY_GB', 128))
main_mem = main_memory * (2 ** 30)
scenarios = wlb.ScenarioManager()
block_sizes = [(i, i, i) for i in (8, 16, 32, 64, 128, 256)]
for block_size in block_sizes:
if not domain_block_size_ok(block_size, main_mem):
wlb.log_info_on_root(f"Block size {block_size} would exceed main memory. Skipping.")
continue
scenario = Scenario(cells_per_block=block_size,
time_step_strategy='kernelOnly',
timesteps=num_time_steps(block_size))
scenarios.add(scenario)
def validation_run():
"""Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
wlb.log_info_on_root("Validation run")
wlb.log_info_on_root("")
time_step_strategy = 'simpleOverlap' # 'noOverlap'
scenarios = wlb.ScenarioManager()
scenario = Scenario(cells_per_block=(64, 64, 64),
time_step_strategy=time_step_strategy,
timesteps=3001,
outer_iterations=1,
warmup_steps=0,
init_shear_flow=False,
boundary_setup=True,
vtk_write_frequency=1000,
remaining_time_logger_frequency=10)
scenarios.add(scenario)
wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}")
# Select the benchmark you want to run
# single_node_benchmark() # benchmarks different CUDA block sizes and domain sizes and measures single GPU
# performance of compute kernel (no communication)
# overlap_benchmark() # benchmarks different communication overlap options
# profiling() # run only two timesteps on a smaller domain for profiling only
validation_run()
......@@ -102,7 +102,7 @@ void {{class_name}}::outer( {{- ["IBlock * block", kernel.kernel_selection_param
{
{{kernel|generate_block_data_to_field_extraction|indent(4)}}
if( layers_.size() == 0 )
if( layers_.empty() )
{
CellInterval ci;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment