Commits (79)
This diff is collapsed.
......@@ -250,6 +250,14 @@ else()
endif()
mark_as_advanced ( WALBERLA_CXX_COMPILER_IS_MPI_WRAPPER )
# Check for intel llvm compiler
if( CMAKE_CXX_COMPILER MATCHES "icpx" OR CMAKE_CXX_COMPILER_ARG1 MATCHES "icpx" )
option ( WALBERLA_CXX_COMPILER_IS_INTELLLVM "Use Intel LLVM compiler" ON )
else()
option ( WALBERLA_CXX_COMPILER_IS_INTELLLVM "Use Intel LLVM compiler" OFF )
endif()
mark_as_advanced ( WALBERLA_CXX_COMPILER_IS_INTELLLVM )
############################################################################################################################
......@@ -427,6 +435,10 @@ if ( WALBERLA_BUILD_WITH_FASTMATH )
if( WALBERLA_CXX_COMPILER_IS_MSVC )
add_flag( CMAKE_CXX_FLAGS "/fp:fast" )
endif()
else()
if( WALBERLA_CXX_COMPILER_IS_INTELLLVM )
add_flag( CMAKE_CXX_FLAGS "-fp-model=precise")
endif()
endif()
# Xcode generator disables -isystem flag, even though current versions of Xcode support it
......@@ -478,7 +490,7 @@ try_compile( WALBERLA_USE_STD_FILESYSTEM "${CMAKE_CURRENT_BINARY_DIR}" "${CMAKE_
COMPILE_DEFINITIONS -DWALBERLA_USE_STD_FILESYSTEM COPY_FILE "${CMAKE_CURRENT_BINARY_DIR}/TestStdFilesystem" )
if( WALBERLA_USE_STD_FILESYSTEM )
# detect https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90050 by checking whether it segfaults
execute_process( COMMAND "${CMAKE_CURRENT_BINARY_DIR}/TestStdFilesystem" RESULT_VARIABLE WALBERLA_STD_FILESYSTEM_WORKS )
execute_process( COMMAND "${CMAKE_CURRENT_BINARY_DIR}/TestStdFilesystem" OUTPUT_QUIET RESULT_VARIABLE WALBERLA_STD_FILESYSTEM_WORKS )
endif()
if( WALBERLA_USE_STD_FILESYSTEM AND WALBERLA_STD_FILESYSTEM_WORKS EQUAL 0 )
message( STATUS "Found std::filesystem")
......@@ -1011,11 +1023,18 @@ endif()
option ( WALBERLA_THREAD_SAFE_LOGGING "Enables/Disables thread-safe logging" ON )
if ( WALBERLA_BUILD_WITH_OPENMP )
if( APPLE AND EXISTS /opt/local/lib/libomp AND EXISTS /opt/local/include/libomp ) # find libomp from MacPorts
set( CMAKE_FRAMEWORK_PATH /opt/local/lib/libomp )
set( CMAKE_INCLUDE_PATH /opt/local/include/libomp )
endif()
find_package( OpenMP )
if (OpenMP_FOUND)
add_flag ( CMAKE_C_FLAGS "${OpenMP_C_FLAGS}" )
add_flag ( CMAKE_CXX_FLAGS "${OpenMP_CXX_FLAGS}" )
list ( APPEND SERVICE_LIBS ${OpenMP_CXX_LIBRARIES} )
if( OpenMP_CXX_INCLUDE_DIRS )
include_directories( ${OpenMP_CXX_INCLUDE_DIRS} )
endif()
else()
#workarounds
if ( WALBERLA_CXX_COMPILER_IS_NEC )
......@@ -1026,6 +1045,17 @@ if ( WALBERLA_BUILD_WITH_OPENMP )
message(FATAL_ERROR "Could NOT enable OpenMP")
endif()
endif()
if( WALBERLA_CXX_COMPILER_IS_CLANG OR WALBERLA_CXX_COMPILER_IS_INTELLLVM )
# check for bug in combination with OpenMP and sign conversion https://bugs.llvm.org/show_bug.cgi?id=48387
try_compile( WALBERLA_CLANG_OPENMP_BUG "${CMAKE_CURRENT_BINARY_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/cmake/TestClangOpenMPBug.cpp"
COMPILE_DEFINITIONS -Werror )
if( NOT ${WALBERLA_CLANG_OPENMP_BUG} )
message( WARNING "Setting -Wno-sign-conversion due to a compiler bug in LLVM (https://bugs.llvm.org/show_bug.cgi?id=48387)" )
add_flag( CMAKE_CXX_FLAGS "-Wno-sign-conversion" )
endif()
endif()
else()
if ( WALBERLA_CXX_COMPILER_IS_CRAY )
add_flag ( CMAKE_C_FLAGS "-h noomp" )
......@@ -1256,6 +1286,11 @@ include_directories ( ${CMAKE_CURRENT_BINARY_DIR}/src )
# All include paths are specified relative to src/ directory
include_directories ( ${CMAKE_CURRENT_SOURCE_DIR}/src )
# external
add_subdirectory( extern )
# sources
add_subdirectory ( src )
# Generate file with compile options, and add install rule for it
configure_file ( src/waLBerlaDefinitions.in.h
......@@ -1263,8 +1298,6 @@ configure_file ( src/waLBerlaDefinitions.in.h
install( FILES ${walberla_BINARY_DIR}/src/waLBerlaDefinitions.h DESTINATION walberla/ )
# sources
add_subdirectory ( src )
# test
if ( WALBERLA_BUILD_TESTS )
......
......@@ -32,4 +32,4 @@ endif()
# Python module
if ( WALBERLA_BUILD_WITH_PYTHON )
add_subdirectory( pythonmodule )
endif()
\ No newline at end of file
endif()
......@@ -1452,7 +1452,7 @@ int main( int argc, char **argv )
WALBERLA_LOG_INFO_ON_ROOT("Refreshing blockforest...")
// check refinement criterions and refine/coarsen if necessary
// check refinement criteria and refine/coarsen if necessary
uint_t stampBefore = blocks->getBlockForest().getModificationStamp();
blocks->refresh();
uint_t stampAfter = blocks->getBlockForest().getModificationStamp();
......@@ -2090,7 +2090,7 @@ int main( int argc, char **argv )
WALBERLA_LOG_INFO_ON_ROOT("Refreshing blockforest...")
// check refinement criterions and refine/coarsen if necessary
// check refinement criteria and refine/coarsen if necessary
uint_t stampBefore = blocks->getBlockForest().getModificationStamp();
blocks->refresh();
uint_t stampAfter = blocks->getBlockForest().getModificationStamp();
......
......@@ -929,7 +929,7 @@ int main( int argc, char **argv )
if( !useStaticRefinement && refinementCheckFrequency == 0 && numberOfLevels != 1 )
{
// determine check frequency automatically based on maximum admissable velocity and block sizes
// determine check frequency automatically based on maximum admissible velocity and block sizes
real_t uMax = real_t(0.1);
real_t refinementCheckFrequencyFinestLevel = ( overlap + real_c(blockSize) - real_t(2) * real_t(FieldGhostLayers) * dx) / uMax;
refinementCheckFrequency = uint_c( refinementCheckFrequencyFinestLevel / real_t(lbmTimeStepsPerTimeLoopIteration));
......@@ -1252,7 +1252,7 @@ int main( int argc, char **argv )
(*velocityCommunicationScheme)();
}
// check refinement criterions and refine/coarsen if necessary
// check refinement criteria and refine/coarsen if necessary
uint_t stampBefore = blocks->getBlockForest().getModificationStamp();
blocks->refresh();
uint_t stampAfter = blocks->getBlockForest().getModificationStamp();
......
......@@ -6,7 +6,7 @@ import os
class Parameter:
def __init__(self, name, type, defValue="", comment=""):
"""Propery of a data strcuture
"""Property of a data structure
Parameters
----------
......
......@@ -70,21 +70,29 @@ auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage* const stora
};
auto VelocityCallback = [](const Cell& pos, const shared_ptr< StructuredBlockForest >& SbF, IBlock& block,
real_t inflow_velocity) {
Cell globalCell;
CellInterval domain = SbF->getDomainCellBB();
real_t h_y = domain.yMax() - domain.yMin();
real_t h_z = domain.zMax() - domain.zMin();
SbF->transformBlockLocalToGlobalCell(globalCell, block, pos);
real_t inflow_velocity, const bool constant_inflow = true) {
if (constant_inflow)
{
Vector3< real_t > result(inflow_velocity, 0.0, 0.0);
return result;
}
else
{
Cell globalCell;
CellInterval domain = SbF->getDomainCellBB();
real_t h_y = real_c(domain.ySize());
real_t h_z = real_c(domain.zSize());
SbF->transformBlockLocalToGlobalCell(globalCell, block, pos);
real_t y1 = globalCell[1] - (h_y / 2.0 + 0.5);
real_t z1 = globalCell[2] - (h_z / 2.0 + 0.5);
real_t y1 = real_c(globalCell[1] - (h_y / 2.0 - 0.5));
real_t z1 = real_c(globalCell[2] - (h_z / 2.0 - 0.5));
real_t u = (inflow_velocity * 16) / (h_y * h_y * h_z * h_z) * (h_y / 2.0 - y1) * (h_y / 2 + y1) * (h_z / 2 - z1) *
(h_z / 2 + z1);
real_t u = (inflow_velocity * real_c(16)) / (h_y * h_y * h_z * h_z) * (h_y / real_c(2.0) - y1) *
(h_y / real_c(2.0) + y1) * (h_z / real_c(2.0) - z1) * (h_z / real_c(2.0) + z1);
Vector3< real_t > result(u, 0.0, 0.0);
return result;
Vector3< real_t > result(u, 0.0, 0.0);
return result;
}
};
class AlternatingBeforeFunction
......@@ -147,6 +155,7 @@ int main(int argc, char** argv)
const real_t u_max = parameters.getParameter< real_t >("u_max", real_t(0.05));
const real_t reynolds_number = parameters.getParameter< real_t >("reynolds_number", real_t(1000));
const uint_t diameter_sphere = parameters.getParameter< uint_t >("diameter_sphere", uint_t(5));
const bool constant_inflow = parameters.getParameter< bool >("constant_inflow", true);
const double remainingTimeLoggerFrequency =
parameters.getParameter< double >("remainingTimeLoggerFrequency", 3.0); // in seconds
......@@ -204,7 +213,7 @@ int main(int argc, char** argv)
auto boundariesConfig = config->getOneBlock("Boundaries");
std::function< Vector3< real_t >(const Cell&, const shared_ptr< StructuredBlockForest >&, IBlock&) >
velocity_initialisation = std::bind(VelocityCallback, _1, _2, _3, u_max);
velocity_initialisation = std::bind(VelocityCallback, _1, _2, _3, u_max, constant_inflow);
#if defined(WALBERLA_BUILD_WITH_CUDA)
lbm::FlowAroundSphereCodeGen_UBB ubb(blocks, pdfFieldIDGPU, velocity_initialisation);
......@@ -236,10 +245,9 @@ int main(int argc, char** argv)
AlternatingBeforeFunction communication(evenComm, oddComm, tracker);
// add LBM sweep and communication to time loop
timeloop.add() << BeforeFunction(communication, "communication")
<< Sweep(noSlip.getSweep(tracker), "noSlip boundary");
timeloop.add() << BeforeFunction(communication, "communication") << Sweep(ubb.getSweep(tracker), "ubb boundary");
timeloop.add() << Sweep(outflow.getSweep(tracker), "outflow boundary");
timeloop.add() << Sweep(ubb.getSweep(tracker), "ubb boundary");
timeloop.add() << Sweep(noSlip.getSweep(tracker), "noSlip boundary");
timeloop.add() << BeforeFunction(tracker->getAdvancementFunction(), "Timestep Advancement")
<< Sweep(lbSweep.getSweep(tracker), "LB update rule");
......
from pystencils import Target
from pystencils.field import fields
from lbmpy import LBMConfig, LBMOptimisation, LBStencil, Method, Stencil
from lbmpy.advanced_streaming.utility import get_timesteps, Timestep
from lbmpy.advanced_streaming.utility import get_timesteps
from lbmpy.macroscopic_value_kernels import macroscopic_values_setter
from lbmpy.stencils import get_stencil
from lbmpy.creationfunctions import create_lb_collision_rule, create_lb_method, create_lb_update_rule
from lbmpy.creationfunctions import create_lb_collision_rule
from lbmpy.boundaries import NoSlip, UBB, ExtrapolationOutflow
from pystencils_walberla import CodeGeneration, generate_sweep, generate_info_header
from lbmpy_walberla.additional_data_handler import UBBAdditionalDataHandler, OutflowAdditionalDataHandler
from lbmpy_walberla import generate_boundary, generate_lb_pack_info
from lbmpy_walberla import generate_lb_pack_info
from lbmpy_walberla import generate_alternating_lbm_sweep, generate_alternating_lbm_boundary
import sympy as sp
stencil = get_stencil("D3Q27")
q = len(stencil)
dim = len(stencil[0])
streaming_pattern = 'esotwist'
timesteps = get_timesteps(streaming_pattern)
pdfs, velocity_field, density_field = fields(f"pdfs({q}), velocity({dim}), density(1) : double[{dim}D]", layout='fzyx')
omega = sp.Symbol("omega")
u_max = sp.Symbol("u_max")
output = {
'density': density_field,
'velocity': velocity_field
}
opt = {'symbolic_field': pdfs,
'cse_global': False,
'cse_pdfs': False}
method_params = {'method': 'cumulant',
'stencil': stencil,
'relaxation_rate': omega,
'galilean_correction': True,
'field_name': 'pdfs',
'streaming_pattern': streaming_pattern,
'output': output,
'optimization': opt}
collision_rule = create_lb_collision_rule(**method_params)
lb_method = collision_rule.method
# getter & setter
setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
pdfs=pdfs, density=1,
streaming_pattern=streaming_pattern,
previous_timestep=timesteps[0])
# opt = {'instruction_set': 'sse', 'assume_aligned': True, 'nontemporal': False, 'assume_inner_stride_one': True}
stencil_typedefs = {'Stencil_T': stencil}
field_typedefs = {'PdfField_T': pdfs,
'VelocityField_T': velocity_field,
'ScalarField_T': density_field}
with CodeGeneration() as ctx:
data_type = "float64" if ctx.double_accuracy else "float32"
stencil = LBStencil(Stencil.D3Q27)
q = stencil.Q
dim = stencil.D
streaming_pattern = 'esotwist'
timesteps = get_timesteps(streaming_pattern)
pdfs, velocity_field, density_field = fields(f"pdfs({q}), velocity({dim}), density(1) : {data_type}[{dim}D]",
layout='fzyx')
omega = sp.Symbol("omega")
u_max = sp.Symbol("u_max")
output = {
'density': density_field,
'velocity': velocity_field
}
lbm_config = LBMConfig(stencil=stencil, method=Method.CUMULANT, relaxation_rate=omega, galilean_correction=True,
field_name='pdfs', streaming_pattern=streaming_pattern, output=output)
lbm_optimisation = LBMOptimisation(symbolic_field=pdfs, cse_global=False, cse_pdfs=False)
collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_optimisation)
lb_method = collision_rule.method
# getter & setter
setter_assignments = macroscopic_values_setter(lb_method, velocity=velocity_field.center_vector,
pdfs=pdfs, density=1.0,
streaming_pattern=streaming_pattern,
previous_timestep=timesteps[0])
# opt = {'instruction_set': 'sse', 'assume_aligned': True, 'nontemporal': False, 'assume_inner_stride_one': True}
stencil_typedefs = {'Stencil_T': stencil}
field_typedefs = {'PdfField_T': pdfs,
'VelocityField_T': velocity_field,
'ScalarField_T': density_field}
if ctx.cuda:
target = 'gpu'
target = Target.GPU
else:
target = 'cpu'
opt['target'] = target
target = Target.CPU
# sweeps
generate_alternating_lbm_sweep(ctx, 'FlowAroundSphereCodeGen_LbSweep',
collision_rule, streaming_pattern, optimization=opt)
collision_rule, lbm_config=lbm_config, lbm_optimisation=lbm_optimisation,
target=target)
generate_sweep(ctx, 'FlowAroundSphereCodeGen_MacroSetter', setter_assignments, target=target)
# boundaries
......
......@@ -4,13 +4,15 @@ from lbmpy.relaxationrates import relaxation_rate_from_lattice_viscosity
class Scenario:
def __init__(self):
self.timesteps = 101
self.vtkWriteFrequency = 2000
self.timesteps = 1001
self.vtkWriteFrequency = 100
self.cells = (384, 128, 128)
self.blocks = (1, 1, 1)
self.periodic = (0, 0, 0)
self.constant_inflow = True
self.diameter_sphere = min(self.cells) // 2
self.u_max = 0.1
self.reynolds_number = 1000000
......@@ -38,7 +40,8 @@ class Scenario:
'omega': self.omega,
'u_max': self.u_max,
'reynolds_number': self.reynolds_number,
'diameter_sphere': self.diameter_sphere
'diameter_sphere': self.diameter_sphere,
'constant_inflow': self.constant_inflow
},
'Boundaries': {
'Border': [
......@@ -54,7 +57,7 @@ class Scenario:
'midpoint': (int(0.40 * self.total_cells[0]), self.total_cells[1] // 2, self.total_cells[2] // 2),
'radius': self.diameter_sphere // 2,
'flag': 'NoSlip'}
]
],
},
}
......
......@@ -878,7 +878,7 @@ int main( int argc, char **argv )
real_t defaultOmegaBulk = lbm_mesapd_coupling::omegaBulkFromOmega(omega, real_t(1));
shared_ptr<OmegaBulkAdapter_T> omegaBulkAdapter = make_shared<OmegaBulkAdapter_T>(blocks, omegaBulkFieldID, accessor, defaultOmegaBulk, omegaBulk, adaptionLayerSize, sphereSelector);
timeloopAfterParticles.add() << Sweep( makeSharedSweep(omegaBulkAdapter), "Omega Bulk Adapter");
// initally adapt
// initially adapt
for (auto blockIt = blocks->begin(); blockIt != blocks->end(); ++blockIt) {
(*omegaBulkAdapter)(blockIt.get());
}
......
......@@ -843,7 +843,7 @@ int main( int argc, char **argv )
auto sphereShape = ss->create<mesa_pd::data::Sphere>( diameter * real_t(0.5) );
ss->shapes[sphereShape]->updateMassAndInertia(densityRatio);
std::mt19937 randomGenerator (static_cast<unsigned int>(2610)); // fixed seed: quasi-random and reproducable
std::mt19937 randomGenerator (static_cast<unsigned int>(2610)); // fixed seed: quasi-random and reproducible
for( uint_t nSed = 0; nSed < numberOfSediments; ++nSed )
{
......@@ -962,7 +962,7 @@ int main( int argc, char **argv )
if(currentPhase == 1)
{
// damp velocites to avoid too large ones
// damp velocities to avoid too large ones
ps->forEachParticle( useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor,
[](const size_t idx, ParticleAccessor_T& ac){
ac.setLinearVelocity(idx, ac.getLinearVelocity(idx) * real_t(0.5));
......
......@@ -573,7 +573,7 @@ int main( int argc, char **argv )
if(maxPenetrationDepth < overlapLimit) break;
// reset velocites to avoid too large ones
// reset velocities to avoid too large ones
ps->forEachParticle( useOpenMP, mesa_pd::kernel::SelectLocal(), *accessor,
[](const size_t idx, ParticleAccessor_T& ac){
......
......@@ -6,7 +6,7 @@ import os
class Parameter:
def __init__(self, name, type, defValue=""):
"""Propery of a data strcuture
"""Property of a data structure
Parameters
----------
......
......@@ -3,30 +3,43 @@ import waLBerla as wlb
import pandas as pd
from waLBerla.tools.sqlitedb import sequenceValuesToScalars
from waLBerla.tools.config import block_decomposition
import sys
from math import prod
def domain_block_size_ok(block_size, total_mem, gls=1, q_phase=15, q_hydro=27, size_per_value=8):
"""Checks if a single block of given size fits into GPU memory"""
cells = prod(b + 2 * gls for b in block_size)
# 3 values for the velocity and two for the phase field and the temporary phase field
values_per_cell = 2 * q_phase + 2 * q_hydro + 3 + 2
needed_memory = values_per_cell * cells * size_per_value
return needed_memory < total_mem
class Scenario:
def __init__(self, timeStepStrategy, overlappingWidth, cuda_block_size):
def __init__(self, time_step_strategy, cuda_block_size, cells_per_block=(256, 256, 256),
cuda_enabled_mpi=False):
# output frequencies
self.vtkWriteFrequency = 0
# simulation parameters
self.timesteps = 101
edge_size = 32
self.cells = (edge_size, edge_size, edge_size)
self.blocks = (1, 1, 1)
self.cells_per_block = cells_per_block
self.blocks = block_decomposition(wlb.mpi.numProcesses())
self.periodic = (1, 1, 1)
self.size = (self.cells[0] * self.blocks[0],
self.cells[1] * self.blocks[1],
self.cells[2] * self.blocks[2])
self.size = (self.cells_per_block[0] * self.blocks[0],
self.cells_per_block[1] * self.blocks[1],
self.cells_per_block[2] * self.blocks[2])
self.timeStepStrategy = timeStepStrategy
self.overlappingWidth = overlappingWidth
self.timeStepStrategy = time_step_strategy
self.cuda_block_size = cuda_block_size
self.warmupSteps = 10
self.cudaEnabledMpi = cuda_enabled_mpi
# bubble parameters
self.bubbleRadius = min(self.size) // 4
self.bubbleMidPoint = (self.size[0] / 2, self.size[1] / 2, self.size[2] / 2)
......@@ -41,19 +54,18 @@ class Scenario:
return {
'DomainSetup': {
'blocks': self.blocks,
'cellsPerBlock': self.cells,
'cellsPerBlock': self.cells_per_block,
'periodic': self.periodic,
},
'Parameters': {
'timesteps': self.timesteps,
'vtkWriteFrequency': self.vtkWriteFrequency,
'useGui': 0,
'remainingTimeLoggerFrequency': -1,
'timeStepStrategy': self.timeStepStrategy,
'overlappingWidth': self.overlappingWidth,
'gpuBlockSize': self.cuda_block_size,
'warmupSteps': self.warmupSteps,
'scenario': self.scenario,
'cudaEnabledMpi': self.cudaEnabledMpi
},
'Boundaries_GPU': {
'Border': []
......@@ -86,43 +98,46 @@ class Scenario:
df.to_csv(self.csv_file, index=False, mode='a', header=False)
def overlap_benchmark():
def benchmark():
scenarios = wlb.ScenarioManager()
overlappingWidths = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1),
(4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1),
(4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32)]
cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1), (512, 1, 1),
(32, 2, 1), (64, 2, 1), (128, 2, 1), (256, 2, 1),
(32, 4, 1), (64, 4, 1), (128, 4, 1),
(32, 4, 2), (64, 4, 2), (128, 2, 2),
(32, 8, 1), (64, 8, 1), (64, 4, 2),
(32, 16, 1), (16, 16, 1), (16, 16, 2)]
# no overlap
scenarios.add(Scenario(timeStepStrategy='normal', overlappingWidth=(1, 1, 1), cuda_block_size=(16, 16, 1)))
# overlap
for overlap_strategy in ['overlap']:
for overlappingWidth in overlappingWidths:
for cuda_block in cuda_blocks:
scenario = Scenario(timeStepStrategy=overlap_strategy,
overlappingWidth=overlappingWidth,
cuda_block_size=cuda_block)
scenarios.add(scenario)
gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8))
gpu_mem = gpu_mem_gb * (2 ** 30)
block_size = (256, 256, 256)
if not domain_block_size_ok(block_size, gpu_mem):
wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
else:
scenarios.add(Scenario(time_step_strategy='normal', cuda_block_size=(256, 1, 1), cells_per_block=block_size))
def kernel_benchmark():
scenarios = wlb.ScenarioManager()
# overlap
# for overlap_strategy in ['phase_only', 'hydro_only', 'kernel_only']:
for overlap_strategy in ['overlap']:
scenario = Scenario(timeStepStrategy=overlap_strategy,
overlappingWidth=(1, 1, 1),
cuda_block_size=(128, 1, 1))
scenarios.add(scenario)
gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8))
gpu_mem = gpu_mem_gb * (2 ** 30)
block_sizes = [(i, i, i) for i in (32, 64, 128, 256, 320, 384, 448, 512)]
cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1),
(32, 2, 1), (64, 2, 1), (128, 2, 1),
(32, 4, 1), (64, 4, 1),
(32, 4, 2),
(32, 8, 1),
(16, 16, 1)]
for time_step_strategy in ['phase_only', 'hydro_only', 'kernel_only', 'normal']:
for cuda_block in cuda_blocks:
for block_size in block_sizes:
if not domain_block_size_ok(block_size, gpu_mem):
wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
continue
scenario = Scenario(time_step_strategy=time_step_strategy,
cuda_block_size=cuda_block,
cells_per_block=block_size)
scenarios.add(scenario)
# overlap_benchmark()
# benchmark()
kernel_benchmark()
......@@ -3,57 +3,55 @@ import waLBerla as wlb
import pandas as pd
from waLBerla.tools.sqlitedb import sequenceValuesToScalars
from waLBerla.tools.config import block_decomposition
import sys
class Scenario:
def __init__(self, timeStepStrategy, overlappingWidth):
def __init__(self, time_step_strategy, cells_per_block=(256, 256, 256),
cuda_enabled_mpi=False):
# output frequencies
self.vtkWriteFrequency = -1
self.vtkWriteFrequency = 0
# simulation parameters
self.timesteps = 500
edge_size = 50
self.cells = (edge_size, edge_size, edge_size)
self.blocks = (1, 1, 1)
self.timesteps = 101
self.cells_per_block = cells_per_block
self.blocks = block_decomposition(wlb.mpi.numProcesses())
self.periodic = (1, 1, 1)
self.size = (self.cells[0] * self.blocks[0],
self.cells[1] * self.blocks[1],
self.cells[2] * self.blocks[2])
self.size = (self.cells_per_block[0] * self.blocks[0],
self.cells_per_block[1] * self.blocks[1],
self.cells_per_block[2] * self.blocks[2])
self.timeStepStrategy = timeStepStrategy
self.overlappingWidth = overlappingWidth
self.cuda_block_size = (-1, -1, -1)
self.timeStepStrategy = time_step_strategy
self.warmupSteps = 10
self.cudaEnabledMpi = cuda_enabled_mpi
# bubble parameters
self.bubbleRadius = min(self.size) // 4
self.bubbleMidPoint = (self.size[0] / 2, self.size[1] / 2, self.size[2] / 2)
self.scenario = 1 # 1 rising bubble, 2 RTI
self.config_dict = self.config()
self.csv_file = "benchmark_cpu.csv"
self.csv_file = "benchmark.csv"
@wlb.member_callback
def config(self):
return {
'DomainSetup': {
'blocks': self.blocks,
'cellsPerBlock': self.cells,
'cellsPerBlock': self.cells_per_block,
'periodic': self.periodic,
},
'Parameters': {
'timesteps': self.timesteps,
'vtkWriteFrequency': self.vtkWriteFrequency,
'useGui': 0,
'remainingTimeLoggerFrequency': 10.0,
'remainingTimeLoggerFrequency': -1,
'timeStepStrategy': self.timeStepStrategy,
'overlappingWidth': self.overlappingWidth,
'gpuBlockSize': self.cuda_block_size,
'warmupSteps': self.warmupSteps,
'scenario': self.scenario,
'scenario': 1,
'cudaEnabledMpi': self.cudaEnabledMpi
},
'Boundaries_GPU': {
'Border': []
......@@ -86,31 +84,23 @@ class Scenario:
df.to_csv(self.csv_file, index=False, mode='a', header=False)
def overlap_benchmark():
def benchmark():
scenarios = wlb.ScenarioManager()
overlappingWidths = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1),
(4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1),
(4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32),
(64, 32, 32), (64, 64, 32), (64, 64, 64)]
# no overlap
scenarios.add(Scenario(timeStepStrategy='normal', overlappingWidth=(1, 1, 1)))
# overlap
for overlap_strategy in ['overlap']:
for overlappingWidth in overlappingWidths:
scenario = Scenario(timeStepStrategy=overlap_strategy,
overlappingWidth=overlappingWidth)
scenarios.add(scenario)
block_size = (64, 64, 64)
scenarios.add(Scenario(time_step_strategy='normal', cells_per_block=block_size))
def kernel_benchmark():
scenarios = wlb.ScenarioManager()
block_sizes = [(i, i, i) for i in (8, 16, 32, 64, 128)]
# overlap
scenario = Scenario(timeStepStrategy='overlap',
overlappingWidth=(8, 8, 8))
scenarios.add(scenario)
for time_step_strategy in ['phase_only', 'hydro_only', 'kernel_only', 'normal']:
for block_size in block_sizes:
scenario = Scenario(time_step_strategy=time_step_strategy,
cells_per_block=block_size)
scenarios.add(scenario)
# overlap_benchmark()
# benchmark()
kernel_benchmark()
......@@ -37,8 +37,6 @@
#include "timeloop/SweepTimeloop.h"
#include <blockforest/communication/UniformBufferedScheme.h>
#include "InitializerFunctions.h"
//////////////////////////////
......@@ -48,45 +46,23 @@
#if defined(WALBERLA_BUILD_WITH_CUDA)
# include "cuda/AddGPUFieldToStorage.h"
# include "cuda/DeviceSelectMPI.h"
# include "cuda/HostFieldAllocator.h"
# include "cuda/ParallelStreams.h"
# include "cuda/communication/GPUPackInfo.h"
# include "cuda/communication/MemcpyPackInfo.h"
# include "cuda/communication/UniformGPUScheme.h"
# include "GenDefines.h"
# include "PackInfo_phase_field.h"
# include "PackInfo_phase_field_distributions.h"
# include "PackInfo_velocity_based_distributions.h"
# include "hydro_LB_step.h"
# include "initialize_phase_field_distributions.h"
# include "initialize_velocity_based_distributions.h"
# include "phase_field_LB_step.h"
#else
# include "GenDefines.h"
# include "PackInfo_phase_field.h"
# include "PackInfo_phase_field_distributions.h"
# include "PackInfo_velocity_based_distributions.h"
# include "hydro_LB_step.h"
# include "initialize_phase_field_distributions.h"
# include "initialize_velocity_based_distributions.h"
# include "phase_field_LB_step.h"
# include <blockforest/communication/UniformBufferedScheme.h>
#endif
#include "GenDefines.h"
using namespace walberla;
using PdfField_phase_T = GhostLayerField< real_t, Stencil_phase_T::Size >;
using PdfField_hydro_T = GhostLayerField< real_t, Stencil_hydro_T::Size >;
using VelocityField_T = GhostLayerField< real_t, Stencil_hydro_T::Dimension >;
using PhaseField_T = GhostLayerField< real_t, 1 >;
using flag_t = walberla::uint8_t;
using FlagField_T = FlagField< flag_t >;
using flag_t = walberla::uint8_t;
using FlagField_T = FlagField< flag_t >;
#if defined(WALBERLA_BUILD_WITH_CUDA)
typedef cuda::GPUField< real_t > GPUField;
#endif
// using CommScheme_T = cuda::communication::UniformGPUScheme<stencil::D2Q9>;
int main(int argc, char** argv)
{
......@@ -112,9 +88,7 @@ int main(int argc, char** argv)
const real_t remainingTimeLoggerFrequency =
parameters.getParameter< real_t >("remainingTimeLoggerFrequency", 3.0);
const uint_t scenario = parameters.getParameter< uint_t >("scenario", uint_c(1));
Vector3< int > overlappingWidth =
parameters.getParameter< Vector3< int > >("overlappingWidth", Vector3< int >(1, 1, 1));
const uint_t warmupSteps = parameters.getParameter< uint_t >("warmupSteps", uint_t(2));
const uint_t warmupSteps = parameters.getParameter< uint_t >("warmupSteps", uint_t(2));
#if defined(WALBERLA_BUILD_WITH_CUDA)
// CPU fields
......@@ -166,50 +140,46 @@ int main(int argc, char** argv)
pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field_gpu, vel_field_gpu);
pystencils::phase_field_LB_step phase_field_LB_step(
lb_phase_field_gpu, phase_field_gpu, vel_field_gpu, gpuBlockSize[0], gpuBlockSize[1],
Cell(overlappingWidth[0], overlappingWidth[1], overlappingWidth[2]));
lb_phase_field_gpu, phase_field_gpu, vel_field_gpu, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]);
pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field_gpu, phase_field_gpu, vel_field_gpu, gpuBlockSize[0],
gpuBlockSize[1],
Cell(overlappingWidth[0], overlappingWidth[1], overlappingWidth[2]));
gpuBlockSize[1], gpuBlockSize[2]);
#else
pystencils::initialize_phase_field_distributions init_h(lb_phase_field, phase_field, vel_field);
pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field, vel_field);
pystencils::phase_field_LB_step phase_field_LB_step(
lb_phase_field, phase_field, vel_field, Cell(overlappingWidth[0], overlappingWidth[1], overlappingWidth[2]));
pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field, phase_field, vel_field,
Cell(overlappingWidth[0], overlappingWidth[1], overlappingWidth[2]));
pystencils::phase_field_LB_step phase_field_LB_step(lb_phase_field, phase_field, vel_field);
pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field, phase_field, vel_field);
#endif
// add communication
#if defined(WALBERLA_BUILD_WITH_CUDA)
const bool cudaEnabledMpi = parameters.getParameter< bool >("cudaEnabledMpi", false);
auto Comm_velocity_based_distributions =
make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, 0);
make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
auto generatedPackInfo_velocity_based_distributions =
make_shared< pystencils::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field_gpu);
Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_phase_field);
auto Comm_phase_field_distributions =
make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, 0);
make_shared< cuda::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
auto generatedPackInfo_phase_field_distributions =
make_shared< pystencils::PackInfo_phase_field_distributions >(lb_phase_field_gpu);
make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field_gpu);
Comm_phase_field_distributions->addPackInfo(generatedPackInfo_phase_field_distributions);
#else
blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > Comm_velocity_based_distributions(blocks);
auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field);
auto generatedPackInfo_velocity_based_distributions =
make_shared< pystencils::PackInfo_velocity_based_distributions >(lb_velocity_field);
make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field);
Comm_velocity_based_distributions.addPackInfo(generatedPackInfo_phase_field);
Comm_velocity_based_distributions.addPackInfo(generatedPackInfo_velocity_based_distributions);
blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > Comm_phase_field_distributions(blocks);
auto generatedPackInfo_phase_field_distributions =
make_shared< pystencils::PackInfo_phase_field_distributions >(lb_phase_field);
make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field);
Comm_phase_field_distributions.addPackInfo(generatedPackInfo_phase_field_distributions);
#endif
......@@ -245,29 +215,15 @@ int main(int argc, char** argv)
auto timeLoop = make_shared< SweepTimeloop >(blocks->getBlockStorage(), timesteps);
#if defined(WALBERLA_BUILD_WITH_CUDA)
auto normalTimeStep = [&]() {
Comm_velocity_based_distributions->startCommunication(defaultStream);
for (auto& block : *blocks)
{
Comm_phase_field_distributions->communicate(nullptr);
phase_field_LB_step(&block);
phase_field_LB_step(&block, defaultStream);
Comm_velocity_based_distributions->wait(defaultStream);
Comm_velocity_based_distributions->communicate(nullptr);
hydro_LB_step(&block);
}
};
auto simpleOverlapTimeStep = [&]() {
Comm_phase_field_distributions->startCommunication(defaultStream);
for (auto& block : *blocks)
phase_field_LB_step.inner(&block, defaultStream);
hydro_LB_step(&block, defaultStream);
Comm_phase_field_distributions->wait(defaultStream);
for (auto& block : *blocks)
phase_field_LB_step.outer(&block, defaultStream);
Comm_velocity_based_distributions->startCommunication(defaultStream);
for (auto& block : *blocks)
hydro_LB_step.inner(&block, defaultStream);
Comm_velocity_based_distributions->wait(defaultStream);
for (auto& block : *blocks)
hydro_LB_step.outer(&block, defaultStream);
};
auto phase_only = [&]() {
for (auto& block : *blocks)
......@@ -285,29 +241,15 @@ int main(int argc, char** argv)
};
#else
auto normalTimeStep = [&]() {
for (auto& block : *blocks)
{
Comm_phase_field_distributions();
phase_field_LB_step(&block);
Comm_velocity_based_distributions();
hydro_LB_step(&block);
}
};
auto simpleOverlapTimeStep = [&]() {
Comm_phase_field_distributions.startCommunication();
for (auto& block : *blocks)
phase_field_LB_step.inner(&block);
Comm_phase_field_distributions.wait();
for (auto& block : *blocks)
phase_field_LB_step.outer(&block);
Comm_velocity_based_distributions.startCommunication();
for (auto& block : *blocks)
hydro_LB_step.inner(&block);
Comm_velocity_based_distributions.wait();
for (auto& block : *blocks)
hydro_LB_step.outer(&block);
Comm_velocity_based_distributions.startCommunication();
for (auto& block : *blocks)
phase_field_LB_step(&block);
Comm_velocity_based_distributions.wait();
Comm_phase_field_distributions.startCommunication();
for (auto& block : *blocks)
hydro_LB_step(&block);
Comm_phase_field_distributions.wait();
};
auto phase_only = [&]() {
for (auto& block : *blocks)
......@@ -325,12 +267,7 @@ int main(int argc, char** argv)
};
#endif
std::function< void() > timeStep;
if (timeStepStrategy == "overlap")
{
timeStep = std::function< void() >(simpleOverlapTimeStep);
WALBERLA_LOG_INFO_ON_ROOT("overlapping timestep")
}
else if (timeStepStrategy == "phase_only")
if (timeStepStrategy == "phase_only")
{
timeStep = std::function< void() >(phase_only);
WALBERLA_LOG_INFO_ON_ROOT("started only phasefield step without communication for benchmarking")
......@@ -348,7 +285,7 @@ int main(int argc, char** argv)
else
{
timeStep = std::function< void() >(normalTimeStep);
WALBERLA_LOG_INFO_ON_ROOT("normal timestep with no overlapping")
WALBERLA_LOG_INFO_ON_ROOT("normal timestep with overlapping")
}
timeLoop->add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step");
......@@ -365,10 +302,8 @@ int main(int argc, char** argv)
auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
"simulation_step", false, true, true, false, 0);
#if defined(WALBERLA_BUILD_WITH_CUDA)
vtkOutput->addBeforeFunction([&]() {
cuda::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu);
// cuda::fieldCpy<VelocityField_T, GPUField>( blocks, vel_field, vel_field_gpu );
});
vtkOutput->addBeforeFunction(
[&]() { cuda::fieldCpy< PhaseField_T, GPUField >(blocks, phase_field, phase_field_gpu); });
#endif
auto phaseWriter = make_shared< field::VTKWriter< PhaseField_T > >(phase_field, "phase");
vtkOutput->addCellDataWriter(phaseWriter);
......@@ -403,6 +338,11 @@ int main(int argc, char** argv)
if (pythonCallbackResults.isCallable())
{
pythonCallbackResults.data().exposeValue("mlupsPerProcess", mlupsPerProcess);
pythonCallbackResults.data().exposeValue("stencil_phase", StencilNamePhase);
pythonCallbackResults.data().exposeValue("stencil_hydro", StencilNameHydro);
#if defined(WALBERLA_BUILD_WITH_CUDA)
pythonCallbackResults.data().exposeValue("cuda_enabled_mpi", cudaEnabledMpi);
#endif
// Call Python function to report results
pythonCallbackResults();
}
......
import os
import waLBerla as wlb
import csv
from waLBerla.tools.sqlitedb import sequenceValuesToScalars
import sys
class Scenario:
def __init__(self, timeStepStrategy, overlappingWidth, cuda_block_size):
# output frequencies
self.vtkWriteFrequency = 0
# simulation parameters
self.timesteps = 201
edge_size = 100
self.cells = (edge_size, edge_size, edge_size)
self.blocks = (1, 1, 1)
self.periodic = (1, 1, 1)
self.size = (self.cells[0] * self.blocks[0],
self.cells[1] * self.blocks[1],
self.cells[2] * self.blocks[2])
self.timeStepStrategy = timeStepStrategy
self.overlappingWidth = overlappingWidth
self.cuda_block_size = cuda_block_size
self.warmupSteps = 20
# bubble parameters
self.bubbleRadius = min(self.size) // 4
self.bubbleMidPoint = (self.size[0] / 2, self.size[1] / 2, self.size[2] / 2)
self.scenario = 1 # 1 rising bubble, 2 RTI
self.config_dict = self.config()
self.csv_file = "benchmark_piz_daint.csv"
@wlb.member_callback
def config(self):
return {
'DomainSetup': {
'blocks': self.blocks,
'cellsPerBlock': self.cells,
'periodic': self.periodic,
},
'Parameters': {
'timesteps': self.timesteps,
'vtkWriteFrequency': self.vtkWriteFrequency,
'useGui': 0,
'remainingTimeLoggerFrequency': -1,
'timeStepStrategy': self.timeStepStrategy,
'overlappingWidth': self.overlappingWidth,
'gpuBlockSize': self.cuda_block_size,
'warmupSteps': self.warmupSteps,
'scenario': self.scenario,
},
'Boundaries_GPU': {
'Border': []
},
'Boundaries_CPU': {
'Border': []
},
'Bubble': {
'bubbleMidPoint': self.bubbleMidPoint,
'bubbleRadius': self.bubbleRadius,
},
}
@wlb.member_callback
def results_callback(self, **kwargs):
data = {}
data.update(self.config_dict['Parameters'])
data.update(self.config_dict['DomainSetup'])
data.update(kwargs)
data['executable'] = sys.argv[0]
data['compile_flags'] = wlb.build_info.compiler_flags
data['walberla_version'] = wlb.build_info.version
data['build_machine'] = wlb.build_info.build_machine
sequenceValuesToScalars(data)
if not os.path.isfile(self.csv_file):
try:
with open(self.csv_file, 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=data)
writer.writeheader()
writer.writerow(data)
except IOError:
print("could not create csv file")
else:
try:
with open(self.csv_file, 'a') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=data)
writer.writerow(data)
except IOError:
print("could not write to csv file")
def overlap_benchmark():
scenarios = wlb.ScenarioManager()
overlappingWidths = [(1, 1, 1), (4, 1, 1), (8, 1, 1), (16, 1, 1), (32, 1, 1),
(4, 4, 1), (8, 8, 1), (16, 16, 1), (32, 32, 1),
(4, 4, 4), (8, 8, 8), (16, 16, 16), (32, 32, 32)]
cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1),
(32, 2, 1), (64, 2, 1), (128, 2, 1),
(32, 4, 1), (64, 4, 1),
(32, 4, 2), (32, 8, 1), (16, 16, 1)]
# no overlap
scenarios.add(Scenario(timeStepStrategy='normal', overlappingWidth=(1, 1, 1), cuda_block_size=(16, 16, 1)))
# overlap
for overlap_strategy in ['overlap']:
for overlappingWidth in overlappingWidths:
for cuda_block in cuda_blocks:
scenario = Scenario(timeStepStrategy=overlap_strategy,
overlappingWidth=overlappingWidth,
cuda_block_size=cuda_block)
scenarios.add(scenario)
def kernel_benchmark():
scenarios = wlb.ScenarioManager()
# overlap
# for overlap_strategy in ['phase_only', 'hydro_only', 'kernel_only']:
for overlap_strategy in ['overlap']:
scenario = Scenario(timeStepStrategy=overlap_strategy,
overlappingWidth=(16, 16, 16),
cuda_block_size=(128, 4, 1))
scenarios.add(scenario)
def weak_scaling():
scenarios = wlb.ScenarioManager()
# overlap
# for overlap_strategy in ['phase_only', 'hydro_only', 'kernel_only']:
for overlap_strategy in ['overlap']:
scenario = Scenario(timeStepStrategy=overlap_strategy,
overlappingWidth=(32, 32, 32),
cuda_block_size=(128, 2, 1))
scenarios.add(scenario)
# overlap_benchmark()
# kernel_benchmark()
weak_scaling()
from pystencils import fields, TypedSymbol
from pystencils import fields, Target, TypedSymbol
from pystencils.simp import sympy_cse
from pystencils import AssignmentCollection
from lbmpy.creationfunctions import create_lb_method, create_lb_update_rule