Skip to content
Snippets Groups Projects
Commit fb98460a authored by Philipp Suffa's avatar Philipp Suffa
Browse files

Merge branch 'UpdateBenchmarks' into 'master'

Update benchmarks

See merge request walberla/walberla!685
parents 0c0092f3 a74df61b
No related merge requests found
Showing
with 911 additions and 482 deletions
......@@ -10,6 +10,7 @@ qrc_*
# CLion indexing
*.uuid
.fleet
# Generated files
......@@ -32,11 +33,13 @@ qrc_*
# Visual Studio Code
/.vscode
# Zed
/.cache*
# CLion
*.idea
*.clion*
# QtCreator
CMakeLists.txt.user.*
......
......@@ -11,11 +11,6 @@ waLBerla_generate_target_from_python(NAME NonUniformGridCPUGenerated
NonUniformGridCPUBoundaryCollection.h
NonUniformGridCPUInfoHeader.h)
waLBerla_add_executable( NAME NonUniformGridGenerator
FILES NonUniformGridGenerator.cpp LdcSetup.h
DEPENDS blockforest core field python_coupling )
waLBerla_add_executable( NAME NonUniformGridCPU
FILES NonUniformGridCPU.cpp LdcSetup.h
FILES NonUniformGridCPU.cpp LdcSetup.h GridGeneration.h
DEPENDS blockforest boundary core domain_decomposition field geometry lbm_generated python_coupling timeloop vtk NonUniformGridCPUGenerated )
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file GridGeneration.h
//! \author Markus Holzer <markus.holzer@fau.de>
//
//======================================================================================================================
#pragma once
#include "blockforest/Initialization.h"
#include "blockforest/SetupBlock.h"
#include "blockforest/SetupBlockForest.h"
#include "blockforest/loadbalancing/StaticCurve.h"
#include "core/Environment.h"
#include "core/logging/Initialization.h"
#include "core/timing/RemainingTimeLogger.h"
#include "core/timing/TimingPool.h"
#include <string>
#include "LdcSetup.h"
#include "NonUniformGridCPUInfoHeader.h"
using StorageSpecification_T = lbm::NonUniformGridCPUStorageSpecification;
using Stencil_T = StorageSpecification_T::Stencil;
using namespace walberla;
void createSetupBlockForest(SetupBlockForest& setupBfs,
const Config::BlockHandle& domainSetup, const Config::BlockHandle& blockForestSetup,
const bool useMPIManager=false)
{
WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...")
Vector3<real_t> domainSize = domainSetup.getParameter<Vector3<real_t> >("domainSize");
Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
Vector3<uint_t> rootBlocks = domainSetup.getParameter<Vector3<uint_t> >("rootBlocks");
Vector3<bool> periodic = domainSetup.getParameter<Vector3<bool> >("periodic");
const uint_t refinementDepth = blockForestSetup.getParameter< uint_t >("refinementDepth", uint_c(1));
uint_t numProcesses = blockForestSetup.getParameter< uint_t >( "numProcesses");
const std::string blockForestFilestem = blockForestSetup.getParameter< std::string > ("blockForestFilestem", "blockforest");
const bool writeVtk = blockForestSetup.getParameter< bool >("writeVtk", false);
const bool outputStatistics = blockForestSetup.getParameter< bool >("outputStatistics", false);
if(useMPIManager)
numProcesses = uint_c(mpi::MPIManager::instance()->numProcesses());
const LDC ldc(refinementDepth);
auto refSelection = ldc.refinementSelector();
setupBfs.addRefinementSelectionFunction(std::function<void(SetupBlockForest &)>(refSelection));
const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]);
setupBfs.addWorkloadMemorySUIDAssignmentFunction(blockforest::uniformWorkloadAndMemoryAssignment);
setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]);
setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses);
if(mpi::MPIManager::instance()->numProcesses() > 1)
return;
{
std::ostringstream oss;
oss << blockForestFilestem << ".bfs";
setupBfs.saveToFile(oss.str().c_str());
}
if(writeVtk){
setupBfs.writeVTKOutput(blockForestFilestem);
}
if(outputStatistics){
WALBERLA_LOG_INFO_ON_ROOT("=========================== BLOCK FOREST STATISTICS ============================");
WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks())
for (uint_t level = 0; level <= refinementDepth; level++)
{
const uint_t numberOfBlocks = setupBfs.getNumberOfBlocks(level);
WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << numberOfBlocks)
}
const real_t avgBlocksPerProc = real_c(setupBfs.getNumberOfBlocks()) / real_c(setupBfs.getNumberOfProcesses());
WALBERLA_LOG_INFO_ON_ROOT("Average blocks per process: " << avgBlocksPerProc);
const uint_t totalNumberCells = setupBfs.getNumberOfBlocks() * cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2];
const real_t averageCellsPerGPU = avgBlocksPerProc * real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);
const uint_t PDFsPerCell = StorageSpecification_T::inplace ? Stencil_T::Q : 2 * Stencil_T::Q;
const uint_t valuesPerCell = (PDFsPerCell + VelocityField_T::F_SIZE + ScalarField_T::F_SIZE);
const uint_t sizePerValue = sizeof(StorageSpecification_T::value_type);
const double expectedMemory = double_c(totalNumberCells * valuesPerCell * sizePerValue) * 1e-9;
const double expectedMemoryPerGPU = double_c(averageCellsPerGPU * valuesPerCell * sizePerValue) * 1e-9;
WALBERLA_LOG_INFO_ON_ROOT( "Total number of cells will be " << totalNumberCells << " fluid cells (in total on all levels)")
WALBERLA_LOG_INFO_ON_ROOT( "Expected total memory demand will be " << expectedMemory << " GB")
WALBERLA_LOG_INFO_ON_ROOT( "Average memory demand per GPU will be " << expectedMemoryPerGPU << " GB")
WALBERLA_LOG_INFO_ON_ROOT("=================================================================================");
}
}
void createBlockForest(shared_ptr< BlockForest >& bfs,
const Config::BlockHandle& domainSetup, const Config::BlockHandle& blockForestSetup)
{
if (mpi::MPIManager::instance()->numProcesses() > 1)
{
const std::string blockForestFilestem =
blockForestSetup.getParameter< std::string >("blockForestFilestem", "blockforest");
// Load structured block forest from file
std::ostringstream oss;
oss << blockForestFilestem << ".bfs";
const std::string setupBlockForestFilepath = oss.str();
std::ifstream infile(setupBlockForestFilepath.c_str());
if(!infile.good())
{
WALBERLA_LOG_WARNING_ON_ROOT("Blockforest was not created beforehand and thus needs to be created on the fly. For large simulation runs this can be a severe problem!")
SetupBlockForest setupBfs;
createSetupBlockForest(setupBfs, domainSetup, blockForestSetup, true);
bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
}
else
{
bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()),
setupBlockForestFilepath.c_str(), false);
}
}
else
{
SetupBlockForest setupBfs;
createSetupBlockForest(setupBfs, domainSetup, blockForestSetup);
bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
}
}
\ No newline at end of file
......@@ -48,14 +48,8 @@ class LDCRefinement
{
const AABB & domain = forest.getDomain();
const real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 );
const real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 );
const AABB leftCorner( domain.xMin(), domain.yMin(), domain.zMin(),
domain.xMin() + xSize, domain.yMin() + ySize, domain.zMax() );
const AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMin(),
domain.xMax(), domain.yMin() + ySize, domain.zMax() );
const AABB leftCorner( 0, domain.yMax() -1, 0, 1, domain.yMax() , domain.zMax() );
const AABB rightCorner( domain.xMax() - 1, domain.yMax() -1, 0, domain.xMax(), domain.yMax() , domain.zMax() );
for(auto & block : forest)
{
......
......@@ -38,6 +38,7 @@
#include <cmath>
#include "GridGeneration.h"
#include "LdcSetup.h"
#include "NonUniformGridCPUInfoHeader.h"
#include "lbm_generated/communication/NonuniformGeneratedPdfPackInfo.h"
......@@ -77,23 +78,25 @@ int main(int argc, char** argv)
auto config = *cfg;
logging::configureLogging(config);
auto domainSetup = config->getOneBlock("DomainSetup");
auto blockForestSetup = config->getOneBlock("SetupBlockForest");
const bool writeSetupForestAndReturn = blockForestSetup.getParameter< bool >("writeSetupForestAndReturn", true);
const std::string blockForestFilestem =
blockForestSetup.getParameter< std::string >("blockForestFilestem", "blockforest");
const uint_t refinementDepth = blockForestSetup.getParameter< uint_t >("refinementDepth", uint_c(1));
auto domainSetup = config->getOneBlock("DomainSetup");
Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
// Load structured block forest from file
std::ostringstream oss;
oss << blockForestFilestem << ".bfs";
const std::string setupBlockForestFilepath = oss.str();
shared_ptr< BlockForest > bfs;
createBlockForest(bfs, domainSetup, blockForestSetup);
if (writeSetupForestAndReturn && mpi::MPIManager::instance()->numProcesses() == 1)
{
WALBERLA_LOG_INFO_ON_ROOT("BlockForest has been created and writen to file. Returning program")
return EXIT_SUCCESS;
}
WALBERLA_LOG_INFO_ON_ROOT("Creating structured block forest...")
auto bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()),
setupBlockForestFilepath.c_str(), false);
auto blocks =
std::make_shared< StructuredBlockForest >(bfs, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2]);
blocks->createCellBoundingBoxes();
......@@ -173,6 +176,8 @@ int main(int argc, char** argv)
const uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
const bool useVTKAMRWriter = parameters.getParameter< bool >("useVTKAMRWriter", false);
const bool oneFilePerProcess = parameters.getParameter< bool >("oneFilePerProcess", false);
auto finalDomain = blocks->getDomain();
if (vtkWriteFrequency > 0)
{
auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out",
......@@ -180,6 +185,12 @@ int main(int argc, char** argv)
auto velWriter = make_shared< field::VTKWriter< VelocityField_T, float32 > >(velFieldID, "vel");
vtkOutput->addCellDataWriter(velWriter);
if (parameters.getParameter< bool >("writeOnlySlice", true)){
const AABB sliceXY(finalDomain.xMin(), finalDomain.yMin(), finalDomain.center()[2] - blocks->dz(refinementDepth),
finalDomain.xMax(), finalDomain.yMax(), finalDomain.center()[2] + blocks->dz(refinementDepth));
vtkOutput->addCellInclusionFilter(vtk::AABBCellFilter(sliceXY));
}
vtkOutput->addBeforeFunction([&]() {
for (auto& block : *blocks)
sweepCollection.calculateMacroscopicParameters(&block);
......@@ -236,6 +247,8 @@ int main(int argc, char** argv)
pythonCallbackResults.data().exposeValue("numProcesses", performance.processes());
pythonCallbackResults.data().exposeValue("numThreads", performance.threads());
pythonCallbackResults.data().exposeValue("numCores", performance.cores());
pythonCallbackResults.data().exposeValue("numberOfCells", performance.numberOfCells());
pythonCallbackResults.data().exposeValue("numberOfFluidCells", performance.numberOfFluidCells());
pythonCallbackResults.data().exposeValue("mlups", performance.mlups(timesteps, time));
pythonCallbackResults.data().exposeValue("mlupsPerCore", performance.mlupsPerCore(timesteps, time));
pythonCallbackResults.data().exposeValue("mlupsPerProcess",
......
......@@ -23,17 +23,23 @@ const bool infoCsePdfs = {cse_pdfs};
with CodeGeneration() as ctx:
field_type = "float64" if ctx.double_accuracy else "float32"
cpu_vec = {"instruction_set": None}
streaming_pattern = 'aa'
streaming_pattern = 'esopull'
timesteps = get_timesteps(streaming_pattern)
stencil = LBStencil(Stencil.D3Q19)
method_enum = Method.CUMULANT
fourth_order_correction = 0.01 if method_enum == Method.CUMULANT and stencil.Q == 27 else False
collision_setup = "cumulant-K17" if fourth_order_correction else method_enum.name.lower()
assert stencil.D == 3, "This application supports only three-dimensional stencils"
pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {field_type}[3D]", layout='fzyx')
density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx')
macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
lbm_config = LBMConfig(stencil=stencil, method=Method.SRT, relaxation_rate=omega, compressible=True,
lbm_config = LBMConfig(stencil=stencil, method=method_enum, relaxation_rate=omega, compressible=True,
fourth_order_correction=fourth_order_correction,
streaming_pattern=streaming_pattern)
lbm_opt = LBMOptimisation(cse_global=False, field_layout="fzyx")
......@@ -50,12 +56,12 @@ with CodeGeneration() as ctx:
lbm_config=lbm_config, lbm_optimisation=lbm_opt,
nonuniform=True, boundaries=[no_slip, ubb],
macroscopic_fields=macroscopic_fields,
target=ps.Target.CPU)
target=ps.Target.CPU, cpu_vectorize_info=cpu_vec,)
infoHeaderParams = {
'stencil': stencil.name.lower(),
'streaming_pattern': streaming_pattern,
'collision_setup': lbm_config.method.name.lower(),
'collision_setup': collision_setup,
'cse_global': int(lbm_opt.cse_global),
'cse_pdfs': int(lbm_opt.cse_pdfs),
}
......
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file NonUniformGridGenerator.cpp
//! \author Frederik Hennig <frederik.hennig@fau.de>
//
//======================================================================================================================
#include "blockforest/Initialization.h"
#include "blockforest/SetupBlock.h"
#include "blockforest/SetupBlockForest.h"
#include "blockforest/loadbalancing/StaticCurve.h"
#include "core/all.h"
#include "python_coupling/CreateConfig.h"
#include <string>
#include "LdcSetup.h"
using namespace walberla;
int main(int argc, char ** argv){
const mpi::Environment env(argc, argv);
mpi::MPIManager::instance()->useWorldComm();
if(mpi::MPIManager::instance()->numProcesses() > 1){
WALBERLA_ABORT("Commandment: Thou shalt not run thy grid generator with more than one process.");
}
for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg)
{
auto config = *cfg;
auto domainSetup = config->getOneBlock("DomainSetup");
Vector3<real_t> domainSize = domainSetup.getParameter<Vector3<real_t> >("domainSize");
Vector3<uint_t> rootBlocks = domainSetup.getParameter<Vector3<uint_t> >("rootBlocks");
Vector3<bool> periodic = domainSetup.getParameter<Vector3<bool> >("periodic");
auto blockForestSetup = config->getOneBlock("SetupBlockForest");
const uint_t refinementDepth = blockForestSetup.getParameter< uint_t >("refinementDepth", uint_c(1));
const uint_t numProcesses = blockForestSetup.getParameter< uint_t >( "numProcesses");
const std::string blockForestFilestem = blockForestSetup.getParameter< std::string > ("blockForestFilestem", "blockforest");
const bool writeVtk = blockForestSetup.getParameter< bool >("writeVtk", false);
const bool outputStatistics = blockForestSetup.getParameter< bool >("outputStatistics", false);
const LDC ldc(refinementDepth);
SetupBlockForest setupBfs;
auto refSelection = ldc.refinementSelector();
setupBfs.addRefinementSelectionFunction(std::function<void(SetupBlockForest &)>(refSelection));
const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]);
setupBfs.addWorkloadMemorySUIDAssignmentFunction(blockforest::uniformWorkloadAndMemoryAssignment);
setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]);
setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses);
{
std::ostringstream oss;
oss << blockForestFilestem << ".bfs";
setupBfs.saveToFile(oss.str().c_str());
}
if(writeVtk){
setupBfs.writeVTKOutput(blockForestFilestem);
}
if(outputStatistics){
WALBERLA_LOG_INFO_ON_ROOT("=========================== BLOCK FOREST STATISTICS ============================");
WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks())
for (uint_t level = 0; level <= refinementDepth; level++)
{
const uint_t numberOfBlocks = setupBfs.getNumberOfBlocks(level);
WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << numberOfBlocks)
}
const uint_t avgBlocksPerProc = setupBfs.getNumberOfBlocks() / setupBfs.getNumberOfProcesses();
WALBERLA_LOG_INFO_ON_ROOT("Average blocks per process: " << avgBlocksPerProc);
WALBERLA_LOG_INFO_ON_ROOT("=================================================================================");
}
WALBERLA_LOG_INFO_ON_ROOT("Ending program")
}
}
import waLBerla as wlb
from waLBerla.tools.config import block_decomposition
from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
import sqlite3
import os
import sys
try:
import machinestate as ms
except ImportError:
ms = None
DB_FILE = os.environ.get('DB_FILE', "cpu_benchmark.sqlite3")
BENCHMARK = int(os.environ.get('BENCHMARK', 0))
WeakX = int(os.environ.get('WeakX', 128))
WeakY = int(os.environ.get('WeakY', 128))
WeakZ = int(os.environ.get('WeakZ', 128))
StrongX = int(os.environ.get('StrongX', 128))
StrongY = int(os.environ.get('StrongY', 128))
StrongZ = int(os.environ.get('StrongZ', 128))
class Scenario:
......@@ -18,7 +33,8 @@ class Scenario:
vtk_write_frequency=0,
logger_frequency=0,
blockforest_filestem="blockforest",
write_setup_vtk=False):
write_setup_vtk=True,
db_file_name=None):
self.domain_size = domain_size
self.root_blocks = root_blocks
......@@ -34,6 +50,8 @@ class Scenario:
self.vtk_write_frequency = vtk_write_frequency
self.logger_frequency = logger_frequency
self.db_file_name = DB_FILE if db_file_name is None else db_file_name
self.config_dict = self.config(print_dict=False)
@wlb.member_callback
......@@ -51,7 +69,8 @@ class Scenario:
'numProcesses': self.num_processes,
'blockForestFilestem': self.bfs_filestem,
'writeVtk': self.write_setup_vtk,
'outputStatistics': False
'outputStatistics': True,
'writeSetupForestAndReturn': True,
},
'Parameters': {
'omega': 1.95,
......@@ -59,14 +78,15 @@ class Scenario:
'remainingTimeLoggerFrequency': self.logger_frequency,
'vtkWriteFrequency': self.vtk_write_frequency,
'useVTKAMRWriter': True,
'oneFilePerProcess': False
'oneFilePerProcess': False,
'writeOnlySlice': False
},
'Logging': {
'logLevel': "info",
}
}
if (print_dict):
if print_dict:
wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))
return config_dict
......@@ -82,6 +102,15 @@ class Scenario:
data['compile_flags'] = wlb.build_info.compiler_flags
data['walberla_version'] = wlb.build_info.version
data['build_machine'] = wlb.build_info.build_machine
if ms:
state = ms.MachineState(extended=False, anonymous=True)
state.generate() # generate subclasses
state.update() # read information
data["MachineState"] = str(state.get())
else:
print("MachineState module is not available. MachineState was not saved")
sequenceValuesToScalars(data)
result = data
......@@ -92,52 +121,109 @@ class Scenario:
table_name = table_name.replace("-", "_")
for num_try in range(num_tries):
try:
checkAndUpdateSchema(result, table_name, DB_FILE)
storeSingle(result, table_name, DB_FILE)
checkAndUpdateSchema(result, table_name, self.db_file_name)
storeSingle(result, table_name, self.db_file_name)
break
except sqlite3.OperationalError as e:
wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries} {str(e)}")
def validation_run():
"""Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
wlb.log_info_on_root("Validation run")
def weak_scaling_ldc(num_proc, uniform=False):
wlb.log_info_on_root("Running weak scaling benchmark...")
domain_size = (96, 96, 96)
cells_per_block = (32, 32, 32)
# This benchmark must run from 16 processes onwards
if wlb.mpi.numProcesses() > 1:
num_proc = wlb.mpi.numProcesses()
if uniform:
factor = 3 * num_proc
name = "uniform"
else:
if num_proc % 16 != 0:
raise RuntimeError("Number of processes must be dividable by 16")
factor = int(num_proc // 16)
name = "nonuniform"
cells_per_block = (WeakX, WeakY, WeakZ)
domain_size = (cells_per_block[0] * 3, cells_per_block[1] * 3, cells_per_block[2] * factor)
root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])
scenarios = wlb.ScenarioManager()
scenario = Scenario(domain_size=domain_size,
scenario = Scenario(blockforest_filestem=f"blockforest_{name}_{num_proc}",
domain_size=domain_size,
root_blocks=root_blocks,
num_processes=1,
refinement_depth=1,
num_processes=num_proc,
cells_per_block=cells_per_block,
timesteps=201,
vtk_write_frequency=100,
logger_frequency=5,
write_setup_vtk=True)
refinement_depth=0 if uniform else 3,
timesteps=10,
db_file_name=f"weakScalingCPU{name}LDC.sqlite3")
scenarios.add(scenario)
def strong_scaling_ldc(num_proc, uniform=False):
wlb.log_info_on_root("Running strong scaling benchmark...")
# This benchmark must run from 64 GPUs onwards
if wlb.mpi.numProcesses() > 1:
num_proc = wlb.mpi.numProcesses()
if num_proc % 64 != 0:
raise RuntimeError("Number of processes must be dividable by 64")
cells_per_block = (StrongX, StrongY, StrongZ)
if uniform:
domain_size = (cells_per_block[0] * 2, cells_per_block[1] * 2, cells_per_block[2] * 16)
name = "uniform"
else:
factor = int(num_proc / 64)
blocks64 = block_decomposition(factor)
cells_per_block = tuple([int(c / b) for c, b in zip(cells_per_block, reversed(blocks64))])
domain_size = (cells_per_block[0] * 3, cells_per_block[1] * 3, cells_per_block[2] * factor)
name = "nonuniform"
root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])
scenarios = wlb.ScenarioManager()
scenario = Scenario(blockforest_filestem=f"blockforest_{name}_{num_proc}",
domain_size=domain_size,
root_blocks=root_blocks,
num_processes=num_proc,
cells_per_block=cells_per_block,
refinement_depth=0 if uniform else 3,
timesteps=10,
db_file_name=f"strongScalingCPU{name}LDC.sqlite3")
scenarios.add(scenario)
def scaling():
wlb.log_info_on_root("Running scaling benchmark...")
def validation_run():
"""Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
wlb.log_info_on_root("Validation run")
numProc = wlb.mpi.numProcesses()
domain_size = (96, 96, 32)
cells_per_block = (32, 32, 32)
domain_size = (256, 256, 128 * numProc)
cells_per_block = (64, 64, 64)
root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])
scenarios = wlb.ScenarioManager()
scenario = Scenario(domain_size=domain_size,
root_blocks=root_blocks,
num_processes=1,
refinement_depth=3,
cells_per_block=cells_per_block,
refinement_depth=2,
timesteps=10)
timesteps=1001,
vtk_write_frequency=100,
logger_frequency=5,
write_setup_vtk=True)
scenarios.add(scenario)
validation_run()
# scaling()
if BENCHMARK == 0:
validation_run()
elif BENCHMARK == 1:
weak_scaling_ldc(1, False)
elif BENCHMARK == 2:
strong_scaling_ldc(1, False)
else:
print(f"Invalid benchmark case {BENCHMARK}")
......@@ -11,5 +11,5 @@ waLBerla_generate_target_from_python(NAME NonUniformGridGPUGenerated
NonUniformGridGPUBoundaryCollection.h
NonUniformGridGPUInfoHeader.h)
waLBerla_add_executable( NAME NonUniformGridGPU
FILES NonUniformGridGPU.cpp LdcSetup.h
FILES NonUniformGridGPU.cpp LdcSetup.h GridGeneration.h
DEPENDS blockforest boundary core gpu domain_decomposition field geometry lbm_generated python_coupling timeloop vtk NonUniformGridGPUGenerated )
\ No newline at end of file
//======================================================================================================================
//
// This file is part of waLBerla. waLBerla is free software: you can
// redistribute it and/or modify it under the terms of the GNU General Public
// License as published by the Free Software Foundation, either version 3 of
// the License, or (at your option) any later version.
//
// waLBerla is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
//
//! \file GridGeneration.h
//! \author Markus Holzer <markus.holzer@fau.de>
//
//======================================================================================================================
#pragma once
#include "blockforest/Initialization.h"
#include "blockforest/SetupBlock.h"
#include "blockforest/SetupBlockForest.h"
#include "blockforest/loadbalancing/StaticCurve.h"
#include "core/Environment.h"
#include "core/logging/Initialization.h"
#include "core/timing/RemainingTimeLogger.h"
#include "core/timing/TimingPool.h"
#include <string>
#include "LdcSetup.h"
#include "NonUniformGridGPUInfoHeader.h"
using StorageSpecification_T = lbm::NonUniformGridGPUStorageSpecification;
using Stencil_T = StorageSpecification_T::Stencil;
using namespace walberla;
void createSetupBlockForest(SetupBlockForest& setupBfs,
const Config::BlockHandle& domainSetup, const Config::BlockHandle& blockForestSetup,
const bool useMPIManager=false)
{
WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...")
Vector3<real_t> domainSize = domainSetup.getParameter<Vector3<real_t> >("domainSize");
Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock");
Vector3<uint_t> rootBlocks = domainSetup.getParameter<Vector3<uint_t> >("rootBlocks");
Vector3<bool> periodic = domainSetup.getParameter<Vector3<bool> >("periodic");
const uint_t refinementDepth = blockForestSetup.getParameter< uint_t >("refinementDepth", uint_c(1));
uint_t numProcesses = blockForestSetup.getParameter< uint_t >( "numProcesses");
const std::string blockForestFilestem = blockForestSetup.getParameter< std::string > ("blockForestFilestem", "blockforest");
const bool writeVtk = blockForestSetup.getParameter< bool >("writeVtk", false);
const bool outputStatistics = blockForestSetup.getParameter< bool >("outputStatistics", false);
if(useMPIManager)
numProcesses = uint_c(mpi::MPIManager::instance()->numProcesses());
const LDC ldc(refinementDepth);
auto refSelection = ldc.refinementSelector();
setupBfs.addRefinementSelectionFunction(std::function<void(SetupBlockForest &)>(refSelection));
const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]);
setupBfs.addWorkloadMemorySUIDAssignmentFunction(blockforest::uniformWorkloadAndMemoryAssignment);
setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]);
setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses);
if(mpi::MPIManager::instance()->numProcesses() > 1)
return;
{
std::ostringstream oss;
oss << blockForestFilestem << ".bfs";
setupBfs.saveToFile(oss.str().c_str());
}
if(writeVtk){
setupBfs.writeVTKOutput(blockForestFilestem);
}
if(outputStatistics){
WALBERLA_LOG_INFO_ON_ROOT("=========================== BLOCK FOREST STATISTICS ============================");
WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks())
for (uint_t level = 0; level <= refinementDepth; level++){
const uint_t numberOfBlocks = setupBfs.getNumberOfBlocks(level);
WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << numberOfBlocks)
}
const real_t avgBlocksPerProc = real_c(setupBfs.getNumberOfBlocks()) / real_c(setupBfs.getNumberOfProcesses());
WALBERLA_LOG_INFO_ON_ROOT("Average blocks per process: " << avgBlocksPerProc);
const uint_t totalNumberCells = setupBfs.getNumberOfBlocks() * cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2];
const real_t averageCellsPerGPU = avgBlocksPerProc * real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);
const uint_t PDFsPerCell = StorageSpecification_T::inplace ? Stencil_T::Q : 2 * Stencil_T::Q;
const uint_t valuesPerCell = (PDFsPerCell + VelocityField_T::F_SIZE + ScalarField_T::F_SIZE);
const uint_t sizePerValue = sizeof(StorageSpecification_T::value_type);
const double expectedMemory = double_c(totalNumberCells * valuesPerCell * sizePerValue) * 1e-9;
const double expectedMemoryPerGPU = double_c(averageCellsPerGPU * valuesPerCell * sizePerValue) * 1e-9;
WALBERLA_LOG_INFO_ON_ROOT( "Total number of cells will be " << totalNumberCells << " fluid cells (in total on all levels)")
WALBERLA_LOG_INFO_ON_ROOT( "Expected total memory demand will be " << expectedMemory << " GB")
WALBERLA_LOG_INFO_ON_ROOT( "Average memory demand per GPU will be " << expectedMemoryPerGPU << " GB")
WALBERLA_LOG_INFO_ON_ROOT("=================================================================================");
}
}
void createBlockForest(shared_ptr< BlockForest >& bfs,
const Config::BlockHandle& domainSetup, const Config::BlockHandle& blockForestSetup)
{
if (mpi::MPIManager::instance()->numProcesses() > 1){
const std::string blockForestFilestem =
blockForestSetup.getParameter< std::string >("blockForestFilestem", "blockforest");
// Load structured block forest from file
std::ostringstream oss;
oss << blockForestFilestem << ".bfs";
const std::string setupBlockForestFilepath = oss.str();
std::ifstream infile(setupBlockForestFilepath.c_str());
if(!infile.good()){
WALBERLA_LOG_WARNING_ON_ROOT("Blockforest was not created beforehand and thus needs to be created on the fly. For large simulation runs this can be a severe problem!")
SetupBlockForest setupBfs;
createSetupBlockForest(setupBfs, domainSetup, blockForestSetup, true);
bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
}
else{
bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()),
setupBlockForestFilepath.c_str(), false);
}
}
else{
SetupBlockForest setupBfs;
createSetupBlockForest(setupBfs, domainSetup, blockForestSetup);
bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs);
}
}
\ No newline at end of file
......@@ -31,7 +31,9 @@
#include "field/FlagUID.h"
using namespace walberla;
using RefinementSelectionFunctor = SetupBlockForest::RefinementSelectionFunction;
using FlagField_T = FlagField< uint8_t >;
class LDCRefinement
......@@ -46,14 +48,8 @@ class LDCRefinement
{
const AABB & domain = forest.getDomain();
const real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 );
const real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 );
const AABB leftCorner( domain.xMin(), domain.yMin(), domain.zMin(),
domain.xMin() + xSize, domain.yMin() + ySize, domain.zMax() );
const AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMin(),
domain.xMax(), domain.yMin() + ySize, domain.zMax() );
const AABB leftCorner( 0, domain.yMax() -1, 0, 1, domain.yMax() , domain.zMax() );
const AABB rightCorner( domain.xMax() - 1, domain.yMax() -1, 0, domain.xMax(), domain.yMax() , domain.zMax() );
for(auto & block : forest)
{
......@@ -99,8 +95,7 @@ class LDC
Cell globalCell(localCell);
sbfs.transformBlockLocalToGlobalCell(globalCell, b);
if (globalCell.y() >= cell_idx_c(sbfs.getNumberOfYCells(level))) { flagField->addFlag(localCell, ubbFlag); }
else if (globalCell.z() < 0 || globalCell.y() < 0 || globalCell.x() < 0 ||
globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level)) || globalCell.z() >= cell_idx_c(sbfs.getNumberOfZCells(level)))
else if (globalCell.y() < 0 || globalCell.x() < 0 || globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level)))
{
flagField->addFlag(localCell, noslipFlag);
}
......
This diff is collapsed.
......@@ -7,7 +7,7 @@ from pystencils.typing import TypedSymbol
from lbmpy.advanced_streaming.utility import get_timesteps
from lbmpy.boundaries import NoSlip, UBB
from lbmpy.creationfunctions import create_lb_method, create_lb_collision_rule
from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil
from lbmpy import LBMConfig, LBMOptimisation, Stencil, Method, LBStencil, SubgridScaleModel
from pystencils_walberla import CodeGeneration, generate_info_header
from lbmpy_walberla import generate_lbm_package, lbm_boundary_generator
......@@ -30,20 +30,25 @@ const char * infoCollisionSetup = "{collision_setup}";
const bool infoCseGlobal = {cse_global};
const bool infoCsePdfs = {cse_pdfs};
"""
with CodeGeneration() as ctx:
field_type = "float64" if ctx.double_accuracy else "float32"
streaming_pattern = 'pull'
streaming_pattern = 'esopull'
timesteps = get_timesteps(streaming_pattern)
stencil = LBStencil(Stencil.D3Q19)
method_enum = Method.CUMULANT
fourth_order_correction = 0.01 if method_enum == Method.CUMULANT and stencil.Q == 27 else False
collision_setup = "cumulant-K17" if fourth_order_correction else method_enum.name.lower()
assert stencil.D == 3, "This application supports only three-dimensional stencils"
pdfs, pdfs_tmp = ps.fields(f"pdfs({stencil.Q}), pdfs_tmp({stencil.Q}): {field_type}[3D]", layout='fzyx')
density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx')
macroscopic_fields = {'density': density_field, 'velocity': velocity_field}
lbm_config = LBMConfig(stencil=stencil, method=Method.SRT, relaxation_rate=omega,
lbm_config = LBMConfig(stencil=stencil, method=method_enum, relaxation_rate=omega, compressible=True,
fourth_order_correction=fourth_order_correction,
streaming_pattern=streaming_pattern)
lbm_opt = LBMOptimisation(cse_global=False, field_layout='fzyx')
......@@ -66,7 +71,7 @@ with CodeGeneration() as ctx:
infoHeaderParams = {
'stencil': stencil.name.lower(),
'streaming_pattern': streaming_pattern,
'collision_setup': lbm_config.method.name.lower(),
'collision_setup': collision_setup,
'cse_global': int(lbm_opt.cse_global),
'cse_pdfs': int(lbm_opt.cse_pdfs),
}
......
import waLBerla as wlb
from waLBerla.tools.config import block_decomposition
from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle
import sqlite3
import os
import sys
try:
import machinestate as ms
except ImportError:
ms = None
DB_FILE = os.environ.get('DB_FILE', "gpu_benchmark.sqlite3")
BENCHMARK = int(os.environ.get('BENCHMARK', 0))
WeakX = int(os.environ.get('WeakX', 128))
WeakY = int(os.environ.get('WeakY', 128))
WeakZ = int(os.environ.get('WeakZ', 128))
StrongX = int(os.environ.get('StrongX', 128))
StrongY = int(os.environ.get('StrongY', 128))
StrongZ = int(os.environ.get('StrongZ', 128))
class Scenario:
def __init__(self, domain_size=(64, 64, 64), root_blocks=(2, 2, 2),
cells_per_block=(32, 32, 32), refinement_depth=0):
def __init__(self,
domain_size=(64, 64, 64),
root_blocks=(2, 2, 2),
num_processes=1,
refinement_depth=0,
cells_per_block=(32, 32, 32),
timesteps=101,
gpu_enabled_mpi=False,
vtk_write_frequency=0,
logger_frequency=30,
blockforest_filestem="blockforest",
write_setup_vtk=True,
db_file_name=None):
self.domain_size = domain_size
self.root_blocks = root_blocks
self.cells_per_block = cells_per_block
self.periodic = (0, 0, 1)
self.refinement_depth = refinement_depth
self.num_processes = num_processes
self.bfs_filestem = blockforest_filestem
self.write_setup_vtk = write_setup_vtk
self.timesteps = timesteps
self.gpu_enabled_mpi = gpu_enabled_mpi
self.vtk_write_frequency = vtk_write_frequency
self.logger_frequency = logger_frequency
self.periodic = (0, 0, 0)
self.db_file_name = DB_FILE if db_file_name is None else db_file_name
self.config_dict = self.config(print_dict=False)
......@@ -22,39 +64,79 @@ class Scenario:
'domainSize': self.domain_size,
'rootBlocks': self.root_blocks,
'cellsPerBlock': self.cells_per_block,
'periodic': self.periodic
'periodic': self.periodic,
},
'SetupBlockForest': {
'refinementDepth': self.refinement_depth,
'numProcesses': self.num_processes,
'blockForestFilestem': self.bfs_filestem,
'writeVtk': self.write_setup_vtk,
'outputStatistics': True,
'writeSetupForestAndReturn': True,
},
'Parameters': {
'omega': 1.95,
'timesteps': 30001,
'refinementDepth': self.refinement_depth,
'writeSetupForestAndReturn': False,
'numProcesses': 1,
'cudaEnabledMPI': False,
'benchmarkKernelOnly': False,
'remainingTimeLoggerFrequency': 3,
'vtkWriteFrequency': 10000,
'timesteps': self.timesteps,
'remainingTimeLoggerFrequency': self.logger_frequency,
'vtkWriteFrequency': self.vtk_write_frequency,
'useVTKAMRWriter': True,
'oneFilePerProcess': False,
'writeOnlySlice': False,
'gpuEnabledMPI': self.gpu_enabled_mpi,
'gpuBlockSize': (128, 1, 1),
},
'Logging': {
'logLevel': "info",
}
}
if print_dict and config_dict["Parameters"]["writeSetupForestAndReturn"] is False:
if print_dict:
wlb.log_info_on_root("Scenario:\n" + pformat(config_dict))
return config_dict
@wlb.member_callback
def results_callback(self, **kwargs):
data = {}
data.update(self.config_dict['Parameters'])
data.update(self.config_dict['DomainSetup'])
data.update(kwargs)
data['executable'] = sys.argv[0]
data['compile_flags'] = wlb.build_info.compiler_flags
data['walberla_version'] = wlb.build_info.version
data['build_machine'] = wlb.build_info.build_machine
if ms:
state = ms.MachineState(extended=False, anonymous=True)
state.generate() # generate subclasses
state.update() # read information
data["MachineState"] = str(state.get())
else:
print("MachineState module is not available. MachineState was not saved")
sequenceValuesToScalars(data)
result = data
sequenceValuesToScalars(result)
num_tries = 4
# check multiple times e.g. may fail when multiple benchmark processes are running
table_name = f"runs"
table_name = table_name.replace("-", "_")
for num_try in range(num_tries):
try:
checkAndUpdateSchema(result, table_name, self.db_file_name)
storeSingle(result, table_name, self.db_file_name)
break
except sqlite3.OperationalError as e:
wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries} {str(e)}")
def validation_run():
"""Run with full periodic shear flow or boundary scenario (ldc) to check if the code works"""
wlb.log_info_on_root("Validation run")
domain_size = (96, 96, 96)
cells_per_block = (32, 32, 32)
domain_size = (192, 192, 64)
cells_per_block = (64, 64, 64)
root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])
......@@ -62,8 +144,91 @@ def validation_run():
scenario = Scenario(domain_size=domain_size,
root_blocks=root_blocks,
cells_per_block=cells_per_block,
refinement_depth=1)
timesteps=0,
vtk_write_frequency=0,
refinement_depth=3,
gpu_enabled_mpi=False)
scenarios.add(scenario)
validation_run()
def weak_scaling_ldc(num_proc, gpu_enabled_mpi=False, uniform=True):
wlb.log_info_on_root("Running weak scaling benchmark...")
# This benchmark must run from 16 GPUs onwards
if wlb.mpi.numProcesses() > 1:
num_proc = wlb.mpi.numProcesses()
if uniform:
factor = 3 * num_proc
name = "uniform"
else:
if num_proc % 16 != 0:
raise RuntimeError("Number of processes must be dividable by 16")
factor = int(num_proc // 16)
name = "nonuniform"
cells_per_block = (WeakX, WeakY, WeakZ)
domain_size = (cells_per_block[0] * 3, cells_per_block[1] * 3, cells_per_block[2] * factor)
root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])
scenarios = wlb.ScenarioManager()
scenario = Scenario(blockforest_filestem=f"blockforest_{name}_{num_proc}",
domain_size=domain_size,
root_blocks=root_blocks,
num_processes=num_proc,
cells_per_block=cells_per_block,
refinement_depth=0 if uniform else 3,
timesteps=10,
gpu_enabled_mpi=gpu_enabled_mpi,
db_file_name=f"weakScalingGPU{name}LDC.sqlite3")
scenarios.add(scenario)
def strong_scaling_ldc(num_proc, gpu_enabled_mpi=False, uniform=True):
wlb.log_info_on_root("Running strong scaling benchmark...")
# This benchmark must run from 64 GPUs onwards
if wlb.mpi.numProcesses() > 1:
num_proc = wlb.mpi.numProcesses()
if num_proc % 64 != 0:
raise RuntimeError("Number of processes must be dividable by 64")
cells_per_block = (StrongX, StrongY, StrongZ)
if uniform:
domain_size = (cells_per_block[0] * 2, cells_per_block[1] * 2, cells_per_block[2] * 16)
name = "uniform"
else:
factor = int(num_proc / 64)
blocks64 = block_decomposition(factor)
cells_per_block = tuple([int(c / b) for c, b in zip(cells_per_block, reversed(blocks64))])
domain_size = (cells_per_block[0] * 3, cells_per_block[1] * 3, cells_per_block[2] * factor)
name = "nonuniform"
root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)])
scenarios = wlb.ScenarioManager()
scenario = Scenario(blockforest_filestem=f"blockforest_{name}_{num_proc}",
domain_size=domain_size,
root_blocks=root_blocks,
num_processes=num_proc,
cells_per_block=cells_per_block,
refinement_depth=0 if uniform else 3,
timesteps=10,
gpu_enabled_mpi=gpu_enabled_mpi,
db_file_name=f"strongScalingGPU{name}LDC.sqlite3")
scenarios.add(scenario)
if BENCHMARK == 0:
validation_run()
elif BENCHMARK == 1:
weak_scaling_ldc(1, True, False)
elif BENCHMARK == 2:
strong_scaling_ldc(1, True, False)
else:
print(f"Invalid benchmark case {BENCHMARK}")
......@@ -15,10 +15,10 @@ waLBerla_generate_target_from_python(NAME BenchmarkPhaseFieldCodeGen
if (WALBERLA_BUILD_WITH_GPU_SUPPORT )
waLBerla_add_executable(NAME benchmark_multiphase
FILES benchmark_multiphase.cpp InitializerFunctions.cpp multiphase_codegen.py
DEPENDS blockforest core gpu field postprocessing python_coupling lbm geometry timeloop gui BenchmarkPhaseFieldCodeGen)
DEPENDS blockforest core gpu field postprocessing python_coupling lbm_generated geometry timeloop gui BenchmarkPhaseFieldCodeGen)
else ()
waLBerla_add_executable(NAME benchmark_multiphase
FILES benchmark_multiphase.cpp InitializerFunctions.cpp multiphase_codegen.py
DEPENDS blockforest core field postprocessing python_coupling lbm geometry timeloop gui BenchmarkPhaseFieldCodeGen)
DEPENDS blockforest core field postprocessing python_coupling lbm_generated geometry timeloop gui BenchmarkPhaseFieldCodeGen)
endif (WALBERLA_BUILD_WITH_GPU_SUPPORT )
......@@ -8,6 +8,11 @@ from waLBerla.tools.config import block_decomposition
import sys
from math import prod
try:
import machinestate as ms
except ImportError:
ms = None
def domain_block_size_ok(block_size, total_mem, gls=1, q_phase=15, q_hydro=27, size_per_value=8):
"""Checks if a single block of given size fits into GPU memory"""
......@@ -20,7 +25,9 @@ def domain_block_size_ok(block_size, total_mem, gls=1, q_phase=15, q_hydro=27, s
class Scenario:
def __init__(self, time_step_strategy, cuda_block_size, cells_per_block=(256, 256, 256),
def __init__(self, time_step_strategy,
cuda_block_size,
cells_per_block=(256, 256, 256),
cuda_enabled_mpi=False):
# output frequencies
self.vtkWriteFrequency = 0
......@@ -89,6 +96,14 @@ class Scenario:
data['compile_flags'] = wlb.build_info.compiler_flags
data['walberla_version'] = wlb.build_info.version
data['build_machine'] = wlb.build_info.build_machine
if ms:
state = ms.MachineState(extended=False, anonymous=True)
state.generate() # generate subclasses
state.update() # read information
data["MachineState"] = str(state.get())
else:
print("MachineState module is not available. MachineState was not saved")
sequenceValuesToScalars(data)
df = pd.DataFrame.from_records([data])
......@@ -101,43 +116,19 @@ class Scenario:
def benchmark():
scenarios = wlb.ScenarioManager()
gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8))
gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 40))
gpu_mem = gpu_mem_gb * (2 ** 30)
block_size = (256, 256, 256)
block_size = (320, 320, 320)
cuda_enabled_mpi = True
if not domain_block_size_ok(block_size, gpu_mem):
wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
else:
scenarios.add(Scenario(time_step_strategy='normal', cuda_block_size=(256, 1, 1), cells_per_block=block_size))
scenarios.add(Scenario(time_step_strategy='normal',
cuda_block_size=(128, 1, 1),
cells_per_block=block_size,
cuda_enabled_mpi=cuda_enabled_mpi))
def kernel_benchmark():
scenarios = wlb.ScenarioManager()
gpu_mem_gb = int(os.environ.get('GPU_MEMORY_GB', 8))
gpu_mem = gpu_mem_gb * (2 ** 30)
block_sizes = [(i, i, i) for i in (32, 64, 128, 256, 320, 384, 448, 512)]
cuda_blocks = [(32, 1, 1), (64, 1, 1), (128, 1, 1), (256, 1, 1),
(32, 2, 1), (64, 2, 1), (128, 2, 1),
(32, 4, 1), (64, 4, 1),
(32, 4, 2),
(32, 8, 1),
(16, 16, 1)]
for time_step_strategy in ['phase_only', 'hydro_only', 'kernel_only', 'normal']:
for cuda_block in cuda_blocks:
for block_size in block_sizes:
if not domain_block_size_ok(block_size, gpu_mem):
wlb.log_info_on_root(f"Block size {block_size} would exceed GPU memory. Skipping.")
continue
scenario = Scenario(time_step_strategy=time_step_strategy,
cuda_block_size=cuda_block,
cells_per_block=block_size)
scenarios.add(scenario)
# benchmark()
kernel_benchmark()
benchmark()
......@@ -29,6 +29,7 @@
#include "field/vtk/VTKWriter.h"
#include "geometry/InitBoundaryHandling.h"
#include "lbm_generated/evaluation/PerformanceEvaluation.h"
#include "python_coupling/CreateConfig.h"
#include "python_coupling/DictWrapper.h"
......@@ -78,14 +79,10 @@ int main(int argc, char** argv)
logging::configureLogging(config);
shared_ptr< StructuredBlockForest > blocks = blockforest::createUniformBlockGridFromConfig(config);
Vector3< uint_t > cellsPerBlock =
config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock");
// Reading parameters
auto parameters = config->getOneBlock("Parameters");
const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal");
const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50));
const real_t remainingTimeLoggerFrequency =
parameters.getParameter< real_t >("remainingTimeLoggerFrequency", real_c(3.0));
const uint_t scenario = parameters.getParameter< uint_t >("scenario", uint_c(1));
const uint_t warmupSteps = parameters.getParameter< uint_t >("warmupSteps", uint_t(2));
......@@ -102,6 +99,7 @@ int main(int argc, char** argv)
gpu::addGPUFieldToStorage< VelocityField_T >(blocks, vel_field, "velocity field on GPU", true);
BlockDataID phase_field_gpu =
gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "phase field on GPU", true);
BlockDataID phase_field_tmp = gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "temporary phasefield", true);
#else
BlockDataID lb_phase_field =
field::addToStorage< PdfField_phase_T >(blocks, "lb phase field", real_c(0.0), field::fzyx);
......@@ -109,6 +107,7 @@ int main(int argc, char** argv)
field::addToStorage< PdfField_hydro_T >(blocks, "lb velocity field", real_c(0.0), field::fzyx);
BlockDataID vel_field = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_c(0.0), field::fzyx);
BlockDataID phase_field_tmp = field::addToStorage< PhaseField_T >(blocks, "phase tmp", real_c(0.0), field::fzyx);
#endif
if (timeStepStrategy != "phase_only" && timeStepStrategy != "hydro_only" && timeStepStrategy != "kernel_only")
......@@ -139,47 +138,80 @@ int main(int argc, char** argv)
pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field_gpu, vel_field_gpu);
pystencils::phase_field_LB_step phase_field_LB_step(
lb_phase_field_gpu, phase_field_gpu, vel_field_gpu, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]);
lb_phase_field_gpu, phase_field_gpu, phase_field_tmp, vel_field_gpu, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]);
pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field_gpu, phase_field_gpu, vel_field_gpu, gpuBlockSize[0],
gpuBlockSize[1], gpuBlockSize[2]);
#else
pystencils::initialize_phase_field_distributions init_h(lb_phase_field, phase_field, vel_field);
pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field, vel_field);
pystencils::phase_field_LB_step phase_field_LB_step(lb_phase_field, phase_field, vel_field);
pystencils::phase_field_LB_step phase_field_LB_step(lb_phase_field, phase_field, phase_field_tmp, vel_field);
pystencils::hydro_LB_step hydro_LB_step(lb_velocity_field, phase_field, vel_field);
#endif
// add communication
#if defined(WALBERLA_BUILD_WITH_CUDA)
const bool cudaEnabledMpi = parameters.getParameter< bool >("cudaEnabledMpi", false);
auto Comm_velocity_based_distributions =
make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
auto generatedPackInfo_velocity_based_distributions =
make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
const bool gpuEnabledMpi = parameters.getParameter< bool >("cudaEnabledMpi", false);
const int streamLowPriority = 0;
const int streamHighPriority = 0;
auto defaultStream = gpu::StreamRAII::newPriorityStream(streamLowPriority);
auto innerOuterStreams = gpu::ParallelStreams(streamHighPriority);
auto generatedPackInfo_phase_field_distributions = make_shared< lbm::PackInfo_phase_field_distributions>(lb_phase_field_gpu);
auto generatedPackInfo_velocity_based_distributions = make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu);
auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field_gpu);
Comm_velocity_based_distributions->addPackInfo(generatedPackInfo_phase_field);
auto Comm_phase_field_distributions =
make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi);
auto generatedPackInfo_phase_field_distributions =
make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field_gpu);
Comm_phase_field_distributions->addPackInfo(generatedPackInfo_phase_field_distributions);
#else
auto UniformGPUSchemeVelocityBasedDistributions = make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, gpuEnabledMpi, false);
auto UniformGPUSchemePhaseFieldDistributions = make_shared< gpu::communication::UniformGPUScheme< Full_Stencil_T > >(blocks, gpuEnabledMpi, false);
auto UniformGPUSchemePhaseField = make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, gpuEnabledMpi, false, 65432);
UniformGPUSchemeVelocityBasedDistributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
UniformGPUSchemePhaseFieldDistributions->addPackInfo(generatedPackInfo_phase_field_distributions);
UniformGPUSchemePhaseField->addPackInfo(generatedPackInfo_phase_field);
auto Comm_velocity_based_distributions_start = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->startCommunication(); });
auto Comm_velocity_based_distributions_wait = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->wait(); });
blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > Comm_velocity_based_distributions(blocks);
auto Comm_phase_field_distributions_start = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->startCommunication(); });
auto Comm_phase_field_distributions_wait = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->wait(); });
auto Comm_phase_field = std::function< void() >([&]() { UniformGPUSchemePhaseField->communicate(); });
auto swapPhaseField = std::function< void(IBlock *) >([&](IBlock * b)
{
auto phaseField = b->getData< gpu::GPUField<real_t> >(phase_field_gpu);
auto phaseFieldTMP = b->getData< gpu::GPUField<real_t> >(phase_field_tmp);
phaseField->swapDataPointers(phaseFieldTMP);
});
#else
auto generatedPackInfo_phase_field_distributions = make_shared< lbm::PackInfo_phase_field_distributions>(lb_phase_field);
auto generatedPackInfo_velocity_based_distributions = make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field);
auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field);
auto generatedPackInfo_velocity_based_distributions =
make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field);
Comm_velocity_based_distributions.addPackInfo(generatedPackInfo_phase_field);
Comm_velocity_based_distributions.addPackInfo(generatedPackInfo_velocity_based_distributions);
auto UniformGPUSchemeVelocityBasedDistributions = make_shared< blockforest::communication::UniformBufferedScheme< Full_Stencil_T > >(blocks);
auto UniformGPUSchemePhaseFieldDistributions = make_shared< blockforest::communication::UniformBufferedScheme< Full_Stencil_T > >(blocks);
auto UniformGPUSchemePhaseField = make_shared< blockforest::communication::UniformBufferedScheme< Full_Stencil_T > >(blocks, 65432);
UniformGPUSchemeVelocityBasedDistributions->addPackInfo(generatedPackInfo_velocity_based_distributions);
UniformGPUSchemePhaseFieldDistributions->addPackInfo(generatedPackInfo_phase_field_distributions);
UniformGPUSchemePhaseField->addPackInfo(generatedPackInfo_phase_field);
blockforest::communication::UniformBufferedScheme< Stencil_hydro_T > Comm_phase_field_distributions(blocks);
auto generatedPackInfo_phase_field_distributions =
make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field);
Comm_phase_field_distributions.addPackInfo(generatedPackInfo_phase_field_distributions);
auto Comm_velocity_based_distributions_start = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->startCommunication(); });
auto Comm_velocity_based_distributions_wait = std::function< void() >([&]() { UniformGPUSchemeVelocityBasedDistributions->wait(); });
auto Comm_phase_field_distributions = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->communicate(); });
auto Comm_phase_field_distributions_start = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->startCommunication(); });
auto Comm_phase_field_distributions_wait = std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->wait(); });
auto Comm_phase_field = std::function< void() >([&]() { UniformGPUSchemePhaseField->communicate(); });
auto swapPhaseField = std::function< void(IBlock *) >([&](IBlock * b)
{
auto phaseField = b->getData< PhaseField_T >(phase_field);
auto phaseFieldTMP = b->getData< PhaseField_T >(phase_field_tmp);
phaseField->swapDataPointers(phaseFieldTMP);
});
#endif
BlockDataID const flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field");
......@@ -201,99 +233,37 @@ int main(int argc, char** argv)
init_h(&block);
init_g(&block);
}
WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
WALBERLA_GPU_CHECK(gpuPeekAtLastError())
WALBERLA_MPI_BARRIER()
WALBERLA_LOG_INFO_ON_ROOT("initialization of the distributions done")
}
SweepTimeloop timeloop(blocks->getBlockStorage(), timesteps);
#if defined(WALBERLA_BUILD_WITH_CUDA)
int const streamLowPriority = 0;
int const streamHighPriority = 0;
auto defaultStream = gpu::StreamRAII::newPriorityStream(streamLowPriority);
auto innerOuterStreams = gpu::ParallelStreams(streamHighPriority);
#endif
timeloop.add() << BeforeFunction(Comm_velocity_based_distributions_start, "Start Hydro PDFs Communication")
<< Sweep(phase_field_LB_step.getSweep(defaultStream), "Phase LB Step")
<< AfterFunction(Comm_velocity_based_distributions_wait, "Wait Hydro PDFs Communication");
auto timeLoop = make_shared< SweepTimeloop >(blocks->getBlockStorage(), timesteps);
#if defined(WALBERLA_BUILD_WITH_CUDA)
auto normalTimeStep = [&]() {
Comm_velocity_based_distributions->startCommunication();
for (auto& block : *blocks)
phase_field_LB_step(&block, defaultStream);
Comm_velocity_based_distributions->wait();
timeloop.add() << BeforeFunction(Comm_phase_field_distributions_start, "Start Phase PDFs Communication")
<< Sweep(hydro_LB_step.getSweep(defaultStream), "Hydro LB Step");
timeloop.add() << Sweep(swapPhaseField, "Swap PhaseField")
<< AfterFunction(Comm_phase_field_distributions_wait, "Wait Phase PDFs Communication");
timeloop.addFuncAfterTimeStep(Comm_phase_field, "Communication Phase field");
Comm_phase_field_distributions->startCommunication();
for (auto& block : *blocks)
hydro_LB_step(&block, defaultStream);
Comm_phase_field_distributions->wait();
};
auto phase_only = [&]() {
for (auto& block : *blocks)
phase_field_LB_step(&block);
};
auto hydro_only = [&]() {
for (auto& block : *blocks)
hydro_LB_step(&block);
};
auto without_comm = [&]() {
for (auto& block : *blocks)
phase_field_LB_step(&block);
for (auto& block : *blocks)
hydro_LB_step(&block);
};
#else
auto normalTimeStep = [&]() {
Comm_velocity_based_distributions.startCommunication();
for (auto& block : *blocks)
phase_field_LB_step(&block);
Comm_velocity_based_distributions.wait();
Comm_phase_field_distributions.startCommunication();
for (auto& block : *blocks)
hydro_LB_step(&block);
Comm_phase_field_distributions.wait();
};
auto phase_only = [&]() {
for (auto& block : *blocks)
phase_field_LB_step(&block);
};
auto hydro_only = [&]() {
for (auto& block : *blocks)
hydro_LB_step(&block);
};
auto without_comm = [&]() {
for (auto& block : *blocks)
phase_field_LB_step(&block);
for (auto& block : *blocks)
hydro_LB_step(&block);
};
#endif
std::function< void() > timeStep;
if (timeStepStrategy == "phase_only")
{
timeStep = std::function< void() >(phase_only);
WALBERLA_LOG_INFO_ON_ROOT("started only phasefield step without communication for benchmarking")
}
else if (timeStepStrategy == "hydro_only")
{
timeStep = std::function< void() >(hydro_only);
WALBERLA_LOG_INFO_ON_ROOT("started only hydro step without communication for benchmarking")
}
else if (timeStepStrategy == "kernel_only")
{
timeStep = std::function< void() >(without_comm);
WALBERLA_LOG_INFO_ON_ROOT("started complete phasefield model without communication for benchmarking")
}
else
{
timeStep = std::function< void() >(normalTimeStep);
WALBERLA_LOG_INFO_ON_ROOT("normal timestep with overlapping")
}
timeloop.add() << BeforeFunction(Comm_velocity_based_distributions_start, "Start Hydro PDFs Communication")
<< Sweep(phase_field_LB_step.getSweep(), "Phase LB Step")
<< AfterFunction(Comm_velocity_based_distributions_wait, "Wait Hydro PDFs Communication");
timeLoop->add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step");
timeloop.add() << BeforeFunction(Comm_phase_field_distributions_start, "Start Phase PDFs Communication")
<< Sweep(hydro_LB_step.getSweep(), "Hydro LB Step");
timeloop.add() << Sweep(swapPhaseField, "Swap PhaseField")
<< AfterFunction(Comm_phase_field_distributions_wait, "Wait Phase PDFs Communication");
// remaining time logger
if (remainingTimeLoggerFrequency > 0)
timeLoop->addFuncAfterTimeStep(
timing::RemainingTimeLogger(timeLoop->getNrOfTimeSteps(), remainingTimeLoggerFrequency),
"remaining time logger");
timeloop.addFuncAfterTimeStep(Comm_phase_field, "Communication Phase field");
#endif
uint_t const vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0);
if (vtkWriteFrequency > 1)
......@@ -307,40 +277,60 @@ int main(int argc, char** argv)
auto phaseWriter = make_shared< field::VTKWriter< PhaseField_T > >(phase_field, "phase");
vtkOutput->addCellDataWriter(phaseWriter);
timeLoop->addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
timeloop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output");
}
lbm_generated::PerformanceEvaluation< FlagField_T > const performance(blocks, flagFieldID, fluidFlagUID);
field::CellCounter< FlagField_T > fluidCells(blocks, flagFieldID, fluidFlagUID);
fluidCells();
WALBERLA_LOG_INFO_ON_ROOT("Multiphase benchmark with " << fluidCells.numberOfCells() << " fluid cells")
WALBERLA_LOG_INFO_ON_ROOT("Running " << warmupSteps << " timesteps to warm up the system")
for (uint_t i = 0; i < warmupSteps; ++i)
timeLoop->singleStep();
timeloop.singleStep();
WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
WALBERLA_GPU_CHECK(gpuPeekAtLastError())
WALBERLA_MPI_BARRIER()
WALBERLA_LOG_INFO_ON_ROOT("Warmup timesteps done")
timeLoop->setCurrentTimeStepToZero();
timeloop.setCurrentTimeStepToZero();
WALBERLA_MPI_BARRIER()
WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps")
WcTimingPool timeloopTiming;
WcTimer simTimer;
#if defined(WALBERLA_BUILD_WITH_CUDA)
cudaDeviceSynchronize();
WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
#endif
simTimer.start();
timeLoop->run();
timeloop.run(timeloopTiming);
#if defined(WALBERLA_BUILD_WITH_CUDA)
cudaDeviceSynchronize();
WALBERLA_GPU_CHECK(gpuDeviceSynchronize())
WALBERLA_GPU_CHECK(gpuPeekAtLastError())
#endif
WALBERLA_MPI_BARRIER()
simTimer.end();
WALBERLA_LOG_INFO_ON_ROOT("Simulation finished")
auto time = real_c(simTimer.last());
auto nrOfCells = real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]);
auto mlupsPerProcess = nrOfCells * real_c(timesteps) / time * 1e-6;
WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process: " << mlupsPerProcess)
double time = simTimer.max();
WALBERLA_MPI_SECTION() { walberla::mpi::reduceInplace(time, walberla::mpi::MAX); }
performance.logResultOnRoot(timesteps, time);
const auto reducedTimeloopTiming = timeloopTiming.getReduced();
WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming)
WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process: " << performance.mlupsPerProcess(timesteps, time))
WALBERLA_LOG_RESULT_ON_ROOT("Time per time step: " << time / real_c(timesteps) << " s")
WALBERLA_ROOT_SECTION()
{
python_coupling::PythonCallback pythonCallbackResults("results_callback");
if (pythonCallbackResults.isCallable())
{
pythonCallbackResults.data().exposeValue("mlupsPerProcess", mlupsPerProcess);
pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
pythonCallbackResults.data().exposeValue("stencil_phase", StencilNamePhase);
pythonCallbackResults.data().exposeValue("stencil_hydro", StencilNameHydro);
#if defined(WALBERLA_BUILD_WITH_CUDA)
pythonCallbackResults.data().exposeValue("cuda_enabled_mpi", cudaEnabledMpi);
pythonCallbackResults.data().exposeValue("cuda_enabled_mpi", gpuEnabledMpi);
#endif
// Call Python function to report results
pythonCallbackResults();
......
......@@ -19,7 +19,7 @@ with CodeGeneration() as ctx:
field_type = "float64" if ctx.double_accuracy else "float32"
stencil_phase = LBStencil(Stencil.D3Q15)
stencil_hydro = LBStencil(Stencil.D3Q27)
stencil_hydro = LBStencil(Stencil.D3Q19)
assert (stencil_phase.D == stencil_hydro.D)
########################
......@@ -76,13 +76,13 @@ with CodeGeneration() as ctx:
delta_equilibrium=False,
force=sp.symbols(f"F_:{stencil_phase.D}"), velocity_input=u,
weighted=True, relaxation_rates=rates,
output={'density': C_tmp}, kernel_type='stream_pull_collide')
output={'density': C_tmp})
method_phase = create_lb_method(lbm_config=lbm_config_phase)
lbm_config_hydro = LBMConfig(stencil=stencil_hydro, method=Method.MRT, compressible=False,
weighted=True, relaxation_rate=omega,
force=sp.symbols(f"F_:{stencil_hydro.D}"),
output={'velocity': u}, kernel_type='collide_stream_push')
output={'velocity': u})
method_hydro = create_lb_method(lbm_config=lbm_config_hydro)
# create the kernels for the initialization of the g and h field
......@@ -137,7 +137,8 @@ with CodeGeneration() as ctx:
sweep_params = {'block_size': sweep_block_size}
stencil_typedefs = {'Stencil_phase_T': stencil_phase,
'Stencil_hydro_T': stencil_hydro}
'Stencil_hydro_T': stencil_hydro,
'Full_Stencil_T': LBStencil(Stencil.D3Q27)}
field_typedefs = {'PdfField_phase_T': h,
'PdfField_hydro_T': g,
'VelocityField_T': u,
......@@ -156,7 +157,7 @@ with CodeGeneration() as ctx:
generate_sweep(ctx, 'initialize_velocity_based_distributions', g_updates, target=Target.CPU)
generate_sweep(ctx, 'phase_field_LB_step', phase_field_LB_step,
field_swaps=[(h, h_tmp), (C, C_tmp)],
field_swaps=[(h, h_tmp)],
inner_outer_split=True,
cpu_vectorize_info=cpu_vec,
target=Target.CPU)
......@@ -172,7 +173,7 @@ with CodeGeneration() as ctx:
streaming_pattern='pull', target=Target.CPU)
generate_lb_pack_info(ctx, 'PackInfo_velocity_based_distributions', stencil_hydro, g,
streaming_pattern='push', target=Target.CPU)
streaming_pattern='pull', target=Target.CPU)
generate_pack_info_for_field(ctx, 'PackInfo_phase_field', C, target=Target.CPU)
......@@ -183,7 +184,7 @@ with CodeGeneration() as ctx:
g_updates, target=Target.GPU)
generate_sweep(ctx, 'phase_field_LB_step', phase_field_LB_step,
field_swaps=[(h, h_tmp), (C, C_tmp)],
field_swaps=[(h, h_tmp)],
target=Target.GPU,
gpu_indexing_params=sweep_params,
varying_parameters=vp)
......@@ -198,7 +199,7 @@ with CodeGeneration() as ctx:
streaming_pattern='pull', target=Target.GPU)
generate_lb_pack_info(ctx, 'PackInfo_velocity_based_distributions', stencil_hydro, g,
streaming_pattern='push', target=Target.GPU)
streaming_pattern='pull', target=Target.GPU)
generate_pack_info_for_field(ctx, 'PackInfo_phase_field', C, target=Target.GPU)
......
......@@ -3,14 +3,16 @@ waLBerla_link_files_to_builddir( "*.py" )
waLBerla_link_files_to_builddir( "simulation_setup" )
foreach(streaming_pattern pull push aa esotwist)
foreach(streaming_pattern pull push aa esotwist esopull esopush)
foreach(stencil d3q19 d3q27)
foreach (collision_setup srt trt w-mrt r-w-mrt cm r-cm k r-k entropic smagorinsky)
foreach (collision_setup srt trt mrt mrt-overrelax central central-overrelax cumulant cumulant-overrelax cumulant-K17 entropic smagorinsky qr)
# KBC methods only for D2Q9 and D3Q27 defined
if (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
continue()
endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19")
if (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
continue()
endif (${collision_setup} STREQUAL "cumulant-K17" AND ${stencil} STREQUAL "d3q19")
set(config ${stencil}_${streaming_pattern}_${collision_setup})
waLBerla_generate_target_from_python(NAME UniformGridCPUGenerated_${config}
FILE UniformGridCPU.py
......
......@@ -64,6 +64,9 @@ using SweepCollection_T = lbm::UniformGridCPUSweepCollection;
using blockforest::communication::UniformBufferedScheme;
using macroFieldType = VelocityField_T::value_type;
using pdfFieldType = PdfField_T::value_type;
int main(int argc, char** argv)
{
const mpi::Environment env(argc, argv);
......@@ -87,10 +90,10 @@ int main(int argc, char** argv)
// Creating fields
const StorageSpecification_T StorageSpec = StorageSpecification_T();
auto fieldAllocator = make_shared< field::AllocateAligned< real_t, 64 > >();
auto fieldAllocator = make_shared< field::AllocateAligned< pdfFieldType, 64 > >();
const BlockDataID pdfFieldId = lbm_generated::addPdfFieldToStorage(blocks, "pdfs", StorageSpec, field::fzyx, fieldAllocator);
const BlockDataID velFieldId = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx);
const BlockDataID densityFieldId = field::addToStorage< ScalarField_T >(blocks, "density", real_c(1.0), field::fzyx);
const BlockDataID velFieldId = field::addToStorage< VelocityField_T >(blocks, "vel", macroFieldType(0.0), field::fzyx);
const BlockDataID densityFieldId = field::addToStorage< ScalarField_T >(blocks, "density", macroFieldType(1.0), field::fzyx);
const BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field");
// Initialize velocity on cpu
......@@ -235,12 +238,16 @@ int main(int argc, char** argv)
pythonCallbackResults.data().exposeValue("numProcesses", performance.processes());
pythonCallbackResults.data().exposeValue("numThreads", performance.threads());
pythonCallbackResults.data().exposeValue("numCores", performance.cores());
pythonCallbackResults.data().exposeValue("numberOfCells", performance.numberOfCells());
pythonCallbackResults.data().exposeValue("numberOfFluidCells", performance.numberOfFluidCells());
pythonCallbackResults.data().exposeValue("mlups", performance.mlups(timesteps, time));
pythonCallbackResults.data().exposeValue("mlupsPerCore", performance.mlupsPerCore(timesteps, time));
pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time));
pythonCallbackResults.data().exposeValue("stencil", infoStencil);
pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern);
pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup);
pythonCallbackResults.data().exposeValue("vectorised", vectorised);
pythonCallbackResults.data().exposeValue("nontemporal", nontemporal);
pythonCallbackResults.data().exposeValue("cse_global", infoCseGlobal);
pythonCallbackResults.data().exposeValue("cse_pdfs", infoCsePdfs);
// Call Python function to report results
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment