diff --git a/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt b/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt index f332e065fed35fa99367127e9d44b211849cc7b3..de1a2c1db0cbd078a80879f964f6a046f98afc95 100644 --- a/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt +++ b/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt @@ -10,6 +10,12 @@ waLBerla_generate_target_from_python(NAME NonUniformGridCPUGenerated UBB.h UBB.cpp NonUniformGridCPUBoundaryCollection.h NonUniformGridCPUInfoHeader.h) + +waLBerla_add_executable( NAME NonUniformGridGenerator + FILES NonUniformGridGenerator.cpp LdcSetup.h + DEPENDS blockforest core domain_decomposition field geometry python_coupling vtk ) + + waLBerla_add_executable( NAME NonUniformGridCPU - FILES NonUniformGridCPU.cpp - DEPENDS blockforest boundary core domain_decomposition field geometry python_coupling timeloop vtk NonUniformGridCPUGenerated ) \ No newline at end of file + FILES NonUniformGridCPU.cpp LdcSetup.h + DEPENDS blockforest boundary core domain_decomposition field geometry python_coupling timeloop vtk NonUniformGridCPUGenerated ) diff --git a/apps/benchmarks/NonUniformGridCPU/LdcSetup.h b/apps/benchmarks/NonUniformGridCPU/LdcSetup.h new file mode 100644 index 0000000000000000000000000000000000000000..1657e85e0e58dc0a1107ba62505e438c1821a1c1 --- /dev/null +++ b/apps/benchmarks/NonUniformGridCPU/LdcSetup.h @@ -0,0 +1,109 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file LdcSetup.h +//! \author Frederik Hennig <frederik.hennig@fau.de> +// +//====================================================================================================================== + +#include "blockforest/SetupBlock.h" +#include "blockforest/SetupBlockForest.h" + +#include "core/all.h" + +#include "field/FlagField.h" + +#include "field/FlagUID.h" + +using namespace walberla; + +using RefinementSelectionFunctor = SetupBlockForest::RefinementSelectionFunction; + +using FlagField_T = FlagField< uint8_t >; + +class LDCRefinement +{ + private: + const uint_t refinementDepth_; + + public: + explicit LDCRefinement(const uint_t depth) : refinementDepth_(depth){}; + + void operator()(SetupBlockForest& forest) const + { + const AABB & domain = forest.getDomain(); + + real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 ); + real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 ); + + AABB leftCorner( domain.xMin(), domain.yMax() - ySize, domain.zMin(), + domain.xMin() + xSize, domain.yMax() + ySize, domain.zMax() ); + + AABB rightCorner( domain.xMax() - xSize, domain.yMax() - ySize, domain.zMin(), + domain.xMax(), domain.yMax(), domain.zMax() ); + + for(auto & block : forest) + { + auto & aabb = block.getAABB(); + if( leftCorner.intersects( aabb ) || rightCorner.intersects( aabb ) ) + { + if( block.getLevel() < refinementDepth_) + block.setMarker( true ); + } + } + } +}; + +class LDC +{ + private: + const std::string refinementProfile_; + const uint_t refinementDepth_; + + const FlagUID noSlipFlagUID_; + const FlagUID ubbFlagUID_; + + public: + explicit LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){}; + + RefinementSelectionFunctor refinementSelector() const + { + return LDCRefinement(refinementDepth_); + } + + void setupBoundaryFlagField(StructuredBlockForest& sbfs, const BlockDataID flagFieldID) + { + for (auto bIt = sbfs.begin(); bIt != sbfs.end(); ++bIt) + { + auto& b = dynamic_cast< Block& >(*bIt); + const uint_t level = b.getLevel(); + auto flagField = b.getData< FlagField_T >(flagFieldID); + const uint8_t noslipFlag = flagField->registerFlag(noSlipFlagUID_); + const uint8_t ubbFlag = flagField->registerFlag(ubbFlagUID_); + for (auto cIt = flagField->beginWithGhostLayerXYZ(2); cIt != flagField->end(); ++cIt) + { + const Cell localCell = cIt.cell(); + Cell globalCell(localCell); + sbfs.transformBlockLocalToGlobalCell(globalCell, b); + if (globalCell.y() >= cell_idx_c(sbfs.getNumberOfYCells(level))) { flagField->addFlag(localCell, ubbFlag); } + else if (globalCell.z() < 0 || globalCell.y() < 0 || globalCell.x() < 0 || + globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level)) || globalCell.z() >= cell_idx_c(sbfs.getNumberOfZCells(level))) + { + flagField->addFlag(localCell, noslipFlag); + } + } + } + } +}; diff --git a/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp index 4d3ce332f4f6b23e7fbfb5e885b61c3268232155..6476d82ec1877b7f2732837d89cf76a4d81b2627 100644 --- a/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp +++ b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.cpp @@ -15,13 +15,10 @@ // //! \file NonUniformGridCPU.cpp //! \author Markus Holzer <markus.holzer@fau.de> +//! \author Frederik Hennig <frederik.hennig@fau.de> // //====================================================================================================================== -#include "blockforest/Initialization.h" -#include "blockforest/SetupBlockForest.h" -#include "blockforest/loadbalancing/StaticCurve.h" - #include "core/Environment.h" #include "core/logging/Initialization.h" #include "core/timing/RemainingTimeLogger.h" @@ -33,20 +30,21 @@ #include "geometry/InitBoundaryHandling.h" -#include "lbm_generated/communication/NonuniformGeneratedPdfPackInfo.h" -#include "lbm_generated/field/AddToStorage.h" -#include "lbm_generated/field/PdfField.h" -#include "lbm_generated/refinement/BasicRecursiveTimeStep.h" -#include "lbm_generated/evaluation/PerformanceEvaluation.h" - #include "python_coupling/CreateConfig.h" +#include "python_coupling/DictWrapper.h" #include "python_coupling/PythonCallback.h" #include "timeloop/SweepTimeloop.h" #include <cmath> +#include "LdcSetup.h" #include "NonUniformGridCPUInfoHeader.h" +#include "lbm_generated/communication/NonuniformGeneratedPdfPackInfo.h" +#include "lbm_generated/evaluation/PerformanceEvaluation.h" +#include "lbm_generated/field/AddToStorage.h" +#include "lbm_generated/field/PdfField.h" +#include "lbm_generated/refinement/BasicRecursiveTimeStep.h" using namespace walberla; @@ -55,157 +53,47 @@ using Stencil_T = StorageSpecification_T::Stencil; using CommunicationStencil_T = StorageSpecification_T::CommunicationStencil; using PdfField_T = lbm_generated::PdfField< StorageSpecification_T >; -using FlagField_T = FlagField< uint8_t >; using BoundaryCollection_T = lbm::NonUniformGridCPUBoundaryCollection< FlagField_T >; using SweepCollection_T = lbm::NonUniformGridCPUSweepCollection; using blockforest::communication::NonUniformBufferedScheme; -using RefinementSelectionFunctor = SetupBlockForest::RefinementSelectionFunction; - - -class LDCRefinement -{ -private: - const uint_t refinementDepth_; - -public: - explicit LDCRefinement(const uint_t depth) : refinementDepth_(depth){}; - - void operator()(SetupBlockForest& forest) const - { - const AABB & domain = forest.getDomain(); - - real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 ); - real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 ); - - AABB leftCorner( domain.xMin(), domain.yMin(), domain.zMin(), - domain.xMin() + xSize, domain.yMin() + ySize, domain.zMax() ); - - AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMin(), - domain.xMax(), domain.yMin() + ySize, domain.zMax() ); - - for(auto & block : forest) - { - auto & aabb = block.getAABB(); - if( leftCorner.intersects( aabb ) || rightCorner.intersects( aabb ) ) - { - if( block.getLevel() < refinementDepth_) - block.setMarker( true ); - } - } - } -}; - -class LDC -{ -private: - const std::string refinementProfile_; - const uint_t refinementDepth_; - - const FlagUID noSlipFlagUID_; - const FlagUID ubbFlagUID_; - -public: - explicit LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){}; - - RefinementSelectionFunctor refinementSelector() const - { - return LDCRefinement(refinementDepth_); - } - - void setupBoundaryFlagField(StructuredBlockForest& sbfs, const BlockDataID flagFieldID) - { - for (auto bIt = sbfs.begin(); bIt != sbfs.end(); ++bIt) - { - auto& b = dynamic_cast< Block& >(*bIt); - const uint_t level = b.getLevel(); - auto flagField = b.getData< FlagField_T >(flagFieldID); - const uint8_t noslipFlag = flagField->registerFlag(noSlipFlagUID_); - const uint8_t ubbFlag = flagField->registerFlag(ubbFlagUID_); - for (auto cIt = flagField->beginWithGhostLayerXYZ(2); cIt != flagField->end(); ++cIt) - { - const Cell localCell = cIt.cell(); - Cell globalCell(localCell); - sbfs.transformBlockLocalToGlobalCell(globalCell, b); - if (globalCell.y() >= cell_idx_c(sbfs.getNumberOfYCells(level))) { flagField->addFlag(localCell, ubbFlag); } - else if (globalCell.z() < 0 || globalCell.y() < 0 || globalCell.x() < 0 || - globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level)) || globalCell.z() >= cell_idx_c(sbfs.getNumberOfZCells(level))) - { - flagField->addFlag(localCell, noslipFlag); - } - } - } - } -}; - -static void createSetupBlockForest(SetupBlockForest& setupBfs, const Config::BlockHandle& domainSetup, LDC& ldcSetup, const uint_t numProcesses=uint_c(MPIManager::instance()->numProcesses())) -{ - Vector3<real_t> domainSize = domainSetup.getParameter<Vector3<real_t> >("domainSize"); - Vector3<uint_t> rootBlocks = domainSetup.getParameter<Vector3<uint_t> >("rootBlocks"); - Vector3<bool> periodic = domainSetup.getParameter<Vector3<bool> >("periodic"); - - auto refSelection = ldcSetup.refinementSelector(); - setupBfs.addRefinementSelectionFunction(std::function<void(SetupBlockForest &)>(refSelection)); - const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]); - setupBfs.addWorkloadMemorySUIDAssignmentFunction(blockforest::uniformWorkloadAndMemoryAssignment); - setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]); - setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses); -} int main(int argc, char** argv) { const mpi::Environment env(argc, argv); mpi::MPIManager::instance()->useWorldComm(); + const std::string input_filename(argv[1]); + const bool inputIsPython = string_ends_with(input_filename, ".py"); + for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg) { WALBERLA_MPI_BARRIER() + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - /// SETUP AND CONFIGURATION /// + /// BLOCK FOREST SETUP /// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// auto config = *cfg; logging::configureLogging(config); - auto domainSetup = config->getOneBlock("DomainSetup"); - // Reading parameters - auto parameters = config->getOneBlock("Parameters"); - const real_t omega = parameters.getParameter< real_t >("omega", real_c(1.4)); - const uint_t refinementDepth = parameters.getParameter< uint_t >("refinementDepth", uint_c(1)); - const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50)); - const bool writeSetupForestAndReturn = parameters.getParameter< bool >("writeSetupForestAndReturn", false); - const bool benchmarkKernelOnly = parameters.getParameter< bool >("benchmarkKernelOnly", false); - const uint_t numProcesses = parameters.getParameter< uint_t >( "numProcesses"); - - auto ldc = std::make_shared< LDC >(refinementDepth); - SetupBlockForest setupBfs; - if (writeSetupForestAndReturn) - { - WALBERLA_LOG_INFO_ON_ROOT("Creating SetupBlockForest for " << numProcesses << " processes") - WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...") - createSetupBlockForest(setupBfs, domainSetup, *ldc, numProcesses); - - WALBERLA_ROOT_SECTION() { setupBfs.writeVTKOutput("SetupBlockForest"); } + auto blockForestSetup = config->getOneBlock("SetupBlockForest"); + const std::string blockForestFilestem = + blockForestSetup.getParameter< std::string >("blockForestFilestem", "blockforest"); + const uint_t refinementDepth = blockForestSetup.getParameter< uint_t >("refinementDepth", uint_c(1)); - WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks()) - for (uint_t level = 0; level <= refinementDepth; level++) - { - const uint_t numberOfBlocks = setupBfs.getNumberOfBlocks(level); - WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << numberOfBlocks) - } - - WALBERLA_LOG_INFO_ON_ROOT("Ending program") - return EXIT_SUCCESS; - } + auto domainSetup = config->getOneBlock("DomainSetup"); + Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock"); - WALBERLA_LOG_INFO_ON_ROOT("Generating SetupBlockForest...") - createSetupBlockForest(setupBfs, domainSetup, *ldc); + // Load structured block forest from file + std::ostringstream oss; + oss << blockForestFilestem << ".bfs"; + const std::string setupBlockForestFilepath = oss.str(); - // Create structured block forest - Vector3< uint_t > cellsPerBlock = domainSetup.getParameter< Vector3< uint_t > >("cellsPerBlock"); WALBERLA_LOG_INFO_ON_ROOT("Creating structured block forest...") - auto bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), setupBfs); + auto bfs = std::make_shared< BlockForest >(uint_c(MPIManager::instance()->worldRank()), + setupBlockForestFilepath.c_str(), false); auto blocks = std::make_shared< StructuredBlockForest >(bfs, cellsPerBlock[0], cellsPerBlock[1], cellsPerBlock[2]); blocks->createCellBoundingBoxes(); @@ -216,6 +104,18 @@ int main(int argc, char** argv) WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << blocks->getNumberOfBlocks(level)) } + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// DISTRIBUTED DATA STRUCTURES /// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + auto ldc = std::make_shared< LDC >(refinementDepth); + + // Reading parameters + auto parameters = config->getOneBlock("Parameters"); + const real_t omega = parameters.getParameter< real_t >("omega", real_c(1.4)); + const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50)); + const bool benchmarkKernelOnly = parameters.getParameter< bool >("benchmarkKernelOnly", false); + // Creating fields const StorageSpecification_T StorageSpec = StorageSpecification_T(); const BlockDataID pdfFieldID = @@ -263,12 +163,11 @@ int main(int argc, char** argv) SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps); - if(benchmarkKernelOnly){ + if (benchmarkKernelOnly) + { timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL), "LBM StreamCollide"); } - else{ - LBMMeshRefinement.addRefinementToTimeLoop(timeLoop); - } + else { LBMMeshRefinement.addRefinementToTimeLoop(timeLoop); } // VTK const uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0); @@ -298,11 +197,12 @@ int main(int argc, char** argv) timeLoop.addFuncAfterTimeStep(logger, "remaining time logger"); } - lbm_generated::PerformanceEvaluation<FlagField_T> const performance(blocks, flagFieldID, fluidFlagUID); - field::CellCounter< FlagField_T > fluidCells( blocks, flagFieldID, fluidFlagUID ); + lbm_generated::PerformanceEvaluation< FlagField_T > const performance(blocks, flagFieldID, fluidFlagUID); + field::CellCounter< FlagField_T > fluidCells(blocks, flagFieldID, fluidFlagUID); fluidCells(); - WALBERLA_LOG_INFO_ON_ROOT( "Non uniform Grid benchmark with " << fluidCells.numberOfCells() << " fluid cells (in total on all levels)") + WALBERLA_LOG_INFO_ON_ROOT("Non uniform Grid benchmark with " << fluidCells.numberOfCells() + << " fluid cells (in total on all levels)") WcTimingPool timeloopTiming; WcTimer simTimer; @@ -323,6 +223,31 @@ int main(int argc, char** argv) const auto reducedTimeloopTiming = timeloopTiming.getReduced(); WALBERLA_LOG_RESULT_ON_ROOT("Time loop timing:\n" << *reducedTimeloopTiming) + + WALBERLA_ROOT_SECTION() + { + if (inputIsPython) + { + python_coupling::PythonCallback pythonCallbackResults("results_callback"); + if (pythonCallbackResults.isCallable()) + { + pythonCallbackResults.data().exposeValue("numProcesses", performance.processes()); + pythonCallbackResults.data().exposeValue("numThreads", performance.threads()); + pythonCallbackResults.data().exposeValue("numCores", performance.cores()); + pythonCallbackResults.data().exposeValue("mlups", performance.mlups(timesteps, time)); + pythonCallbackResults.data().exposeValue("mlupsPerCore", performance.mlupsPerCore(timesteps, time)); + pythonCallbackResults.data().exposeValue("mlupsPerProcess", + performance.mlupsPerProcess(timesteps, time)); + pythonCallbackResults.data().exposeValue("stencil", infoStencil); + pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern); + pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup); + pythonCallbackResults.data().exposeValue("cse_global", infoCseGlobal); + pythonCallbackResults.data().exposeValue("cse_pdfs", infoCsePdfs); + // Call Python function to report results + pythonCallbackResults(); + } + } + } } return EXIT_SUCCESS; } \ No newline at end of file diff --git a/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.py b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.py index 3b350b6c9c48e0418244101cb3de1daec26c34ce..25e9342058f24b7ae6008ee66a42beb8dd230a5a 100644 --- a/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.py +++ b/apps/benchmarks/NonUniformGridCPU/NonUniformGridCPU.py @@ -24,7 +24,7 @@ const bool infoCsePdfs = {cse_pdfs}; with CodeGeneration() as ctx: field_type = "float64" if ctx.double_accuracy else "float32" - streaming_pattern = 'pull' + streaming_pattern = 'aa' timesteps = get_timesteps(streaming_pattern) stencil = LBStencil(Stencil.D3Q19) @@ -33,7 +33,7 @@ with CodeGeneration() as ctx: density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx') macroscopic_fields = {'density': density_field, 'velocity': velocity_field} - lbm_config = LBMConfig(stencil=stencil, method=Method.SRT, relaxation_rate=omega, + lbm_config = LBMConfig(stencil=stencil, method=Method.SRT, relaxation_rate=omega, compressible=True, streaming_pattern=streaming_pattern) lbm_opt = LBMOptimisation(cse_global=False, field_layout="fzyx") diff --git a/apps/benchmarks/NonUniformGridCPU/NonUniformGridGenerator.cpp b/apps/benchmarks/NonUniformGridCPU/NonUniformGridGenerator.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d7eab3046a8969f7ccc09bc7941315d924412822 --- /dev/null +++ b/apps/benchmarks/NonUniformGridCPU/NonUniformGridGenerator.cpp @@ -0,0 +1,98 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file NonUniformGridGenerator.cpp +//! \author Frederik Hennig <frederik.hennig@fau.de> +// +//====================================================================================================================== + +#include "blockforest/Initialization.h" +#include "blockforest/SetupBlock.h" +#include "blockforest/SetupBlockForest.h" +#include "blockforest/loadbalancing/StaticCurve.h" + +#include "core/all.h" + +#include "python_coupling/CreateConfig.h" + +#include <string> + +#include "LdcSetup.h" + +using namespace walberla; + + +int main(int argc, char ** argv){ + const mpi::Environment env(argc, argv); + mpi::MPIManager::instance()->useWorldComm(); + + if(mpi::MPIManager::instance()->numProcesses() > 1){ + WALBERLA_ABORT("Commandment: Thou shalt not run thy grid generator with more than one process."); + } + + for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg) + { + auto config = *cfg; + auto domainSetup = config->getOneBlock("DomainSetup"); + + Vector3<real_t> domainSize = domainSetup.getParameter<Vector3<real_t> >("domainSize"); + Vector3<uint_t> rootBlocks = domainSetup.getParameter<Vector3<uint_t> >("rootBlocks"); + Vector3<bool> periodic = domainSetup.getParameter<Vector3<bool> >("periodic"); + + auto blockForestSetup = config->getOneBlock("SetupBlockForest"); + const uint_t refinementDepth = blockForestSetup.getParameter< uint_t >("refinementDepth", uint_c(1)); + const uint_t numProcesses = blockForestSetup.getParameter< uint_t >( "numProcesses"); + const std::string blockForestFilestem = blockForestSetup.getParameter< std::string > ("blockForestFilestem", "blockforest"); + const bool writeVtk = blockForestSetup.getParameter< bool >("writeVtk", false); + const bool outputStatistics = blockForestSetup.getParameter< bool >("outputStatistics", false); + + const LDC ldc(refinementDepth); + SetupBlockForest setupBfs; + + auto refSelection = ldc.refinementSelector(); + setupBfs.addRefinementSelectionFunction(std::function<void(SetupBlockForest &)>(refSelection)); + const AABB domain(real_t(0.0), real_t(0.0), real_t(0.0), domainSize[0], domainSize[1], domainSize[2]); + setupBfs.addWorkloadMemorySUIDAssignmentFunction(blockforest::uniformWorkloadAndMemoryAssignment); + setupBfs.init(domain, rootBlocks[0], rootBlocks[1], rootBlocks[2], periodic[0], periodic[1], periodic[2]); + setupBfs.balanceLoad(blockforest::StaticLevelwiseCurveBalanceWeighted(), numProcesses); + + { + std::ostringstream oss; + oss << blockForestFilestem << ".bfs"; + setupBfs.saveToFile(oss.str().c_str()); + } + + if(writeVtk){ + setupBfs.writeVTKOutput(blockForestFilestem); + } + + if(outputStatistics){ + WALBERLA_LOG_INFO_ON_ROOT("=========================== BLOCK FOREST STATISTICS ============================"); + WALBERLA_LOG_INFO_ON_ROOT("Blocks created: " << setupBfs.getNumberOfBlocks()) + for (uint_t level = 0; level <= refinementDepth; level++) + { + const uint_t numberOfBlocks = setupBfs.getNumberOfBlocks(level); + WALBERLA_LOG_INFO_ON_ROOT("Level " << level << " Blocks: " << numberOfBlocks) + } + + const uint_t avgBlocksPerProc = setupBfs.getNumberOfBlocks() / setupBfs.getNumberOfProcesses(); + WALBERLA_LOG_INFO_ON_ROOT("Average blocks per process: " << avgBlocksPerProc); + WALBERLA_LOG_INFO_ON_ROOT("================================================================================="); + } + + + WALBERLA_LOG_INFO_ON_ROOT("Ending program") + } +} diff --git a/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py index ccfcecacfb5c0f5a79b56aea13409cc3da4d2748..8c99c62c2fc7c3791239b417b813fa6a1d62f1cd 100644 --- a/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py +++ b/apps/benchmarks/NonUniformGridCPU/simulation_setup/benchmark_configs.py @@ -1,16 +1,37 @@ import waLBerla as wlb +from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle +import sqlite3 +import os +import sys +DB_FILE = os.environ.get('DB_FILE', "cpu_benchmark.sqlite3") class Scenario: - def __init__(self, domain_size=(64, 64, 64), root_blocks=(2, 2, 2), - cells_per_block=(32, 32, 32), refinement_depth=0): + def __init__(self, + domain_size=(64, 64, 64), + root_blocks=(2, 2, 2), + num_processes=1, + refinement_depth=0, + cells_per_block=(32, 32, 32), + timesteps=101, + vtk_write_frequency=0, + logger_frequency=0, + blockforest_filestem="blockforest", + write_setup_vtk=False): self.domain_size = domain_size self.root_blocks = root_blocks self.cells_per_block = cells_per_block + self.periodic = (0, 0, 0) + self.refinement_depth = refinement_depth + self.num_processes = num_processes + self.bfs_filestem = blockforest_filestem + self.write_setup_vtk = write_setup_vtk - self.periodic = (0, 0, 0) + self.timesteps = timesteps + self.vtk_write_frequency = vtk_write_frequency + self.logger_frequency = logger_frequency self.config_dict = self.config(print_dict=False) @@ -24,30 +45,56 @@ class Scenario: 'cellsPerBlock': self.cells_per_block, 'periodic': self.periodic }, + 'SetupBlockForest': { + 'refinementDepth': self.refinement_depth, + 'numProcesses': self.num_processes, + 'blockForestFilestem': self.bfs_filestem, + 'writeVtk': self.write_setup_vtk, + 'outputStatistics': False + }, 'Parameters': { 'omega': 1.95, - 'timesteps': 10001, - - 'refinementDepth': self.refinement_depth, - 'writeSetupForestAndReturn': False, - 'numProcesses': 1, - - 'cudaEnabledMPI': False, - 'benchmarkKernelOnly': False, - - 'remainingTimeLoggerFrequency': 3, - - 'vtkWriteFrequency': 5000, + 'timesteps': self.timesteps, + 'remainingTimeLoggerFrequency': self.logger_frequency, + 'vtkWriteFrequency': self.vtk_write_frequency, }, 'Logging': { 'logLevel': "info", } } - if print_dict and config_dict["Parameters"]["writeSetupForestAndReturn"] is False: + if(print_dict): wlb.log_info_on_root("Scenario:\n" + pformat(config_dict)) + return config_dict + @wlb.member_callback + def results_callback(self, **kwargs): + data = {} + data.update(self.config_dict['Parameters']) + data.update(self.config_dict['DomainSetup']) + data.update(kwargs) + + data['executable'] = sys.argv[0] + data['compile_flags'] = wlb.build_info.compiler_flags + data['walberla_version'] = wlb.build_info.version + data['build_machine'] = wlb.build_info.build_machine + sequenceValuesToScalars(data) + + result = data + sequenceValuesToScalars(result) + num_tries = 4 + # check multiple times e.g. may fail when multiple benchmark processes are running + table_name = f"runs" + table_name = table_name.replace("-", "_") + for num_try in range(num_tries): + try: + checkAndUpdateSchema(result, table_name, DB_FILE) + storeSingle(result, table_name, DB_FILE) + break + except sqlite3.OperationalError as e: + wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries} {str(e)}") + def validation_run(): """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works""" @@ -61,9 +108,31 @@ def validation_run(): scenarios = wlb.ScenarioManager() scenario = Scenario(domain_size=domain_size, root_blocks=root_blocks, + num_processes=1, + refinement_depth=1, cells_per_block=cells_per_block, - refinement_depth=1) + timesteps=201, + vtk_write_frequency=100, + logger_frequency=5, + write_setup_vtk=True) scenarios.add(scenario) +def scaling(): + wlb.log_info_on_root("Running scaling benchmark...") + + numProc = wlb.mpi.numProcesses() + + domain_size = (256, 256, 128 * numProc) + cells_per_block = (64, 64, 64) + root_blocks = tuple([d // c for d, c in zip(domain_size, cells_per_block)]) + + scenarios = wlb.ScenarioManager() + scenario = Scenario(domain_size=domain_size, + root_blocks=root_blocks, + cells_per_block=cells_per_block, + refinement_depth=2, + timesteps=10) + scenarios.add(scenario) validation_run() +# scaling() diff --git a/apps/benchmarks/UniformGridCPU/CMakeLists.txt b/apps/benchmarks/UniformGridCPU/CMakeLists.txt index 0d159bc542c6ada48999dace8e2b7dce4a085519..8e9c1e7af92fcde2102b9e279abed9e451c28858 100644 --- a/apps/benchmarks/UniformGridCPU/CMakeLists.txt +++ b/apps/benchmarks/UniformGridCPU/CMakeLists.txt @@ -5,7 +5,7 @@ waLBerla_link_files_to_builddir( "simulation_setup" ) foreach(streaming_pattern pull push aa esotwist) foreach(stencil d3q19 d3q27) - foreach (collision_setup srt trt mrt mrt-overrelax central central-overrelax cumulant cumulant-overrelax entropic smagorinsky) + foreach (collision_setup srt trt w-mrt r-w-mrt cm r-cm k r-k entropic smagorinsky) # KBC methods only for D2Q9 and D3Q27 defined if (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19") continue() diff --git a/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp b/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp index 64d94ce3d0dd843b29e693d446485bac73b84119..fc229eb116f86ee6c967a8fba5a1948f7b73fc64 100644 --- a/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp +++ b/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp @@ -68,6 +68,9 @@ int main(int argc, char** argv) { const mpi::Environment env(argc, argv); + std::string input_filename(argv[1]); + bool inputIsPython = string_ends_with(input_filename, ".py"); + for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg) { WALBERLA_MPI_WORLD_BARRIER() @@ -157,7 +160,7 @@ int main(int argc, char** argv) } else if (timeStepStrategy == "kernelOnly") { timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL), "LBM StreamCollide"); } else if (timeStepStrategy == "StreamOnly") { - timeLoop.add() << Sweep(StreamOnlyKernel, "LBM Stream Only"); + timeLoop.add() << Sweep(sweepCollection.stream(SweepCollection_T::ALL), "LBM Stream-Only"); } else { WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'") } @@ -225,17 +228,24 @@ int main(int argc, char** argv) WALBERLA_ROOT_SECTION() { - python_coupling::PythonCallback pythonCallbackResults("results_callback"); - if (pythonCallbackResults.isCallable()) - { - pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time)); - pythonCallbackResults.data().exposeValue("stencil", infoStencil); - pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern); - pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup); - pythonCallbackResults.data().exposeValue("cse_global", infoCseGlobal); - pythonCallbackResults.data().exposeValue("cse_pdfs", infoCsePdfs); - // Call Python function to report results - pythonCallbackResults(); + if(inputIsPython){ + python_coupling::PythonCallback pythonCallbackResults("results_callback"); + if (pythonCallbackResults.isCallable()) + { + pythonCallbackResults.data().exposeValue("numProcesses", performance.processes()); + pythonCallbackResults.data().exposeValue("numThreads", performance.threads()); + pythonCallbackResults.data().exposeValue("numCores", performance.cores()); + pythonCallbackResults.data().exposeValue("mlups", performance.mlups(timesteps, time)); + pythonCallbackResults.data().exposeValue("mlupsPerCore", performance.mlupsPerCore(timesteps, time)); + pythonCallbackResults.data().exposeValue("mlupsPerProcess", performance.mlupsPerProcess(timesteps, time)); + pythonCallbackResults.data().exposeValue("stencil", infoStencil); + pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern); + pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup); + pythonCallbackResults.data().exposeValue("cse_global", infoCseGlobal); + pythonCallbackResults.data().exposeValue("cse_pdfs", infoCsePdfs); + // Call Python function to report results + pythonCallbackResults(); + } } } } diff --git a/apps/benchmarks/UniformGridCPU/UniformGridCPU.py b/apps/benchmarks/UniformGridCPU/UniformGridCPU.py index cd1a36114788a0ad440f89d750abc8af26109eda..ae9ec4f1bd6e26a8474099cd6f03d4c40f114854 100644 --- a/apps/benchmarks/UniformGridCPU/UniformGridCPU.py +++ b/apps/benchmarks/UniformGridCPU/UniformGridCPU.py @@ -7,7 +7,7 @@ from pystencils.simp.subexpression_insertion import insert_zeros, insert_aliases insert_symbol_times_minus_one from lbmpy.advanced_streaming import is_inplace -from lbmpy.advanced_streaming.utility import streaming_patterns +from lbmpy.advanced_streaming.utility import streaming_patterns, get_accessor, Timestep from lbmpy.boundaries import NoSlip, UBB from lbmpy.creationfunctions import LBMConfig, LBMOptimisation, LBStencil, create_lb_collision_rule from lbmpy.enums import Method, Stencil @@ -26,40 +26,40 @@ options_dict = { 'srt': { 'method': Method.SRT, 'relaxation_rate': omega, - 'compressible': False, + 'compressible': True, }, 'trt': { 'method': Method.TRT, 'relaxation_rate': omega, - 'compressible': False, + 'compressible': True, }, - 'mrt': { + 'r-w-mrt': { 'method': Method.MRT, 'relaxation_rates': [omega, 1, 1, 1, 1, 1, 1], - 'compressible': False, + 'compressible': True, }, - 'mrt-overrelax': { + 'w-mrt': { 'method': Method.MRT, 'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)], - 'compressible': False, + 'compressible': True, }, - 'central': { + 'r-cm': { 'method': Method.CENTRAL_MOMENT, 'relaxation_rate': omega, 'compressible': True, }, - 'central-overrelax': { + 'cm': { 'method': Method.CENTRAL_MOMENT, 'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)], 'compressible': True, }, - 'cumulant': { - 'method': Method.MONOMIAL_CUMULANT, + 'r-k': { + 'method': Method.CUMULANT, 'relaxation_rate': omega, 'compressible': True, }, - 'cumulant-overrelax': { - 'method': Method.MONOMIAL_CUMULANT, + 'k': { + 'method': Method.CUMULANT, 'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 18)], 'compressible': True, }, @@ -87,9 +87,6 @@ const bool infoCseGlobal = {cse_global}; const bool infoCsePdfs = {cse_pdfs}; """ -# DEFAULTS -optimize = True - with CodeGeneration() as ctx: openmp = True if ctx.openmp else False field_type = "float64" if ctx.double_accuracy else "float32" @@ -105,9 +102,6 @@ with CodeGeneration() as ctx: streaming_pattern = config_tokens[1] collision_setup = config_tokens[2] - if len(config_tokens) >= 4: - optimize = (config_tokens[3] != 'noopt') - if stencil_str == "d3q27": stencil = LBStencil(Stencil.D3Q27) elif stencil_str == "d3q19": @@ -139,20 +133,15 @@ with CodeGeneration() as ctx: # Sweep for Stream only. This is for benchmarking an empty streaming pattern without LBM. # is_inplace is set to False to ensure that the streaming is done with src and dst field. # If this is not the case the compiler might simplify the streaming in a way that benchmarking makes no sense. - accessor = CollideOnlyInplaceAccessor() - accessor.is_inplace = False - field_swaps_stream_only = [(pdfs, pdfs_tmp)] - stream_only_kernel = create_stream_only_kernel(stencil, pdfs, pdfs_tmp, accessor=accessor) + # accessor = CollideOnlyInplaceAccessor() + accessor = get_accessor(streaming_pattern, Timestep.EVEN) + #accessor.is_inplace = False + field_swaps_stream_only = () if accessor.is_inplace else [(pdfs, pdfs_tmp)] + stream_only_kernel = create_stream_only_kernel(stencil, pdfs, None if accessor.is_inplace else pdfs_tmp, accessor=accessor) # LB Sweep collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt) - if optimize: - collision_rule = insert_constants(collision_rule) - collision_rule = insert_zeros(collision_rule) - collision_rule = insert_aliases(collision_rule) - collision_rule = insert_symbol_times_minus_one(collision_rule) - no_slip = lbm_boundary_generator(class_name='NoSlip', flag_uid='NoSlip', boundary_object=NoSlip()) ubb = lbm_boundary_generator(class_name='UBB', flag_uid='UBB', @@ -166,7 +155,8 @@ with CodeGeneration() as ctx: cpu_openmp=openmp, cpu_vectorize_info=cpu_vec) # Stream only kernel - generate_sweep(ctx, 'UniformGridCPU_StreamOnlyKernel', stream_only_kernel, field_swaps=field_swaps_stream_only, + generate_sweep(ctx, 'UniformGridCPU_StreamOnlyKernel', stream_only_kernel, + field_swaps=field_swaps_stream_only, target=ps.Target.CPU, cpu_openmp=openmp) infoHeaderParams = { diff --git a/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py old mode 100755 new mode 100644 index 6a54df4522354c3f2f2ca51da370e7b4eba5229d..c3076cb3e1f457ecae1f6a5090adac2314c34e97 --- a/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py +++ b/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py @@ -24,15 +24,15 @@ def num_time_steps(block_size, time_steps_for_128_block=TIME_STEPS_FOR_128_BLOCK ldc_setup = {'Border': [ {'direction': 'N', 'walldistance': -1, 'flag': 'NoSlip'}, {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'}, - {'direction': 'N', 'walldistance': -1, 'flag': 'UBB'}, + {'direction': 'W', 'walldistance': -1, 'flag': 'NoSlip'}, {'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'}, - {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'T', 'walldistance': -1, 'flag': 'UBB'}, {'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'}, ]} class Scenario: - def __init__(self, cells_per_block=(128, 128, 128), periodic=(1, 1, 1), + def __init__(self, cells_per_block=(128, 128, 128), periodic=(1, 1, 1), blocks_per_process=1, timesteps=None, time_step_strategy="normal", omega=1.8, inner_outer_split=(1, 1, 1), warmup_steps=2, outer_iterations=3, init_shear_flow=False, boundary_setup=False, vtk_write_frequency=0, remaining_time_logger_frequency=-1): @@ -41,7 +41,8 @@ class Scenario: init_shear_flow = False periodic = (0, 0, 0) - self.blocks = block_decomposition(wlb.mpi.numProcesses()) + self.blocks_per_process = blocks_per_process + self.blocks = block_decomposition(self.blocks_per_process * wlb.mpi.numProcesses()) self.cells_per_block = cells_per_block self.periodic = periodic @@ -68,7 +69,7 @@ class Scenario: 'blocks': self.blocks, 'cellsPerBlock': self.cells_per_block, 'periodic': self.periodic, - 'oneBlockPerProcess': True + 'cartesianSetup': (self.blocks_per_process == 1) }, 'Parameters': { 'omega': self.omega, @@ -106,7 +107,7 @@ class Scenario: sequenceValuesToScalars(result) num_tries = 4 # check multiple times e.g. may fail when multiple benchmark processes are running - table_name = f"runs_{data['stencil']}_{data['streamingPattern']}_{data['collisionSetup']}_{prod(self.blocks)}" + table_name = f"runs" table_name = table_name.replace("-", "_") for num_try in range(num_tries): try: diff --git a/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h b/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h index 29f7de7657e6d923b216d1ed4eae0229325a3762..215f5a5c1f5dae1c7b514026bd19def5bff8786f 100644 --- a/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h +++ b/src/lbm_generated/refinement/BasicRecursiveTimeStep.impl.h @@ -186,7 +186,7 @@ void BasicRecursiveTimeStep< PdfField_T, SweepCollection_T, BoundaryCollection_T for(auto it = CommunicationStencil::beginNoCenter(); it != CommunicationStencil::end(); ++it){ uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(*it); // Propagate on ghost layers shadowing coarse or no blocks - if(!block->neighborhoodSectionHasSmallerBlocks(nSecIdx)){ + if(block->neighborhoodSectionHasLargerBlock(nSecIdx)){ CellInterval ci; pdfField->getGhostRegion(*it, ci, 1); sweepCollection_.streamOnlyNoAdvancementCellInterval(block, ci);