diff --git a/apps/benchmarks/CMakeLists.txt b/apps/benchmarks/CMakeLists.txt index 41519581d10865c5a1f6c405178ab79725ba7776..2adab475060437d4b2934f215c45d470b3b34b0a 100644 --- a/apps/benchmarks/CMakeLists.txt +++ b/apps/benchmarks/CMakeLists.txt @@ -22,7 +22,7 @@ if ( WALBERLA_BUILD_WITH_PYTHON ) if ( WALBERLA_BUILD_WITH_CODEGEN ) add_subdirectory( FlowAroundSphereCodeGen ) - add_subdirectory( UniformGridGenerated ) + add_subdirectory( UniformGridCPU ) add_subdirectory( PhaseFieldAllenCahn ) endif() diff --git a/apps/benchmarks/UniformGridCPU/CMakeLists.txt b/apps/benchmarks/UniformGridCPU/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2f06826e40553f9c157c5b5e5200ba8ed2b26b2 --- /dev/null +++ b/apps/benchmarks/UniformGridCPU/CMakeLists.txt @@ -0,0 +1,45 @@ +waLBerla_link_files_to_builddir( "*.prm" ) +waLBerla_link_files_to_builddir( "*.py" ) +waLBerla_link_files_to_builddir( "simulation_setup" ) + + +foreach(streaming_pattern pull push aa esotwist) + foreach(stencil d3q19 d3q27) + foreach (collision_setup srt trt mrt mrt-overrelax central central-overrelax cumulant cumulant-overrelax entropic smagorinsky) + # KBC methods only for D2Q9 and D3Q27 defined + if (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19") + continue() + endif (${collision_setup} STREQUAL "entropic" AND ${stencil} STREQUAL "d3q19") + + set(config ${stencil}_${streaming_pattern}_${collision_setup}) + waLBerla_generate_target_from_python(NAME UniformGridCPUGenerated_${config} + FILE UniformGridCPU.py + CODEGEN_CFG ${config} + OUT_FILES UniformGridCPU_LbKernel.cpp UniformGridCPU_LbKernel.h + UniformGridCPU_PackInfoEven.cpp UniformGridCPU_PackInfoEven.h + UniformGridCPU_PackInfoOdd.cpp UniformGridCPU_PackInfoOdd.h + UniformGridCPU_NoSlip.cpp UniformGridCPU_NoSlip.h + UniformGridCPU_UBB.cpp UniformGridCPU_UBB.h + UniformGridCPU_MacroSetter.cpp UniformGridCPU_MacroSetter.h + UniformGridCPU_MacroGetter.cpp UniformGridCPU_MacroGetter.h + UniformGridCPU_StreamOnlyKernel.cpp UniformGridCPU_StreamOnlyKernel.h + UniformGridCPU_InfoHeader.h + ) + + + waLBerla_add_executable(NAME UniformGridCPU_${config} + FILES UniformGridCPU.cpp + DEPENDS blockforest boundary core domain_decomposition field geometry python_coupling timeloop vtk UniformGridCPUGenerated_${config}) + + # all configs are excluded from all except for pull d3q27. + if (${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27") + set_target_properties( UniformGridCPUGenerated_${config} PROPERTIES EXCLUDE_FROM_ALL FALSE) + set_target_properties( UniformGridCPU_${config} PROPERTIES EXCLUDE_FROM_ALL FALSE) + else() + set_target_properties( UniformGridCPUGenerated_${config} PROPERTIES EXCLUDE_FROM_ALL TRUE) + set_target_properties( UniformGridCPU_${config} PROPERTIES EXCLUDE_FROM_ALL TRUE) + endif(${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27") + + endforeach () + endforeach() +endforeach() diff --git a/apps/benchmarks/UniformGridGenerated/InitShearVelocity.h b/apps/benchmarks/UniformGridCPU/InitShearVelocity.h similarity index 100% rename from apps/benchmarks/UniformGridGenerated/InitShearVelocity.h rename to apps/benchmarks/UniformGridCPU/InitShearVelocity.h diff --git a/apps/benchmarks/UniformGridGenerated/ManualKernels.h b/apps/benchmarks/UniformGridCPU/ManualKernels.h similarity index 100% rename from apps/benchmarks/UniformGridGenerated/ManualKernels.h rename to apps/benchmarks/UniformGridCPU/ManualKernels.h diff --git a/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp b/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c5fef7d891befd7e954c8a8e207e745ee8ef921c --- /dev/null +++ b/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp @@ -0,0 +1,318 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file UniformGridCPU.cpp +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#include "blockforest/Initialization.h" +#include "blockforest/communication/UniformBufferedScheme.h" + +#include "core/Environment.h" +#include "core/OpenMP.h" +#include "core/logging/Initialization.h" +#include "core/timing/RemainingTimeLogger.h" +#include "core/timing/TimingPool.h" + +#include "domain_decomposition/SharedSweep.h" + +#include "field/AddToStorage.h" +#include "field/vtk/VTKWriter.h" + +#include "geometry/InitBoundaryHandling.h" + +#include "lbm/communication/CombinedInPlaceCpuPackInfo.h" + +#include "python_coupling/CreateConfig.h" +#include "python_coupling/DictWrapper.h" +#include "python_coupling/PythonCallback.h" + +#include "timeloop/all.h" + +#include <iomanip> + +#include "InitShearVelocity.h" +#include "ManualKernels.h" +#include "UniformGridCPU_InfoHeader.h" + +using namespace walberla; + +using PackInfoEven_T = lbm::UniformGridCPU_PackInfoEven; +using PackInfoOdd_T = lbm::UniformGridCPU_PackInfoOdd; +using LbSweep = lbm::UniformGridCPU_LbKernel; + +using FlagField_T = FlagField< uint8_t >; + +auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage* const storage) { + return new PdfField_T(storage->getNumberOfXCells(*block), storage->getNumberOfYCells(*block), + storage->getNumberOfZCells(*block), uint_t(1), field::fzyx, + make_shared< field::AllocateAligned< real_t, 64 > >()); +}; + +int main(int argc, char** argv) +{ + mpi::Environment env(argc, argv); + + for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg) + { + WALBERLA_MPI_WORLD_BARRIER() + + auto config = *cfg; + logging::configureLogging(config); + auto blocks = blockforest::createUniformBlockGridFromConfig(config); + + Vector3< uint_t > cellsPerBlock = + config->getBlock("DomainSetup").getParameter< Vector3< uint_t > >("cellsPerBlock"); + // Reading parameters + auto parameters = config->getOneBlock("Parameters"); + const real_t omega = parameters.getParameter< real_t >("omega", real_c(1.4)); + const uint_t timesteps = parameters.getParameter< uint_t >("timesteps", uint_c(50)); + const bool initShearFlow = parameters.getParameter< bool >("initShearFlow", true); + + // Creating fields + BlockDataID pdfFieldId = blocks->addStructuredBlockData< PdfField_T >(pdfFieldAdder, "pdfs"); + BlockDataID velFieldId = field::addToStorage< VelocityField_T >(blocks, "vel", real_t(0), field::fzyx); + BlockDataID densityFieldId = field::addToStorage< ScalarField_T >(blocks, "density", real_t(1.0), field::fzyx); + + // Initialize velocity on cpu + if (initShearFlow) + { + WALBERLA_LOG_INFO_ON_ROOT("Initializing shear flow") + initShearVelocity(blocks, velFieldId); + } + + pystencils::UniformGridCPU_MacroSetter setterSweep(densityFieldId, pdfFieldId, velFieldId); + pystencils::UniformGridCPU_MacroGetter getterSweep(densityFieldId, pdfFieldId, velFieldId); + + // Set up initial PDF values + for (auto& block : *blocks) + setterSweep(&block); + + Vector3< int > innerOuterSplit = + parameters.getParameter< Vector3< int > >("innerOuterSplit", Vector3< int >(1, 1, 1)); + + for (uint_t i = 0; i < 3; ++i) + { + if (int_c(cellsPerBlock[i]) <= innerOuterSplit[i] * 2) + { + WALBERLA_ABORT_NO_DEBUG_INFO("innerOuterSplit too large - make it smaller or increase cellsPerBlock") + } + } + Cell innerOuterSplitCell(innerOuterSplit[0], innerOuterSplit[1], innerOuterSplit[2]); + + LbSweep lbSweep(pdfFieldId, omega, innerOuterSplitCell); + pystencils::UniformGridCPU_StreamOnlyKernel StreamOnlyKernel(pdfFieldId); + + // Boundaries + const FlagUID fluidFlagUID("Fluid"); + BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "Boundary Flag Field"); + auto boundariesConfig = config->getBlock("Boundaries"); + bool boundaries = false; + if (boundariesConfig) + { + WALBERLA_LOG_INFO_ON_ROOT("Setting boundary conditions") + boundaries = true; + geometry::initBoundaryHandling< FlagField_T >(*blocks, flagFieldID, boundariesConfig); + geometry::setNonBoundaryCellsToDomain< FlagField_T >(*blocks, flagFieldID, fluidFlagUID); + } + + lbm::UniformGridCPU_NoSlip noSlip(blocks, pdfFieldId); + noSlip.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("NoSlip"), fluidFlagUID); + + lbm::UniformGridCPU_UBB ubb(blocks, pdfFieldId); + ubb.fillFromFlagField< FlagField_T >(blocks, flagFieldID, FlagUID("UBB"), fluidFlagUID); + + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// COMMUNICATION SCHEME /// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + // Initial setup is the post-collision state of an even time step + auto tracker = make_shared< lbm::TimestepTracker >(0); + auto packInfo = + make_shared< lbm::CombinedInPlaceCpuPackInfo< PackInfoEven_T , PackInfoOdd_T > >(tracker, pdfFieldId); + + blockforest::communication::UniformBufferedScheme< Stencil_T > communication(blocks); + communication.addPackInfo(packInfo); + + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// TIME STEP DEFINITIONS /// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + auto boundarySweep = [&](IBlock* block, uint8_t t) { + noSlip.run(block, t); + ubb.run(block, t); + }; + + auto boundaryInner = [&](IBlock* block, uint8_t t) { + noSlip.inner(block, t); + ubb.inner(block, t); + }; + + auto boundaryOuter = [&](IBlock* block, uint8_t t) { + noSlip.outer(block, t); + ubb.outer(block, t); + }; + + auto simpleOverlapTimeStep = [&]() { + // Communicate post-collision values of previous timestep... + communication.startCommunication(); + for (auto& block : *blocks) + { + if (boundaries) boundaryInner(&block, tracker->getCounter()); + lbSweep.inner(&block, tracker->getCounterPlusOne()); + } + communication.wait(); + for (auto& block : *blocks) + { + if (boundaries) boundaryOuter(&block, tracker->getCounter()); + lbSweep.outer(&block, tracker->getCounterPlusOne()); + } + + tracker->advance(); + }; + + auto normalTimeStep = [&]() { + communication.communicate(); + for (auto& block : *blocks) + { + if (boundaries) boundarySweep(&block, tracker->getCounter()); + lbSweep(&block, tracker->getCounterPlusOne()); + } + + tracker->advance(); + }; + + // With two-fields patterns, ghost layer cells act as constant stream-in boundaries; + // with in-place patterns, ghost layer cells act as wet-node no-slip boundaries. + auto kernelOnlyFunc = [&]() { + tracker->advance(); + for (auto& block : *blocks) + lbSweep(&block, tracker->getCounter()); + }; + + // Stream only function to test a streaming pattern without executing lbm operations inside + auto StreamOnlyFunc = [&]() { + for (auto& block : *blocks) + StreamOnlyKernel(&block); + }; + + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// TIME LOOP SETUP /// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + SweepTimeloop timeLoop(blocks->getBlockStorage(), timesteps); + + const std::string timeStepStrategy = parameters.getParameter< std::string >("timeStepStrategy", "normal"); + std::function< void() > timeStep; + if (timeStepStrategy == "noOverlap") + timeStep = std::function< void() >(normalTimeStep); + else if (timeStepStrategy == "simpleOverlap") + timeStep = simpleOverlapTimeStep; + else if (timeStepStrategy == "kernelOnly") + { + WALBERLA_LOG_INFO_ON_ROOT( + "Running only compute kernel without boundary - this makes only sense for benchmarking!") + // Run initial communication once to provide any missing stream-in populations + communication.communicate(); + timeStep = kernelOnlyFunc; + } + else if (timeStepStrategy == "StreamOnly") + { + WALBERLA_LOG_INFO_ON_ROOT( + "Running only streaming kernel without LBM - this makes only sense for benchmarking!") + // Run initial communication once to provide any missing stream-in populations + timeStep = StreamOnlyFunc; + } + else + { + WALBERLA_ABORT_NO_DEBUG_INFO("Invalid value for 'timeStepStrategy'. Allowed values are 'noOverlap', " + "'simpleOverlap', 'kernelOnly'") + } + + timeLoop.add() << BeforeFunction(timeStep) << Sweep([](IBlock*) {}, "time step"); + + uint_t vtkWriteFrequency = parameters.getParameter< uint_t >("vtkWriteFrequency", 0); + if (vtkWriteFrequency > 0) + { + auto vtkOutput = vtk::createVTKOutput_BlockData(*blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out", + "simulation_step", false, true, true, false, 0); + auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >(velFieldId, "vel"); + vtkOutput->addCellDataWriter(velWriter); + + vtkOutput->addBeforeFunction([&]() { + for (auto& block : *blocks){ + getterSweep(&block);} + }); + + timeLoop.addFuncBeforeTimeStep(vtk::writeFiles(vtkOutput), "VTK Output"); + } + + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + /// BENCHMARK /// + ////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + int warmupSteps = parameters.getParameter< int >("warmupSteps", 2); + int outerIterations = parameters.getParameter< int >("outerIterations", 1); + for (int i = 0; i < warmupSteps; ++i) + timeLoop.singleStep(); + + real_t remainingTimeLoggerFrequency = + parameters.getParameter< real_t >("remainingTimeLoggerFrequency", -1.0); // in seconds + if (remainingTimeLoggerFrequency > 0) + { + auto logger = timing::RemainingTimeLogger(timeLoop.getNrOfTimeSteps() * uint_c(outerIterations), + remainingTimeLoggerFrequency); + timeLoop.addFuncAfterTimeStep(logger, "remaining time logger"); + } + + for (int outerIteration = 0; outerIteration < outerIterations; ++outerIteration) + { + timeLoop.setCurrentTimeStepToZero(); + WcTimer simTimer; + WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps") + simTimer.start(); + timeLoop.run(); + simTimer.end(); + WALBERLA_LOG_INFO_ON_ROOT("Simulation finished") + real_t time = simTimer.last(); + WALBERLA_MPI_SECTION() + { + walberla::mpi::reduceInplace(time, walberla::mpi::MAX); + } + auto nrOfCells = real_c(cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2]); + + auto mlupsPerProcess = nrOfCells * real_c(timesteps) / time * 1e-6; + WALBERLA_LOG_RESULT_ON_ROOT("MLUPS per process " << mlupsPerProcess) + WALBERLA_LOG_RESULT_ON_ROOT("Time per time step " << time / real_c(timesteps)) + WALBERLA_ROOT_SECTION() + { + python_coupling::PythonCallback pythonCallbackResults("results_callback"); + if (pythonCallbackResults.isCallable()) + { + pythonCallbackResults.data().exposeValue("mlupsPerProcess", mlupsPerProcess); + pythonCallbackResults.data().exposeValue("stencil", infoStencil); + pythonCallbackResults.data().exposeValue("streamingPattern", infoStreamingPattern); + pythonCallbackResults.data().exposeValue("collisionSetup", infoCollisionSetup); + pythonCallbackResults.data().exposeValue("cse_global", infoCseGlobal); + pythonCallbackResults.data().exposeValue("cse_pdfs", infoCsePdfs); + // Call Python function to report results + pythonCallbackResults(); + } + } + } + } + return EXIT_SUCCESS; +} diff --git a/apps/benchmarks/UniformGridCPU/UniformGridCPU.py b/apps/benchmarks/UniformGridCPU/UniformGridCPU.py new file mode 100644 index 0000000000000000000000000000000000000000..2ea96523a8bf24582eec39479ee2b45857197490 --- /dev/null +++ b/apps/benchmarks/UniformGridCPU/UniformGridCPU.py @@ -0,0 +1,215 @@ +from dataclasses import replace + +import sympy as sp +import pystencils as ps + +from pystencils.fast_approximation import insert_fast_divisions, insert_fast_sqrts +from pystencils.simp.subexpression_insertion import insert_zeros, insert_aliases, insert_constants,\ + insert_symbol_times_minus_one + +from lbmpy.advanced_streaming import Timestep, is_inplace +from lbmpy.advanced_streaming.utility import streaming_patterns +from lbmpy.boundaries import NoSlip, UBB +from lbmpy.creationfunctions import LBMConfig, LBMOptimisation, LBStencil, create_lb_collision_rule +from lbmpy.enums import Method, Stencil +from lbmpy.fieldaccess import CollideOnlyInplaceAccessor +from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter +from lbmpy.updatekernels import create_stream_only_kernel + +from pystencils_walberla import CodeGeneration, generate_pack_info_from_kernel, generate_sweep,\ + generate_mpidtype_info_from_kernel, generate_info_header + +from lbmpy_walberla import generate_alternating_lbm_sweep, generate_alternating_lbm_boundary, generate_lb_pack_info + +omega = sp.symbols('omega') +omega_free = sp.Symbol('omega_free') + +# best configs in terms of FLOPS +options_dict = { + 'srt': { + 'method': Method.SRT, + 'relaxation_rate': omega, + 'compressible': False, + }, + 'trt': { + 'method': Method.TRT, + 'relaxation_rate': omega, + 'compressible': False, + }, + 'mrt': { + 'method': Method.MRT, + 'relaxation_rates': [omega, 1, 1, 1, 1, 1, 1], + 'compressible': False, + }, + 'mrt-overrelax': { + 'method': Method.MRT, + 'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)], + 'compressible': False, + }, + 'central': { + 'method': Method.CENTRAL_MOMENT, + 'relaxation_rate': omega, + 'compressible': True, + }, + 'central-overrelax': { + 'method': Method.CENTRAL_MOMENT, + 'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 11)], + 'compressible': True, + }, + 'cumulant': { + 'method': Method.MONOMIAL_CUMULANT, + 'relaxation_rate': omega, + 'compressible': True, + }, + 'cumulant-overrelax': { + 'method': Method.MONOMIAL_CUMULANT, + 'relaxation_rates': [omega] + [1 + x * 1e-2 for x in range(1, 18)], + 'compressible': True, + }, + 'entropic': { + 'method': Method.TRT_KBC_N4, + 'compressible': True, + 'relaxation_rates': [omega, omega_free], + 'entropic': True, + 'entropic_newton_iterations': False + }, + 'smagorinsky': { + 'method': Method.SRT, + 'smagorinsky': False, + 'relaxation_rate': omega, + } +} + + +info_header = """ +const char * infoStencil = "{stencil}"; +const char * infoStreamingPattern = "{streaming_pattern}"; +const char * infoCollisionSetup = "{collision_setup}"; +const bool infoCseGlobal = {cse_global}; +const bool infoCsePdfs = {cse_pdfs}; +""" + +# DEFAULTS +optimize = True + +with CodeGeneration() as ctx: + openmp = True if ctx.openmp else False + field_type = "float64" if ctx.double_accuracy else "float32" + if ctx.optimize_for_localhost: + cpu_vec = {"nontemporal": True, "assume_aligned": True} + else: + cpu_vec = None + + config_tokens = ctx.config.split('_') + + assert len(config_tokens) >= 3 + stencil_str = config_tokens[0] + streaming_pattern = config_tokens[1] + collision_setup = config_tokens[2] + + if len(config_tokens) >= 4: + optimize = (config_tokens[3] != 'noopt') + + if stencil_str == "d3q27": + stencil = LBStencil(Stencil.D3Q27) + elif stencil_str == "d3q19": + stencil = LBStencil(Stencil.D3Q19) + else: + raise ValueError("Only D3Q27 and D3Q19 stencil are supported at the moment") + + assert streaming_pattern in streaming_patterns, f"Invalid streaming pattern: {streaming_pattern}" + + options = options_dict[collision_setup] + + q = stencil.Q + dim = stencil.D + assert dim == 3, "This app supports only three-dimensional stencils" + pdfs, pdfs_tmp = ps.fields(f"pdfs({q}), pdfs_tmp({q}): {field_type}[3D]", layout='fzyx') + density_field, velocity_field = ps.fields(f"density, velocity(3) : {field_type}[3D]", layout='fzyx') + + lbm_config = LBMConfig(stencil=stencil, field_name=pdfs.name, streaming_pattern=streaming_pattern, **options) + lbm_opt = LBMOptimisation(cse_global=True, cse_pdfs=False, symbolic_field=pdfs, field_layout='fzyx') + + if not is_inplace(streaming_pattern): + lbm_opt = replace(lbm_opt, symbolic_temporary_field=pdfs_tmp) + field_swaps = [(pdfs, pdfs_tmp)] + else: + field_swaps = [] + + # Sweep for Stream only. This is for benchmarking an empty streaming pattern without LBM. + # is_inplace is set to False to ensure that the streaming is done with src and dst field. + # If this is not the case the compiler might simplify the streaming in a way that benchmarking makes no sense. + accessor = CollideOnlyInplaceAccessor() + accessor.is_inplace = False + field_swaps_stream_only = [(pdfs, pdfs_tmp)] + stream_only_kernel = create_stream_only_kernel(stencil, pdfs, pdfs_tmp, accessor=accessor) + + # LB Sweep + collision_rule = create_lb_collision_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt) + + if optimize: + collision_rule = insert_fast_divisions(collision_rule) + collision_rule = insert_fast_sqrts(collision_rule) + collision_rule = insert_constants(collision_rule) + collision_rule = insert_zeros(collision_rule) + collision_rule = insert_aliases(collision_rule) + collision_rule = insert_symbol_times_minus_one(collision_rule) + + lb_method = collision_rule.method + + generate_alternating_lbm_sweep(ctx, 'UniformGridCPU_LbKernel', collision_rule, lbm_config=lbm_config, + lbm_optimisation=lbm_opt, target=ps.Target.CPU, + inner_outer_split=True, field_swaps=field_swaps, + cpu_openmp=openmp, cpu_vectorize_info=cpu_vec) + + # getter & setter + setter_assignments = macroscopic_values_setter(lb_method, + density=density_field.center, velocity=velocity_field.center_vector, + pdfs=pdfs, + streaming_pattern=streaming_pattern, + previous_timestep=Timestep.EVEN) + getter_assignments = macroscopic_values_getter(lb_method, + density=density_field, velocity=velocity_field, + pdfs=pdfs, + streaming_pattern=streaming_pattern, + previous_timestep=Timestep.EVEN) + + generate_sweep(ctx, 'UniformGridCPU_MacroSetter', setter_assignments, target=ps.Target.CPU, cpu_openmp=openmp) + generate_sweep(ctx, 'UniformGridCPU_MacroGetter', getter_assignments, target=ps.Target.CPU, cpu_openmp=openmp) + + # Stream only kernel + generate_sweep(ctx, 'UniformGridCPU_StreamOnlyKernel', stream_only_kernel, field_swaps=field_swaps_stream_only, + target=ps.Target.CPU, cpu_openmp=openmp) + + # Boundaries + noslip = NoSlip() + ubb = UBB((0.05, 0, 0), data_type=field_type) + + generate_alternating_lbm_boundary(ctx, 'UniformGridCPU_NoSlip', noslip, lb_method, field_name=pdfs.name, + streaming_pattern=streaming_pattern, target=ps.Target.CPU, cpu_openmp=openmp) + generate_alternating_lbm_boundary(ctx, 'UniformGridCPU_UBB', ubb, lb_method, field_name=pdfs.name, + streaming_pattern=streaming_pattern, target=ps.Target.CPU, cpu_openmp=openmp) + + # communication + generate_lb_pack_info(ctx, 'UniformGridCPU_PackInfo', stencil, pdfs, + streaming_pattern=streaming_pattern, target=ps.Target.CPU, + always_generate_separate_classes=True) + + infoHeaderParams = { + 'stencil': stencil_str, + 'streaming_pattern': streaming_pattern, + 'collision_setup': collision_setup, + 'cse_global': int(lbm_opt.cse_global), + 'cse_pdfs': int(lbm_opt.cse_pdfs), + } + + stencil_typedefs = {'Stencil_T': stencil, + 'CommunicationStencil_T': stencil} + field_typedefs = {'PdfField_T': pdfs, + 'VelocityField_T': velocity_field, + 'ScalarField_T': density_field} + + # Info header containing correct template definitions for stencil and field + generate_info_header(ctx, 'UniformGridCPU_InfoHeader', + stencil_typedefs=stencil_typedefs, field_typedefs=field_typedefs, + additional_code=info_header.format(**infoHeaderParams)) diff --git a/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py new file mode 100755 index 0000000000000000000000000000000000000000..f432e778bc8e7d5c82120db40469ed7d2f2aa7ed --- /dev/null +++ b/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py @@ -0,0 +1,210 @@ +import os +import waLBerla as wlb +from waLBerla.tools.config import block_decomposition +from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle +import sys +import sqlite3 +from math import prod + +# Number of time steps run for a workload of 128^3 per process +# if double as many cells are on the process, half as many time steps are run etc. +# increase this to get more reliable measurements +TIME_STEPS_FOR_128_BLOCK = 5 +DB_FILE = os.environ.get('DB_FILE', "cpu_benchmark.sqlite3") + + +def num_time_steps(block_size, time_steps_for_128_block=TIME_STEPS_FOR_128_BLOCK): + cells = block_size[0] * block_size[1] * block_size[2] + time_steps = (128 ** 3 / cells) * time_steps_for_128_block + return int(time_steps) + + +ldc_setup = {'Border': [ + {'direction': 'N', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'N', 'walldistance': -1, 'flag': 'UBB'}, + {'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'}, +]} + + +class Scenario: + def __init__(self, cells_per_block=(128, 128, 128), periodic=(1, 1, 1), + timesteps=None, time_step_strategy="normal", omega=1.8, inner_outer_split=(1, 1, 1), + warmup_steps=2, outer_iterations=3, init_shear_flow=False, boundary_setup=False, + vtk_write_frequency=0, remaining_time_logger_frequency=-1): + + if boundary_setup: + init_shear_flow = False + periodic = (0, 0, 0) + + self.blocks = block_decomposition(wlb.mpi.numProcesses()) + + self.cells_per_block = cells_per_block + self.periodic = periodic + + self.time_step_strategy = time_step_strategy + self.omega = omega + self.timesteps = timesteps if timesteps else num_time_steps(cells_per_block) + self.inner_outer_split = inner_outer_split + self.init_shear_flow = init_shear_flow + self.boundary_setup = boundary_setup + self.warmup_steps = warmup_steps + self.outer_iterations = outer_iterations + + self.vtk_write_frequency = vtk_write_frequency + self.remaining_time_logger_frequency = remaining_time_logger_frequency + + self.config_dict = self.config(print_dict=False) + + @wlb.member_callback + def config(self, print_dict=True): + from pprint import pformat + config_dict = { + 'DomainSetup': { + 'blocks': self.blocks, + 'cellsPerBlock': self.cells_per_block, + 'periodic': self.periodic, + }, + 'Parameters': { + 'omega': self.omega, + 'warmupSteps': self.warmup_steps, + 'outerIterations': self.outer_iterations, + 'timeStepStrategy': self.time_step_strategy, + 'timesteps': self.timesteps, + 'initShearFlow': self.init_shear_flow, + 'innerOuterSplit': self.inner_outer_split, + 'vtkWriteFrequency': self.vtk_write_frequency, + 'remainingTimeLoggerFrequency': self.remaining_time_logger_frequency + } + } + if self.boundary_setup: + config_dict["Boundaries"] = ldc_setup + + if print_dict: + wlb.log_info_on_root("Scenario:\n" + pformat(config_dict)) + return config_dict + + @wlb.member_callback + def results_callback(self, **kwargs): + data = {} + data.update(self.config_dict['Parameters']) + data.update(self.config_dict['DomainSetup']) + data.update(kwargs) + + data['executable'] = sys.argv[0] + data['compile_flags'] = wlb.build_info.compiler_flags + data['walberla_version'] = wlb.build_info.version + data['build_machine'] = wlb.build_info.build_machine + sequenceValuesToScalars(data) + + result = data + sequenceValuesToScalars(result) + num_tries = 4 + # check multiple times e.g. may fail when multiple benchmark processes are running + table_name = f"runs_{data['stencil']}_{data['streamingPattern']}_{data['collisionSetup']}_{prod(self.blocks)}" + table_name = table_name.replace("-", "_") + for num_try in range(num_tries): + try: + checkAndUpdateSchema(result, table_name, DB_FILE) + storeSingle(result, table_name, DB_FILE) + break + except sqlite3.OperationalError as e: + wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/{num_tries} {str(e)}") + + +# -------------------------------------- Profiling ----------------------------------- +def profiling(): + """Tests different communication overlapping strategies""" + wlb.log_info_on_root("Running 2 timesteps for profiling") + wlb.log_info_on_root("") + + scenarios = wlb.ScenarioManager() + cells = (128, 128, 128) + + scenarios.add(Scenario(cells_per_block=cells, time_step_strategy='kernelOnly', + inner_outer_split=(1, 1, 1), timesteps=2, + outer_iterations=1, warmup_steps=0)) + + +# -------------------------------------- Functions trying different parameter sets ----------------------------------- + + +def overlap_benchmark(): + """Tests different communication overlapping strategies""" + wlb.log_info_on_root("Running different communication overlap strategies") + wlb.log_info_on_root("") + + scenarios = wlb.ScenarioManager() + inner_outer_splits = [(1, 1, 1), (4, 1, 1)] + cells_per_block = [(i, i, i) for i in (16, 32, 64, 128)] + + for cell_per_block in cells_per_block: + scenarios.add(Scenario(time_step_strategy='noOverlap', + inner_outer_split=(1, 1, 1), + cells_per_block=cell_per_block)) + + for inner_outer_split in inner_outer_splits: + scenario = Scenario(time_step_strategy='simpleOverlap', + inner_outer_split=inner_outer_split, + cells_per_block=cell_per_block) + scenarios.add(scenario) + + +def scaling_benchmark(): + """Tests different communication overlapping strategies""" + wlb.log_info_on_root("Running scaling benchmark") + wlb.log_info_on_root("") + + scenarios = wlb.ScenarioManager() + cells_per_block = [(32, 32, 32), (128, 128, 128)] + + for cell_per_block in cells_per_block: + scenarios.add(Scenario(time_step_strategy='noOverlap', + inner_outer_split=(1, 1, 1), + cells_per_block=cell_per_block)) + + +def single_node_benchmark(): + """Benchmarks only the LBM compute kernel""" + wlb.log_info_on_root("Running single Node benchmarks") + wlb.log_info_on_root("") + + scenarios = wlb.ScenarioManager() + block_sizes = [(i, i, i) for i in (8, 16, 32, 64, 128)] + for block_size in block_sizes: + scenario = Scenario(cells_per_block=block_size, + time_step_strategy='kernelOnly', + timesteps=num_time_steps(block_size)) + scenarios.add(scenario) + + +def validation_run(): + """Run with full periodic shear flow or boundary scenario (ldc) to check if the code works""" + wlb.log_info_on_root("Validation run") + wlb.log_info_on_root("") + + time_step_strategy = 'simpleOverlap' # 'noOverlap' + + scenarios = wlb.ScenarioManager() + scenario = Scenario(cells_per_block=(64, 64, 64), + time_step_strategy=time_step_strategy, + timesteps=101, + outer_iterations=1, + warmup_steps=0, + init_shear_flow=True, + boundary_setup=False, + vtk_write_frequency=100, + remaining_time_logger_frequency=10) + scenarios.add(scenario) + + +wlb.log_info_on_root(f"Batch run of benchmark scenarios, saving result to {DB_FILE}") +# Select the benchmark you want to run +single_node_benchmark() # benchmarks different CUDA block sizes and domain sizes and measures single GPU +# performance of compute kernel (no communication) +# overlap_benchmark() # benchmarks different communication overlap options +# profiling() # run only two timesteps on a smaller domain for profiling only +# validation_run() +# scaling_benchmark() diff --git a/apps/benchmarks/UniformGridGenerated/CMakeLists.txt b/apps/benchmarks/UniformGridGenerated/CMakeLists.txt deleted file mode 100644 index dda05f8fdda22c560543253a3a8f97e21781afc3..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGenerated/CMakeLists.txt +++ /dev/null @@ -1,20 +0,0 @@ -waLBerla_link_files_to_builddir( "*.prm" ) -waLBerla_link_files_to_builddir( "*.py" ) - - -foreach(config srt trt smagorinsky mrt entropic_kbc_n4 cumulant ) - waLBerla_generate_target_from_python(NAME UniformGridGenerated_${config} - CODEGEN_CFG ${config} - FILE UniformGridGenerated.py - OUT_FILES GenMacroGetter.cpp GenMacroGetter.h GenMacroSetter.cpp GenMacroSetter.h - GenPackInfo.cpp GenPackInfo.h GenPackInfoAAPush.cpp GenPackInfoAAPush.h GenPackInfoAAPull.cpp GenPackInfoAAPull.h - GenLbKernel.cpp GenLbKernel.h GenLbKernelAAEven.cpp GenLbKernelAAEven.h GenLbKernelAAOdd.cpp GenLbKernelAAOdd.h - GenMpiDtypeInfo.h GenMpiDtypeInfoAAPull.h GenMpiDtypeInfoAAPush.h - GenDefines.h) - - waLBerla_add_executable ( NAME UniformGridBenchmarkGenerated_${config} - FILES UniformGridGenerated.cpp - DEPENDS blockforest boundary core domain_decomposition field geometry timeloop vtk gui - UniformGridGenerated_${config} python_coupling) - -endforeach() diff --git a/apps/benchmarks/UniformGridGenerated/UniformGrid.prm b/apps/benchmarks/UniformGridGenerated/UniformGrid.prm deleted file mode 100644 index 702d06252c13908fe98f99757acf4d4a4e77f799..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGenerated/UniformGrid.prm +++ /dev/null @@ -1,24 +0,0 @@ -DomainSetup -{ - blocks < 1, 1, 1 >; - cellsPerBlock < 64, 64, 64 >; - periodic < 1, 1, 1 >; -} - -Parameters -{ - - timesteps 200; // time steps of one performance measurement - warmupSteps 1; // number of steps to run before measurement starts - outerIterations 4; // how many measurements to conduct - - vtkWriteFrequency 0; // write a VTK file every n'th step, if zero VTK output is disabled - timeStepMode twoField; - //twoFieldKernelType manualD3Q19; - remainingTimeLoggerFrequency 6; // interval in seconds to log the estimated remaining time - directComm 0; - - omega 1.8; - shearVelocityMagnitude 0.02; - useGui 0; -} diff --git a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp deleted file mode 100644 index e27ee3e61d6b2bbf6e57d9ca7353d190978f6c22..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.cpp +++ /dev/null @@ -1,216 +0,0 @@ -#include "core/Environment.h" -#include "core/logging/Initialization.h" -#include "core/OpenMP.h" -#include "python_coupling/CreateConfig.h" -#include "python_coupling/PythonCallback.h" -#include "python_coupling/DictWrapper.h" -#include "blockforest/Initialization.h" -#include "field/vtk/VTKWriter.h" -#include "field/AddToStorage.h" -#include "blockforest/communication/UniformBufferedScheme.h" -#include "blockforest/communication/UniformDirectScheme.h" -#include "timeloop/all.h" -#include "core/timing/TimingPool.h" -#include "core/timing/RemainingTimeLogger.h" -#include "core/waLBerlaBuildInfo.h" -#include "domain_decomposition/SharedSweep.h" -#include "gui/Gui.h" -#include "InitShearVelocity.h" -#include "ManualKernels.h" -#include "lbm/lattice_model/D3Q19.h" - -#include "GenDefines.h" - -#include <iomanip> - -using namespace walberla; - -int main( int argc, char **argv ) -{ - mpi::Environment env( argc, argv ); - - for( auto cfg = python_coupling::configBegin( argc, argv ); cfg != python_coupling::configEnd(); ++cfg ) - { - WALBERLA_MPI_WORLD_BARRIER() - - auto config = *cfg; - logging::configureLogging( config ); - auto blocks = blockforest::createUniformBlockGridFromConfig( config ); - - Vector3<uint_t> cellsPerBlock = config->getBlock( "DomainSetup" ).getParameter<Vector3<uint_t> >( "cellsPerBlock" ); - // Reading parameters - auto parameters = config->getOneBlock( "Parameters" ); - const std::string timeStepMode = parameters.getParameter<std::string>( "timeStepMode", "twoField"); - const real_t omega = parameters.getParameter<real_t>( "omega", real_c( 1.4 )); - uint_t timesteps = parameters.getParameter<uint_t>( "timesteps", uint_c( 60 )); - const real_t shearVelocityMagnitude = parameters.getParameter<real_t>("shearVelocityMagnitude", real_t(0.08)); - const bool directComm = parameters.getParameter<bool>("directComm", false); - - auto pdfFieldAdder = [](IBlock* const block, StructuredBlockStorage * const storage) { - return new PdfField_T(storage->getNumberOfXCells(*block), - storage->getNumberOfYCells(*block), - storage->getNumberOfZCells(*block), - uint_t(1), - field::fzyx, - make_shared<field::AllocateAligned<real_t, 64>>()); - }; - - // Creating fields - BlockDataID pdfFieldId = blocks->addStructuredBlockData<PdfField_T>(pdfFieldAdder, "pdfs"); - BlockDataID velFieldId = field::addToStorage< VelocityField_T >( blocks, "vel", real_t( 0 ), field::fzyx ); - - pystencils::GenMacroSetter setterKernel(pdfFieldId, velFieldId); - pystencils::GenMacroGetter getterKernel(pdfFieldId, velFieldId); - - if( shearVelocityMagnitude > 0 ) - initShearVelocity(blocks, velFieldId, shearVelocityMagnitude); - for( auto & b : *blocks) - setterKernel(&b); - - // Buffered Comm - blockforest::communication::UniformBufferedScheme< Stencil_T > twoFieldComm(blocks ); - twoFieldComm.addPackInfo(make_shared< pystencils::GenPackInfo >(pdfFieldId ) ); - - blockforest::communication::UniformBufferedScheme< Stencil_T > aaPullComm(blocks); - aaPullComm.addPackInfo(make_shared< pystencils::GenPackInfoAAPull>(pdfFieldId)); - - blockforest::communication::UniformBufferedScheme< Stencil_T > aaPushComm(blocks); - aaPushComm.addPackInfo(make_shared< pystencils::GenPackInfoAAPush>(pdfFieldId)); - - // Direct Comm - blockforest::communication::UniformDirectScheme< Stencil_T > twoFieldCommDirect(blocks); - twoFieldCommDirect.addDataToCommunicate(make_shared<pystencils::GenMpiDtypeInfo>(pdfFieldId)); - - blockforest::communication::UniformDirectScheme< Stencil_T > aaPullCommDirect(blocks); - aaPullCommDirect.addDataToCommunicate(make_shared<pystencils::GenMpiDtypeInfoAAPull>(pdfFieldId)); - - blockforest::communication::UniformDirectScheme< Stencil_T > aaPushCommDirect(blocks); - aaPushCommDirect.addDataToCommunicate(make_shared<pystencils::GenMpiDtypeInfoAAPush>(pdfFieldId)); - - - const std::string twoFieldKernelType = parameters.getParameter<std::string>( "twoFieldKernelType", "generated"); - std::function<void(IBlock*)> twoFieldKernel; - if( twoFieldKernelType == "generated") { - twoFieldKernel = pystencils::GenLbKernel(pdfFieldId, omega); - } else if (twoFieldKernelType == "manualGeneric") { - using MyLM = lbm::D3Q19<lbm::collision_model::SRT>; - BlockDataID tmpPdfFieldId = blocks->addStructuredBlockData<PdfField_T>(pdfFieldAdder, "pdfs"); - twoFieldKernel = StreamPullCollideGeneric<MyLM>(pdfFieldId, tmpPdfFieldId, omega); - } else if (twoFieldKernelType == "manualD3Q19") { - using MyLM = lbm::D3Q19<lbm::collision_model::SRT>; - BlockDataID tmpPdfFieldId = blocks->addStructuredBlockData<PdfField_T>(pdfFieldAdder, "pdfs"); - twoFieldKernel = StreamPullCollideD3Q19<MyLM>(pdfFieldId, tmpPdfFieldId, omega); - } else { - WALBERLA_ABORT_NO_DEBUG_INFO("Invalid option for \"twoFieldKernelType\", " - "valid options are \"generated\", \"manualGeneric\", \"manualD3Q19\"") - } - - using F = std::function<void()>; - SweepTimeloop timeLoop( blocks->getBlockStorage(), timesteps / 2 ); - if( timeStepMode == "twoField") - { - timeLoop.add() << BeforeFunction(directComm ? F(twoFieldCommDirect) : F(twoFieldComm), "communication" ) - << Sweep( twoFieldKernel, "LB stream & collide1" ); - timeLoop.add() << BeforeFunction(directComm ? F(twoFieldCommDirect) : F(twoFieldComm), "communication" ) - << Sweep( twoFieldKernel, "LB stream & collide2" ); - - } else if ( timeStepMode == "twoFieldKernelOnly") { - timeLoop.add() << Sweep( pystencils::GenLbKernel(pdfFieldId, omega), "LB stream & collide1" ); - timeLoop.add() << Sweep( pystencils::GenLbKernel(pdfFieldId, omega), "LB stream & collide2" ); - } else if ( timeStepMode == "aa") { - timeLoop.add() << Sweep( pystencils::GenLbKernelAAEven(pdfFieldId, omega), "AA Even" ); - timeLoop.add() << BeforeFunction( directComm ? F(aaPullCommDirect) : F(aaPullComm) ) - << Sweep( pystencils::GenLbKernelAAOdd(pdfFieldId, omega), "AA Odd") - << AfterFunction( directComm ? F(aaPushCommDirect) : F(aaPushComm) ); - } else if ( timeStepMode == "aaKernelOnly") { - timeLoop.add() << Sweep( pystencils::GenLbKernelAAEven(pdfFieldId, omega), "AA Even" ); - timeLoop.add() << Sweep( pystencils::GenLbKernelAAOdd(pdfFieldId, omega), "AA Odd"); - } else { - WALBERLA_ABORT("Invalid value for timeStepMode") - } - - - int warmupSteps = parameters.getParameter<int>( "warmupSteps", 2 ); - int outerIterations = parameters.getParameter<int>( "outerIterations", 1 ); - for(int i=0; i < warmupSteps; ++i ) - timeLoop.singleStep(); - - auto remainingTimeLoggerFrequency = parameters.getParameter< double >( "remainingTimeLoggerFrequency", -1.0 ); // in seconds - if (remainingTimeLoggerFrequency > 0) { - auto logger = timing::RemainingTimeLogger( timeLoop.getNrOfTimeSteps() * uint_c( outerIterations ), remainingTimeLoggerFrequency ); - timeLoop.addFuncAfterTimeStep( logger, "remaining time logger" ); - } - - // VTK - uint_t vtkWriteFrequency = parameters.getParameter<uint_t>( "vtkWriteFrequency", 0 ); - if( vtkWriteFrequency > 0 ) - { - auto vtkOutput = vtk::createVTKOutput_BlockData( *blocks, "vtk", vtkWriteFrequency, 0, false, "vtk_out", - "simulation_step", false, true, true, false, 0 ); - auto velWriter = make_shared< field::VTKWriter< VelocityField_T > >( velFieldId, "vel" ); - vtkOutput->addCellDataWriter( velWriter ); - vtkOutput->addBeforeFunction( [&]() - { for( auto & b : *blocks) - getterKernel(&b); - } ); - timeLoop.addFuncAfterTimeStep( vtk::writeFiles( vtkOutput ), "VTK Output" ); - } - - - bool useGui = parameters.getParameter<bool>( "useGui", false ); - if( useGui ) - { - GUI gui( timeLoop, blocks, argc, argv); - gui.run(); - } - else - { - for ( int outerIteration = 0; outerIteration < outerIterations; ++outerIteration ) - { - timeLoop.setCurrentTimeStepToZero(); - WcTimer simTimer; - - auto threads = omp_get_max_threads(); - - simTimer.start(); - timeLoop.run(); - simTimer.end(); - auto time = simTimer.last(); - auto nrOfCells = real_c( cellsPerBlock[0] * cellsPerBlock[1] * cellsPerBlock[2] ); - auto mlupsPerProcess = nrOfCells * real_c( timesteps ) / time * 1e-6; - - using std::setw; - WALBERLA_LOG_INFO_ON_ROOT(setw(18) << timeStepMode << - " procs: " << setw(6) << MPIManager::instance()->numProcesses() << - " threads: " << threads << - " direct_comm: " << directComm << - " time steps: " << timesteps << - setw(15) << " block size: " << cellsPerBlock << - " mlups/core: " << int(mlupsPerProcess/ threads) << - " mlups: " << int(mlupsPerProcess) * MPIManager::instance()->numProcesses()) - - WALBERLA_ROOT_SECTION() - { - python_coupling::PythonCallback pythonCallbackResults( "results_callback" ); - if ( pythonCallbackResults.isCallable()) - { - pythonCallbackResults.data().exposeValue( "mlupsPerProcess", mlupsPerProcess ); - pythonCallbackResults.data().exposeValue( "stencil", infoStencil ); - pythonCallbackResults.data().exposeValue( "configName", infoConfigName ); - pythonCallbackResults.data().exposeValue( "timeStepMode", timeStepMode ); - pythonCallbackResults.data().exposeValue( "twoFieldKernel", twoFieldKernelType ); - pythonCallbackResults.data().exposeValue( "optimizations", optimizationDict ); - pythonCallbackResults.data().exposeValue( "githash", core::buildinfo::gitSHA1() ); - pythonCallbackResults.data().exposeValue( "compilerFlags", core::buildinfo::compilerFlags() ); - pythonCallbackResults.data().exposeValue( "buildMachine", core::buildinfo::buildMachine() ); - - // Call Python function to report results - pythonCallbackResults(); - } - } - } - } - } - - return 0; -} diff --git a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py b/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py deleted file mode 100644 index 01b00e4a53fb6c3a8d463c9b15e86f9a34eb25fe..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGenerated/UniformGridGenerated.py +++ /dev/null @@ -1,208 +0,0 @@ -import sympy as sp -import pystencils as ps -from lbmpy.creationfunctions import create_lb_update_rule, create_lb_collision_rule -from pystencils_walberla import CodeGeneration, generate_pack_info_from_kernel, generate_sweep,\ - generate_mpidtype_info_from_kernel, generate_info_header - -from lbmpy import LBMConfig, LBMOptimisation, LBStencil, Method, Stencil -from lbmpy.macroscopic_value_kernels import macroscopic_values_getter, macroscopic_values_setter -from lbmpy.fieldaccess import AAEvenTimeStepAccessor, AAOddTimeStepAccessor - -omega = sp.symbols('omega') -omega_free = sp.Symbol('omega_free') -omega_fill = sp.symbols('omega_:10') - -options_dict = { - 'srt': { - 'method': 'srt', - 'stencil': 'D3Q19', - 'relaxation_rate': omega, - 'compressible': False, - }, - 'trt': { - 'method': 'trt', - 'stencil': 'D3Q19', - 'compressible': False, - 'relaxation_rate': omega, - }, - 'mrt': { - 'method': 'mrt', - 'stencil': 'D3Q19', - 'relaxation_rates': [0.0, omega, 1.3, 1.4, omega, 1.2, 1.1, 1.15, 1.234, 1.4235, 1.242, 1.2567, 0.9, 0.7], - }, - 'mrt_full': { - 'method': 'mrt', - 'stencil': 'D3Q19', - 'relaxation_rates': [omega_fill[0], omega, omega_fill[1], omega_fill[2], - omega_fill[3], omega_fill[4], omega_fill[5]], - }, - 'entropic': { - 'method': 'mrt', - 'stencil': 'D3Q19', - 'compressible': True, - 'relaxation_rates': [omega, omega, omega_free, omega_free, omega_free], - 'entropic': True, - }, - 'entropic_kbc_n4': { - 'method': 'trt_kbc_n4', - 'stencil': 'D3Q27', - 'compressible': True, - 'relaxation_rates': [omega, omega_free], - 'entropic': True, - }, - 'smagorinsky': { - 'method': 'srt', - 'stencil': 'D3Q19', - 'smagorinsky': True, - 'relaxation_rate': omega, - }, - 'cumulant': { - 'method': 'cumulant', - 'stencil': 'D3Q19', - 'compressible': True, - 'relaxation_rate': omega, - }, -} - -with CodeGeneration() as ctx: - common_options = { - 'field_name': 'pdfs', - 'temporary_field_name': 'pdfs_tmp', - } - opts = { - 'two_field_cse_pdfs': False, - 'two_field_cse_global': False, - 'two_field_split': True, - 'two_field_nt_stores': True, - - 'aa_even_cse_pdfs': False, - 'aa_even_cse_global': False, - 'aa_even_split': False, - 'aa_even_nt_stores': False, - - 'aa_odd_cse_pdfs': False, - 'aa_odd_cse_global': False, - 'aa_odd_split': True, - 'aa_odd_nt_stores': False, - - 'compiled_in_boundaries': False, - } - config_name = ctx.config - noopt = False - d3q27 = False - if config_name.endswith('_d3q27'): - d3q27 = True - config_name = config_name[:-len('_d3q27')] - - if config_name == '': - config_name = 'trt' - options = options_dict[config_name] - options.update(common_options) - options = options.copy() - - if d3q27: - stencil = LBStencil(Stencil.D3Q27) - options['stencil'] = stencil - else: - stencil = LBStencil(options['stencil']) - - dtype_string = 'float64' if ctx.double_accuracy else 'float32' - - pdfs, velocity_field = ps.fields(f'pdfs({stencil.Q}), velocity(3) : {dtype_string}[3D]', layout='fzyx') - - lbm_config = LBMConfig(**options) - lbm_optimisation = LBMOptimisation(symbolic_field=pdfs, split=opts['two_field_split'], - cse_global=opts['two_field_cse_global'], cse_pdfs=opts['two_field_cse_pdfs']) - - update_rule_two_field = create_lb_update_rule(lbm_config=lbm_config, lbm_optimisation=lbm_optimisation) - - if opts['compiled_in_boundaries']: - from lbmpy.boundaries import NoSlip, UBB - from lbmpy.boundaries.boundaries_in_kernel import update_rule_with_push_boundaries - from collections import OrderedDict - boundaries = OrderedDict(( - ((1, 0, 0), NoSlip()), - ((-1, 0, 0), NoSlip()), - ((0, 1, 0), NoSlip()), - ((0, -1, 0), NoSlip()), - ((0, 0, 1), UBB([0.05, 0, 0])), - ((0, 0, -1), NoSlip()), - )) - cr_even = create_lb_collision_rule(lbm_config=LBMConfig(stencil=LBStencil(Stencil.D3Q19), compressible=False), - lbm_optimisation=LBMOptimisation(cse_global=opts['aa_even_cse_global'], - cse_pdfs=opts['aa_even_cse_pdfs'])) - - cr_odd = create_lb_collision_rule(lbm_config=LBMConfig(stencil=LBStencil(Stencil.D3Q19), compressible=False), - lbm_optimisation=LBMOptimisation(cse_global=opts['aa_odd_cse_global'], - cse_pdfs=opts['aa_odd_cse_pdfs'])) - - update_rule_aa_even = update_rule_with_push_boundaries(cr_even, pdfs, boundaries, - AAEvenTimeStepAccessor, AAOddTimeStepAccessor.read) - update_rule_aa_odd = update_rule_with_push_boundaries(cr_odd, pdfs, boundaries, - AAOddTimeStepAccessor, AAEvenTimeStepAccessor.read) - else: - lbm_opt_even = LBMOptimisation(symbolic_field=pdfs, split=opts['aa_even_split'], - cse_global=opts['aa_even_cse_global'], cse_pdfs=opts['aa_even_cse_pdfs']) - - update_rule_aa_even = create_lb_update_rule(lbm_config=LBMConfig(**options, - kernel_type=AAEvenTimeStepAccessor()), - lbm_optimisation=lbm_opt_even) - - lbm_opt_odd = LBMOptimisation(symbolic_field=pdfs, split=opts['aa_odd_split'], - cse_global=opts['aa_odd_cse_global'], cse_pdfs=opts['aa_odd_cse_pdfs']) - - update_rule_aa_odd = create_lb_update_rule(lbm_config=LBMConfig(**options, - kernel_type=AAOddTimeStepAccessor()), - lbm_optimisation=lbm_opt_odd) - - vec = {'assume_aligned': True, 'assume_inner_stride_one': True} - - # check if openmp is enabled in cmake - if ctx.openmp: - openmp_enabled = True - else: - openmp_enabled = False - - # Sweeps - vec['nontemporal'] = opts['two_field_nt_stores'] - generate_sweep(ctx, 'GenLbKernel', update_rule_two_field, field_swaps=[('pdfs', 'pdfs_tmp')], - cpu_vectorize_info=vec) - vec['nontemporal'] = opts['aa_even_nt_stores'] - generate_sweep(ctx, 'GenLbKernelAAEven', update_rule_aa_even, cpu_vectorize_info=vec, - cpu_openmp=openmp_enabled, ghost_layers=1) - vec['nontemporal'] = opts['aa_odd_nt_stores'] - generate_sweep(ctx, 'GenLbKernelAAOdd', update_rule_aa_odd, cpu_vectorize_info=vec, - cpu_openmp=openmp_enabled, ghost_layers=1) - - setter_assignments = macroscopic_values_setter(update_rule_two_field.method, velocity=velocity_field.center_vector, - pdfs=pdfs.center_vector, density=1.0) - getter_assignments = macroscopic_values_getter(update_rule_two_field.method, velocity=velocity_field.center_vector, - pdfs=pdfs.center_vector, density=None) - generate_sweep(ctx, 'GenMacroSetter', setter_assignments, cpu_openmp=openmp_enabled) - generate_sweep(ctx, 'GenMacroGetter', getter_assignments, cpu_openmp=openmp_enabled) - - # Communication - generate_pack_info_from_kernel(ctx, 'GenPackInfo', update_rule_two_field, - cpu_vectorize_info={'instruction_set': None}, cpu_openmp=False) - generate_pack_info_from_kernel(ctx, 'GenPackInfoAAPull', update_rule_aa_odd, kind='pull', - cpu_vectorize_info={'instruction_set': None}, cpu_openmp=False) - generate_pack_info_from_kernel(ctx, 'GenPackInfoAAPush', update_rule_aa_odd, kind='push', - cpu_vectorize_info={'instruction_set': None}, cpu_openmp=False) - - generate_mpidtype_info_from_kernel(ctx, 'GenMpiDtypeInfo', update_rule_two_field) - generate_mpidtype_info_from_kernel(ctx, 'GenMpiDtypeInfoAAPull', update_rule_aa_odd, kind='pull') - generate_mpidtype_info_from_kernel(ctx, 'GenMpiDtypeInfoAAPush', update_rule_aa_odd, kind='push') - - additional_code = f""" - const char * infoStencil = "{stencil.name}"; - const char * infoConfigName = "{ctx.config}"; - const char * optimizationDict = "{str(opts)}"; - """ - - stencil_typedefs = {'Stencil_T': stencil, - 'CommunicationStencil_T': stencil} - field_typedefs = {'PdfField_T': pdfs, - 'VelocityField_T': velocity_field} - - generate_info_header(ctx, "GenDefines.h", stencil_typedefs=stencil_typedefs, field_typedefs=field_typedefs, - additional_code=additional_code) diff --git a/apps/benchmarks/UniformGridGenerated/params.py b/apps/benchmarks/UniformGridGenerated/params.py deleted file mode 100644 index 598855b63f1f54afe4b49c32aa35a637a2c2b58b..0000000000000000000000000000000000000000 --- a/apps/benchmarks/UniformGridGenerated/params.py +++ /dev/null @@ -1,174 +0,0 @@ -import math -import os -import operator -import waLBerla as wlb -from waLBerla.tools.sqlitedb import sequenceValuesToScalars, checkAndUpdateSchema, storeSingle -from waLBerla.tools.config import block_decomposition -from functools import reduce -import sqlite3 - - -def prod(seq): - return reduce(operator.mul, seq, 1) - - -def get_block_decomposition(block_decomposition, num_processes): - bx = by = bz = 1 - blocks_per_axis = int(math.log(num_processes, 2)) - for i in range(blocks_per_axis): - decomposition_axis = block_decomposition[i % len(block_decomposition)] - if decomposition_axis == 'y': - by *= 2 - elif decomposition_axis == 'z': - bz *= 2 - elif decomposition_axis == 'x': - bx *= 2 - - assert (bx * by * bz) == num_processes - return bx, by, bz - - -def calculate_time_steps(runtime, expected_mlups, domain_size): - cells = prod(domain_size) - time_steps_per_second = expected_mlups * 1e6 / cells - return int(time_steps_per_second * runtime) - - -def domain_decomposition_func_z(processes, threads, block_size): - return { - 'blocks': (1, 1, processes), - 'cellsPerBlock': (block_size[0], block_size[1], block_size[2] * threads) - } - - -def domain_decomposition_func_full(processes, threads, block_size): - return { - 'blocks': block_decomposition(processes), - 'cellsPerBlock': (block_size[0], block_size[1], block_size[2] * threads) - } - - -class BenchmarkScenario: - def __init__(self, block_size=(256, 128, 128), direct_comm=True, - time_step_mode='aa', two_field_kernel_type='generated', - domain_decomposition_func=domain_decomposition_func_z, - db_file_name='uniform_grid_gen.sqlite'): - self.block_size = block_size - self.direct_comm = direct_comm - self.time_step_mode = time_step_mode - self.two_field_kernel_type = two_field_kernel_type - self.domain_decomposition_func = domain_decomposition_func - self.threads = int(os.environ['OMP_NUM_THREADS']) - self.processes = wlb.mpi.numProcesses() - self.db_file_name = db_file_name - - @wlb.member_callback - def config(self, **kwargs): - time_steps_for_128_cubed = 10 - time_steps = int(128**3 / prod(self.block_size) * time_steps_for_128_cubed) - time_steps = max(10, time_steps) - cfg = { - 'DomainSetup': { - 'periodic': (1, 1, 1), - }, - 'Parameters': { - 'timesteps': time_steps, - 'warmupSteps': 2, - 'outerIterations': 4, - 'vtkWriteFrequency': 0, - 'remainingTimeLoggerFrequency': 0, - 'omega': 1.6, - 'timeStepMode': self.time_step_mode, - 'twoFieldKernelType': self.two_field_kernel_type, - 'directComm': self.direct_comm, - } - } - cfg['DomainSetup'].update(self.domain_decomposition_func(self.processes, self.threads, self.block_size)) - return cfg - - @wlb.member_callback - def results_callback(self, mlupsPerProcess, optimizations, **kwargs): - cfg = self.config() - result = { - 'block_size': self.block_size, - 'mlups_per_core': mlupsPerProcess / self.threads, - 'threads': self.threads, - 'processes': self.processes, - 'time_step_mode': self.time_step_mode, - 'direct_comm': self.direct_comm, - 'time_steps': cfg['Parameters']['timesteps'], - 'I_MPI_PIN_CELL': os.environ.get('I_MPI_PIN_CELL', ''), - 'I_MPI_PIN_DOMAIN': os.environ.get('I_MPI_PIN_CELL', ''), - } - - optimizations = eval(optimizations) - result.update(optimizations) - result.update(kwargs) - sequenceValuesToScalars(result) - num_tries = 4 - for num_try in range(num_tries): - try: - checkAndUpdateSchema(result, "runs", self.db_file_name) - storeSingle(result, "runs", self.db_file_name) - break - except sqlite3.OperationalError as e: - wlb.log_warning(f"Sqlite DB writing failed: try {num_try + 1}/4 " + str(e)) - - -def block_size_ok(sc): - block_size = sc.config()['DomainSetup']['cellsPerBlock'] - return prod(block_size) * 19 < 2 ** 31 - - -def single_node_benchmark(): - scenarios = wlb.ScenarioManager() - for block_size in [(128, 128, 128), (128, 64, 64), (64, 64, 128), (64, 128, 64), (64, 64, 64), - (1024, 64, 32), (2048, 64, 16), - (64, 32, 32), (32, 32, 32), (16, 16, 16), (256, 128, 64), (512, 128, 32)]: - for direct_comm in (True, False): - for time_step_mode in ['aa', 'aaKernelOnly', 'twoField']: - if time_step_mode == 'twoField': - for kt in ['generated', 'manualGeneric', 'manualD3Q19']: - sc = BenchmarkScenario(block_size=block_size, direct_comm=direct_comm, - time_step_mode=time_step_mode, two_field_kernel_type=kt, - domain_decomposition_func=domain_decomposition_func_z - ) - if not block_size_ok(sc): - continue - scenarios.add(sc) - else: - sc = BenchmarkScenario(block_size=block_size, direct_comm=direct_comm, - domain_decomposition_func=domain_decomposition_func_z, - time_step_mode=time_step_mode) - if not block_size_ok(sc): - continue - # scenarios.add(sc) - - -def single_node_benchmark_small(): - scenarios = wlb.ScenarioManager() - for block_size in [(128, 128, 128), (128, 64, 64), (64, 64, 128), (64, 128, 64), (64, 64, 64), - (1024, 64, 32), (2048, 64, 16), (64, 32, 32), (32, 32, 32), (16, 16, 16), - (256, 128, 64), (512, 128, 32)]: - for direct_comm in (True, False): - for time_step_mode in ['aa', 'aaKernelOnly', 'twoField']: - sc = BenchmarkScenario(block_size=block_size, direct_comm=direct_comm, time_step_mode=time_step_mode) - if not block_size_ok(sc): - continue - scenarios.add(sc) - - -def weak_scaling(): - scenarios = wlb.ScenarioManager() - for block_size in [(128, 128, 128), (128, 64, 64), (64, 64, 128), (64, 128, 64), (64, 64, 64), - (1024, 64, 32), (2048, 64, 16), (64, 32, 32), (32, 32, 32), (16, 16, 16), - (256, 128, 64), (512, 128, 32)]: - for direct_comm in (True, False): - sc = BenchmarkScenario(block_size=block_size, direct_comm=direct_comm, - domain_decomposition_func=domain_decomposition_func_full) - if not block_size_ok(sc): - continue - scenarios.add(sc) - - -single_node_benchmark() diff --git a/extern/lodepng/CMakeLists.txt b/extern/lodepng/CMakeLists.txt index 30f2e0da46c7f8efbfd062d2ac8382b50986b9d8..9f7e544cb710edef74a866db3f75abd64c34565f 100644 --- a/extern/lodepng/CMakeLists.txt +++ b/extern/lodepng/CMakeLists.txt @@ -1,8 +1,8 @@ add_library( - lodepng - EXCLUDE_FROM_ALL - lodepng.cpp - lodepng.h + lodepng + EXCLUDE_FROM_ALL + lodepng.cpp + lodepng.h ) target_include_directories(lodepng PUBLIC SYSTEM "${CMAKE_CURRENT_SOURCE_DIR}") @@ -10,4 +10,4 @@ if (MSVC) target_compile_options( lodepng PRIVATE /w ) else () target_compile_options( lodepng PRIVATE -w ) -endif () \ No newline at end of file +endif () diff --git a/python/lbmpy_walberla/packinfo.py b/python/lbmpy_walberla/packinfo.py index fef1c0026c7c0ed6acd95331a65ca354fed0bd5c..27f26b05b5e6e1fc68a093e24af07f149b9298e6 100644 --- a/python/lbmpy_walberla/packinfo.py +++ b/python/lbmpy_walberla/packinfo.py @@ -17,7 +17,7 @@ def generate_lb_pack_info(generation_context, namespace='lbm', target=Target.CPU, data_type=None, - cpu_openmp=None, + cpu_openmp=False, **create_kernel_params): """Generates waLBerla MPI PackInfos for an LBM kernel, based on a given method and streaming pattern. For in-place streaming patterns, two PackInfos are generated; diff --git a/python/pystencils_walberla/codegen.py b/python/pystencils_walberla/codegen.py index 6816ac87ec90e22bc93b5086b77f1ff8d259d760..de792b8fc3fd8231ed021c557bf5ed84848e3f07 100644 --- a/python/pystencils_walberla/codegen.py +++ b/python/pystencils_walberla/codegen.py @@ -160,7 +160,7 @@ def generate_selective_sweep(generation_context, class_name, selection_tree, int def generate_pack_info_for_field(generation_context, class_name: str, field: Field, direction_subset: Optional[Tuple[Tuple[int, int, int]]] = None, operator=None, gl_to_inner=False, - target=Target.CPU, data_type=None, cpu_openmp=None, + target=Target.CPU, data_type=None, cpu_openmp=False, **create_kernel_params): """Creates a pack info for a pystencils field assuming a pull-type stencil, packing all cell elements. @@ -188,7 +188,7 @@ def generate_pack_info_for_field(generation_context, class_name: str, field: Fie def generate_pack_info_from_kernel(generation_context, class_name: str, assignments: Sequence[Assignment], - kind='pull', operator=None, target=Target.CPU, data_type=None, cpu_openmp=None, + kind='pull', operator=None, target=Target.CPU, data_type=None, cpu_openmp=False, **create_kernel_params): """Generates a waLBerla GPU PackInfo from a (pull) kernel. @@ -241,7 +241,7 @@ def generate_pack_info_from_kernel(generation_context, class_name: str, assignme def generate_pack_info(generation_context, class_name: str, directions_to_pack_terms: Dict[Tuple[Tuple], Sequence[Field.Access]], namespace='pystencils', operator=None, gl_to_inner=False, - target=Target.CPU, data_type=None, cpu_openmp=None, + target=Target.CPU, data_type=None, cpu_openmp=False, **create_kernel_params): """Generates a waLBerla GPU PackInfo @@ -258,6 +258,9 @@ def generate_pack_info(generation_context, class_name: str, cpu_openmp: if loops should use openMP or not. **create_kernel_params: remaining keyword arguments are passed to `pystencils.create_kernel` """ + if cpu_openmp: + raise ValueError("The packing kernels are already called inside an OpenMP parallel region. Thus " + "additionally parallelising each kernel is not supported.") items = [(e[0], sorted(e[1], key=lambda x: str(x))) for e in directions_to_pack_terms.items()] items = sorted(items, key=lambda e: e[0]) directions_to_pack_terms = OrderedDict(items) @@ -286,7 +289,7 @@ def generate_pack_info(generation_context, class_name: str, if len(data_types) == 0: raise ValueError("No fields to pack!") if len(data_types) != 1: - err_detail = "\n".join(" - {} [{}]".format(f.name, f.dtype) for f in fields_accessed) + err_detail = "\n".join(f" - {f.name} [{f.dtype}]" for f in fields_accessed) raise NotImplementedError("Fields of different data types are used - this is not supported.\n" + err_detail) dtype = data_types.pop() diff --git a/python/pystencils_walberla/templates/CpuPackInfo.tmpl.h b/python/pystencils_walberla/templates/CpuPackInfo.tmpl.h index ab7ef56fd2b53e9b177b1222b89a087a37447569..d25c04b2b782fe891de361356aa046554d32f1ae 100644 --- a/python/pystencils_walberla/templates/CpuPackInfo.tmpl.h +++ b/python/pystencils_walberla/templates/CpuPackInfo.tmpl.h @@ -44,7 +44,6 @@ public: unpackData( receiver, stencil::inverseDir[dir], rBuffer ); } -private: void packDataImpl(const IBlock * sender, stencil::Direction dir, mpi::SendBuffer & outBuffer) const { const auto dataSize = size(dir, sender); pack(dir, outBuffer.forward(dataSize), const_cast<IBlock*>(sender)); @@ -54,6 +53,7 @@ private: void unpack(stencil::Direction dir, unsigned char * buffer, IBlock * block) const; uint_t size (stencil::Direction dir, const IBlock * block) const; + private: {{fused_kernel|generate_members(parameters_to_ignore=['buffer'])|indent(4)}} }; diff --git a/python/pystencils_walberla/templates/Sweep.tmpl.cpp b/python/pystencils_walberla/templates/Sweep.tmpl.cpp index 7b095e32877f295f8f39f74501a7ecadcd6c02e8..96e589e1e549bd3ad28075e663ff399131dc4a33 100644 --- a/python/pystencils_walberla/templates/Sweep.tmpl.cpp +++ b/python/pystencils_walberla/templates/Sweep.tmpl.cpp @@ -102,7 +102,7 @@ void {{class_name}}::outer( {{- ["IBlock * block", kernel.kernel_selection_param { {{kernel|generate_block_data_to_field_extraction|indent(4)}} - if( layers_.size() == 0 ) + if( layers_.empty() ) { CellInterval ci; diff --git a/python/pystencils_walberla/tests/test_packinfo_generation.py b/python/pystencils_walberla/tests/test_packinfo_generation.py index c3fc2f50fb6c76a6db1dbcc609a049475f95f589..890a8469db5e9da88e5dfb617469bb6dfdb28bd8 100644 --- a/python/pystencils_walberla/tests/test_packinfo_generation.py +++ b/python/pystencils_walberla/tests/test_packinfo_generation.py @@ -28,8 +28,12 @@ class PackinfoGenTest(unittest.TestCase): for file_name_to_test in ('PI1.cpp', 'PI2.cpp'): file_to_test = ctx.files[file_name_to_test] + + # For Packing kernels it is better to not have OpenMP in the code because the packing kernels + # themselves are launched in an OpenMP environment. Still it could be forced but setting + # openmp to True in generate_pack_info_from_kernel if openmp: - assert '#pragma omp parallel' in file_to_test + assert '#pragma omp parallel' not in file_to_test if da: assert 'float ' not in file_to_test diff --git a/src/cuda/lbm/CombinedInPlaceGpuPackInfo.h b/src/cuda/lbm/CombinedInPlaceGpuPackInfo.h index dc7c7d2c0fbd9f1d152793bcbe84d577dc708ed9..c47d815c111efb82e2d199f17763d620744397b4 100644 --- a/src/cuda/lbm/CombinedInPlaceGpuPackInfo.h +++ b/src/cuda/lbm/CombinedInPlaceGpuPackInfo.h @@ -82,5 +82,3 @@ class CombinedInPlaceGpuPackInfo : public cuda::GeneratedGPUPackInfo } // namespace lbm } // namespace walberla - - diff --git a/src/lbm/communication/CombinedInPlaceCpuPackInfo.h b/src/lbm/communication/CombinedInPlaceCpuPackInfo.h new file mode 100644 index 0000000000000000000000000000000000000000..2b00dabc6e2c08169a544040efac19ff703144c2 --- /dev/null +++ b/src/lbm/communication/CombinedInPlaceCpuPackInfo.h @@ -0,0 +1,124 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file CombinedInPlaceCpuPackInfo.h +//! \author Markus Holzer <markus.holzer@fau.de> +// +//====================================================================================================================== + +#pragma once +#define IS_EVEN(x) (((x) &1) ^ 1) + +#include "communication/UniformPackInfo.h" + +#include "lbm/inplace_streaming/TimestepTracker.h" + +namespace walberla +{ +namespace lbm +{ +template< typename EvenPackInfo, typename OddPackInfo > +class CombinedInPlaceCpuPackInfo : public ::walberla::communication::UniformPackInfo +{ + public: + template< typename... Args > + CombinedInPlaceCpuPackInfo(std::shared_ptr< lbm::TimestepTracker >& tracker, Args&&... args) + : tracker_(tracker), evenPackInfo_(std::forward< Args >(args)...), oddPackInfo_(std::forward< Args >(args)...) + {} + + ~CombinedInPlaceCpuPackInfo() override = default; + bool constantDataExchange() const override { return true; } + bool threadsafeReceiving() const override { return true; } + + void unpackData(IBlock* receiver, stencil::Direction dir, mpi::RecvBuffer& buffer) override + { + if (IS_EVEN(tracker_->getCounter())) + { + return evenPackInfo_.unpackData(receiver, dir, buffer); + } + else + { + return oddPackInfo_.unpackData(receiver, dir, buffer); + } + } + + void communicateLocal(const IBlock* sender, IBlock* receiver, stencil::Direction dir) override + { + if (IS_EVEN(tracker_->getCounter())) + { + return evenPackInfo_.communicateLocal(sender, receiver, dir); + } + else + { + return oddPackInfo_.communicateLocal(sender, receiver, dir); + } + } + + void packDataImpl(const IBlock* sender, stencil::Direction dir, mpi::SendBuffer& outBuffer) const override + { + if (IS_EVEN(tracker_->getCounter())) + { + return evenPackInfo_.packDataImpl(sender, dir, outBuffer); + } + else + { + return oddPackInfo_.packDataImpl(sender, dir, outBuffer); + } + } + + void pack(stencil::Direction dir, unsigned char* buffer, IBlock* block) const + { + if (IS_EVEN(tracker_->getCounter())) + { + evenPackInfo_.pack(dir, buffer, block); + } + else + { + oddPackInfo_.pack(dir, buffer, block); + } + } + + void unpack(stencil::Direction dir, unsigned char* buffer, IBlock* block) const + { + if (IS_EVEN(tracker_->getCounter())) + { + evenPackInfo_.unpack(dir, buffer, block); + } + else + { + oddPackInfo_.unpack(dir, buffer, block); + } + } + + uint_t size(stencil::Direction dir, IBlock* block) const + { + if (IS_EVEN(tracker_->getCounter())) + { + return evenPackInfo_.size(dir, block); + } + else + { + return oddPackInfo_.size(dir, block); + } + } + + private: + const std::shared_ptr< lbm::TimestepTracker >& tracker_; + EvenPackInfo evenPackInfo_; + OddPackInfo oddPackInfo_; +}; + +} // namespace lbm +} // namespace walberla diff --git a/src/lbm/communication/all.h b/src/lbm/communication/all.h index e2d98f85b5881ff20202dccb9b4fb8b8918ea382..50beeff07c6518c8d091cc69a7b9360764a094bd 100644 --- a/src/lbm/communication/all.h +++ b/src/lbm/communication/all.h @@ -22,6 +22,7 @@ #pragma once +#include "CombinedInPlaceCpuPackInfo.h" #include "PdfFieldMPIDatatypeInfo.h" #include "PdfFieldPackInfo.h" diff --git a/tests/lbm/UnrollTest.cpp b/tests/lbm/UnrollTest.cpp index e32be61148ba623c45e711ea12f0e3bd902fa54f..a7b352de1d29be978b60e3c7bc2926cfb1aba307 100644 --- a/tests/lbm/UnrollTest.cpp +++ b/tests/lbm/UnrollTest.cpp @@ -21,7 +21,6 @@ #include "lbm/IntelCompilerOptimization.h" #include "core/DataTypes.h" -#include "field/Field.h" #include <iostream> #include <sstream>