diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp index 08e5de4d928d9a23e76104823721dfbb5d811f69..494424a562e25503c6b189fa5cf2d957fafca313 100644 --- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp +++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp @@ -194,11 +194,11 @@ int main(int argc, char** argv) gpu::communication::UniformGPUScheme< Stencil_T > comEven(blocks, false); comEven.addPackInfo(make_shared< PackInfoEven_T >(pdfFieldIDGPU)); - auto evenComm = std::function< void() >([&]() { comEven.communicate(nullptr); }); + auto evenComm = std::function< void() >([&]() { comEven.communicate(); }); gpu::communication::UniformGPUScheme< Stencil_T > comODD(blocks, false); comODD.addPackInfo(make_shared< PackInfoOdd_T >(pdfFieldIDGPU)); - auto oddComm = std::function< void() >([&]() { comODD.communicate(nullptr); }); + auto oddComm = std::function< void() >([&]() { comODD.communicate(); }); #else blockforest::communication::UniformBufferedScheme< Stencil_T > evenComm(blocks); evenComm.addPackInfo(make_shared< PackInfoEven_T >(pdfFieldID)); diff --git a/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt b/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt index de1a2c1db0cbd078a80879f964f6a046f98afc95..2b37ed6fb19797229ae5507f4d26f3f031491a87 100644 --- a/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt +++ b/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt @@ -13,9 +13,9 @@ waLBerla_generate_target_from_python(NAME NonUniformGridCPUGenerated waLBerla_add_executable( NAME NonUniformGridGenerator FILES NonUniformGridGenerator.cpp LdcSetup.h - DEPENDS blockforest core domain_decomposition field geometry python_coupling vtk ) + DEPENDS blockforest core field python_coupling ) waLBerla_add_executable( NAME NonUniformGridCPU FILES NonUniformGridCPU.cpp LdcSetup.h - DEPENDS blockforest boundary core domain_decomposition field geometry python_coupling timeloop vtk NonUniformGridCPUGenerated ) + DEPENDS blockforest boundary core domain_decomposition field geometry lbm_generated python_coupling timeloop vtk NonUniformGridCPUGenerated ) diff --git a/apps/benchmarks/NonUniformGridCPU/LdcSetup.h b/apps/benchmarks/NonUniformGridCPU/LdcSetup.h index 1657e85e0e58dc0a1107ba62505e438c1821a1c1..070656cb23582a4da83f954c3e108aa70e69b315 100644 --- a/apps/benchmarks/NonUniformGridCPU/LdcSetup.h +++ b/apps/benchmarks/NonUniformGridCPU/LdcSetup.h @@ -14,12 +14,15 @@ // with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. // //! \file LdcSetup.h +//! \author Markus Holzer <markus.holzer@fau.de> //! \author Frederik Hennig <frederik.hennig@fau.de> // //====================================================================================================================== +#pragma once #include "blockforest/SetupBlock.h" #include "blockforest/SetupBlockForest.h" +#include "blockforest/StructuredBlockForest.h" #include "core/all.h" @@ -45,14 +48,14 @@ class LDCRefinement { const AABB & domain = forest.getDomain(); - real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 ); - real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 ); + const real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 ); + const real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 ); - AABB leftCorner( domain.xMin(), domain.yMax() - ySize, domain.zMin(), - domain.xMin() + xSize, domain.yMax() + ySize, domain.zMax() ); + const AABB leftCorner( domain.xMin(), domain.yMin(), domain.zMin(), + domain.xMin() + xSize, domain.yMin() + ySize, domain.zMax() ); - AABB rightCorner( domain.xMax() - xSize, domain.yMax() - ySize, domain.zMin(), - domain.xMax(), domain.yMax(), domain.zMax() ); + const AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMin(), + domain.xMax(), domain.yMin() + ySize, domain.zMax() ); for(auto & block : forest) { diff --git a/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt b/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt index d6840007e14d5f5af685bb5b262c8bcfd6138d6e..f6b4e1ff3779f624c8fb9845425d9d6a86103ee9 100644 --- a/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt +++ b/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt @@ -11,5 +11,5 @@ waLBerla_generate_target_from_python(NAME NonUniformGridGPUGenerated NonUniformGridGPUBoundaryCollection.h NonUniformGridGPUInfoHeader.h) waLBerla_add_executable( NAME NonUniformGridGPU - FILES NonUniformGridGPU.cpp - DEPENDS blockforest boundary core gpu domain_decomposition field geometry python_coupling timeloop vtk NonUniformGridGPUGenerated ) \ No newline at end of file + FILES NonUniformGridGPU.cpp LdcSetup.h + DEPENDS blockforest boundary core gpu domain_decomposition field geometry lbm_generated python_coupling timeloop vtk NonUniformGridGPUGenerated ) \ No newline at end of file diff --git a/apps/benchmarks/NonUniformGridGPU/LdcSetup.h b/apps/benchmarks/NonUniformGridGPU/LdcSetup.h new file mode 100644 index 0000000000000000000000000000000000000000..238943a7daa9745054980e6011d46ef037ef27ec --- /dev/null +++ b/apps/benchmarks/NonUniformGridGPU/LdcSetup.h @@ -0,0 +1,110 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file LdcSetup.h +//! \author Markus Holzer <markus.holzer@fau.de> +//! \author Frederik Hennig <frederik.hennig@fau.de> +// +//====================================================================================================================== +#pragma once + +#include "blockforest/SetupBlock.h" +#include "blockforest/SetupBlockForest.h" +#include "blockforest/StructuredBlockForest.h" + +#include "core/all.h" + +#include "field/FlagField.h" + +#include "field/FlagUID.h" + +using namespace walberla; +using RefinementSelectionFunctor = SetupBlockForest::RefinementSelectionFunction; +using FlagField_T = FlagField< uint8_t >; + +class LDCRefinement +{ + private: + const uint_t refinementDepth_; + + public: + explicit LDCRefinement(const uint_t depth) : refinementDepth_(depth){}; + + void operator()(SetupBlockForest& forest) const + { + const AABB & domain = forest.getDomain(); + + const real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 ); + const real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 ); + + const AABB leftCorner( domain.xMin(), domain.yMin(), domain.zMin(), + domain.xMin() + xSize, domain.yMin() + ySize, domain.zMax() ); + + const AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMin(), + domain.xMax(), domain.yMin() + ySize, domain.zMax() ); + + for(auto & block : forest) + { + auto & aabb = block.getAABB(); + if( leftCorner.intersects( aabb ) || rightCorner.intersects( aabb ) ) + { + if( block.getLevel() < refinementDepth_) + block.setMarker( true ); + } + } + } +}; + +class LDC +{ + private: + const std::string refinementProfile_; + const uint_t refinementDepth_; + + const FlagUID noSlipFlagUID_; + const FlagUID ubbFlagUID_; + + public: + explicit LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){}; + + RefinementSelectionFunctor refinementSelector() const + { + return LDCRefinement(refinementDepth_); + } + + void setupBoundaryFlagField(StructuredBlockForest& sbfs, const BlockDataID flagFieldID) + { + for (auto bIt = sbfs.begin(); bIt != sbfs.end(); ++bIt) + { + auto& b = dynamic_cast< Block& >(*bIt); + const uint_t level = b.getLevel(); + auto flagField = b.getData< FlagField_T >(flagFieldID); + const uint8_t noslipFlag = flagField->registerFlag(noSlipFlagUID_); + const uint8_t ubbFlag = flagField->registerFlag(ubbFlagUID_); + for (auto cIt = flagField->beginWithGhostLayerXYZ(2); cIt != flagField->end(); ++cIt) + { + const Cell localCell = cIt.cell(); + Cell globalCell(localCell); + sbfs.transformBlockLocalToGlobalCell(globalCell, b); + if (globalCell.y() >= cell_idx_c(sbfs.getNumberOfYCells(level))) { flagField->addFlag(localCell, ubbFlag); } + else if (globalCell.z() < 0 || globalCell.y() < 0 || globalCell.x() < 0 || + globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level)) || globalCell.z() >= cell_idx_c(sbfs.getNumberOfZCells(level))) + { + flagField->addFlag(localCell, noslipFlag); + } + } + } + } +}; diff --git a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp index 8110dbbb4437b8d3c56d9b855de501fd55521454..919755d6d7cd481dd90a2f1310965e0c6947432c 100644 --- a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp +++ b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp @@ -19,7 +19,6 @@ //====================================================================================================================== #include "blockforest/Initialization.h" -#include "blockforest/SetupBlockForest.h" #include "blockforest/loadbalancing/StaticCurve.h" #include "core/Environment.h" @@ -54,6 +53,7 @@ #include <cmath> +#include "LdcSetup.h" #include "NonUniformGridGPUInfoHeader.h" using namespace walberla; @@ -69,82 +69,6 @@ using BoundaryCollection_T = lbm::NonUniformGridGPUBoundaryCollection< FlagField using SweepCollection_T = lbm::NonUniformGridGPUSweepCollection; using gpu::communication::NonUniformGPUScheme; -using RefinementSelectionFunctor = SetupBlockForest::RefinementSelectionFunction; - -class LDCRefinement -{ - private: - const uint_t refinementDepth_; - - public: - explicit LDCRefinement(const uint_t depth) : refinementDepth_(depth){}; - - void operator()(SetupBlockForest& forest) const - { - const AABB & domain = forest.getDomain(); - - real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 ); - real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 ); - - AABB leftCorner( domain.xMin(), domain.yMin(), domain.zMin(), - domain.xMin() + xSize, domain.yMin() + ySize, domain.zMax() ); - - AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMin(), - domain.xMax(), domain.yMin() + ySize, domain.zMax() ); - - for(auto & block : forest) - { - auto & aabb = block.getAABB(); - if( leftCorner.intersects( aabb ) || rightCorner.intersects( aabb ) ) - { - if( block.getLevel() < refinementDepth_) - block.setMarker( true ); - } - } - } -}; - -class LDC -{ - private: - const std::string refinementProfile_; - const uint_t refinementDepth_; - - const FlagUID noSlipFlagUID_; - const FlagUID ubbFlagUID_; - - public: - explicit LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){}; - - RefinementSelectionFunctor refinementSelector() const - { - return LDCRefinement(refinementDepth_); - } - - void setupBoundaryFlagField(StructuredBlockForest& sbfs, const BlockDataID flagFieldID) - { - for (auto bIt = sbfs.begin(); bIt != sbfs.end(); ++bIt) - { - auto& b = dynamic_cast< Block& >(*bIt); - const uint_t level = b.getLevel(); - auto flagField = b.getData< FlagField_T >(flagFieldID); - const uint8_t noslipFlag = flagField->registerFlag(noSlipFlagUID_); - const uint8_t ubbFlag = flagField->registerFlag(ubbFlagUID_); - for (auto cIt = flagField->beginWithGhostLayerXYZ(2); cIt != flagField->end(); ++cIt) - { - const Cell localCell = cIt.cell(); - Cell globalCell(localCell); - sbfs.transformBlockLocalToGlobalCell(globalCell, b); - if (globalCell.y() >= cell_idx_c(sbfs.getNumberOfYCells(level))) { flagField->addFlag(localCell, ubbFlag); } - else if (globalCell.z() < 0 || globalCell.y() < 0 || globalCell.x() < 0 || - globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level)) || globalCell.z() >= cell_idx_c(sbfs.getNumberOfZCells(level))) - { - flagField->addFlag(localCell, noslipFlag); - } - } - } - } -}; namespace { void createSetupBlockForest(SetupBlockForest& setupBfs, const Config::BlockHandle& domainSetup, LDC& ldcSetup, const uint_t numProcesses=uint_c(MPIManager::instance()->numProcesses())) { diff --git a/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py index ccfcecacfb5c0f5a79b56aea13409cc3da4d2748..34bc6caa92b92d5239c9ca1409660b062247a469 100644 --- a/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py +++ b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py @@ -26,7 +26,7 @@ class Scenario: }, 'Parameters': { 'omega': 1.95, - 'timesteps': 10001, + 'timesteps': 30001, 'refinementDepth': self.refinement_depth, 'writeSetupForestAndReturn': False, @@ -37,7 +37,7 @@ class Scenario: 'remainingTimeLoggerFrequency': 3, - 'vtkWriteFrequency': 5000, + 'vtkWriteFrequency': 10000, }, 'Logging': { 'logLevel': "info", diff --git a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp b/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp index c8992a65afb93fa7dae572959a654651f14aabde..2a59e6be99b49942a169d1c24921a93cbe8abd1e 100644 --- a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp +++ b/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp @@ -18,7 +18,6 @@ // //====================================================================================================================== #include "blockforest/Initialization.h" -#include "blockforest/communication/UniformDirectScheme.h" #include "core/Environment.h" #include "core/logging/Initialization.h" @@ -66,7 +65,7 @@ typedef gpu::GPUField< real_t > GPUField; int main(int argc, char** argv) { - mpi::Environment env(argc, argv); + const mpi::Environment env(argc, argv); #if defined(WALBERLA_BUILD_WITH_CUDA) gpu::selectDeviceBasedOnMpiRank(); #endif @@ -92,14 +91,14 @@ int main(int argc, char** argv) #if defined(WALBERLA_BUILD_WITH_CUDA) // CPU fields - BlockDataID vel_field = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx); + const BlockDataID vel_field = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx); BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_c(0.0), field::fzyx); // GPU fields - BlockDataID lb_phase_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >( + const BlockDataID lb_phase_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >( blocks, "lb phase field on GPU", Stencil_phase_T::Size, field::fzyx, 1); - BlockDataID lb_velocity_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >( + const BlockDataID lb_velocity_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >( blocks, "lb velocity field on GPU", Stencil_hydro_T::Size, field::fzyx, 1); - BlockDataID vel_field_gpu = + const BlockDataID vel_field_gpu = gpu::addGPUFieldToStorage< VelocityField_T >(blocks, vel_field, "velocity field on GPU", true); BlockDataID phase_field_gpu = gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "phase field on GPU", true); @@ -215,15 +214,15 @@ int main(int argc, char** argv) auto timeLoop = make_shared< SweepTimeloop >(blocks->getBlockStorage(), timesteps); #if defined(WALBERLA_BUILD_WITH_CUDA) auto normalTimeStep = [&]() { - Comm_velocity_based_distributions->startCommunication(defaultStream); + Comm_velocity_based_distributions->startCommunication(); for (auto& block : *blocks) phase_field_LB_step(&block, defaultStream); - Comm_velocity_based_distributions->wait(defaultStream); + Comm_velocity_based_distributions->wait(); - Comm_phase_field_distributions->startCommunication(defaultStream); + Comm_phase_field_distributions->startCommunication(); for (auto& block : *blocks) hydro_LB_step(&block, defaultStream); - Comm_phase_field_distributions->wait(defaultStream); + Comm_phase_field_distributions->wait(); }; auto phase_only = [&]() { for (auto& block : *blocks) diff --git a/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp b/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp index fc229eb116f86ee6c967a8fba5a1948f7b73fc64..dfcd22a87e6942fb7ac2bc5789ac92fdd65fec9f 100644 --- a/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp +++ b/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp @@ -68,8 +68,8 @@ int main(int argc, char** argv) { const mpi::Environment env(argc, argv); - std::string input_filename(argv[1]); - bool inputIsPython = string_ends_with(input_filename, ".py"); + const std::string input_filename(argv[1]); + const bool inputIsPython = string_ends_with(input_filename, ".py"); for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg) { diff --git a/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py index c3076cb3e1f457ecae1f6a5090adac2314c34e97..21235056434f3daeebdbac212ec8de60d58810b4 100644 --- a/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py +++ b/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py @@ -22,12 +22,12 @@ def num_time_steps(block_size, time_steps_for_128_block=TIME_STEPS_FOR_128_BLOCK ldc_setup = {'Border': [ - {'direction': 'N', 'walldistance': -1, 'flag': 'NoSlip'}, - {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'N', 'walldistance': -1, 'flag': 'UBB'}, {'direction': 'W', 'walldistance': -1, 'flag': 'NoSlip'}, {'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'}, - {'direction': 'T', 'walldistance': -1, 'flag': 'UBB'}, + {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'}, {'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'}, ]} diff --git a/apps/benchmarks/UniformGridGPU/CMakeLists.txt b/apps/benchmarks/UniformGridGPU/CMakeLists.txt index 66a5b0fa4f4a3588f36ba4dbd5feb732131f76d0..2607004f3749f366f8155a0dd200202f00e45867 100644 --- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt +++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt @@ -25,7 +25,7 @@ foreach(streaming_pattern pull push aa esotwist) waLBerla_add_executable(NAME UniformGridGPU_${config} FILES UniformGridGPU.cpp - DEPENDS blockforest boundary core gpu domain_decomposition field geometry python_coupling timeloop vtk UniformGridGPUGenerated_${config}) + DEPENDS blockforest boundary core gpu domain_decomposition field geometry lbm_generated python_coupling timeloop vtk UniformGridGPUGenerated_${config}) # all configs are excluded from all except for pull d3q27. if (${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27") diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp index ee022f457738fb6f8aa71f615441e9279fd25eca..fdc8969d626b866b978dfd1260565c50f96f01b8 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp @@ -167,22 +167,22 @@ int main(int argc, char** argv) if (timeStepStrategy == "noOverlap") { if (boundariesConfig){ - timeLoop.add() << BeforeFunction(communication.getCommunicateFunctor(defaultStream), "communication") + timeLoop.add() << BeforeFunction(communication.getCommunicateFunctor(), "communication") << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL, defaultStream), "Boundary Conditions"); timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL, defaultStream), "LBM StreamCollide"); }else { - timeLoop.add() << BeforeFunction(communication.getCommunicateFunctor(defaultStream), "communication") + timeLoop.add() << BeforeFunction(communication.getCommunicateFunctor(), "communication") << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL, defaultStream), "LBM StreamCollide");} } else if (timeStepStrategy == "simpleOverlap") { if (boundariesConfig){ - timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(defaultStream), "Start Communication") + timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(), "Start Communication") << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL, defaultStream), "Boundary Conditions"); timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::INNER, defaultStream), "LBM StreamCollide Inner Frame"); timeLoop.add() << BeforeFunction(communication.getWaitFunctor(), "Wait for Communication") << Sweep(sweepCollection.streamCollide(SweepCollection_T::OUTER, defaultStream), "LBM StreamCollide Outer Frame"); }else{ - timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(defaultStream), "Start Communication") + timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(), "Start Communication") << Sweep(sweepCollection.streamCollide(SweepCollection_T::INNER, defaultStream), "LBM StreamCollide Inner Frame"); timeLoop.add() << BeforeFunction(communication.getWaitFunctor(), "Wait for Communication") << Sweep(sweepCollection.streamCollide(SweepCollection_T::OUTER,defaultStream), "LBM StreamCollide Outer Frame");} @@ -240,14 +240,14 @@ int main(int argc, char** argv) WALBERLA_GPU_CHECK(gpuPeekAtLastError()) timeLoop.setCurrentTimeStepToZero(); - WcTimingPool const timeloopTiming; + WcTimingPool timeloopTiming; WcTimer simTimer; WALBERLA_GPU_CHECK( gpuDeviceSynchronize() ) WALBERLA_GPU_CHECK( gpuPeekAtLastError() ) WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps") simTimer.start(); - timeLoop.run(); + timeLoop.run(timeloopTiming); WALBERLA_GPU_CHECK( gpuDeviceSynchronize() ) simTimer.end(); diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py index e1972d914a5acc26ab54aaf3cb86c615ac4d3b77..74be4378e0e2acef0bcb3c36f0f6d64916bba6c8 100755 --- a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py +++ b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py @@ -26,12 +26,13 @@ BASE_CONFIG = { } ldc_setup = {'Border': [ - {'direction': 'N', 'walldistance': -1, 'flag': 'NoSlip'}, - {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'}, {'direction': 'N', 'walldistance': -1, 'flag': 'UBB'}, + {'direction': 'W', 'walldistance': -1, 'flag': 'NoSlip'}, {'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'}, - {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'}, {'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'}, + ]} @@ -55,7 +56,7 @@ def domain_block_size_ok(block_size, total_mem, gls=1, q=27, size_per_value=8): class Scenario: - def __init__(self, cells_per_block=(256, 128, 128), periodic=(1, 1, 1), cuda_blocks=(256, 1, 1), + def __init__(self, cells_per_block=(256, 128, 128), periodic=(1, 1, 1), cuda_blocks=(128, 1, 1), timesteps=None, time_step_strategy="normal", omega=1.8, cuda_enabled_mpi=False, inner_outer_split=(1, 1, 1), warmup_steps=5, outer_iterations=3, init_shear_flow=False, boundary_setup=False, @@ -110,7 +111,11 @@ class Scenario: 'innerOuterSplit': self.inner_outer_split, 'vtkWriteFrequency': self.vtk_write_frequency, 'remainingTimeLoggerFrequency': self.remaining_time_logger_frequency + }, + 'Logging': { + 'logLevel': 'info', # info progress detail tracing } + } if self.boundary_setup: config_dict["Boundaries"] = ldc_setup @@ -184,12 +189,14 @@ def overlap_benchmark(): # no overlap scenarios.add(Scenario(time_step_strategy='noOverlap', inner_outer_split=(1, 1, 1), - cuda_enabled_mpi=cuda_enabled_mpi)) + cuda_enabled_mpi=cuda_enabled_mpi, + outer_iterations=1)) for inner_outer_split in inner_outer_splits: scenario = Scenario(time_step_strategy='simpleOverlap', inner_outer_split=inner_outer_split, - cuda_enabled_mpi=cuda_enabled_mpi) + cuda_enabled_mpi=cuda_enabled_mpi, + outer_iterations=1) scenarios.add(scenario) @@ -228,6 +235,7 @@ def single_gpu_benchmark(): cuda_blocks=cuda_block_size, time_step_strategy='kernelOnly', timesteps=num_time_steps(block_size, 2000), + outer_iterations=1, additional_info=additional_info) scenarios.add(scenario) @@ -237,18 +245,18 @@ def validation_run(): wlb.log_info_on_root("Validation run") wlb.log_info_on_root("") - time_step_strategy = "noOverlap" # "noOverlap" + time_step_strategy = "noOverlap" # "simpleOverlap" scenarios = wlb.ScenarioManager() - scenario = Scenario(cells_per_block=(64, 64, 64), + scenario = Scenario(cells_per_block=(128, 128, 128), time_step_strategy=time_step_strategy, - timesteps=1000, + timesteps=10001, outer_iterations=1, warmup_steps=0, init_shear_flow=False, boundary_setup=True, - vtk_write_frequency=0, - remaining_time_logger_frequency=10) + vtk_write_frequency=5000, + remaining_time_logger_frequency=30) scenarios.add(scenario) diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp index 2800b98cb65008ef5d66aa98853fb5589087c8d5..9b883282a0437628203cfb04de660f0301b5758a 100644 --- a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp +++ b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp @@ -72,7 +72,7 @@ typedef gpu::GPUField< uint8_t > GPUField_int; int main(int argc, char** argv) { - mpi::Environment Env(argc, argv); + const mpi::Environment Env(argc, argv); gpu::selectDeviceBasedOnMpiRank(); exportDataStructuresToPython(); @@ -114,17 +114,17 @@ int main(int argc, char** argv) BlockDataID vel_field = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx); BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_c(0.0), field::fzyx); // GPU fields - BlockDataID lb_phase_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >( + const BlockDataID lb_phase_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >( blocks, "lb phase field on GPU", Stencil_phase_T::Size, field::fzyx, 1); - BlockDataID lb_velocity_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >( + const BlockDataID lb_velocity_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >( blocks, "lb velocity field on GPU", Stencil_hydro_T::Size, field::fzyx, 1); BlockDataID vel_field_gpu = gpu::addGPUFieldToStorage< VelocityField_T >(blocks, vel_field, "velocity field on GPU", true); BlockDataID phase_field_gpu = gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "phase field on GPU", true); // Flag field - BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field"); - BlockDataID flagFieldID_gpu = gpu::addGPUFieldToStorage< FlagField_T >(blocks, flagFieldID, "flag on GPU", true); + const BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field"); + const BlockDataID flagFieldID_gpu = gpu::addGPUFieldToStorage< FlagField_T >(blocks, flagFieldID, "flag on GPU", true); auto physical_parameters = config->getOneBlock("PhysicalParameters"); const real_t density_liquid = physical_parameters.getParameter< real_t >("density_liquid", real_c(1.0)); @@ -181,11 +181,11 @@ int main(int argc, char** argv) interface_thickness); pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field_gpu, vel_field_gpu); - pystencils::phase_field_LB_step phase_field_LB_step(flagFieldID_gpu, lb_phase_field_gpu, phase_field_gpu, + const pystencils::phase_field_LB_step phase_field_LB_step(flagFieldID_gpu, lb_phase_field_gpu, phase_field_gpu, vel_field_gpu, mobility, interface_thickness, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]); - pystencils::hydro_LB_step hydro_LB_step(flagFieldID_gpu, lb_velocity_field_gpu, phase_field_gpu, vel_field_gpu, + const pystencils::hydro_LB_step hydro_LB_step(flagFieldID_gpu, lb_velocity_field_gpu, phase_field_gpu, vel_field_gpu, gravitational_acceleration, interface_thickness, density_liquid, density_gas, surface_tension, relaxation_time_liquid, relaxation_time_gas, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]); @@ -193,8 +193,8 @@ int main(int argc, char** argv) //////////////////////// // ADD COMMUNICATION // ////////////////////// - int streamLowPriority = 0; - int streamHighPriority = 0; + const int streamLowPriority = 0; + const int streamHighPriority = 0; auto defaultStream = gpu::StreamRAII::newPriorityStream(streamLowPriority); auto innerOuterStreams = gpu::ParallelStreams(streamHighPriority); @@ -204,20 +204,20 @@ int main(int argc, char** argv) make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu); UniformGPUSchemeVelocityDistributions->addPackInfo(generatedPackInfo_velocity_based_distributions); auto Comm_velocity_based_distributions = - std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->communicate(defaultStream); }); + std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->communicate(); }); auto Comm_velocity_based_distributions_start = - std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->startCommunication(defaultStream); }); + std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->startCommunication(); }); auto Comm_velocity_based_distributions_wait = - std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->wait(defaultStream); }); + std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->wait(); }); auto UniformGPUSchemePhaseField = make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi); auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field_gpu); UniformGPUSchemePhaseField->addPackInfo(generatedPackInfo_phase_field); - auto Comm_phase_field = std::function< void() >([&]() { UniformGPUSchemePhaseField->communicate(defaultStream); }); + auto Comm_phase_field = std::function< void() >([&]() { UniformGPUSchemePhaseField->communicate(); }); auto Comm_phase_field_start = - std::function< void() >([&]() { UniformGPUSchemePhaseField->startCommunication(defaultStream); }); - auto Comm_phase_field_wait = std::function< void() >([&]() { UniformGPUSchemePhaseField->wait(defaultStream); }); + std::function< void() >([&]() { UniformGPUSchemePhaseField->startCommunication(); }); + auto Comm_phase_field_wait = std::function< void() >([&]() { UniformGPUSchemePhaseField->wait(); }); auto UniformGPUSchemePhaseFieldDistributions = make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi); @@ -225,11 +225,11 @@ int main(int argc, char** argv) make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field_gpu); UniformGPUSchemePhaseFieldDistributions->addPackInfo(generatedPackInfo_phase_field_distributions); auto Comm_phase_field_distributions = - std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->communicate(defaultStream); }); + std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->communicate(); }); auto Comm_phase_field_distributions_start = - std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->startCommunication(defaultStream); }); + std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->startCommunication(); }); auto Comm_phase_field_distributions_wait = - std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->wait(defaultStream); }); + std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->wait(); }); //////////////////////// // BOUNDARY HANDLING // @@ -394,7 +394,7 @@ int main(int argc, char** argv) targetRank, MPI_COMM_WORLD); WALBERLA_EXCLUSIVE_WORLD_SECTION(targetRank) { - std::string path = ""; + const std::string path = ""; std::ostringstream out; out << std::internal << std::setfill('0') << std::setw(6) << counter; geometry::writeMesh( diff --git a/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp b/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp index 1856106c5b61880752c7216ee10eabda140485b1..0d57d59433ee7879ff3deb272d2a8c1cc8ff3419 100644 --- a/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp +++ b/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp @@ -205,7 +205,7 @@ int main(int argc, char** argv) const bool sendDirectlyFromGPU = false; gpu::communication::UniformGPUScheme< Stencil_T > com(blocks, sendDirectlyFromGPU); com.addPackInfo(make_shared< PackInfo_T >(pdfFieldId)); - auto communication = std::function< void() >([&]() { com.communicate(nullptr); }); + auto communication = std::function< void() >([&]() { com.communicate(); }); #else blockforest::communication::UniformBufferedScheme< Stencil_T > communication(blocks); communication.addPackInfo(make_shared< PackInfo_T >(pdfFieldId)); diff --git a/extern/pybind11 b/extern/pybind11 index 8b03ffa7c06cd9c8a38297b1c8923695d1ff1b07..f7b499615e14d70ab098a20deb0cdb3889998a1a 160000 --- a/extern/pybind11 +++ b/extern/pybind11 @@ -1 +1 @@ -Subproject commit 8b03ffa7c06cd9c8a38297b1c8923695d1ff1b07 +Subproject commit f7b499615e14d70ab098a20deb0cdb3889998a1a diff --git a/src/blockforest/Initialization.cpp b/src/blockforest/Initialization.cpp index b91923eebd5b53156fbe3e916c8aeb84963a283b..d800a75b10a112edba39a6655c84bfad53cb1426 100644 --- a/src/blockforest/Initialization.cpp +++ b/src/blockforest/Initialization.cpp @@ -130,7 +130,7 @@ shared_ptr< StructuredBlockForest > createUniformBlockGridFromConfig( const Conf cell_idx_c(cells[1])-1, cell_idx_c(cells[2])-1 ); - uint_t nrOfProcesses = uint_c( MPIManager::instance()->numProcesses() ); + const uint_t nrOfProcesses = uint_c( MPIManager::instance()->numProcesses() ); calculateCellDistribution( cells, nrOfProcesses, blocks, cellsPerBlock ); } diff --git a/src/geometry/initializer/BoundaryFromDomainBorder.impl.h b/src/geometry/initializer/BoundaryFromDomainBorder.impl.h index 38a97d82451d6fc114541f892aa023660c3a9b02..c3ae21b55cd3709ff28bdf7012151d51b6b311eb 100644 --- a/src/geometry/initializer/BoundaryFromDomainBorder.impl.h +++ b/src/geometry/initializer/BoundaryFromDomainBorder.impl.h @@ -65,7 +65,7 @@ void BoundaryFromDomainBorder<Handling>::init( const Config::BlockHandle & block BoundarySetter<Handling> boundarySetter; boundarySetter.setConfigBlock( blockHandle ); - std::string directionStr = blockHandle.getParameter<std::string>( "direction" ); + const std::string directionStr = blockHandle.getParameter<std::string>( "direction" ); cell_idx_t wallDistance = blockHandle.getParameter<cell_idx_t>( "walldistance", 0 ); cell_idx_t ghostLayersToInitialize = blockHandle.getParameter<cell_idx_t>( "ghostLayersToInitialize", std::numeric_limits<cell_idx_t>::max() ); @@ -75,8 +75,8 @@ void BoundaryFromDomainBorder<Handling>::init( const Config::BlockHandle & block using stencil::D3Q7; for( auto dirIt = D3Q7::beginNoCenter(); dirIt != D3Q7::end(); ++dirIt ) { - bool isAll = string_icompare( directionStr, "all" ) == 0; - bool isInDirectionStrings = std::find( directionStrings.begin(), directionStrings.end(), + const bool isAll = string_icompare( directionStr, "all" ) == 0; + const bool isInDirectionStrings = std::find( directionStrings.begin(), directionStrings.end(), stencil::dirToString[*dirIt] ) != directionStrings.end(); if( isAll || isInDirectionStrings ) @@ -87,7 +87,7 @@ void BoundaryFromDomainBorder<Handling>::init( const Config::BlockHandle & block } if ( ! atLeastOneBoundarySet ) - WALBERLA_ABORT( "Invalid Direction " << directionStr << ". Allowed values: all, N,S,W,E,T,B "); + WALBERLA_ABORT( "Invalid Direction " << directionStr << ". Allowed values: all, N,S,W,E,T,B ") } diff --git a/src/gpu/communication/NonUniformGPUScheme.h b/src/gpu/communication/NonUniformGPUScheme.h index 093ec4cad2a830a80042073f905bb1c7316bf8ae..745d28cc5f18e0df1ce6eeeda0cfbf5d478656ee 100644 --- a/src/gpu/communication/NonUniformGPUScheme.h +++ b/src/gpu/communication/NonUniformGPUScheme.h @@ -307,6 +307,9 @@ void NonUniformGPUScheme< Stencil >::startCommunicationEqualLevel(const uint_t i for (auto it : headers_[EQUAL_LEVEL][index]) bufferSystemGPU_[EQUAL_LEVEL][index].sendBuffer(it.first).clear(); + // wait until communication dependent kernels are finished + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) + // Start filling send buffers for (auto& iBlock : *forest) { @@ -397,6 +400,9 @@ void NonUniformGPUScheme< Stencil >::startCommunicationCoarseToFine(const uint_t for (auto it : headers_[COARSE_TO_FINE][index]) bufferSystemGPU_[COARSE_TO_FINE][index].sendBuffer(it.first).clear(); + // wait until communication dependent kernels are finished + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) + // Start filling send buffers for (auto& iBlock : *forest) { @@ -431,7 +437,7 @@ void NonUniformGPUScheme< Stencil >::startCommunicationCoarseToFine(const uint_t { WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.remainingSize(), pi->sizeCoarseToFineSend(coarseBlock, fineReceiverId, *dir)) - pi->communicateLocalCoarseToFine(coarseBlock, fineReceiverBlock, *dir, gpuDataBuffer, streams_[*dir]); + pi->communicateLocalCoarseToFine(coarseBlock, fineReceiverBlock, *dir, gpuDataBuffer, nullptr); } } else @@ -500,6 +506,9 @@ void NonUniformGPUScheme< Stencil >::startCommunicationFineToCoarse(const uint_t for (auto it : headers_[FINE_TO_COARSE][index]) bufferSystemGPU_[FINE_TO_COARSE][index].sendBuffer(it.first).clear(); + // wait until communication dependent kernels are finished + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) + // Start filling send buffers for (auto& iBlock : *forest) { @@ -532,7 +541,7 @@ void NonUniformGPUScheme< Stencil >::startCommunicationFineToCoarse(const uint_t { WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeFineToCoarseSend(fineBlock, *dir)) - pi->communicateLocalFineToCoarse(fineBlock, coarseReceiverBlock, *dir, gpuDataBuffer, streams_[*dir]); + pi->communicateLocalFineToCoarse(fineBlock, coarseReceiverBlock, *dir, gpuDataBuffer, nullptr); } } else diff --git a/src/gpu/communication/UniformGPUScheme.h b/src/gpu/communication/UniformGPUScheme.h index 5c9604ccd8cc00e5cdb2d9f9c1085ace2f2e44a5..bc481d8950c25d4aa5196316c641e8b67e34318a 100644 --- a/src/gpu/communication/UniformGPUScheme.h +++ b/src/gpu/communication/UniformGPUScheme.h @@ -18,7 +18,6 @@ //! \author Martin Bauer <martin.bauer@fau.de> // //====================================================================================================================== - #pragma once #include "blockforest/StructuredBlockForest.h" @@ -32,9 +31,7 @@ #include <thread> -#include "gpu/GPURAII.h" #include "gpu/GPUWrapper.h" -#include "gpu/ParallelStreams.h" #include "gpu/communication/CustomMemoryBuffer.h" #include "gpu/communication/GeneratedGPUPackInfo.h" @@ -49,29 +46,34 @@ namespace communication { class UniformGPUScheme { public: - explicit UniformGPUScheme( weak_ptr<StructuredBlockForest> bf, - bool sendDirectlyFromGPU = false, - bool useLocalCommunication = true, + explicit UniformGPUScheme( const weak_ptr< StructuredBlockForest >& bf, + const bool sendDirectlyFromGPU = false, + const bool useLocalCommunication = true, const int tag = 5432 ); - explicit UniformGPUScheme( weak_ptr<StructuredBlockForest> bf, - const Set<SUID> & requiredBlockSelectors, - const Set<SUID> & incompatibleBlockSelectors, - bool sendDirectlyFromGPU = false, - bool useLocalCommunication = true, - const int tag = 5432 ); + explicit UniformGPUScheme( const weak_ptr< StructuredBlockForest >& bf, + const Set<SUID> & requiredBlockSelectors, + const Set<SUID> & incompatibleBlockSelectors, + const bool sendDirectlyFromGPU = false, + const bool useLocalCommunication = true, + const int tag = 5432 ); + ~UniformGPUScheme() + { + for (uint_t i = 0; i < Stencil::Q; ++i) + WALBERLA_GPU_CHECK(gpuStreamDestroy(streams_[i])) + } void addPackInfo( const shared_ptr<GeneratedGPUPackInfo> &pi ); - void startCommunication( gpuStream_t stream = nullptr); - void wait( gpuStream_t stream = nullptr); + void startCommunication(); + void wait(); - void operator()( gpuStream_t stream = nullptr ) { communicate( stream ); } - inline void communicate( gpuStream_t stream = nullptr ) { startCommunication(stream); wait(stream); } + void operator()() { communicate( ); } + inline void communicate() { startCommunication(); wait(); } - std::function<void()> getCommunicateFunctor( gpuStream_t stream = nullptr ); - std::function<void()> getStartCommunicateFunctor( gpuStream_t stream = nullptr ); - std::function<void()> getWaitFunctor( gpuStream_t stream = nullptr ); + std::function<void()> getCommunicateFunctor(); + std::function<void()> getStartCommunicateFunctor(); + std::function<void()> getWaitFunctor(); private: void setupCommunication(); @@ -81,8 +83,8 @@ namespace communication { bool setupBeforeNextCommunication_; bool communicationInProgress_; - bool sendFromGPU_; - bool useLocalCommunication_; + const bool sendFromGPU_; + const bool useLocalCommunication_; using CpuBuffer_T = gpu::communication::PinnedMemoryBuffer; using GpuBuffer_T = gpu::communication::GPUMemoryBuffer; @@ -92,8 +94,6 @@ namespace communication { std::vector<shared_ptr<GeneratedGPUPackInfo> > packInfos_; - ParallelStreams parallelSectionManager_; - struct Header { BlockID blockId; @@ -103,6 +103,8 @@ namespace communication { Set<SUID> requiredBlockSelectors_; Set<SUID> incompatibleBlockSelectors_; + + gpuStream_t streams_[Stencil::Q]; }; diff --git a/src/gpu/communication/UniformGPUScheme.impl.h b/src/gpu/communication/UniformGPUScheme.impl.h index 93f6dd85e0e3f44293b9943e1bf252ce52c6ad33..84d9e0f22dd5661d1d428525d3758a5bb9a29488 100644 --- a/src/gpu/communication/UniformGPUScheme.impl.h +++ b/src/gpu/communication/UniformGPUScheme.impl.h @@ -19,10 +19,6 @@ // //====================================================================================================================== -#include "core/mpi/MPIWrapper.h" - -#include "gpu/ParallelStreams.h" - namespace walberla { namespace gpu { @@ -30,9 +26,9 @@ namespace communication { template<typename Stencil> - UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf, - bool sendDirectlyFromGPU, - bool useLocalCommunication, + UniformGPUScheme<Stencil>::UniformGPUScheme( const weak_ptr< StructuredBlockForest >& bf, + const bool sendDirectlyFromGPU, + const bool useLocalCommunication, const int tag ) : blockForest_( bf ), setupBeforeNextCommunication_( true ), @@ -41,7 +37,6 @@ namespace communication { useLocalCommunication_(useLocalCommunication), bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ), bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ), - parallelSectionManager_( -1 ), requiredBlockSelectors_( Set<SUID>::emptySet() ), incompatibleBlockSelectors_( Set<SUID>::emptySet() ) { @@ -52,14 +47,17 @@ namespace communication { WALBERLA_CHECK(!sendDirectlyFromGPU) #endif } + + for (uint_t i = 0; i < Stencil::Q; ++i) + WALBERLA_GPU_CHECK(gpuStreamCreate(&streams_[i])) } template<typename Stencil> - UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf, + UniformGPUScheme<Stencil>::UniformGPUScheme( const weak_ptr< StructuredBlockForest >& bf, const Set<SUID> & requiredBlockSelectors, const Set<SUID> & incompatibleBlockSelectors, - bool sendDirectlyFromGPU, - bool useLocalCommunication, + const bool sendDirectlyFromGPU, + const bool useLocalCommunication, const int tag ) : blockForest_( bf ), setupBeforeNextCommunication_( true ), @@ -68,7 +66,6 @@ namespace communication { useLocalCommunication_(useLocalCommunication), bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ), bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ), - parallelSectionManager_( -1 ), requiredBlockSelectors_( requiredBlockSelectors ), incompatibleBlockSelectors_( incompatibleBlockSelectors ) { @@ -78,11 +75,14 @@ namespace communication { WALBERLA_CHECK(!sendDirectlyFromGPU) #endif } + + for (uint_t i = 0; i < Stencil::Q; ++i) + WALBERLA_GPU_CHECK(gpuStreamCreate(&streams_[i])) } template<typename Stencil> - void UniformGPUScheme<Stencil>::startCommunication( gpuStream_t stream ) + void UniformGPUScheme<Stencil>::startCommunication( ) { WALBERLA_ASSERT( !communicationInProgress_ ) auto forest = blockForest_.lock(); @@ -102,9 +102,11 @@ namespace communication { for( auto it : headers_ ) bufferSystemGPU_.sendBuffer( it.first ).clear(); + // wait until communication dependent kernels are finished + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) + // Start filling send buffers { - auto parallelSection = parallelSectionManager_.parallelSection( stream ); for( auto &iBlock : *forest ) { auto senderBlock = dynamic_cast< Block * >( &iBlock ); @@ -127,7 +129,7 @@ namespace communication { auto receiverBlock = dynamic_cast< Block * >( forest->getBlock( senderBlock->getNeighborId( neighborIdx, uint_t(0) )) ); for (auto& pi : packInfos_) { - pi->communicateLocal(*dir, senderBlock, receiverBlock, stream); + pi->communicateLocal(*dir, senderBlock, receiverBlock, streams_[*dir]); } } else @@ -136,26 +138,27 @@ namespace communication { for( auto &pi : packInfos_ ) { - parallelSection.run([&](auto s) { auto size = pi->size( *dir, senderBlock ); auto gpuDataPtr = bufferSystemGPU_.sendBuffer( nProcess ).advanceNoResize( size ); WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr ) - pi->pack( *dir, gpuDataPtr, senderBlock, s ); + pi->pack( *dir, gpuDataPtr, senderBlock, streams_[*dir] ); if( !sendFromGPU_ ) { auto cpuDataPtr = bufferSystemCPU_.sendBuffer( nProcess ).advanceNoResize( size ); WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr ) - WALBERLA_GPU_CHECK( gpuMemcpyAsync( cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, s )) + WALBERLA_GPU_CHECK( gpuMemcpyAsync( cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, streams_[*dir] )) } - }); } } } } } // wait for packing to finish - WALBERLA_GPU_CHECK( gpuStreamSynchronize( stream ) ); + for (uint_t i = 0; i < Stencil::Q; ++i) + { + WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i])) + } if( sendFromGPU_ ) bufferSystemGPU_.sendAll(); @@ -167,7 +170,7 @@ namespace communication { template<typename Stencil> - void UniformGPUScheme<Stencil>::wait( gpuStream_t stream ) + void UniformGPUScheme<Stencil>::wait() { WALBERLA_ASSERT( communicationInProgress_ ) @@ -175,7 +178,6 @@ namespace communication { if( sendFromGPU_ ) { - auto parallelSection = parallelSectionManager_.parallelSection( stream ); for( auto recvInfo = bufferSystemGPU_.begin(); recvInfo != bufferSystemGPU_.end(); ++recvInfo ) { recvInfo.buffer().clear(); @@ -188,16 +190,13 @@ namespace communication { auto size = pi->size( header.dir, block ); auto gpuDataPtr = recvInfo.buffer().advanceNoResize( size ); WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr ) - parallelSection.run([&](auto s) { - pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, s ); - }); + pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, streams_[header.dir] ); } } } } else { - auto parallelSection = parallelSectionManager_.parallelSection( stream ); for( auto recvInfo = bufferSystemCPU_.begin(); recvInfo != bufferSystemCPU_.end(); ++recvInfo ) { auto &gpuBuffer = bufferSystemGPU_.sendBuffer( recvInfo.rank()); @@ -214,17 +213,17 @@ namespace communication { auto gpuDataPtr = gpuBuffer.advanceNoResize( size ); WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr ) WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr ) - parallelSection.run([&](auto s) { WALBERLA_GPU_CHECK( gpuMemcpyAsync( gpuDataPtr, cpuDataPtr, size, - gpuMemcpyHostToDevice, s )) - pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, s ); - }); + gpuMemcpyHostToDevice, streams_[header.dir] )) + pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, streams_[header.dir] ); } } } } - - WALBERLA_GPU_CHECK( gpuDeviceSynchronize() ) + for (uint_t i = 0; i < Stencil::Q; ++i) + { + WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i])) + } communicationInProgress_ = false; } @@ -312,21 +311,21 @@ namespace communication { } template< typename Stencil > - std::function<void()> UniformGPUScheme<Stencil>::getCommunicateFunctor(gpuStream_t stream) + std::function<void()> UniformGPUScheme<Stencil>::getCommunicateFunctor() { - return [this, stream]() { communicate( stream ); }; + return [this]() { communicate( ); }; } template< typename Stencil > - std::function<void()> UniformGPUScheme<Stencil>::getStartCommunicateFunctor(gpuStream_t stream) + std::function<void()> UniformGPUScheme<Stencil>::getStartCommunicateFunctor() { - return [this, stream]() { startCommunication( stream ); }; + return [this]() { startCommunication(); }; } template< typename Stencil > - std::function<void()> UniformGPUScheme<Stencil>::getWaitFunctor(gpuStream_t stream) + std::function<void()> UniformGPUScheme<Stencil>::getWaitFunctor() { - return [this, stream]() { wait( stream ); }; + return [this]() { wait(); }; } } // namespace communication diff --git a/tests/gpu/communication/CommTest.cpp b/tests/gpu/communication/CommTest.cpp index 5bc87aa13f9a2c72351b532fb20e3614cdfc8d82..a8a60321d9ee45f131843dbcb9f974f41ace0b01 100644 --- a/tests/gpu/communication/CommTest.cpp +++ b/tests/gpu/communication/CommTest.cpp @@ -47,7 +47,7 @@ void hostToHost() hostField2.set(hostField1); } double const endTime = MPI_Wtime(); - std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl; + std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n'; } void hostToDevice() @@ -61,7 +61,7 @@ void hostToDevice() gpu::fieldCpy(deviceField, hostField); } double const endTime = MPI_Wtime(); - std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl; + std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n'; } void deviceToHost() @@ -76,7 +76,7 @@ void deviceToHost() gpu::fieldCpy(hostField, deviceField); } double const endTime = MPI_Wtime(); - std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl; + std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n'; } void mpiHostToHost() @@ -100,7 +100,7 @@ void mpiHostToHost() MPI_Wait(&request2, MPI_STATUS_IGNORE); } double const endTime = MPI_Wtime(); - std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl; + std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n'; } void mpiHostToDevice() @@ -124,7 +124,7 @@ void mpiHostToDevice() MPI_Wait(&request2, MPI_STATUS_IGNORE); } double const endTime = MPI_Wtime(); - std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl; + std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n'; } void mpiDeviceToHost() @@ -148,7 +148,7 @@ void mpiDeviceToHost() MPI_Wait(&request2, MPI_STATUS_IGNORE); } double const endTime = MPI_Wtime(); - std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl; + std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n'; } void mpiDeviceToDevice() @@ -172,7 +172,7 @@ void mpiDeviceToDevice() MPI_Wait(&request2, MPI_STATUS_IGNORE); } double const endTime = MPI_Wtime(); - std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl; + std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n'; } void mpiCopyHostToDevice() @@ -199,7 +199,7 @@ void mpiCopyHostToDevice() gpu::fieldCpy(deviceField, hostField2); } double const endTime = MPI_Wtime(); - std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl; + std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n'; } void mpiCopyDeviceToHost() @@ -226,7 +226,7 @@ void mpiCopyDeviceToHost() MPI_Wait(&request2, MPI_STATUS_IGNORE); } double const endTime = MPI_Wtime(); - std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl; + std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n'; } int main(int argc, char** argv)