diff --git a/CMakeLists.txt b/CMakeLists.txt index ad983d0599758187c7b207bf6bee094acc0f26fd..e3b808b8d12c0c708bef8d1fbd1c280aab9a62e0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1255,28 +1255,34 @@ endif() ## Half precision ## ############################################################################################################################ -if (WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT) - if (WALBERLA_CXX_COMPILER_IS_GNU OR WALBERLA_CXX_COMPILER_IS_CLANG) - message(STATUS "Configuring with *experimental* half precision (float16) support. You better know what you are doing.") - if (WALBERLA_CXX_COMPILER_IS_GNU AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 12.0.0) - message(WARNING "[WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT] " - "Half precision support for gcc has only been tested with version >= 12. " - "You are using a previous version - it may not work correctly.") - endif () - if (WALBERLA_CXX_COMPILER_IS_CLANG AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0.0) - message(WARNING "[WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT] " - "Half precision support for clang has only been tested with version >= 15. " - "You are using a previous version - it may not work correctly.") - endif () - if (NOT WALBERLA_OPTIMIZE_FOR_LOCALHOST) - message(WARNING "[WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT] " - "You are not optimizing for localhost. You may encounter linker errors, or WORSE: silent incorrect fp16 arithmetic! Consider also enabling WALBERLA_OPTIMIZE_FOR_LOCALHOST!") - endif () - else () - message(FATAL_ERROR "[WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT] " - "Half precision support is currently only available for gcc and clang.") - endif () -endif () +if ( WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT ) + ### Compiler requirements: + ### Within this project, there are several checks to ensure that the template parameter 'ValueType' + ### is a floating point number. The check is_floating_point<ValueType> is done primarily in our MPI implementation. + ### The IEE 754 floating type format _Float16, evaluates to true only if your compiler supports the + ### open C++23 standard P1467R9 (Extended floating-point types and standard names). + ### Compare: + ### https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p1467r9.html + ### + ### Right now (18.12.2023) this is the case only for gcc13. + ### For more information see: + ### https://gcc.gnu.org/projects/cxx-status.html#:~:text=Extended%20floating%2Dpoint%20types%20and%20standard%20names + ### https://clang.llvm.org/cxx_status.html#:~:text=Extended%20floating%2Dpoint%20types%20and%20standard%20names + + try_compile( WALBERLA_SUPPORT_HALF_PRECISION "${CMAKE_CURRENT_BINARY_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/cmake/TestFloat16.cpp" + CXX_STANDARD 23 OUTPUT_VARIABLE TRY_COMPILE_OUTPUT ) + ## message( STATUS ${TRY_COMPILE_OUTPUT} ) + if ( NOT WALBERLA_SUPPORT_HALF_PRECISION ) + message( FATAL_ERROR "Compiler: ${CMAKE_CXX_COMPILER} Version: ${CMAKE_CXX_COMPILER_VERSION} does not support half precision" ) + endif () + + # Local host optimization + if ( NOT WALBERLA_OPTIMIZE_FOR_LOCALHOST ) + message( WARNING "[WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT] " + "You are not optimizing for localhost. You may encounter linker errors, or WORSE: silent incorrect fp16 arithmetic! Consider also enabling WALBERLA_OPTIMIZE_FOR_LOCALHOST!" ) + endif () # Local host check + +endif () # Check if WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT is set ############################################################################################################################ # Documentation Generation diff --git a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp index 08e5de4d928d9a23e76104823721dfbb5d811f69..494424a562e25503c6b189fa5cf2d957fafca313 100644 --- a/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp +++ b/apps/benchmarks/FlowAroundSphereCodeGen/FlowAroundSphereCodeGen.cpp @@ -194,11 +194,11 @@ int main(int argc, char** argv) gpu::communication::UniformGPUScheme< Stencil_T > comEven(blocks, false); comEven.addPackInfo(make_shared< PackInfoEven_T >(pdfFieldIDGPU)); - auto evenComm = std::function< void() >([&]() { comEven.communicate(nullptr); }); + auto evenComm = std::function< void() >([&]() { comEven.communicate(); }); gpu::communication::UniformGPUScheme< Stencil_T > comODD(blocks, false); comODD.addPackInfo(make_shared< PackInfoOdd_T >(pdfFieldIDGPU)); - auto oddComm = std::function< void() >([&]() { comODD.communicate(nullptr); }); + auto oddComm = std::function< void() >([&]() { comODD.communicate(); }); #else blockforest::communication::UniformBufferedScheme< Stencil_T > evenComm(blocks); evenComm.addPackInfo(make_shared< PackInfoEven_T >(pdfFieldID)); diff --git a/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt b/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt index de1a2c1db0cbd078a80879f964f6a046f98afc95..2b37ed6fb19797229ae5507f4d26f3f031491a87 100644 --- a/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt +++ b/apps/benchmarks/NonUniformGridCPU/CMakeLists.txt @@ -13,9 +13,9 @@ waLBerla_generate_target_from_python(NAME NonUniformGridCPUGenerated waLBerla_add_executable( NAME NonUniformGridGenerator FILES NonUniformGridGenerator.cpp LdcSetup.h - DEPENDS blockforest core domain_decomposition field geometry python_coupling vtk ) + DEPENDS blockforest core field python_coupling ) waLBerla_add_executable( NAME NonUniformGridCPU FILES NonUniformGridCPU.cpp LdcSetup.h - DEPENDS blockforest boundary core domain_decomposition field geometry python_coupling timeloop vtk NonUniformGridCPUGenerated ) + DEPENDS blockforest boundary core domain_decomposition field geometry lbm_generated python_coupling timeloop vtk NonUniformGridCPUGenerated ) diff --git a/apps/benchmarks/NonUniformGridCPU/LdcSetup.h b/apps/benchmarks/NonUniformGridCPU/LdcSetup.h index 1657e85e0e58dc0a1107ba62505e438c1821a1c1..070656cb23582a4da83f954c3e108aa70e69b315 100644 --- a/apps/benchmarks/NonUniformGridCPU/LdcSetup.h +++ b/apps/benchmarks/NonUniformGridCPU/LdcSetup.h @@ -14,12 +14,15 @@ // with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. // //! \file LdcSetup.h +//! \author Markus Holzer <markus.holzer@fau.de> //! \author Frederik Hennig <frederik.hennig@fau.de> // //====================================================================================================================== +#pragma once #include "blockforest/SetupBlock.h" #include "blockforest/SetupBlockForest.h" +#include "blockforest/StructuredBlockForest.h" #include "core/all.h" @@ -45,14 +48,14 @@ class LDCRefinement { const AABB & domain = forest.getDomain(); - real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 ); - real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 ); + const real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 ); + const real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 ); - AABB leftCorner( domain.xMin(), domain.yMax() - ySize, domain.zMin(), - domain.xMin() + xSize, domain.yMax() + ySize, domain.zMax() ); + const AABB leftCorner( domain.xMin(), domain.yMin(), domain.zMin(), + domain.xMin() + xSize, domain.yMin() + ySize, domain.zMax() ); - AABB rightCorner( domain.xMax() - xSize, domain.yMax() - ySize, domain.zMin(), - domain.xMax(), domain.yMax(), domain.zMax() ); + const AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMin(), + domain.xMax(), domain.yMin() + ySize, domain.zMax() ); for(auto & block : forest) { diff --git a/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt b/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt index d6840007e14d5f5af685bb5b262c8bcfd6138d6e..f6b4e1ff3779f624c8fb9845425d9d6a86103ee9 100644 --- a/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt +++ b/apps/benchmarks/NonUniformGridGPU/CMakeLists.txt @@ -11,5 +11,5 @@ waLBerla_generate_target_from_python(NAME NonUniformGridGPUGenerated NonUniformGridGPUBoundaryCollection.h NonUniformGridGPUInfoHeader.h) waLBerla_add_executable( NAME NonUniformGridGPU - FILES NonUniformGridGPU.cpp - DEPENDS blockforest boundary core gpu domain_decomposition field geometry python_coupling timeloop vtk NonUniformGridGPUGenerated ) \ No newline at end of file + FILES NonUniformGridGPU.cpp LdcSetup.h + DEPENDS blockforest boundary core gpu domain_decomposition field geometry lbm_generated python_coupling timeloop vtk NonUniformGridGPUGenerated ) \ No newline at end of file diff --git a/apps/benchmarks/NonUniformGridGPU/LdcSetup.h b/apps/benchmarks/NonUniformGridGPU/LdcSetup.h new file mode 100644 index 0000000000000000000000000000000000000000..238943a7daa9745054980e6011d46ef037ef27ec --- /dev/null +++ b/apps/benchmarks/NonUniformGridGPU/LdcSetup.h @@ -0,0 +1,110 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file LdcSetup.h +//! \author Markus Holzer <markus.holzer@fau.de> +//! \author Frederik Hennig <frederik.hennig@fau.de> +// +//====================================================================================================================== +#pragma once + +#include "blockforest/SetupBlock.h" +#include "blockforest/SetupBlockForest.h" +#include "blockforest/StructuredBlockForest.h" + +#include "core/all.h" + +#include "field/FlagField.h" + +#include "field/FlagUID.h" + +using namespace walberla; +using RefinementSelectionFunctor = SetupBlockForest::RefinementSelectionFunction; +using FlagField_T = FlagField< uint8_t >; + +class LDCRefinement +{ + private: + const uint_t refinementDepth_; + + public: + explicit LDCRefinement(const uint_t depth) : refinementDepth_(depth){}; + + void operator()(SetupBlockForest& forest) const + { + const AABB & domain = forest.getDomain(); + + const real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 ); + const real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 ); + + const AABB leftCorner( domain.xMin(), domain.yMin(), domain.zMin(), + domain.xMin() + xSize, domain.yMin() + ySize, domain.zMax() ); + + const AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMin(), + domain.xMax(), domain.yMin() + ySize, domain.zMax() ); + + for(auto & block : forest) + { + auto & aabb = block.getAABB(); + if( leftCorner.intersects( aabb ) || rightCorner.intersects( aabb ) ) + { + if( block.getLevel() < refinementDepth_) + block.setMarker( true ); + } + } + } +}; + +class LDC +{ + private: + const std::string refinementProfile_; + const uint_t refinementDepth_; + + const FlagUID noSlipFlagUID_; + const FlagUID ubbFlagUID_; + + public: + explicit LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){}; + + RefinementSelectionFunctor refinementSelector() const + { + return LDCRefinement(refinementDepth_); + } + + void setupBoundaryFlagField(StructuredBlockForest& sbfs, const BlockDataID flagFieldID) + { + for (auto bIt = sbfs.begin(); bIt != sbfs.end(); ++bIt) + { + auto& b = dynamic_cast< Block& >(*bIt); + const uint_t level = b.getLevel(); + auto flagField = b.getData< FlagField_T >(flagFieldID); + const uint8_t noslipFlag = flagField->registerFlag(noSlipFlagUID_); + const uint8_t ubbFlag = flagField->registerFlag(ubbFlagUID_); + for (auto cIt = flagField->beginWithGhostLayerXYZ(2); cIt != flagField->end(); ++cIt) + { + const Cell localCell = cIt.cell(); + Cell globalCell(localCell); + sbfs.transformBlockLocalToGlobalCell(globalCell, b); + if (globalCell.y() >= cell_idx_c(sbfs.getNumberOfYCells(level))) { flagField->addFlag(localCell, ubbFlag); } + else if (globalCell.z() < 0 || globalCell.y() < 0 || globalCell.x() < 0 || + globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level)) || globalCell.z() >= cell_idx_c(sbfs.getNumberOfZCells(level))) + { + flagField->addFlag(localCell, noslipFlag); + } + } + } + } +}; diff --git a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp index 8110dbbb4437b8d3c56d9b855de501fd55521454..919755d6d7cd481dd90a2f1310965e0c6947432c 100644 --- a/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp +++ b/apps/benchmarks/NonUniformGridGPU/NonUniformGridGPU.cpp @@ -19,7 +19,6 @@ //====================================================================================================================== #include "blockforest/Initialization.h" -#include "blockforest/SetupBlockForest.h" #include "blockforest/loadbalancing/StaticCurve.h" #include "core/Environment.h" @@ -54,6 +53,7 @@ #include <cmath> +#include "LdcSetup.h" #include "NonUniformGridGPUInfoHeader.h" using namespace walberla; @@ -69,82 +69,6 @@ using BoundaryCollection_T = lbm::NonUniformGridGPUBoundaryCollection< FlagField using SweepCollection_T = lbm::NonUniformGridGPUSweepCollection; using gpu::communication::NonUniformGPUScheme; -using RefinementSelectionFunctor = SetupBlockForest::RefinementSelectionFunction; - -class LDCRefinement -{ - private: - const uint_t refinementDepth_; - - public: - explicit LDCRefinement(const uint_t depth) : refinementDepth_(depth){}; - - void operator()(SetupBlockForest& forest) const - { - const AABB & domain = forest.getDomain(); - - real_t xSize = ( domain.xSize() / real_t(12) ) * real_c( 0.99 ); - real_t ySize = ( domain.ySize() / real_t(12) ) * real_c( 0.99 ); - - AABB leftCorner( domain.xMin(), domain.yMin(), domain.zMin(), - domain.xMin() + xSize, domain.yMin() + ySize, domain.zMax() ); - - AABB rightCorner( domain.xMax() - xSize, domain.yMin(), domain.zMin(), - domain.xMax(), domain.yMin() + ySize, domain.zMax() ); - - for(auto & block : forest) - { - auto & aabb = block.getAABB(); - if( leftCorner.intersects( aabb ) || rightCorner.intersects( aabb ) ) - { - if( block.getLevel() < refinementDepth_) - block.setMarker( true ); - } - } - } -}; - -class LDC -{ - private: - const std::string refinementProfile_; - const uint_t refinementDepth_; - - const FlagUID noSlipFlagUID_; - const FlagUID ubbFlagUID_; - - public: - explicit LDC(const uint_t depth) : refinementDepth_(depth), noSlipFlagUID_("NoSlip"), ubbFlagUID_("UBB"){}; - - RefinementSelectionFunctor refinementSelector() const - { - return LDCRefinement(refinementDepth_); - } - - void setupBoundaryFlagField(StructuredBlockForest& sbfs, const BlockDataID flagFieldID) - { - for (auto bIt = sbfs.begin(); bIt != sbfs.end(); ++bIt) - { - auto& b = dynamic_cast< Block& >(*bIt); - const uint_t level = b.getLevel(); - auto flagField = b.getData< FlagField_T >(flagFieldID); - const uint8_t noslipFlag = flagField->registerFlag(noSlipFlagUID_); - const uint8_t ubbFlag = flagField->registerFlag(ubbFlagUID_); - for (auto cIt = flagField->beginWithGhostLayerXYZ(2); cIt != flagField->end(); ++cIt) - { - const Cell localCell = cIt.cell(); - Cell globalCell(localCell); - sbfs.transformBlockLocalToGlobalCell(globalCell, b); - if (globalCell.y() >= cell_idx_c(sbfs.getNumberOfYCells(level))) { flagField->addFlag(localCell, ubbFlag); } - else if (globalCell.z() < 0 || globalCell.y() < 0 || globalCell.x() < 0 || - globalCell.x() >= cell_idx_c(sbfs.getNumberOfXCells(level)) || globalCell.z() >= cell_idx_c(sbfs.getNumberOfZCells(level))) - { - flagField->addFlag(localCell, noslipFlag); - } - } - } - } -}; namespace { void createSetupBlockForest(SetupBlockForest& setupBfs, const Config::BlockHandle& domainSetup, LDC& ldcSetup, const uint_t numProcesses=uint_c(MPIManager::instance()->numProcesses())) { diff --git a/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py index ccfcecacfb5c0f5a79b56aea13409cc3da4d2748..34bc6caa92b92d5239c9ca1409660b062247a469 100644 --- a/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py +++ b/apps/benchmarks/NonUniformGridGPU/simulation_setup/benchmark_configs.py @@ -26,7 +26,7 @@ class Scenario: }, 'Parameters': { 'omega': 1.95, - 'timesteps': 10001, + 'timesteps': 30001, 'refinementDepth': self.refinement_depth, 'writeSetupForestAndReturn': False, @@ -37,7 +37,7 @@ class Scenario: 'remainingTimeLoggerFrequency': 3, - 'vtkWriteFrequency': 5000, + 'vtkWriteFrequency': 10000, }, 'Logging': { 'logLevel': "info", diff --git a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp b/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp index c8992a65afb93fa7dae572959a654651f14aabde..2a59e6be99b49942a169d1c24921a93cbe8abd1e 100644 --- a/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp +++ b/apps/benchmarks/PhaseFieldAllenCahn/benchmark_multiphase.cpp @@ -18,7 +18,6 @@ // //====================================================================================================================== #include "blockforest/Initialization.h" -#include "blockforest/communication/UniformDirectScheme.h" #include "core/Environment.h" #include "core/logging/Initialization.h" @@ -66,7 +65,7 @@ typedef gpu::GPUField< real_t > GPUField; int main(int argc, char** argv) { - mpi::Environment env(argc, argv); + const mpi::Environment env(argc, argv); #if defined(WALBERLA_BUILD_WITH_CUDA) gpu::selectDeviceBasedOnMpiRank(); #endif @@ -92,14 +91,14 @@ int main(int argc, char** argv) #if defined(WALBERLA_BUILD_WITH_CUDA) // CPU fields - BlockDataID vel_field = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx); + const BlockDataID vel_field = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx); BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_c(0.0), field::fzyx); // GPU fields - BlockDataID lb_phase_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >( + const BlockDataID lb_phase_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >( blocks, "lb phase field on GPU", Stencil_phase_T::Size, field::fzyx, 1); - BlockDataID lb_velocity_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >( + const BlockDataID lb_velocity_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >( blocks, "lb velocity field on GPU", Stencil_hydro_T::Size, field::fzyx, 1); - BlockDataID vel_field_gpu = + const BlockDataID vel_field_gpu = gpu::addGPUFieldToStorage< VelocityField_T >(blocks, vel_field, "velocity field on GPU", true); BlockDataID phase_field_gpu = gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "phase field on GPU", true); @@ -215,15 +214,15 @@ int main(int argc, char** argv) auto timeLoop = make_shared< SweepTimeloop >(blocks->getBlockStorage(), timesteps); #if defined(WALBERLA_BUILD_WITH_CUDA) auto normalTimeStep = [&]() { - Comm_velocity_based_distributions->startCommunication(defaultStream); + Comm_velocity_based_distributions->startCommunication(); for (auto& block : *blocks) phase_field_LB_step(&block, defaultStream); - Comm_velocity_based_distributions->wait(defaultStream); + Comm_velocity_based_distributions->wait(); - Comm_phase_field_distributions->startCommunication(defaultStream); + Comm_phase_field_distributions->startCommunication(); for (auto& block : *blocks) hydro_LB_step(&block, defaultStream); - Comm_phase_field_distributions->wait(defaultStream); + Comm_phase_field_distributions->wait(); }; auto phase_only = [&]() { for (auto& block : *blocks) diff --git a/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp b/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp index fc229eb116f86ee6c967a8fba5a1948f7b73fc64..dfcd22a87e6942fb7ac2bc5789ac92fdd65fec9f 100644 --- a/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp +++ b/apps/benchmarks/UniformGridCPU/UniformGridCPU.cpp @@ -68,8 +68,8 @@ int main(int argc, char** argv) { const mpi::Environment env(argc, argv); - std::string input_filename(argv[1]); - bool inputIsPython = string_ends_with(input_filename, ".py"); + const std::string input_filename(argv[1]); + const bool inputIsPython = string_ends_with(input_filename, ".py"); for (auto cfg = python_coupling::configBegin(argc, argv); cfg != python_coupling::configEnd(); ++cfg) { diff --git a/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py index c3076cb3e1f457ecae1f6a5090adac2314c34e97..21235056434f3daeebdbac212ec8de60d58810b4 100644 --- a/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py +++ b/apps/benchmarks/UniformGridCPU/simulation_setup/benchmark_configs.py @@ -22,12 +22,12 @@ def num_time_steps(block_size, time_steps_for_128_block=TIME_STEPS_FOR_128_BLOCK ldc_setup = {'Border': [ - {'direction': 'N', 'walldistance': -1, 'flag': 'NoSlip'}, - {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'N', 'walldistance': -1, 'flag': 'UBB'}, {'direction': 'W', 'walldistance': -1, 'flag': 'NoSlip'}, {'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'}, - {'direction': 'T', 'walldistance': -1, 'flag': 'UBB'}, + {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'}, {'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'}, ]} diff --git a/apps/benchmarks/UniformGridGPU/CMakeLists.txt b/apps/benchmarks/UniformGridGPU/CMakeLists.txt index 66a5b0fa4f4a3588f36ba4dbd5feb732131f76d0..2607004f3749f366f8155a0dd200202f00e45867 100644 --- a/apps/benchmarks/UniformGridGPU/CMakeLists.txt +++ b/apps/benchmarks/UniformGridGPU/CMakeLists.txt @@ -25,7 +25,7 @@ foreach(streaming_pattern pull push aa esotwist) waLBerla_add_executable(NAME UniformGridGPU_${config} FILES UniformGridGPU.cpp - DEPENDS blockforest boundary core gpu domain_decomposition field geometry python_coupling timeloop vtk UniformGridGPUGenerated_${config}) + DEPENDS blockforest boundary core gpu domain_decomposition field geometry lbm_generated python_coupling timeloop vtk UniformGridGPUGenerated_${config}) # all configs are excluded from all except for pull d3q27. if (${streaming_pattern} STREQUAL "pull" AND ${stencil} STREQUAL "d3q27") diff --git a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp index ee022f457738fb6f8aa71f615441e9279fd25eca..fdc8969d626b866b978dfd1260565c50f96f01b8 100644 --- a/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp +++ b/apps/benchmarks/UniformGridGPU/UniformGridGPU.cpp @@ -167,22 +167,22 @@ int main(int argc, char** argv) if (timeStepStrategy == "noOverlap") { if (boundariesConfig){ - timeLoop.add() << BeforeFunction(communication.getCommunicateFunctor(defaultStream), "communication") + timeLoop.add() << BeforeFunction(communication.getCommunicateFunctor(), "communication") << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL, defaultStream), "Boundary Conditions"); timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL, defaultStream), "LBM StreamCollide"); }else { - timeLoop.add() << BeforeFunction(communication.getCommunicateFunctor(defaultStream), "communication") + timeLoop.add() << BeforeFunction(communication.getCommunicateFunctor(), "communication") << Sweep(sweepCollection.streamCollide(SweepCollection_T::ALL, defaultStream), "LBM StreamCollide");} } else if (timeStepStrategy == "simpleOverlap") { if (boundariesConfig){ - timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(defaultStream), "Start Communication") + timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(), "Start Communication") << Sweep(boundaryCollection.getSweep(BoundaryCollection_T::ALL, defaultStream), "Boundary Conditions"); timeLoop.add() << Sweep(sweepCollection.streamCollide(SweepCollection_T::INNER, defaultStream), "LBM StreamCollide Inner Frame"); timeLoop.add() << BeforeFunction(communication.getWaitFunctor(), "Wait for Communication") << Sweep(sweepCollection.streamCollide(SweepCollection_T::OUTER, defaultStream), "LBM StreamCollide Outer Frame"); }else{ - timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(defaultStream), "Start Communication") + timeLoop.add() << BeforeFunction(communication.getStartCommunicateFunctor(), "Start Communication") << Sweep(sweepCollection.streamCollide(SweepCollection_T::INNER, defaultStream), "LBM StreamCollide Inner Frame"); timeLoop.add() << BeforeFunction(communication.getWaitFunctor(), "Wait for Communication") << Sweep(sweepCollection.streamCollide(SweepCollection_T::OUTER,defaultStream), "LBM StreamCollide Outer Frame");} @@ -240,14 +240,14 @@ int main(int argc, char** argv) WALBERLA_GPU_CHECK(gpuPeekAtLastError()) timeLoop.setCurrentTimeStepToZero(); - WcTimingPool const timeloopTiming; + WcTimingPool timeloopTiming; WcTimer simTimer; WALBERLA_GPU_CHECK( gpuDeviceSynchronize() ) WALBERLA_GPU_CHECK( gpuPeekAtLastError() ) WALBERLA_LOG_INFO_ON_ROOT("Starting simulation with " << timesteps << " time steps") simTimer.start(); - timeLoop.run(); + timeLoop.run(timeloopTiming); WALBERLA_GPU_CHECK( gpuDeviceSynchronize() ) simTimer.end(); diff --git a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py index e1972d914a5acc26ab54aaf3cb86c615ac4d3b77..74be4378e0e2acef0bcb3c36f0f6d64916bba6c8 100755 --- a/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py +++ b/apps/benchmarks/UniformGridGPU/simulation_setup/benchmark_configs.py @@ -26,12 +26,13 @@ BASE_CONFIG = { } ldc_setup = {'Border': [ - {'direction': 'N', 'walldistance': -1, 'flag': 'NoSlip'}, - {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'}, {'direction': 'N', 'walldistance': -1, 'flag': 'UBB'}, + {'direction': 'W', 'walldistance': -1, 'flag': 'NoSlip'}, {'direction': 'E', 'walldistance': -1, 'flag': 'NoSlip'}, - {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'S', 'walldistance': -1, 'flag': 'NoSlip'}, {'direction': 'B', 'walldistance': -1, 'flag': 'NoSlip'}, + {'direction': 'T', 'walldistance': -1, 'flag': 'NoSlip'}, + ]} @@ -55,7 +56,7 @@ def domain_block_size_ok(block_size, total_mem, gls=1, q=27, size_per_value=8): class Scenario: - def __init__(self, cells_per_block=(256, 128, 128), periodic=(1, 1, 1), cuda_blocks=(256, 1, 1), + def __init__(self, cells_per_block=(256, 128, 128), periodic=(1, 1, 1), cuda_blocks=(128, 1, 1), timesteps=None, time_step_strategy="normal", omega=1.8, cuda_enabled_mpi=False, inner_outer_split=(1, 1, 1), warmup_steps=5, outer_iterations=3, init_shear_flow=False, boundary_setup=False, @@ -110,7 +111,11 @@ class Scenario: 'innerOuterSplit': self.inner_outer_split, 'vtkWriteFrequency': self.vtk_write_frequency, 'remainingTimeLoggerFrequency': self.remaining_time_logger_frequency + }, + 'Logging': { + 'logLevel': 'info', # info progress detail tracing } + } if self.boundary_setup: config_dict["Boundaries"] = ldc_setup @@ -184,12 +189,14 @@ def overlap_benchmark(): # no overlap scenarios.add(Scenario(time_step_strategy='noOverlap', inner_outer_split=(1, 1, 1), - cuda_enabled_mpi=cuda_enabled_mpi)) + cuda_enabled_mpi=cuda_enabled_mpi, + outer_iterations=1)) for inner_outer_split in inner_outer_splits: scenario = Scenario(time_step_strategy='simpleOverlap', inner_outer_split=inner_outer_split, - cuda_enabled_mpi=cuda_enabled_mpi) + cuda_enabled_mpi=cuda_enabled_mpi, + outer_iterations=1) scenarios.add(scenario) @@ -228,6 +235,7 @@ def single_gpu_benchmark(): cuda_blocks=cuda_block_size, time_step_strategy='kernelOnly', timesteps=num_time_steps(block_size, 2000), + outer_iterations=1, additional_info=additional_info) scenarios.add(scenario) @@ -237,18 +245,18 @@ def validation_run(): wlb.log_info_on_root("Validation run") wlb.log_info_on_root("") - time_step_strategy = "noOverlap" # "noOverlap" + time_step_strategy = "noOverlap" # "simpleOverlap" scenarios = wlb.ScenarioManager() - scenario = Scenario(cells_per_block=(64, 64, 64), + scenario = Scenario(cells_per_block=(128, 128, 128), time_step_strategy=time_step_strategy, - timesteps=1000, + timesteps=10001, outer_iterations=1, warmup_steps=0, init_shear_flow=False, boundary_setup=True, - vtk_write_frequency=0, - remaining_time_logger_frequency=10) + vtk_write_frequency=5000, + remaining_time_logger_frequency=30) scenarios.add(scenario) diff --git a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp index 2800b98cb65008ef5d66aa98853fb5589087c8d5..9b883282a0437628203cfb04de660f0301b5758a 100644 --- a/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp +++ b/apps/showcases/PhaseFieldAllenCahn/GPU/multiphase.cpp @@ -72,7 +72,7 @@ typedef gpu::GPUField< uint8_t > GPUField_int; int main(int argc, char** argv) { - mpi::Environment Env(argc, argv); + const mpi::Environment Env(argc, argv); gpu::selectDeviceBasedOnMpiRank(); exportDataStructuresToPython(); @@ -114,17 +114,17 @@ int main(int argc, char** argv) BlockDataID vel_field = field::addToStorage< VelocityField_T >(blocks, "vel", real_c(0.0), field::fzyx); BlockDataID phase_field = field::addToStorage< PhaseField_T >(blocks, "phase", real_c(0.0), field::fzyx); // GPU fields - BlockDataID lb_phase_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >( + const BlockDataID lb_phase_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >( blocks, "lb phase field on GPU", Stencil_phase_T::Size, field::fzyx, 1); - BlockDataID lb_velocity_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >( + const BlockDataID lb_velocity_field_gpu = gpu::addGPUFieldToStorage< gpu::GPUField< real_t > >( blocks, "lb velocity field on GPU", Stencil_hydro_T::Size, field::fzyx, 1); BlockDataID vel_field_gpu = gpu::addGPUFieldToStorage< VelocityField_T >(blocks, vel_field, "velocity field on GPU", true); BlockDataID phase_field_gpu = gpu::addGPUFieldToStorage< PhaseField_T >(blocks, phase_field, "phase field on GPU", true); // Flag field - BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field"); - BlockDataID flagFieldID_gpu = gpu::addGPUFieldToStorage< FlagField_T >(blocks, flagFieldID, "flag on GPU", true); + const BlockDataID flagFieldID = field::addFlagFieldToStorage< FlagField_T >(blocks, "flag field"); + const BlockDataID flagFieldID_gpu = gpu::addGPUFieldToStorage< FlagField_T >(blocks, flagFieldID, "flag on GPU", true); auto physical_parameters = config->getOneBlock("PhysicalParameters"); const real_t density_liquid = physical_parameters.getParameter< real_t >("density_liquid", real_c(1.0)); @@ -181,11 +181,11 @@ int main(int argc, char** argv) interface_thickness); pystencils::initialize_velocity_based_distributions init_g(lb_velocity_field_gpu, vel_field_gpu); - pystencils::phase_field_LB_step phase_field_LB_step(flagFieldID_gpu, lb_phase_field_gpu, phase_field_gpu, + const pystencils::phase_field_LB_step phase_field_LB_step(flagFieldID_gpu, lb_phase_field_gpu, phase_field_gpu, vel_field_gpu, mobility, interface_thickness, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]); - pystencils::hydro_LB_step hydro_LB_step(flagFieldID_gpu, lb_velocity_field_gpu, phase_field_gpu, vel_field_gpu, + const pystencils::hydro_LB_step hydro_LB_step(flagFieldID_gpu, lb_velocity_field_gpu, phase_field_gpu, vel_field_gpu, gravitational_acceleration, interface_thickness, density_liquid, density_gas, surface_tension, relaxation_time_liquid, relaxation_time_gas, gpuBlockSize[0], gpuBlockSize[1], gpuBlockSize[2]); @@ -193,8 +193,8 @@ int main(int argc, char** argv) //////////////////////// // ADD COMMUNICATION // ////////////////////// - int streamLowPriority = 0; - int streamHighPriority = 0; + const int streamLowPriority = 0; + const int streamHighPriority = 0; auto defaultStream = gpu::StreamRAII::newPriorityStream(streamLowPriority); auto innerOuterStreams = gpu::ParallelStreams(streamHighPriority); @@ -204,20 +204,20 @@ int main(int argc, char** argv) make_shared< lbm::PackInfo_velocity_based_distributions >(lb_velocity_field_gpu); UniformGPUSchemeVelocityDistributions->addPackInfo(generatedPackInfo_velocity_based_distributions); auto Comm_velocity_based_distributions = - std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->communicate(defaultStream); }); + std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->communicate(); }); auto Comm_velocity_based_distributions_start = - std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->startCommunication(defaultStream); }); + std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->startCommunication(); }); auto Comm_velocity_based_distributions_wait = - std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->wait(defaultStream); }); + std::function< void() >([&]() { UniformGPUSchemeVelocityDistributions->wait(); }); auto UniformGPUSchemePhaseField = make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi); auto generatedPackInfo_phase_field = make_shared< pystencils::PackInfo_phase_field >(phase_field_gpu); UniformGPUSchemePhaseField->addPackInfo(generatedPackInfo_phase_field); - auto Comm_phase_field = std::function< void() >([&]() { UniformGPUSchemePhaseField->communicate(defaultStream); }); + auto Comm_phase_field = std::function< void() >([&]() { UniformGPUSchemePhaseField->communicate(); }); auto Comm_phase_field_start = - std::function< void() >([&]() { UniformGPUSchemePhaseField->startCommunication(defaultStream); }); - auto Comm_phase_field_wait = std::function< void() >([&]() { UniformGPUSchemePhaseField->wait(defaultStream); }); + std::function< void() >([&]() { UniformGPUSchemePhaseField->startCommunication(); }); + auto Comm_phase_field_wait = std::function< void() >([&]() { UniformGPUSchemePhaseField->wait(); }); auto UniformGPUSchemePhaseFieldDistributions = make_shared< gpu::communication::UniformGPUScheme< Stencil_hydro_T > >(blocks, cudaEnabledMpi); @@ -225,11 +225,11 @@ int main(int argc, char** argv) make_shared< lbm::PackInfo_phase_field_distributions >(lb_phase_field_gpu); UniformGPUSchemePhaseFieldDistributions->addPackInfo(generatedPackInfo_phase_field_distributions); auto Comm_phase_field_distributions = - std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->communicate(defaultStream); }); + std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->communicate(); }); auto Comm_phase_field_distributions_start = - std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->startCommunication(defaultStream); }); + std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->startCommunication(); }); auto Comm_phase_field_distributions_wait = - std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->wait(defaultStream); }); + std::function< void() >([&]() { UniformGPUSchemePhaseFieldDistributions->wait(); }); //////////////////////// // BOUNDARY HANDLING // @@ -394,7 +394,7 @@ int main(int argc, char** argv) targetRank, MPI_COMM_WORLD); WALBERLA_EXCLUSIVE_WORLD_SECTION(targetRank) { - std::string path = ""; + const std::string path = ""; std::ostringstream out; out << std::internal << std::setfill('0') << std::setw(6) << counter; geometry::writeMesh( diff --git a/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp b/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp index 1856106c5b61880752c7216ee10eabda140485b1..2dbeb9c8e2d82d28bc508c4ff215d6fa04bc2ca4 100644 --- a/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp +++ b/apps/tutorials/codegen/03_AdvancedLBMCodegen.cpp @@ -203,9 +203,9 @@ int main(int argc, char** argv) // Communication #if defined(WALBERLA_BUILD_WITH_GPU_SUPPORT) const bool sendDirectlyFromGPU = false; - gpu::communication::UniformGPUScheme< Stencil_T > com(blocks, sendDirectlyFromGPU); + gpu::communication::UniformGPUScheme< Stencil_T > com(blocks, sendDirectlyFromGPU, false); com.addPackInfo(make_shared< PackInfo_T >(pdfFieldId)); - auto communication = std::function< void() >([&]() { com.communicate(nullptr); }); + auto communication = std::function< void() >([&]() { com.communicate(); }); #else blockforest::communication::UniformBufferedScheme< Stencil_T > communication(blocks); communication.addPackInfo(make_shared< PackInfo_T >(pdfFieldId)); diff --git a/cmake/FindOpenMesh.cmake b/cmake/FindOpenMesh.cmake index dfe98d2dd3758f50fd042bb3d265e79131447ff0..4dbb7f7dd06fe02b5019b4a52e76287eafbc8651 100644 --- a/cmake/FindOpenMesh.cmake +++ b/cmake/FindOpenMesh.cmake @@ -15,7 +15,7 @@ #=========================================================================== # # OpenMesh -# Copyright (c) 2001-2015, RWTH-Aachen University +# Copyright (c) 2001-2023, RWTH-Aachen University # Department of Computer Graphics and Multimedia # All rights reserved. # www.openmesh.org @@ -53,7 +53,7 @@ # #=========================================================================== -cmake_minimum_required(VERSION 2.8.12) +cmake_minimum_required(VERSION 3.3.0) #if already found via finder or simulated finder in openmesh CMakeLists.txt, skip the search IF (NOT OPENMESH_FOUND) @@ -63,6 +63,13 @@ IF (NOT OPENMESH_FOUND) "${CMAKE_SOURCE_DIR}/OpenMesh/src/OpenMesh" "${CMAKE_SOURCE_DIR}/libs_required/OpenMesh/src/OpenMesh" "${CMAKE_SOURCE_DIR}/../OpenMesh/src/OpenMesh" + "C:/Program Files/OpenMesh 10.0" + "C:/Program Files/OpenMesh 9.0" + "C:/Program Files/OpenMesh 8.1" + "C:/Program Files/OpenMesh 8.0" + "C:/Program Files/OpenMesh 7.2" + "C:/Program Files/OpenMesh 7.1" + "C:/Program Files/OpenMesh 7.0" "C:/Program Files/OpenMesh 6.3" "C:/Program Files/OpenMesh 6.2" "C:/Program Files/OpenMesh 6.1" @@ -81,6 +88,12 @@ IF (NOT OPENMESH_FOUND) "C:/Program Files/OpenMesh 2.4.1" "C:/Program Files/OpenMesh 2.4" "C:/Program Files/OpenMesh 2.0/include" + "C:/libs/OpenMesh 10.0" + "C:/libs/OpenMesh 9.0" + "C:/libs/OpenMesh 8.1" + "C:/libs/OpenMesh 8.0" + "C:/libs/OpenMesh 7.1" + "C:/libs/OpenMesh 7.0" "C:/libs/OpenMesh 6.3" "C:/libs/OpenMesh 6.2" "C:/libs/OpenMesh 6.1" @@ -98,7 +111,7 @@ IF (NOT OPENMESH_FOUND) "C:/libs/OpenMesh 3.0" "C:/libs/OpenMesh 2.4.1" "C:/libs/OpenMesh 2.4" - "${OPENMESH_DIR}" + "${OPENMESH_LIBRARY_DIR}" ) FIND_PATH (OPENMESH_INCLUDE_DIR OpenMesh/Core/Mesh/PolyMeshT.hh diff --git a/cmake/TestFloat16.cpp b/cmake/TestFloat16.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bae373fbb5af94c152c405f4572bdbb764a43c04 --- /dev/null +++ b/cmake/TestFloat16.cpp @@ -0,0 +1,7 @@ +#include <iostream> + + +int main() +{ + static_assert(std::is_floating_point_v<_Float16>); +} \ No newline at end of file diff --git a/python/lbmpy_walberla/additional_data_handler.py b/python/lbmpy_walberla/additional_data_handler.py index 16e5cb35f8ed64ae77eeda06c08812bbb57e950a..efd86fe92043777b69620b99d138fc1331a98580 100644 --- a/python/lbmpy_walberla/additional_data_handler.py +++ b/python/lbmpy_walberla/additional_data_handler.py @@ -3,7 +3,11 @@ from pystencils.stencil import inverse_direction from pystencils.typing import BasicType from lbmpy.advanced_streaming import AccessPdfValues, numeric_offsets, numeric_index, Timestep, is_inplace -from lbmpy.advanced_streaming.indexing import MirroredStencilDirections +# until lbmpy version 1.3.2 +try: + from lbmpy.advanced_streaming.indexing import MirroredStencilDirections +except ImportError: + from lbmpy.custom_code_nodes import MirroredStencilDirections from lbmpy.boundaries.boundaryconditions import LbBoundary from lbmpy.boundaries import ExtrapolationOutflow, FreeSlip, UBB diff --git a/src/blockforest/Initialization.cpp b/src/blockforest/Initialization.cpp index b91923eebd5b53156fbe3e916c8aeb84963a283b..d800a75b10a112edba39a6655c84bfad53cb1426 100644 --- a/src/blockforest/Initialization.cpp +++ b/src/blockforest/Initialization.cpp @@ -130,7 +130,7 @@ shared_ptr< StructuredBlockForest > createUniformBlockGridFromConfig( const Conf cell_idx_c(cells[1])-1, cell_idx_c(cells[2])-1 ); - uint_t nrOfProcesses = uint_c( MPIManager::instance()->numProcesses() ); + const uint_t nrOfProcesses = uint_c( MPIManager::instance()->numProcesses() ); calculateCellDistribution( cells, nrOfProcesses, blocks, cellsPerBlock ); } diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 099e0b5732a9d5b112600d150162f728bf8dcb8f..71bb209d70167ecd5ef056c61350269667029930 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -29,6 +29,15 @@ add_library( core ) if( MPI_FOUND ) target_link_libraries( core PUBLIC MPI::MPI_CXX ) endif() + +if ( WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT ) + # Actual support for float16 is available only since C++23 + # before is_arithmetic and is_floating_point evaluated to false, + # also many STL functions are compatible with float16 only since C++23. + # Which features are actually supported depend on the compiler + target_compile_features(core PUBLIC cxx_std_23) +endif () + target_link_libraries( core PUBLIC ${SERVICE_LIBS} ) target_sources( core PRIVATE diff --git a/src/core/DataTypes.cpp b/src/core/DataTypes.cpp index ead9f6fb680c12a053c9ebf4fe9fe9333c443027..0b9dcad1fe5e4c0e8973cf2f2719b003fb7134a0 100644 --- a/src/core/DataTypes.cpp +++ b/src/core/DataTypes.cpp @@ -26,6 +26,11 @@ namespace walberla { namespace real_comparison { + #ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT +// const bfloat16 Epsilon< bfloat16 >::value = static_cast< bfloat16 >(1e-2); // machine eps is 2^-7 + const float16 Epsilon< float16 >::value = static_cast< float16 >(1e-3); // machine eps is 2^-10 + // Note, depending on the kind of float16 <bfloat, float16> another Epsilon must be used. + #endif const float Epsilon< float >::value = static_cast< float >(1e-4); const double Epsilon< double >::value = static_cast< double >(1e-8); const long double Epsilon< long double >::value = static_cast< long double >(1e-10); diff --git a/src/core/DataTypes.h b/src/core/DataTypes.h index bae5b7651eaa17bc67c9fe822eeb386de38f61ca..4e7c019a86dcf0ce23fb7c8a9e66e6add8500bde 100644 --- a/src/core/DataTypes.h +++ b/src/core/DataTypes.h @@ -175,21 +175,33 @@ using real_t = float; /// Only bandwidth bound code may therefore benefit. None of this is guaranteed, and may change in the future. /// #ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT -# if defined(WALBERLA_CXX_COMPILER_IS_CLANG) || defined(WALBERLA_CXX_COMPILER_IS_GNU) -/// Clang version must be 15 or higher for x86 half precision support. -/// GCC version must be 12 or higher for x86 half precision support. -/// Also support seems to require SSE, so ensure that respective instruction sets are enabled. +/// FIXME: (not really right) Clang version must be 15 or higher for x86 half precision support. +/// FIXME: (not really right) GCC version must be 12 or higher for x86 half precision support. +/// FIXME: (I don't know) Also support seems to require SSE, so ensure that respective instruction sets are enabled. /// See /// https://clang.llvm.org/docs/LanguageExtensions.html#half-precision-floating-point /// https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html /// for more information. +/// ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ +/// Compiler requirements: +/// Within this project, there are several checks to ensure that the template parameter 'ValueType' +/// is a floating point number. The check is_floating_point<ValueType> is done primarily in our MPI implementation. +/// The IEE 754 floating type format _Float16, evaluates to true only if your compiler supports the +/// open C++23 standard P1467R9 (Extended floating-point types and standard names). +/// Compare: +/// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p1467r9.html +/// +/// Right now (18.12.2023) this is the case only for gcc13. +/// For more information see: +/// https://gcc.gnu.org/projects/cxx-status.html#:~:text=Extended%20floating%2Dpoint%20types%20and%20standard%20names +/// https://clang.llvm.org/cxx_status.html#:~:text=Extended%20floating%2Dpoint%20types%20and%20standard%20names + using half = _Float16; +// Note: there are two possible float16 formats. +// The one used right now is the IEE 754 float16 standard, consisting of a 5 bit exponent and a 10 bit mantissa. +// Another possible half precision format would be the one from Google Brain (bfloat16) with an 8 bit exponent and a 7 bit mantissa. +// Compare https://i10git.cs.fau.de/ab04unyc/walberla/-/issues/23 using float16 = half; -# else -static_assert(false, "\n\n### Attempting to built walberla with half precision support.\n" - "### However, the compiler you chose is not suited for that, or we simply have not implemented " - "support for half precision and your compiler.\n"); -# endif #endif using float32 = float; using float64 = double; @@ -228,6 +240,10 @@ inline bool realIsIdentical( const real_t a, const real_t b ) namespace real_comparison { template< class T > struct Epsilon; + #ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT + using walberla::float16; + template<> struct Epsilon< float16 > { static const float16 value; }; + #endif template<> struct Epsilon< float > { static const float value; }; template<> struct Epsilon< double > { static const double value; }; template<> struct Epsilon< long double > { static const long double value; }; @@ -254,6 +270,14 @@ inline bool floatIsEqual( float lhs, float rhs, const float epsilon = real_compa return std::fabs( lhs - rhs ) < epsilon; } +#ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT +inline bool floatIsEqual( walberla::float16 lhs, walberla::float16 rhs, const walberla::float16 epsilon = real_comparison::Epsilon<walberla::float16>::value ) +{ + const auto difference = lhs - rhs; + return ( (difference < 0) ? -difference : difference ) < epsilon; +} +#endif + } // namespace walberla #define WALBERLA_UNUSED(x) (void)(x) diff --git a/src/core/Environment.h b/src/core/Environment.h index 492467d52b62f266e3c348615369010da208d03b..61ef4f2b13db47c5f2ba7e3bed11cc95c43980e9 100644 --- a/src/core/Environment.h +++ b/src/core/Environment.h @@ -44,9 +44,8 @@ public: * * \param argc,argv If command line parameters are present they have to contain at least * the path to the configuration file and optionally pairs of param-value: - * "-myParameter myValue" - * These values are then substituted in configuration files at - * positions marked with $(myParameter) + * "-blockName.parameterName=parameterValue" + * These values are then substituted in configuration files. * It is also possible to pass no command line options (see description below ) * * If command line arguments are present the constructor initializes static objects diff --git a/src/core/mpi/MPIWrapper.h b/src/core/mpi/MPIWrapper.h index 51ab22e26ed3b9e38a858ab8040c39325751ff12..eecee3136c702a61e7987d25705076a7c780a635 100644 --- a/src/core/mpi/MPIWrapper.h +++ b/src/core/mpi/MPIWrapper.h @@ -353,6 +353,9 @@ WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( unsigned short int , MPI_UNSIGNED_SHORT WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( unsigned int , MPI_UNSIGNED ); WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( unsigned long int , MPI_UNSIGNED_LONG ); WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( unsigned long long , MPI_UNSIGNED_LONG_LONG ); +#ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT + WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( walberla::float16 , MPI_WCHAR ); +#endif WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( float , MPI_FLOAT ); WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( double , MPI_DOUBLE ); WALBERLA_CREATE_MPITRAIT_SPECIALIZATION( long double , MPI_LONG_DOUBLE ); diff --git a/src/geometry/initializer/BoundaryFromDomainBorder.impl.h b/src/geometry/initializer/BoundaryFromDomainBorder.impl.h index 38a97d82451d6fc114541f892aa023660c3a9b02..c3ae21b55cd3709ff28bdf7012151d51b6b311eb 100644 --- a/src/geometry/initializer/BoundaryFromDomainBorder.impl.h +++ b/src/geometry/initializer/BoundaryFromDomainBorder.impl.h @@ -65,7 +65,7 @@ void BoundaryFromDomainBorder<Handling>::init( const Config::BlockHandle & block BoundarySetter<Handling> boundarySetter; boundarySetter.setConfigBlock( blockHandle ); - std::string directionStr = blockHandle.getParameter<std::string>( "direction" ); + const std::string directionStr = blockHandle.getParameter<std::string>( "direction" ); cell_idx_t wallDistance = blockHandle.getParameter<cell_idx_t>( "walldistance", 0 ); cell_idx_t ghostLayersToInitialize = blockHandle.getParameter<cell_idx_t>( "ghostLayersToInitialize", std::numeric_limits<cell_idx_t>::max() ); @@ -75,8 +75,8 @@ void BoundaryFromDomainBorder<Handling>::init( const Config::BlockHandle & block using stencil::D3Q7; for( auto dirIt = D3Q7::beginNoCenter(); dirIt != D3Q7::end(); ++dirIt ) { - bool isAll = string_icompare( directionStr, "all" ) == 0; - bool isInDirectionStrings = std::find( directionStrings.begin(), directionStrings.end(), + const bool isAll = string_icompare( directionStr, "all" ) == 0; + const bool isInDirectionStrings = std::find( directionStrings.begin(), directionStrings.end(), stencil::dirToString[*dirIt] ) != directionStrings.end(); if( isAll || isInDirectionStrings ) @@ -87,7 +87,7 @@ void BoundaryFromDomainBorder<Handling>::init( const Config::BlockHandle & block } if ( ! atLeastOneBoundarySet ) - WALBERLA_ABORT( "Invalid Direction " << directionStr << ". Allowed values: all, N,S,W,E,T,B "); + WALBERLA_ABORT( "Invalid Direction " << directionStr << ". Allowed values: all, N,S,W,E,T,B ") } diff --git a/src/gpu/communication/NonUniformGPUScheme.h b/src/gpu/communication/NonUniformGPUScheme.h index 093ec4cad2a830a80042073f905bb1c7316bf8ae..745d28cc5f18e0df1ce6eeeda0cfbf5d478656ee 100644 --- a/src/gpu/communication/NonUniformGPUScheme.h +++ b/src/gpu/communication/NonUniformGPUScheme.h @@ -307,6 +307,9 @@ void NonUniformGPUScheme< Stencil >::startCommunicationEqualLevel(const uint_t i for (auto it : headers_[EQUAL_LEVEL][index]) bufferSystemGPU_[EQUAL_LEVEL][index].sendBuffer(it.first).clear(); + // wait until communication dependent kernels are finished + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) + // Start filling send buffers for (auto& iBlock : *forest) { @@ -397,6 +400,9 @@ void NonUniformGPUScheme< Stencil >::startCommunicationCoarseToFine(const uint_t for (auto it : headers_[COARSE_TO_FINE][index]) bufferSystemGPU_[COARSE_TO_FINE][index].sendBuffer(it.first).clear(); + // wait until communication dependent kernels are finished + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) + // Start filling send buffers for (auto& iBlock : *forest) { @@ -431,7 +437,7 @@ void NonUniformGPUScheme< Stencil >::startCommunicationCoarseToFine(const uint_t { WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.remainingSize(), pi->sizeCoarseToFineSend(coarseBlock, fineReceiverId, *dir)) - pi->communicateLocalCoarseToFine(coarseBlock, fineReceiverBlock, *dir, gpuDataBuffer, streams_[*dir]); + pi->communicateLocalCoarseToFine(coarseBlock, fineReceiverBlock, *dir, gpuDataBuffer, nullptr); } } else @@ -500,6 +506,9 @@ void NonUniformGPUScheme< Stencil >::startCommunicationFineToCoarse(const uint_t for (auto it : headers_[FINE_TO_COARSE][index]) bufferSystemGPU_[FINE_TO_COARSE][index].sendBuffer(it.first).clear(); + // wait until communication dependent kernels are finished + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) + // Start filling send buffers for (auto& iBlock : *forest) { @@ -532,7 +541,7 @@ void NonUniformGPUScheme< Stencil >::startCommunicationFineToCoarse(const uint_t { WALBERLA_ASSERT_NOT_NULLPTR(gpuDataBuffer.cur()) WALBERLA_ASSERT_GREATER_EQUAL(gpuDataBuffer.allocSize() - gpuDataBuffer.size(), pi->sizeFineToCoarseSend(fineBlock, *dir)) - pi->communicateLocalFineToCoarse(fineBlock, coarseReceiverBlock, *dir, gpuDataBuffer, streams_[*dir]); + pi->communicateLocalFineToCoarse(fineBlock, coarseReceiverBlock, *dir, gpuDataBuffer, nullptr); } } else diff --git a/src/gpu/communication/UniformGPUScheme.h b/src/gpu/communication/UniformGPUScheme.h index 5c9604ccd8cc00e5cdb2d9f9c1085ace2f2e44a5..bc481d8950c25d4aa5196316c641e8b67e34318a 100644 --- a/src/gpu/communication/UniformGPUScheme.h +++ b/src/gpu/communication/UniformGPUScheme.h @@ -18,7 +18,6 @@ //! \author Martin Bauer <martin.bauer@fau.de> // //====================================================================================================================== - #pragma once #include "blockforest/StructuredBlockForest.h" @@ -32,9 +31,7 @@ #include <thread> -#include "gpu/GPURAII.h" #include "gpu/GPUWrapper.h" -#include "gpu/ParallelStreams.h" #include "gpu/communication/CustomMemoryBuffer.h" #include "gpu/communication/GeneratedGPUPackInfo.h" @@ -49,29 +46,34 @@ namespace communication { class UniformGPUScheme { public: - explicit UniformGPUScheme( weak_ptr<StructuredBlockForest> bf, - bool sendDirectlyFromGPU = false, - bool useLocalCommunication = true, + explicit UniformGPUScheme( const weak_ptr< StructuredBlockForest >& bf, + const bool sendDirectlyFromGPU = false, + const bool useLocalCommunication = true, const int tag = 5432 ); - explicit UniformGPUScheme( weak_ptr<StructuredBlockForest> bf, - const Set<SUID> & requiredBlockSelectors, - const Set<SUID> & incompatibleBlockSelectors, - bool sendDirectlyFromGPU = false, - bool useLocalCommunication = true, - const int tag = 5432 ); + explicit UniformGPUScheme( const weak_ptr< StructuredBlockForest >& bf, + const Set<SUID> & requiredBlockSelectors, + const Set<SUID> & incompatibleBlockSelectors, + const bool sendDirectlyFromGPU = false, + const bool useLocalCommunication = true, + const int tag = 5432 ); + ~UniformGPUScheme() + { + for (uint_t i = 0; i < Stencil::Q; ++i) + WALBERLA_GPU_CHECK(gpuStreamDestroy(streams_[i])) + } void addPackInfo( const shared_ptr<GeneratedGPUPackInfo> &pi ); - void startCommunication( gpuStream_t stream = nullptr); - void wait( gpuStream_t stream = nullptr); + void startCommunication(); + void wait(); - void operator()( gpuStream_t stream = nullptr ) { communicate( stream ); } - inline void communicate( gpuStream_t stream = nullptr ) { startCommunication(stream); wait(stream); } + void operator()() { communicate( ); } + inline void communicate() { startCommunication(); wait(); } - std::function<void()> getCommunicateFunctor( gpuStream_t stream = nullptr ); - std::function<void()> getStartCommunicateFunctor( gpuStream_t stream = nullptr ); - std::function<void()> getWaitFunctor( gpuStream_t stream = nullptr ); + std::function<void()> getCommunicateFunctor(); + std::function<void()> getStartCommunicateFunctor(); + std::function<void()> getWaitFunctor(); private: void setupCommunication(); @@ -81,8 +83,8 @@ namespace communication { bool setupBeforeNextCommunication_; bool communicationInProgress_; - bool sendFromGPU_; - bool useLocalCommunication_; + const bool sendFromGPU_; + const bool useLocalCommunication_; using CpuBuffer_T = gpu::communication::PinnedMemoryBuffer; using GpuBuffer_T = gpu::communication::GPUMemoryBuffer; @@ -92,8 +94,6 @@ namespace communication { std::vector<shared_ptr<GeneratedGPUPackInfo> > packInfos_; - ParallelStreams parallelSectionManager_; - struct Header { BlockID blockId; @@ -103,6 +103,8 @@ namespace communication { Set<SUID> requiredBlockSelectors_; Set<SUID> incompatibleBlockSelectors_; + + gpuStream_t streams_[Stencil::Q]; }; diff --git a/src/gpu/communication/UniformGPUScheme.impl.h b/src/gpu/communication/UniformGPUScheme.impl.h index 93f6dd85e0e3f44293b9943e1bf252ce52c6ad33..84d9e0f22dd5661d1d428525d3758a5bb9a29488 100644 --- a/src/gpu/communication/UniformGPUScheme.impl.h +++ b/src/gpu/communication/UniformGPUScheme.impl.h @@ -19,10 +19,6 @@ // //====================================================================================================================== -#include "core/mpi/MPIWrapper.h" - -#include "gpu/ParallelStreams.h" - namespace walberla { namespace gpu { @@ -30,9 +26,9 @@ namespace communication { template<typename Stencil> - UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf, - bool sendDirectlyFromGPU, - bool useLocalCommunication, + UniformGPUScheme<Stencil>::UniformGPUScheme( const weak_ptr< StructuredBlockForest >& bf, + const bool sendDirectlyFromGPU, + const bool useLocalCommunication, const int tag ) : blockForest_( bf ), setupBeforeNextCommunication_( true ), @@ -41,7 +37,6 @@ namespace communication { useLocalCommunication_(useLocalCommunication), bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ), bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ), - parallelSectionManager_( -1 ), requiredBlockSelectors_( Set<SUID>::emptySet() ), incompatibleBlockSelectors_( Set<SUID>::emptySet() ) { @@ -52,14 +47,17 @@ namespace communication { WALBERLA_CHECK(!sendDirectlyFromGPU) #endif } + + for (uint_t i = 0; i < Stencil::Q; ++i) + WALBERLA_GPU_CHECK(gpuStreamCreate(&streams_[i])) } template<typename Stencil> - UniformGPUScheme<Stencil>::UniformGPUScheme( weak_ptr <StructuredBlockForest> bf, + UniformGPUScheme<Stencil>::UniformGPUScheme( const weak_ptr< StructuredBlockForest >& bf, const Set<SUID> & requiredBlockSelectors, const Set<SUID> & incompatibleBlockSelectors, - bool sendDirectlyFromGPU, - bool useLocalCommunication, + const bool sendDirectlyFromGPU, + const bool useLocalCommunication, const int tag ) : blockForest_( bf ), setupBeforeNextCommunication_( true ), @@ -68,7 +66,6 @@ namespace communication { useLocalCommunication_(useLocalCommunication), bufferSystemCPU_( mpi::MPIManager::instance()->comm(), tag ), bufferSystemGPU_( mpi::MPIManager::instance()->comm(), tag ), - parallelSectionManager_( -1 ), requiredBlockSelectors_( requiredBlockSelectors ), incompatibleBlockSelectors_( incompatibleBlockSelectors ) { @@ -78,11 +75,14 @@ namespace communication { WALBERLA_CHECK(!sendDirectlyFromGPU) #endif } + + for (uint_t i = 0; i < Stencil::Q; ++i) + WALBERLA_GPU_CHECK(gpuStreamCreate(&streams_[i])) } template<typename Stencil> - void UniformGPUScheme<Stencil>::startCommunication( gpuStream_t stream ) + void UniformGPUScheme<Stencil>::startCommunication( ) { WALBERLA_ASSERT( !communicationInProgress_ ) auto forest = blockForest_.lock(); @@ -102,9 +102,11 @@ namespace communication { for( auto it : headers_ ) bufferSystemGPU_.sendBuffer( it.first ).clear(); + // wait until communication dependent kernels are finished + WALBERLA_GPU_CHECK(gpuDeviceSynchronize()) + // Start filling send buffers { - auto parallelSection = parallelSectionManager_.parallelSection( stream ); for( auto &iBlock : *forest ) { auto senderBlock = dynamic_cast< Block * >( &iBlock ); @@ -127,7 +129,7 @@ namespace communication { auto receiverBlock = dynamic_cast< Block * >( forest->getBlock( senderBlock->getNeighborId( neighborIdx, uint_t(0) )) ); for (auto& pi : packInfos_) { - pi->communicateLocal(*dir, senderBlock, receiverBlock, stream); + pi->communicateLocal(*dir, senderBlock, receiverBlock, streams_[*dir]); } } else @@ -136,26 +138,27 @@ namespace communication { for( auto &pi : packInfos_ ) { - parallelSection.run([&](auto s) { auto size = pi->size( *dir, senderBlock ); auto gpuDataPtr = bufferSystemGPU_.sendBuffer( nProcess ).advanceNoResize( size ); WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr ) - pi->pack( *dir, gpuDataPtr, senderBlock, s ); + pi->pack( *dir, gpuDataPtr, senderBlock, streams_[*dir] ); if( !sendFromGPU_ ) { auto cpuDataPtr = bufferSystemCPU_.sendBuffer( nProcess ).advanceNoResize( size ); WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr ) - WALBERLA_GPU_CHECK( gpuMemcpyAsync( cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, s )) + WALBERLA_GPU_CHECK( gpuMemcpyAsync( cpuDataPtr, gpuDataPtr, size, gpuMemcpyDeviceToHost, streams_[*dir] )) } - }); } } } } } // wait for packing to finish - WALBERLA_GPU_CHECK( gpuStreamSynchronize( stream ) ); + for (uint_t i = 0; i < Stencil::Q; ++i) + { + WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i])) + } if( sendFromGPU_ ) bufferSystemGPU_.sendAll(); @@ -167,7 +170,7 @@ namespace communication { template<typename Stencil> - void UniformGPUScheme<Stencil>::wait( gpuStream_t stream ) + void UniformGPUScheme<Stencil>::wait() { WALBERLA_ASSERT( communicationInProgress_ ) @@ -175,7 +178,6 @@ namespace communication { if( sendFromGPU_ ) { - auto parallelSection = parallelSectionManager_.parallelSection( stream ); for( auto recvInfo = bufferSystemGPU_.begin(); recvInfo != bufferSystemGPU_.end(); ++recvInfo ) { recvInfo.buffer().clear(); @@ -188,16 +190,13 @@ namespace communication { auto size = pi->size( header.dir, block ); auto gpuDataPtr = recvInfo.buffer().advanceNoResize( size ); WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr ) - parallelSection.run([&](auto s) { - pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, s ); - }); + pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, streams_[header.dir] ); } } } } else { - auto parallelSection = parallelSectionManager_.parallelSection( stream ); for( auto recvInfo = bufferSystemCPU_.begin(); recvInfo != bufferSystemCPU_.end(); ++recvInfo ) { auto &gpuBuffer = bufferSystemGPU_.sendBuffer( recvInfo.rank()); @@ -214,17 +213,17 @@ namespace communication { auto gpuDataPtr = gpuBuffer.advanceNoResize( size ); WALBERLA_ASSERT_NOT_NULLPTR( cpuDataPtr ) WALBERLA_ASSERT_NOT_NULLPTR( gpuDataPtr ) - parallelSection.run([&](auto s) { WALBERLA_GPU_CHECK( gpuMemcpyAsync( gpuDataPtr, cpuDataPtr, size, - gpuMemcpyHostToDevice, s )) - pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, s ); - }); + gpuMemcpyHostToDevice, streams_[header.dir] )) + pi->unpack( stencil::inverseDir[header.dir], gpuDataPtr, block, streams_[header.dir] ); } } } } - - WALBERLA_GPU_CHECK( gpuDeviceSynchronize() ) + for (uint_t i = 0; i < Stencil::Q; ++i) + { + WALBERLA_GPU_CHECK(gpuStreamSynchronize(streams_[i])) + } communicationInProgress_ = false; } @@ -312,21 +311,21 @@ namespace communication { } template< typename Stencil > - std::function<void()> UniformGPUScheme<Stencil>::getCommunicateFunctor(gpuStream_t stream) + std::function<void()> UniformGPUScheme<Stencil>::getCommunicateFunctor() { - return [this, stream]() { communicate( stream ); }; + return [this]() { communicate( ); }; } template< typename Stencil > - std::function<void()> UniformGPUScheme<Stencil>::getStartCommunicateFunctor(gpuStream_t stream) + std::function<void()> UniformGPUScheme<Stencil>::getStartCommunicateFunctor() { - return [this, stream]() { startCommunication( stream ); }; + return [this]() { startCommunication(); }; } template< typename Stencil > - std::function<void()> UniformGPUScheme<Stencil>::getWaitFunctor(gpuStream_t stream) + std::function<void()> UniformGPUScheme<Stencil>::getWaitFunctor() { - return [this, stream]() { wait( stream ); }; + return [this]() { wait(); }; } } // namespace communication diff --git a/src/lbm_generated/evaluation/PerformanceEvaluation.h b/src/lbm_generated/evaluation/PerformanceEvaluation.h index 9fb7e934a2506ca360af12882a0775bcf8281eb6..36f112ac3360f1125fcc11733c8ab56955daceb0 100644 --- a/src/lbm_generated/evaluation/PerformanceEvaluation.h +++ b/src/lbm_generated/evaluation/PerformanceEvaluation.h @@ -24,6 +24,7 @@ #include "core/DataTypes.h" #include "core/Hostname.h" #include "core/Set.h" +#include "core/OpenMP.h" #include "core/waLBerlaBuildInfo.h" #include "core/debug/CheckFunctions.h" #include "core/logging/Logging.h" @@ -223,10 +224,17 @@ PerformanceEvaluationBase< CellCounter_T, FluidCellCounter_T >::PerformanceEvalu fluidCells_( fluidCellCounter ) { #ifdef _OPENMP - if( std::getenv( "OMP_NUM_THREADS" ) == NULL ) - WALBERLA_ABORT( "If you are using a version of the program that was compiled with OpenMP you have to " - "specify the environment variable \'OMP_NUM_THREADS\' accordingly!" ); - threadsPerProcess_ = std::atoi( std::getenv( "OMP_NUM_THREADS" ) ); + if( std::getenv( "OMP_NUM_THREADS" ) ) + { + threadsPerProcess_ = std::atoi( std::getenv( "OMP_NUM_THREADS" ) ); + } + else + { + WALBERLA_LOG_WARNING( "You are using a version of the program that was compiled with OpenMP and the environment " + "variable \'OMP_NUM_THREADS\' was not set!\nThe number of threads will now determined with " + "\'omp_get_max_threads\'.\nIf this not correct in your case please set \'OMP_NUM_THREADS\'" ); + threadsPerProcess_ = omp_get_max_threads(); + } #endif if( std::getenv( "THREADS_PER_CORE" ) ) @@ -256,7 +264,7 @@ std::string PerformanceEvaluationBase< CellCounter_T, FluidCellCounter_T >::logg { std::ostringstream oss; - std::string na( "n/a *)" ); + const std::string na( "n/a *)" ); std::ostringstream threadsPerCoreString; threadsPerCoreString << threadsPerCore_; diff --git a/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h b/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h index 5bb43c3c874e160253dd096e87ee8c50e2aa08b3..6665cf3513df16ca09e7c0cfc7c66c35639b1869 100644 --- a/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h +++ b/src/lbm_generated/gpu/BasicRecursiveTimeStepGPU.impl.h @@ -243,7 +243,7 @@ void BasicRecursiveTimeStepGPU< PdfField_T, SweepCollection_T, BoundaryCollectio for(auto it = CommunicationStencil::beginNoCenter(); it != CommunicationStencil::end(); ++it){ uint_t nSecIdx = blockforest::getBlockNeighborhoodSectionIndex(*it); // Propagate on ghost layers shadowing coarse or no blocks - if(!block->neighborhoodSectionHasSmallerBlocks(nSecIdx)){ + if(block->neighborhoodSectionHasLargerBlock(nSecIdx)){ CellInterval ci; pdfField->getGhostRegion(*it, ci, 1); sweepCollection_.streamOnlyNoAdvancementCellInterval(block, ci, gpuStream); diff --git a/tests/core/CMakeLists.txt b/tests/core/CMakeLists.txt index 46b98eb48c90884feda14e4dde15341b4b5f3687..8d3f0298ac3bf387dfb19d18b46bfaff8caa6912 100644 --- a/tests/core/CMakeLists.txt +++ b/tests/core/CMakeLists.txt @@ -222,3 +222,17 @@ if( WALBERLA_BUILD_WITH_PARMETIS ) waLBerla_compile_test( NAME PlainParMetisTest FILES load_balancing/PlainParMetisTest.cpp ) waLBerla_execute_test( NAME PlainParMetisTest PROCESSES 3 ) endif() + +################### +# Mixed Precision # +################### + +if ( WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT ) + waLBerla_compile_test( Name Float16SupportTest FILES Float16SupportTest.cpp DEPENDS core) + # Actual support for float16 is available only since C++23 + # before is_arithmetic and is_floating_point evaluated to false, + # also many STL functions are compatible with float16 only since C++23. + # Which features are actually supported depend on the compiler + target_compile_features( Float16SupportTest PUBLIC cxx_std_23 ) + waLBerla_execute_test(NAME Float16SupportTest) +endif () \ No newline at end of file diff --git a/tests/core/Float16SupportTest.cpp b/tests/core/Float16SupportTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..04ea9378f54eee4ee78f81177fc609d732da21c5 --- /dev/null +++ b/tests/core/Float16SupportTest.cpp @@ -0,0 +1,163 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file Float16SupportTest.cpp +//! \ingroup core +//! \author Michael Zikeli <michael.zikeli@fau.de> +// +//====================================================================================================================== + +#include <memory> +#include <numeric> + +#include "core/DataTypes.h" +#include "core/Environment.h" +#include "core/logging/Logging.h" + +namespace walberla::simple_Float16_test { +using walberla::floatIsEqual; +using walberla::real_t; +using walberla::uint_c; +using walberla::uint_t; + +// === Choosing Accuracy === +//+++ Precision : fp16 +++ +using walberla::float16; +using walberla::float32; +using walberla::float64; +using dst_t = float16; +using src_t = real_t; +constexpr real_t precisionLimit = walberla::float16( 1e-3 ); +const std::string precisionType = "float16"; +constexpr const auto maxLevel = uint_t( 3 ); + +void simple_array_test() +{ + auto fpSrc = std::make_shared< src_t[] >( 10 ); + auto fpDst = std::make_shared< dst_t[] >( 10 ); + + std::fill_n( fpSrc.get(), 10, 17. ); + std::fill_n( fpDst.get(), 10, (dst_t) 17. ); + + fpSrc[5] = 8.; + fpDst[5] = (dst_t) 8.; + + // Test equality with custom compare + WALBERLA_CHECK_LESS( std::fabs( fpSrc[9] - (src_t) fpDst[9] ), precisionLimit ); + WALBERLA_CHECK_LESS( std::fabs( fpSrc[5] - (src_t) fpDst[5] ), precisionLimit ); + // Test specialized floatIsEqual + WALBERLA_CHECK( floatIsEqual( fpSrc[9], (src_t) fpDst[9], (src_t) precisionLimit ) ); + WALBERLA_CHECK( floatIsEqual( (dst_t) fpSrc[9], fpDst[9], (dst_t) precisionLimit ) ); + WALBERLA_CHECK_FLOAT_EQUAL( (dst_t) fpSrc[9], fpDst[9] ); + + // Test std::fill_n + auto other_fpDst = std::make_shared< dst_t[] >( 10 ); + std::fill_n( other_fpDst.get(), 10, (dst_t) 2. ); + WALBERLA_CHECK_FLOAT_EQUAL( (dst_t) 2., other_fpDst[9] ); + WALBERLA_CHECK_FLOAT_EQUAL( (dst_t) 2., other_fpDst[5] ); + + // Test std::swap + std::swap( fpDst, other_fpDst ); + fpDst[5] = (dst_t) 9.; + + WALBERLA_CHECK_FLOAT_EQUAL( (dst_t) fpSrc[9], other_fpDst[9] ); + WALBERLA_CHECK_FLOAT_EQUAL( (dst_t) fpSrc[5], other_fpDst[5] ); + WALBERLA_CHECK_FLOAT_EQUAL( (dst_t) 2., fpDst[9] ); + WALBERLA_CHECK_FLOAT_EQUAL( (dst_t) 9., fpDst[5] ); + +} // simple_Float16_test::simple_array_test() + +void vector_test() +{ + auto fpSrc = std::vector< src_t >( 10 ); + auto fpDst_cast = std::vector< dst_t >( 10 ); + auto fp32 = std::vector< walberla::float32 >( 10 ); + auto fpDst = std::vector< dst_t >( 10 ); + + fpSrc.assign( 10, 1.5 ); + fpDst_cast.assign( 10, (dst_t) 1.5 ); + fp32.assign( 10, 1.5f ); + std::copy( fpSrc.begin(), fpSrc.end(), fpDst.begin() ); + WALBERLA_LOG_WARNING_ON_ROOT( + " std::vector.assign is not able to assign " + << typeid( src_t ).name() << " values to container of type " << precisionType << ".\n" + << " Therefore, the floating point value for assign must be cast beforehand or std::copy must be used, since it uses a static_cast internally." ); + + fpSrc[5] = 2.3; + fpDst_cast[5] = (dst_t) 2.3; + fp32[5] = 2.3f; + fpDst[5] = (dst_t) 2.3; + + WALBERLA_CHECK_FLOAT_EQUAL( (walberla::float32) fpSrc[0], fp32[0] ); + WALBERLA_CHECK_FLOAT_EQUAL( (walberla::float32) fpSrc[9], fp32[9] ); + WALBERLA_CHECK_FLOAT_EQUAL( (walberla::float32) fpSrc[5], fp32[5] ); + WALBERLA_CHECK_FLOAT_EQUAL( (dst_t) fpSrc[0], fpDst_cast[0] ); + WALBERLA_CHECK_FLOAT_EQUAL( (dst_t) fpSrc[9], fpDst_cast[9] ); + WALBERLA_CHECK_FLOAT_EQUAL( (dst_t) fpSrc[5], fpDst_cast[5] ); + WALBERLA_CHECK_FLOAT_EQUAL( (dst_t) fpSrc[0], fpDst[0] ); + WALBERLA_CHECK_FLOAT_EQUAL( (dst_t) fpSrc[9], fpDst[9] ); + WALBERLA_CHECK_FLOAT_EQUAL( (dst_t) fpSrc[5], fpDst[5] ); + WALBERLA_CHECK_EQUAL( typeid( fpDst ), typeid( fpDst_cast ) ); + + // Add up all elements of the vector to check whether the result is sufficiently correct. + { + const auto sumSrc = std::reduce(fpSrc.begin(), fpSrc.end()); + const auto sumDst = std::reduce(fpDst.begin(), fpDst.end()); + WALBERLA_CHECK_FLOAT_EQUAL( (dst_t)sumSrc, sumDst ); + } + { + fpSrc.assign( 13, 1.3 ); + std::copy( fpSrc.begin(), fpSrc.end(), fpDst.begin() ); + const auto sumSrc = std::reduce(fpSrc.begin(), fpSrc.end()); + const auto sumDst = std::reduce(fpDst.begin(), fpDst.end()); + WALBERLA_CHECK_FLOAT_UNEQUAL( (dst_t)sumSrc, sumDst ); + } +} // simple_Float16_test::vector_test() + +int main( int argc, char** argv ) +{ + // This check only works since C++23 and is used in many implementations, so it's important, that it works. + WALBERLA_CHECK( std::is_arithmetic< dst_t >::value ); + + walberla::Environment env( argc, argv ); + walberla::logging::Logging::instance()->setLogLevel( walberla::logging::Logging::INFO ); + walberla::MPIManager::instance()->useWorldComm(); + + WALBERLA_LOG_INFO_ON_ROOT( " This run is executed with " << precisionType ); + WALBERLA_LOG_INFO_ON_ROOT( " machine precision limit is " << precisionLimit ); + const std::string stringLine( 125, '=' ); + WALBERLA_LOG_INFO_ON_ROOT( stringLine ); + + WALBERLA_LOG_INFO_ON_ROOT( " Start a test with shared_pointer<float16[]>." ); + simple_array_test(); + + WALBERLA_LOG_INFO_ON_ROOT( " Start a test with std::vector<float16>." ); + vector_test(); + + WALBERLA_LOG_INFO_ON_ROOT( " Start a where float32 is sufficient but float16 is not." ); + WALBERLA_CHECK_FLOAT_UNEQUAL( dst_t(1.0)-dst_t(0.3), 1.0-0.3 ); + WALBERLA_CHECK_FLOAT_EQUAL( 1.0f-0.3f, 1.0-0.3 ); + + return 0; +} // simple_Float16_test::main() + +} // namespace walberla::simple_Float16_test + +int main( int argc, char** argv ) +{ + walberla::simple_Float16_test::main( argc, argv ); + + return EXIT_SUCCESS; +} // main() diff --git a/tests/gpu/communication/CommTest.cpp b/tests/gpu/communication/CommTest.cpp index 5bc87aa13f9a2c72351b532fb20e3614cdfc8d82..a8a60321d9ee45f131843dbcb9f974f41ace0b01 100644 --- a/tests/gpu/communication/CommTest.cpp +++ b/tests/gpu/communication/CommTest.cpp @@ -47,7 +47,7 @@ void hostToHost() hostField2.set(hostField1); } double const endTime = MPI_Wtime(); - std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl; + std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n'; } void hostToDevice() @@ -61,7 +61,7 @@ void hostToDevice() gpu::fieldCpy(deviceField, hostField); } double const endTime = MPI_Wtime(); - std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl; + std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n'; } void deviceToHost() @@ -76,7 +76,7 @@ void deviceToHost() gpu::fieldCpy(hostField, deviceField); } double const endTime = MPI_Wtime(); - std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl; + std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n'; } void mpiHostToHost() @@ -100,7 +100,7 @@ void mpiHostToHost() MPI_Wait(&request2, MPI_STATUS_IGNORE); } double const endTime = MPI_Wtime(); - std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl; + std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n'; } void mpiHostToDevice() @@ -124,7 +124,7 @@ void mpiHostToDevice() MPI_Wait(&request2, MPI_STATUS_IGNORE); } double const endTime = MPI_Wtime(); - std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl; + std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n'; } void mpiDeviceToHost() @@ -148,7 +148,7 @@ void mpiDeviceToHost() MPI_Wait(&request2, MPI_STATUS_IGNORE); } double const endTime = MPI_Wtime(); - std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl; + std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n'; } void mpiDeviceToDevice() @@ -172,7 +172,7 @@ void mpiDeviceToDevice() MPI_Wait(&request2, MPI_STATUS_IGNORE); } double const endTime = MPI_Wtime(); - std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl; + std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n'; } void mpiCopyHostToDevice() @@ -199,7 +199,7 @@ void mpiCopyHostToDevice() gpu::fieldCpy(deviceField, hostField2); } double const endTime = MPI_Wtime(); - std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl; + std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n'; } void mpiCopyDeviceToHost() @@ -226,7 +226,7 @@ void mpiCopyDeviceToHost() MPI_Wait(&request2, MPI_STATUS_IGNORE); } double const endTime = MPI_Wtime(); - std::cout << __FUNCTION__ << ": " << endTime - startTime << std::endl; + std::cout << __FUNCTION__ << ": " << endTime - startTime << '\n'; } int main(int argc, char** argv)