diff --git a/CMakeLists.txt b/CMakeLists.txt index 557a81e0f32c0018f7b06ff8cc4050381316759f..fabbc83e56a72e49c311b6672131d4769ea5fcbf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -102,6 +102,8 @@ option ( WALBERLA_LOG_SKIPPED "Log skipped cmake targets" option ( WALBERLA_GIT_SUBMODULE_AUTO "Check submodules during cmake run" ON ) +option ( WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT "Experimental half precision support" OFF ) + # Installation Directory set ( CMAKE_INSTALL_PREFIX /usr/local/waLBerla CACHE STRING "The default installation directory." ) @@ -476,18 +478,18 @@ endif() # disable Xcode 7.3+ linker deduplication pass to speed up linking in debug mode -if ( APPLE ) - execute_process( COMMAND ${CMAKE_LINKER} -v OUTPUT_VARIABLE LINKER_VERSION ERROR_VARIABLE LINKER_VERSION ) - string( REGEX MATCH "ld64-[0-9\\.\\-]+" LINKER_VERSION ${LINKER_VERSION} ) - string( REGEX MATCHALL "[^\\-]+" LINKER_VERSION ${LINKER_VERSION} ) - list( GET LINKER_VERSION 0 LINKER_TYPE ) - list( GET LINKER_VERSION 1 LINKER_VERSION ) - if( LINKER_TYPE STREQUAL "ld64" AND LINKER_VERSION VERSION_GREATER 264.3.101 ) - add_flag( CMAKE_EXE_LINKER_FLAGS_DEBUG "-Wl,-no_deduplicate") - add_flag( CMAKE_MODULE_LINKER_FLAGS_DEBUG "-Wl,-no_deduplicate") - add_flag( CMAKE_SHARED_LINKER_FLAGS_DEBUG "-Wl,-no_deduplicate") - endif() -endif() +#if ( APPLE ) +# execute_process( COMMAND ${CMAKE_LINKER} -v OUTPUT_VARIABLE LINKER_VERSION ERROR_VARIABLE LINKER_VERSION ) +# string( REGEX MATCH "ld64-[0-9\\.\\-]+" LINKER_VERSION ${LINKER_VERSION} ) +# string( REGEX MATCHALL "[^\\-]+" LINKER_VERSION ${LINKER_VERSION} ) +# list( GET LINKER_VERSION 0 LINKER_TYPE ) +# list( GET LINKER_VERSION 1 LINKER_VERSION ) +# if( LINKER_TYPE STREQUAL "ld64" AND LINKER_VERSION VERSION_GREATER 264.3.101 ) +# add_flag( CMAKE_EXE_LINKER_FLAGS_DEBUG "-Wl,-no_deduplicate") +# add_flag( CMAKE_MODULE_LINKER_FLAGS_DEBUG "-Wl,-no_deduplicate") +# add_flag( CMAKE_SHARED_LINKER_FLAGS_DEBUG "-Wl,-no_deduplicate") +# endif() +#endif() ############################################################################################################################ @@ -1271,6 +1273,34 @@ if ( WALBERLA_SANITIZE_UNDEFINED ) endif() endif() +############################################################################################################################ +## +## Half precision +## +############################################################################################################################ +if (WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT) + if (WALBERLA_CXX_COMPILER_IS_GNU OR WALBERLA_CXX_COMPILER_IS_CLANG) + message(STATUS "Configuring with *experimental* half precision (float16) support. You better know what you are doing.") + if (WALBERLA_CXX_COMPILER_IS_GNU AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 12.0.0) + message(WARNING "[WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT] " + "Half precision support for gcc has only been tested with version >= 12. " + "You are using a previous version - it may not work correctly.") + endif () + if (WALBERLA_CXX_COMPILER_IS_CLANG AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0.0) + message(WARNING "[WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT] " + "Half precision support for clang has only been tested with version >= 15. " + "You are using a previous version - it may not work correctly.") + endif () + if (NOT WALBERLA_OPTIMIZE_FOR_LOCALHOST) + message(WARNING "[WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT] " + "You are not optimizing for localhost. You may encounter linker errors, or WORSE: silent incorrect fp16 arithmetic! Consider also enabling WALBERLA_OPTIMIZE_FOR_LOCALHOST!") + endif () + else () + message(FATAL_ERROR "[WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT] " + "Half precision support is currently only available for gcc and clang.") + endif () +endif () + ############################################################################################################################ # Documentation Generation # diff --git a/apps/tools/CMakeLists.txt b/apps/tools/CMakeLists.txt index 3d4e98e5c818c1baebcc4bd1399c44ad9b10779e..eaf667f372496c46e6617c16593844ee58a5eba1 100644 --- a/apps/tools/CMakeLists.txt +++ b/apps/tools/CMakeLists.txt @@ -1 +1,2 @@ +add_subdirectory( MixedPrecision ) add_subdirectory( povrayFileCompressor ) \ No newline at end of file diff --git a/apps/tools/MixedPrecision/CMakeLists.txt b/apps/tools/MixedPrecision/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f14c7e5a2b3ea377db65597d4bfe37b3f38055d --- /dev/null +++ b/apps/tools/MixedPrecision/CMakeLists.txt @@ -0,0 +1,3 @@ +waLBerla_add_executable ( NAME CheckFP16 + FILES CheckFP16.cpp + DEPENDS core ) \ No newline at end of file diff --git a/apps/tools/MixedPrecision/CheckFP16.cpp b/apps/tools/MixedPrecision/CheckFP16.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c436a81f2e6a4e1286ec8424c116270e169ab661 --- /dev/null +++ b/apps/tools/MixedPrecision/CheckFP16.cpp @@ -0,0 +1,197 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file CheckFP16.cpp +//! \brief Checks the availability of float16 (half precision) and verifies some properties. +//! \author Nils Kohl <nils.kohl@fau.de> +// +//====================================================================================================================== + +#include <core/DataTypes.h> +#include <core/Environment.h> +#include <core/logging/Logging.h> +#include <core/perf_analysis/extern/likwid.h> + +namespace walberla +{ + +template< typename T > +void kernel(T* v, T* vv, T* r, size_t vsize) +{ + for (size_t i = 0; i < vsize; i++) + { + r[i] = v[i] + vv[i]; + } +} + +int main(int argc, char** argv) +{ + Environment const env(argc, argv); + + WALBERLA_LOG_INFO_ON_ROOT("-------------") + WALBERLA_LOG_INFO_ON_ROOT(" FP16 checks ") + WALBERLA_LOG_INFO_ON_ROOT("-------------") + +#ifndef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT + WALBERLA_LOG_INFO_ON_ROOT(" - Apparently you have not enabled half precision support.") + WALBERLA_LOG_INFO_ON_ROOT(" Reconfigure by setting the respective CMake variable to ON.") + WALBERLA_LOG_INFO_ON_ROOT(" At the time of writing this it's called WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT.") + + return EXIT_FAILURE; +#else + WALBERLA_LOG_INFO_ON_ROOT(" - Half precision support enabled via CMake!") + + WALBERLA_LOG_INFO_ON_ROOT(" - Sizeof checks: ") + const auto sfloat64 = sizeof(float64); + const auto sfloat32 = sizeof(float32); + const auto sfloat16 = sizeof(float16); + WALBERLA_LOG_INFO_ON_ROOT(" + sizeof( float64 ) == " << sfloat64) + WALBERLA_LOG_INFO_ON_ROOT(" + sizeof( float32 ) == " << sfloat32) + WALBERLA_LOG_INFO_ON_ROOT(" + sizeof( float16 ) == " << sfloat16) + if (sfloat64 != 8 || sfloat32 != 4 || sfloat16 != 2) + { + WALBERLA_LOG_INFO_ON_ROOT(" Your types don't seem to have the expected sizes.") + return EXIT_FAILURE; + } + WALBERLA_LOG_INFO_ON_ROOT(" -> works out!") + + WALBERLA_LOG_INFO_ON_ROOT(" - Casting checks (promotion is required to format strings): ") + const float64 a64 = 42; + const float32 a32 = 42; + const float16 a16 = 42; + WALBERLA_LOG_INFO_ON_ROOT(" + float64: " << a64) + WALBERLA_LOG_INFO_ON_ROOT(" + float32: " << a32) + WALBERLA_LOG_INFO_ON_ROOT(" + float16: " << (double) a16) + WALBERLA_LOG_INFO_ON_ROOT(" Casting and output compiles.") + + WALBERLA_LOG_INFO_ON_ROOT(" - Basic arithmetic check: ") + const auto x = float16(1.2); + const auto y = float16(-1.8); + const float64 z = -0.6; + const float16 sum = x + y; + WALBERLA_LOG_INFO_ON_ROOT(" " << (double) x << " + " << (double) y << " == " << (float64) (x + y) << "") + WALBERLA_CHECK(std::abs((float64) sum - z) < 1e-3, "Float16 arithmetic is broken."); + WALBERLA_LOG_INFO_ON_ROOT("") + +# ifdef WALBERLA_BUILD_WITH_LIKWID_MARKERS + WALBERLA_LOG_INFO_ON_ROOT(" - Memory traffic test. You have built with likwid enabled. Make sure to run ") + WALBERLA_LOG_INFO_ON_ROOT(" $ likwid-perfctr -g MEM_DP -m ./CheckFP16") + WALBERLA_LOG_INFO_ON_ROOT(" to compare the memory traffic, and") + WALBERLA_LOG_INFO_ON_ROOT(" $ likwid-perfctr -g FLOPS_AVX -m ./CheckFP16") + WALBERLA_LOG_INFO_ON_ROOT( + " for the stream-triad-like benchmark to check whether automatic float32 vectorization works.") + WALBERLA_LOG_INFO_ON_ROOT("") + WALBERLA_LOG_INFO_ON_ROOT(" The only real benefit of using float16 is reduced memory traffic since internally,\n" + "all arithmetic operations are preceded by promotions to float32 (likely - depends on " + "the machine).") + WALBERLA_LOG_INFO_ON_ROOT(" + Stream test ... ") + + LIKWID_MARKER_INIT; + LIKWID_MARKER_THREADINIT; + + LIKWID_MARKER_REGISTER("float64-mem"); + LIKWID_MARKER_REGISTER("float32-mem"); + LIKWID_MARKER_REGISTER("float16-mem"); + + LIKWID_MARKER_REGISTER("float64-vec"); + LIKWID_MARKER_REGISTER("float32-vec"); + LIKWID_MARKER_REGISTER("float16-vec"); + + size_t vsize = 100000000; + + std::vector< float64 > v64(vsize, 0.01); + std::vector< float32 > v32(vsize, 0.01f); + std::vector< float16 > v16(vsize, float16(0.01)); + + std::vector< float64 > vv64(vsize, 0.02); + std::vector< float32 > vv32(vsize, 0.02f); + std::vector< float16 > vv16(vsize, float16(0.02)); + + std::vector< float64 > r64(vsize); + std::vector< float32 > r32(vsize); + std::vector< float16 > r16(vsize); + + LIKWID_MARKER_START("float64-mem"); + float64 sum64 = 0; + for (size_t j = 0; j < vsize; j++) + { + if (0 == j % 2) { sum64 += v64[j]; } + else { sum64 -= v64[j]; } + } + WALBERLA_LOG_INFO_ON_ROOT( + " + Printing sum of float64 vector entries. Should be zero up to rounding errors: " << sum64); + LIKWID_MARKER_STOP("float64-mem"); + + // Start measurements + LIKWID_MARKER_START("float32-mem"); + float32 sum32 = 0; + for (size_t j = 0; j < vsize; j++) + { + if (0 == j % 2) { sum32 += v32[j]; } + else { sum32 -= v32[j]; } + } + WALBERLA_LOG_INFO_ON_ROOT( + " + Printing sum of float32 vector entries. Should be zero up to rounding errors: " << sum32); + LIKWID_MARKER_STOP("float32-mem"); + + // Start measurements + LIKWID_MARKER_START("float16-mem"); + float16 sum16 = 0; + for (size_t j = 0; j < vsize; j++) + { + if (0 == j % 2) { sum16 += v16[j]; } + else { sum16 -= v16[j]; } + } + WALBERLA_LOG_INFO_ON_ROOT( + " + Printing sum of float16 vector entries. Should be zero up to rounding errors: " << (double) sum16); + LIKWID_MARKER_STOP("float16-mem"); + + WALBERLA_LOG_INFO_ON_ROOT(" + Vectorization test ... ") + + float64* v64_ptr = v64.data(); + float64* vv64_ptr = vv64.data(); + float64* r64_ptr = r64.data(); + LIKWID_MARKER_START("float64-vec"); + kernel(v64_ptr, vv64_ptr, r64_ptr, vsize); + WALBERLA_LOG_INFO_ON_ROOT(" + Printing entry of float64 vector sum: " << r64[vsize / 2]); + LIKWID_MARKER_STOP("float64-vec"); + + float32* v32_ptr = v32.data(); + float32* vv32_ptr = vv32.data(); + float32* r32_ptr = r32.data(); + LIKWID_MARKER_START("float32-vec"); + kernel(v32_ptr, vv32_ptr, r32_ptr, vsize); + WALBERLA_LOG_INFO_ON_ROOT(" + Printing entry of float32 vector sum: " << r32[vsize / 2]); + LIKWID_MARKER_STOP("float32-vec"); + + float16* v16_ptr = v16.data(); + float16* vv16_ptr = vv16.data(); + float16* r16_ptr = r16.data(); + LIKWID_MARKER_START("float16-vec"); + kernel(v16_ptr, vv16_ptr, r16_ptr, vsize); + WALBERLA_LOG_INFO_ON_ROOT(" + Printing entry of float16 vector sum: " << (double) r16[vsize / 2]); + LIKWID_MARKER_STOP("float16-vec"); + + LIKWID_MARKER_CLOSE; + +# else + WALBERLA_LOG_INFO_ON_ROOT(" - Build and run with likwid to run memory traffic test.") +# endif +#endif + return EXIT_SUCCESS; +} +} // namespace walberla + +int main(int argc, char** argv) { return walberla::main(argc, argv); } diff --git a/src/core/DataTypes.h b/src/core/DataTypes.h index 2f868719787ae5d6930cb55581bfc3df1298d4e5..bae5b7651eaa17bc67c9fe822eeb386de38f61ca 100644 --- a/src/core/DataTypes.h +++ b/src/core/DataTypes.h @@ -167,6 +167,33 @@ using real_t = double; using real_t = float; #endif +/// Half precision support. Experimental. Use carefully. +/// +/// This feature is experimental, since it strictly depends on the underlying architecture and compiler support. +/// On x86 architectures, what you can expect is that the data format is supported natively only for storage and +/// interchange. Arithmetic operations will likely involve casting to fp32 (C++ float) and truncation to fp16. +/// Only bandwidth bound code may therefore benefit. None of this is guaranteed, and may change in the future. +/// +#ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT +# if defined(WALBERLA_CXX_COMPILER_IS_CLANG) || defined(WALBERLA_CXX_COMPILER_IS_GNU) +/// Clang version must be 15 or higher for x86 half precision support. +/// GCC version must be 12 or higher for x86 half precision support. +/// Also support seems to require SSE, so ensure that respective instruction sets are enabled. +/// See +/// https://clang.llvm.org/docs/LanguageExtensions.html#half-precision-floating-point +/// https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html +/// for more information. +using half = _Float16; +using float16 = half; +# else +static_assert(false, "\n\n### Attempting to built walberla with half precision support.\n" + "### However, the compiler you chose is not suited for that, or we simply have not implemented " + "support for half precision and your compiler.\n"); +# endif +#endif +using float32 = float; +using float64 = double; + inline constexpr real_t operator"" _r( long double t ) { return static_cast< real_t >(t); } inline constexpr real_t operator"" _r( unsigned long long int t ) { return static_cast< real_t >(t); } template< typename T > inline real_t real_c ( T t ) { return numeric_cast< real_t >(t); } ///< cast to type real_t using "real_c(x)" diff --git a/src/waLBerlaDefinitions.in.h b/src/waLBerlaDefinitions.in.h index ea9dfee9179c8dbd8ec92e943216f25a77842180..3676dcd942011ea90850f8667f50eae94af0d468 100644 --- a/src/waLBerlaDefinitions.in.h +++ b/src/waLBerlaDefinitions.in.h @@ -13,6 +13,8 @@ // double or single precision #cmakedefine WALBERLA_DOUBLE_ACCURACY +// Experimental half precision support. +#cmakedefine WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT // Debugging options #cmakedefine WALBERLA_ENABLE_GUI @@ -25,6 +27,7 @@ #cmakedefine WALBERLA_BUILD_WITH_OPENMP #cmakedefine WALBERLA_BUILD_WITH_METIS #cmakedefine WALBERLA_BUILD_WITH_PARMETIS +#cmakedefine WALBERLA_BUILD_WITH_LIKWID_MARKERS #cmakedefine WALBERLA_BUILD_WITH_PYTHON diff --git a/tests/core/CMakeLists.txt b/tests/core/CMakeLists.txt index 70e14368e1d854a4ed7189cfc394ad7838361a4f..788e8f3ba206dea078a20dd9a3525a92d914be66 100644 --- a/tests/core/CMakeLists.txt +++ b/tests/core/CMakeLists.txt @@ -193,6 +193,9 @@ waLBerla_compile_test( FILES DebugSTLTest.cpp ) waLBerla_execute_test( NAME DebugSTLTest ) set_tests_properties(DebugSTLTest PROPERTIES WILL_FAIL TRUE) +waLBerla_compile_test( FILES FP16Test.cpp ) +waLBerla_execute_test( NAME FP16Test ) + waLBerla_compile_test( FILES FunctionTraitsTest.cpp ) waLBerla_execute_test( NAME FunctionTraitsTest ) diff --git a/tests/core/FP16Test.cpp b/tests/core/FP16Test.cpp new file mode 100644 index 0000000000000000000000000000000000000000..60a2be0eeee0872449f6a648fa1c65abbbda7f42 --- /dev/null +++ b/tests/core/FP16Test.cpp @@ -0,0 +1,85 @@ +//====================================================================================================================== +// +// This file is part of waLBerla. waLBerla is free software: you can +// redistribute it and/or modify it under the terms of the GNU General Public +// License as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// waLBerla is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. +// +// You should have received a copy of the GNU General Public License along +// with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>. +// +//! \file FP16Test.cpp +//! \ingroup core +//! \author Nils Kohl <nils.kohl@fau.de> +// +//====================================================================================================================== + +#include "core/DataTypes.h" +#include "core/debug/Debug.h" +#include "core/debug/TestSubsystem.h" +#include "core/logging/Logging.h" +#include "core/Environment.h" + +#include <cstdlib> +#include <iostream> + +namespace walberla { + +void fp16Test( int argc, char ** argv ) +{ + Environment const env( argc, argv ); + + WALBERLA_LOG_INFO_ON_ROOT("-------------") + WALBERLA_LOG_INFO_ON_ROOT(" FP16 checks ") + WALBERLA_LOG_INFO_ON_ROOT("-------------") + +#ifndef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT + WALBERLA_LOG_INFO_ON_ROOT(" - Test does nothing as it was not built with fp16 support.") + WALBERLA_LOG_INFO_ON_ROOT(" - Apparently you have not enabled half precision support.") + WALBERLA_LOG_INFO_ON_ROOT(" - Reconfigure by setting the respective CMake variable " + "(at the time of writing this it's called WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT) " + "to ON.") +#else + + WALBERLA_LOG_INFO_ON_ROOT(" - Half precision support enabled via CMake!") + + WALBERLA_LOG_INFO_ON_ROOT(" - Sizeof checks: ") + auto sfloat64 = sizeof(float64); + auto sfloat32 = sizeof(float32); + auto sfloat16 = sizeof(float16); + WALBERLA_CHECK_EQUAL( sfloat64, 8, "Your types don't seem to have the expected sizes." ); + WALBERLA_CHECK_EQUAL( sfloat32, 4, "Your types don't seem to have the expected sizes." ); + WALBERLA_CHECK_EQUAL( sfloat16, 2, "Your types don't seem to have the expected sizes." ); + + WALBERLA_LOG_INFO_ON_ROOT(" - Casting checks (promotion is required to format strings): ") + const float64 a64 = 42; + const float32 a32 = 42; + const float16 a16 = 42; + WALBERLA_LOG_INFO_ON_ROOT(" + float64: " << a64) + WALBERLA_LOG_INFO_ON_ROOT(" + float32: " << a32) + WALBERLA_LOG_INFO_ON_ROOT(" + float16: " << (double) a16) + WALBERLA_LOG_INFO_ON_ROOT(" Casting and output compiles.") + + WALBERLA_LOG_INFO_ON_ROOT(" - Basic arithmetic check: ") + const float16 x = 1.2f16; + const float16 y = -1.8f16; + const float64 z = -0.6; + WALBERLA_LOG_INFO_ON_ROOT(" + " << (double) x << " + " << (double) y << " == " << (float64) (x + y) << " ? ") + WALBERLA_CHECK_FLOAT_EQUAL((float64) (x + y), z, "float16 addition does not work correctly."); +#endif +} + +} + + +int main( int argc, char** argv ) +{ + walberla::debug::enterTestMode(); + walberla::fp16Test( argc, argv ); + return EXIT_SUCCESS; +}