Experimental x86 half precision support.

c5f6979f · Nils Kohl · Markus Holzer · 55e06038 · c5f6979f · c5f6979f
Commit c5f6979f authored 1 year ago by Nils Kohl Committed by Markus Holzer 1 year ago
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -102,6 +102,8 @@ option ( WALBERLA_LOG_SKIPPED               "Log skipped cmake targets"

 option ( WALBERLA_GIT_SUBMODULE_AUTO        "Check submodules during cmake run"               ON )

+option ( WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT "Experimental half precision support"    OFF )
+
 # Installation Directory
 set ( CMAKE_INSTALL_PREFIX /usr/local/waLBerla CACHE STRING "The default installation directory."   )

@@ -476,18 +478,18 @@ endif()


 # disable Xcode 7.3+ linker deduplication pass to speed up linking in debug mode
-if ( APPLE )
-   execute_process( COMMAND ${CMAKE_LINKER} -v OUTPUT_VARIABLE LINKER_VERSION ERROR_VARIABLE LINKER_VERSION )
-   string( REGEX MATCH "ld64-[0-9\\.\\-]+" LINKER_VERSION ${LINKER_VERSION} )
-   string( REGEX MATCHALL "[^\\-]+" LINKER_VERSION ${LINKER_VERSION} )
-   list( GET LINKER_VERSION 0 LINKER_TYPE )
-   list( GET LINKER_VERSION 1 LINKER_VERSION )
-   if( LINKER_TYPE STREQUAL "ld64" AND LINKER_VERSION VERSION_GREATER 264.3.101 )
-       add_flag( CMAKE_EXE_LINKER_FLAGS_DEBUG    "-Wl,-no_deduplicate")
-       add_flag( CMAKE_MODULE_LINKER_FLAGS_DEBUG "-Wl,-no_deduplicate")
-       add_flag( CMAKE_SHARED_LINKER_FLAGS_DEBUG "-Wl,-no_deduplicate")
-   endif()
-endif()
+#if ( APPLE )
+#   execute_process( COMMAND ${CMAKE_LINKER} -v OUTPUT_VARIABLE LINKER_VERSION ERROR_VARIABLE LINKER_VERSION )
+#   string( REGEX MATCH "ld64-[0-9\\.\\-]+" LINKER_VERSION ${LINKER_VERSION} )
+#   string( REGEX MATCHALL "[^\\-]+" LINKER_VERSION ${LINKER_VERSION} )
+#   list( GET LINKER_VERSION 0 LINKER_TYPE )
+#   list( GET LINKER_VERSION 1 LINKER_VERSION )
+#   if( LINKER_TYPE STREQUAL "ld64" AND LINKER_VERSION VERSION_GREATER 264.3.101 )
+#       add_flag( CMAKE_EXE_LINKER_FLAGS_DEBUG    "-Wl,-no_deduplicate")
+#       add_flag( CMAKE_MODULE_LINKER_FLAGS_DEBUG "-Wl,-no_deduplicate")
+#       add_flag( CMAKE_SHARED_LINKER_FLAGS_DEBUG "-Wl,-no_deduplicate")
+#   endif()
+#endif()


 ############################################################################################################################
@@ -1271,6 +1273,34 @@ if ( WALBERLA_SANITIZE_UNDEFINED )
    endif()
 endif()

+############################################################################################################################
+##
+##  Half precision
+##
+############################################################################################################################
+if (WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT)
+    if (WALBERLA_CXX_COMPILER_IS_GNU OR WALBERLA_CXX_COMPILER_IS_CLANG)
+        message(STATUS "Configuring with *experimental* half precision (float16) support. You better know what you are doing.")
+        if (WALBERLA_CXX_COMPILER_IS_GNU AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 12.0.0)
+            message(WARNING "[WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT] "
+                    "Half precision support for gcc has only been tested with version >= 12. "
+                    "You are using a previous version - it may not work correctly.")
+        endif ()
+        if (WALBERLA_CXX_COMPILER_IS_CLANG AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0.0)
+            message(WARNING "[WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT] "
+                    "Half precision support for clang has only been tested with version >= 15. "
+                    "You are using a previous version - it may not work correctly.")
+        endif ()
+        if (NOT WALBERLA_OPTIMIZE_FOR_LOCALHOST)
+            message(WARNING "[WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT] "
+                    "You are not optimizing for localhost. You may encounter linker errors, or WORSE: silent incorrect fp16 arithmetic! Consider also enabling WALBERLA_OPTIMIZE_FOR_LOCALHOST!")
+        endif ()
+    else ()
+        message(FATAL_ERROR "[WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT] "
+                "Half precision support is currently only available for gcc and clang.")
+    endif ()
+endif ()
+
 ############################################################################################################################
 # Documentation Generation
 #

--- a/apps/tools/CMakeLists.txt
+++ b/apps/tools/CMakeLists.txt
+add_subdirectory( MixedPrecision )
 add_subdirectory( povrayFileCompressor )
\ No newline at end of file
--- a/apps/tools/MixedPrecision/CMakeLists.txt
+++ b/apps/tools/MixedPrecision/CMakeLists.txt
+waLBerla_add_executable ( NAME CheckFP16
+        FILES CheckFP16.cpp
+        DEPENDS core )
\ No newline at end of file
--- a/apps/tools/MixedPrecision/CheckFP16.cpp
+++ b/apps/tools/MixedPrecision/CheckFP16.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file CheckFP16.cpp
+//! \brief Checks the availability of float16 (half precision) and verifies some properties.
+//! \author Nils Kohl <nils.kohl@fau.de>
+//
+//======================================================================================================================
+
+#include <core/DataTypes.h>
+#include <core/Environment.h>
+#include <core/logging/Logging.h>
+#include <core/perf_analysis/extern/likwid.h>
+
+namespace walberla
+{
+
+template< typename T >
+void kernel(T* v, T* vv, T* r, size_t vsize)
+{
+   for (size_t i = 0; i < vsize; i++)
+   {
+      r[i] = v[i] + vv[i];
+   }
+}
+
+int main(int argc, char** argv)
+{
+   Environment const env(argc, argv);
+
+   WALBERLA_LOG_INFO_ON_ROOT("-------------")
+   WALBERLA_LOG_INFO_ON_ROOT(" FP16 checks ")
+   WALBERLA_LOG_INFO_ON_ROOT("-------------")
+
+#ifndef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+   WALBERLA_LOG_INFO_ON_ROOT(" - Apparently you have not enabled half precision support.")
+   WALBERLA_LOG_INFO_ON_ROOT("   Reconfigure by setting the respective CMake variable to ON.")
+   WALBERLA_LOG_INFO_ON_ROOT("   At the time of writing this it's called WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT.")
+
+   return EXIT_FAILURE;
+#else
+   WALBERLA_LOG_INFO_ON_ROOT(" - Half precision support enabled via CMake!")
+
+   WALBERLA_LOG_INFO_ON_ROOT(" - Sizeof checks: ")
+   const auto sfloat64 = sizeof(float64);
+   const auto sfloat32 = sizeof(float32);
+   const auto sfloat16 = sizeof(float16);
+   WALBERLA_LOG_INFO_ON_ROOT("   + sizeof( float64 ) == " << sfloat64)
+   WALBERLA_LOG_INFO_ON_ROOT("   + sizeof( float32 ) == " << sfloat32)
+   WALBERLA_LOG_INFO_ON_ROOT("   + sizeof( float16 ) == " << sfloat16)
+   if (sfloat64 != 8 || sfloat32 != 4 || sfloat16 != 2)
+   {
+      WALBERLA_LOG_INFO_ON_ROOT("   Your types don't seem to have the expected sizes.")
+      return EXIT_FAILURE;
+   }
+   WALBERLA_LOG_INFO_ON_ROOT("   -> works out!")
+
+   WALBERLA_LOG_INFO_ON_ROOT(" - Casting checks (promotion is required to format strings): ")
+   const float64 a64 = 42;
+   const float32 a32 = 42;
+   const float16 a16 = 42;
+   WALBERLA_LOG_INFO_ON_ROOT("   + float64: " << a64)
+   WALBERLA_LOG_INFO_ON_ROOT("   + float32: " << a32)
+   WALBERLA_LOG_INFO_ON_ROOT("   + float16: " << (double) a16)
+   WALBERLA_LOG_INFO_ON_ROOT("   Casting and output compiles.")
+
+   WALBERLA_LOG_INFO_ON_ROOT(" - Basic arithmetic check: ")
+   const auto x   = float16(1.2);
+   const auto y   = float16(-1.8);
+   const float64 z   = -0.6;
+   const float16 sum = x + y;
+   WALBERLA_LOG_INFO_ON_ROOT("     " << (double) x << " + " << (double) y << " == " << (float64) (x + y) << "")
+   WALBERLA_CHECK(std::abs((float64) sum - z) < 1e-3, "Float16 arithmetic is broken.");
+   WALBERLA_LOG_INFO_ON_ROOT("")
+
+#   ifdef WALBERLA_BUILD_WITH_LIKWID_MARKERS
+   WALBERLA_LOG_INFO_ON_ROOT(" - Memory traffic test. You have built with likwid enabled. Make sure to run ")
+   WALBERLA_LOG_INFO_ON_ROOT("     $ likwid-perfctr -g MEM_DP    -m ./CheckFP16")
+   WALBERLA_LOG_INFO_ON_ROOT("   to compare the memory traffic, and")
+   WALBERLA_LOG_INFO_ON_ROOT("     $ likwid-perfctr -g FLOPS_AVX -m ./CheckFP16")
+   WALBERLA_LOG_INFO_ON_ROOT(
+      "   for the stream-triad-like benchmark to check whether automatic float32 vectorization works.")
+   WALBERLA_LOG_INFO_ON_ROOT("")
+   WALBERLA_LOG_INFO_ON_ROOT("   The only real benefit of using float16 is reduced memory traffic since internally,\n"
+                             "all arithmetic operations are preceded by promotions to float32 (likely - depends on "
+                             "the machine).")
+   WALBERLA_LOG_INFO_ON_ROOT("   + Stream test ... ")
+
+   LIKWID_MARKER_INIT;
+   LIKWID_MARKER_THREADINIT;
+
+   LIKWID_MARKER_REGISTER("float64-mem");
+   LIKWID_MARKER_REGISTER("float32-mem");
+   LIKWID_MARKER_REGISTER("float16-mem");
+
+   LIKWID_MARKER_REGISTER("float64-vec");
+   LIKWID_MARKER_REGISTER("float32-vec");
+   LIKWID_MARKER_REGISTER("float16-vec");
+
+   size_t vsize = 100000000;
+
+   std::vector< float64 > v64(vsize, 0.01);
+   std::vector< float32 > v32(vsize, 0.01f);
+   std::vector< float16 > v16(vsize, float16(0.01));
+
+   std::vector< float64 > vv64(vsize, 0.02);
+   std::vector< float32 > vv32(vsize, 0.02f);
+   std::vector< float16 > vv16(vsize, float16(0.02));
+
+   std::vector< float64 > r64(vsize);
+   std::vector< float32 > r32(vsize);
+   std::vector< float16 > r16(vsize);
+
+   LIKWID_MARKER_START("float64-mem");
+   float64 sum64 = 0;
+   for (size_t j = 0; j < vsize; j++)
+   {
+      if (0 == j % 2) { sum64 += v64[j]; }
+      else { sum64 -= v64[j]; }
+   }
+   WALBERLA_LOG_INFO_ON_ROOT(
+      "   + Printing sum of float64 vector entries. Should be zero up to rounding errors: " << sum64);
+   LIKWID_MARKER_STOP("float64-mem");
+
+   // Start measurements
+   LIKWID_MARKER_START("float32-mem");
+   float32 sum32 = 0;
+   for (size_t j = 0; j < vsize; j++)
+   {
+      if (0 == j % 2) { sum32 += v32[j]; }
+      else { sum32 -= v32[j]; }
+   }
+   WALBERLA_LOG_INFO_ON_ROOT(
+      "   + Printing sum of float32 vector entries. Should be zero up to rounding errors: " << sum32);
+   LIKWID_MARKER_STOP("float32-mem");
+
+   // Start measurements
+   LIKWID_MARKER_START("float16-mem");
+   float16 sum16 = 0;
+   for (size_t j = 0; j < vsize; j++)
+   {
+      if (0 == j % 2) { sum16 += v16[j]; }
+      else { sum16 -= v16[j]; }
+   }
+   WALBERLA_LOG_INFO_ON_ROOT(
+      "   + Printing sum of float16 vector entries. Should be zero up to rounding errors: " << (double) sum16);
+   LIKWID_MARKER_STOP("float16-mem");
+
+   WALBERLA_LOG_INFO_ON_ROOT("   + Vectorization test ... ")
+
+   float64* v64_ptr  = v64.data();
+   float64* vv64_ptr = vv64.data();
+   float64* r64_ptr  = r64.data();
+   LIKWID_MARKER_START("float64-vec");
+   kernel(v64_ptr, vv64_ptr, r64_ptr, vsize);
+   WALBERLA_LOG_INFO_ON_ROOT("   + Printing entry of float64 vector sum: " << r64[vsize / 2]);
+   LIKWID_MARKER_STOP("float64-vec");
+
+   float32* v32_ptr  = v32.data();
+   float32* vv32_ptr = vv32.data();
+   float32* r32_ptr  = r32.data();
+   LIKWID_MARKER_START("float32-vec");
+   kernel(v32_ptr, vv32_ptr, r32_ptr, vsize);
+   WALBERLA_LOG_INFO_ON_ROOT("   + Printing entry of float32 vector sum: " << r32[vsize / 2]);
+   LIKWID_MARKER_STOP("float32-vec");
+
+   float16* v16_ptr  = v16.data();
+   float16* vv16_ptr = vv16.data();
+   float16* r16_ptr  = r16.data();
+   LIKWID_MARKER_START("float16-vec");
+   kernel(v16_ptr, vv16_ptr, r16_ptr, vsize);
+   WALBERLA_LOG_INFO_ON_ROOT("   + Printing entry of float16 vector sum: " << (double) r16[vsize / 2]);
+   LIKWID_MARKER_STOP("float16-vec");
+
+   LIKWID_MARKER_CLOSE;
+
+#   else
+   WALBERLA_LOG_INFO_ON_ROOT(" - Build and run with likwid to run memory traffic test.")
+#   endif
+#endif
+   return EXIT_SUCCESS;
+}
+} // namespace walberla
+
+int main(int argc, char** argv) { return walberla::main(argc, argv); }
--- a/src/core/DataTypes.h
+++ b/src/core/DataTypes.h
@@ -167,6 +167,33 @@ using real_t = double;
 using real_t = float;
 #endif

+/// Half precision support. Experimental. Use carefully.
+///
+/// This feature is experimental, since it strictly depends on the underlying architecture and compiler support.
+/// On x86 architectures, what you can expect is that the data format is supported natively only for storage and
+/// interchange. Arithmetic operations will likely involve casting to fp32 (C++ float) and truncation to fp16.
+/// Only bandwidth bound code may therefore benefit. None of this is guaranteed, and may change in the future.
+///
+#ifdef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+#   if defined(WALBERLA_CXX_COMPILER_IS_CLANG) || defined(WALBERLA_CXX_COMPILER_IS_GNU)
+/// Clang version must be 15 or higher for x86 half precision support.
+/// GCC version must be 12 or higher for x86 half precision support.
+/// Also support seems to require SSE, so ensure that respective instruction sets are enabled.
+/// See
+///   https://clang.llvm.org/docs/LanguageExtensions.html#half-precision-floating-point
+///   https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html
+/// for more information.
+using half    = _Float16;
+using float16 = half;
+#   else
+static_assert(false, "\n\n### Attempting to built walberla with half precision support.\n"
+                     "### However, the compiler you chose is not suited for that, or we simply have not implemented "
+                     "support for half precision and your compiler.\n");
+#   endif
+#endif
+using float32 = float;
+using float64 = double;
+
 inline constexpr real_t operator"" _r( long double t ) { return static_cast< real_t >(t); }
 inline constexpr real_t operator"" _r( unsigned long long int t ) { return static_cast< real_t >(t); }
 template< typename T > inline real_t real_c  ( T t ) { return numeric_cast< real_t >(t); } ///< cast to type real_t using "real_c(x)"

--- a/src/waLBerlaDefinitions.in.h
+++ b/src/waLBerlaDefinitions.in.h
@@ -13,6 +13,8 @@
 // double or single precision
 #cmakedefine WALBERLA_DOUBLE_ACCURACY

+// Experimental half precision support.
+#cmakedefine WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT

 // Debugging options
 #cmakedefine WALBERLA_ENABLE_GUI
@@ -25,6 +27,7 @@
 #cmakedefine WALBERLA_BUILD_WITH_OPENMP
 #cmakedefine WALBERLA_BUILD_WITH_METIS
 #cmakedefine WALBERLA_BUILD_WITH_PARMETIS
+#cmakedefine WALBERLA_BUILD_WITH_LIKWID_MARKERS

 #cmakedefine WALBERLA_BUILD_WITH_PYTHON


--- a/tests/core/CMakeLists.txt
+++ b/tests/core/CMakeLists.txt
@@ -193,6 +193,9 @@ waLBerla_compile_test( FILES DebugSTLTest.cpp )
 waLBerla_execute_test( NAME DebugSTLTest )
 set_tests_properties(DebugSTLTest PROPERTIES WILL_FAIL TRUE)

+waLBerla_compile_test( FILES FP16Test.cpp )
+waLBerla_execute_test( NAME FP16Test )
+
 waLBerla_compile_test( FILES FunctionTraitsTest.cpp )
 waLBerla_execute_test( NAME FunctionTraitsTest )


--- a/tests/core/FP16Test.cpp
+++ b/tests/core/FP16Test.cpp
+//======================================================================================================================
+//
+//  This file is part of waLBerla. waLBerla is free software: you can
+//  redistribute it and/or modify it under the terms of the GNU General Public
+//  License as published by the Free Software Foundation, either version 3 of
+//  the License, or (at your option) any later version.
+//
+//  waLBerla is distributed in the hope that it will be useful, but WITHOUT
+//  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+//  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+//  for more details.
+//
+//  You should have received a copy of the GNU General Public License along
+//  with waLBerla (see COPYING.txt). If not, see <http://www.gnu.org/licenses/>.
+//
+//! \file FP16Test.cpp
+//! \ingroup core
+//! \author Nils Kohl <nils.kohl@fau.de>
+//
+//======================================================================================================================
+
+#include "core/DataTypes.h"
+#include "core/debug/Debug.h"
+#include "core/debug/TestSubsystem.h"
+#include "core/logging/Logging.h"
+#include "core/Environment.h"
+
+#include <cstdlib>
+#include <iostream>
+
+namespace walberla {
+
+void fp16Test( int argc, char ** argv )
+{
+   Environment const env( argc, argv );
+
+   WALBERLA_LOG_INFO_ON_ROOT("-------------")
+   WALBERLA_LOG_INFO_ON_ROOT(" FP16 checks ")
+   WALBERLA_LOG_INFO_ON_ROOT("-------------")
+
+#ifndef WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT
+   WALBERLA_LOG_INFO_ON_ROOT(" - Test does nothing as it was not built with fp16 support.")
+   WALBERLA_LOG_INFO_ON_ROOT(" - Apparently you have not enabled half precision support.")
+   WALBERLA_LOG_INFO_ON_ROOT(" - Reconfigure by setting the respective CMake variable "
+                             "(at the time of writing this it's called WALBERLA_BUILD_WITH_HALF_PRECISION_SUPPORT) "
+                             "to ON.")
+#else
+
+   WALBERLA_LOG_INFO_ON_ROOT(" - Half precision support enabled via CMake!")
+
+   WALBERLA_LOG_INFO_ON_ROOT(" - Sizeof checks: ")
+   auto sfloat64 = sizeof(float64);
+   auto sfloat32 = sizeof(float32);
+   auto sfloat16 = sizeof(float16);
+   WALBERLA_CHECK_EQUAL( sfloat64, 8, "Your types don't seem to have the expected sizes." );
+   WALBERLA_CHECK_EQUAL( sfloat32, 4, "Your types don't seem to have the expected sizes." );
+   WALBERLA_CHECK_EQUAL( sfloat16, 2, "Your types don't seem to have the expected sizes." );
+
+   WALBERLA_LOG_INFO_ON_ROOT(" - Casting checks (promotion is required to format strings): ")
+   const float64 a64 = 42;
+   const float32 a32 = 42;
+   const float16 a16 = 42;
+   WALBERLA_LOG_INFO_ON_ROOT("   + float64: " << a64)
+   WALBERLA_LOG_INFO_ON_ROOT("   + float32: " << a32)
+   WALBERLA_LOG_INFO_ON_ROOT("   + float16: " << (double) a16)
+   WALBERLA_LOG_INFO_ON_ROOT("   Casting and output compiles.")
+
+   WALBERLA_LOG_INFO_ON_ROOT(" - Basic arithmetic check: ")
+   const float16 x = 1.2f16;
+   const float16 y = -1.8f16;
+   const float64 z = -0.6;
+   WALBERLA_LOG_INFO_ON_ROOT("   + " << (double) x << " + " << (double) y << " == " << (float64) (x + y) << " ? ")
+   WALBERLA_CHECK_FLOAT_EQUAL((float64) (x + y), z, "float16 addition does not work correctly.");
+#endif
+}
+
+}
+
+
+int main( int argc, char** argv )
+{
+   walberla::debug::enterTestMode();
+   walberla::fp16Test( argc, argv );
+   return EXIT_SUCCESS;
+}