[libc-commits] [libc] [libc][math][c23] Enable C23 _Float16 math functions on GPUs (PR #99248)

Fri Jul 19 12:50:43 PDT 2024

https://github.com/overmighty updated https://github.com/llvm/llvm-project/pull/99248

>From f55e4493b4c5c5060735d4bdb292ce1b84070d24 Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty at gmail.com>
Date: Wed, 17 Jul 2024 00:02:45 +0200
Subject: [PATCH 1/4] [libc][math][c23] Enable C23 _Float16 math functions on
 GPUs

---
 .../cmake/modules/CheckCompilerFeatures.cmake | 33 +++++++--
 libc/config/gpu/entrypoints.txt               | 71 +++++++++++++++++++
 2 files changed, 97 insertions(+), 7 deletions(-)

diff --git a/libc/cmake/modules/CheckCompilerFeatures.cmake b/libc/cmake/modules/CheckCompilerFeatures.cmake
index a6d793d495c45..361c1e710b187 100644
--- a/libc/cmake/modules/CheckCompilerFeatures.cmake
+++ b/libc/cmake/modules/CheckCompilerFeatures.cmake
@@ -15,6 +15,12 @@ set(
 # Making sure ALL_COMPILER_FEATURES is sorted.
 list(SORT ALL_COMPILER_FEATURES)
 
+# Compiler features that are unavailable on GPU targets with the in-tree Clang.
+set(
+  CPU_ONLY_COMPILER_FEATURES
+    "float128"
+)
+
 # Function to check whether the compiler supports the provided set of features.
 # Usage:
 # compiler_supports(
@@ -65,13 +71,26 @@ foreach(feature IN LISTS ALL_COMPILER_FEATURES)
     set(CMAKE_TRY_COMPILE_TARGET_TYPE EXECUTABLE)
   endif()
 
-  try_compile(
-    has_feature
-    ${CMAKE_CURRENT_BINARY_DIR}/compiler_features
-    SOURCES ${LIBC_SOURCE_DIR}/cmake/modules/compiler_features/check_${feature}.cpp
-    COMPILE_DEFINITIONS -I${LIBC_SOURCE_DIR} ${compile_options}
-    LINK_OPTIONS ${link_options}
-  )
+  if(LIBC_TARGET_OS_IS_GPU)
+    # CUDA shouldn't be required to build the libc, only to test it, so we can't
+    # try to build CUDA binaries here. Since GPU builds are always compiled with
+    # the in-tree Clang, we just hardcode which compiler features are available
+    # when targeting GPUs.
+    if(feature IN_LIST CPU_ONLY_COMPILER_FEATURES)
+      set(has_feature FALSE)
+    else()
+      set(has_feature TRUE)
+    endif()
+  else()
+    try_compile(
+      has_feature
+      ${CMAKE_CURRENT_BINARY_DIR}/compiler_features
+      SOURCES ${LIBC_SOURCE_DIR}/cmake/modules/compiler_features/check_${feature}.cpp
+      COMPILE_DEFINITIONS -I${LIBC_SOURCE_DIR} ${compile_options}
+      LINK_OPTIONS ${link_options}
+    )
+  endif()
+
   if(has_feature)
     list(APPEND AVAILABLE_COMPILER_FEATURES ${feature})
     if(${feature} STREQUAL "float16")
diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index b8eb743cf587a..e5007f8afc49e 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -346,6 +346,77 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.truncf
 )
 
+if(LIBC_TYPES_HAS_FLOAT16)
+  list(APPEND TARGET_LIBM_ENTRYPOINTS
+    # math.h C23 _Float16 entrypoints
+    libc.src.math.canonicalizef16
+    libc.src.math.ceilf16
+    libc.src.math.copysignf16
+    libc.src.math.f16add
+    libc.src.math.f16addf
+    libc.src.math.f16addl
+    libc.src.math.f16div
+    libc.src.math.f16divf
+    libc.src.math.f16divl
+    libc.src.math.f16fma
+    libc.src.math.f16fmaf
+    libc.src.math.f16fmal
+    libc.src.math.f16sqrt
+    libc.src.math.f16sqrtf
+    libc.src.math.f16sqrtl
+    libc.src.math.f16sub
+    libc.src.math.f16subf
+    libc.src.math.f16subl
+    libc.src.math.fabsf16
+    libc.src.math.fdimf16
+    libc.src.math.floorf16
+    libc.src.math.fmaxf16
+    libc.src.math.fmaximum_mag_numf16
+    libc.src.math.fmaximum_magf16
+    libc.src.math.fmaximum_numf16
+    libc.src.math.fmaximumf16
+    libc.src.math.fminf16
+    libc.src.math.fminimum_mag_numf16
+    libc.src.math.fminimum_magf16
+    libc.src.math.fminimum_numf16
+    libc.src.math.fminimumf16
+    libc.src.math.fmodf16
+    libc.src.math.frexpf16
+    libc.src.math.fromfpf16
+    libc.src.math.fromfpxf16
+    libc.src.math.getpayloadf16
+    libc.src.math.ilogbf16
+    libc.src.math.ldexpf16
+    libc.src.math.llogbf16
+    libc.src.math.llrintf16
+    libc.src.math.llroundf16
+    libc.src.math.logbf16
+    libc.src.math.lrintf16
+    libc.src.math.lroundf16
+    libc.src.math.modff16
+    libc.src.math.nanf16
+    libc.src.math.nearbyintf16
+    libc.src.math.nextafterf16
+    libc.src.math.nextdownf16
+    libc.src.math.nexttowardf16
+    libc.src.math.nextupf16
+    libc.src.math.remainderf16
+    libc.src.math.remquof16
+    libc.src.math.rintf16
+    libc.src.math.roundevenf16
+    libc.src.math.roundf16
+    libc.src.math.scalblnf16
+    libc.src.math.scalbnf16
+    libc.src.math.setpayloadf16
+    libc.src.math.setpayloadsigf16
+    libc.src.math.totalorderf16
+    libc.src.math.totalordermagf16
+    libc.src.math.truncf16
+    libc.src.math.ufromfpf16
+    libc.src.math.ufromfpxf16
+  )
+endif()
+
 set(TARGET_LLVMLIBC_ENTRYPOINTS
   ${TARGET_LIBC_ENTRYPOINTS}
   ${TARGET_LIBM_ENTRYPOINTS}

>From 7cc8e19068979ab8bbb862ef84e9c0fae7d8440c Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty at gmail.com>
Date: Wed, 17 Jul 2024 11:52:01 +0200
Subject: [PATCH 2/4] fixup! [libc][math][c23] Enable C23 _Float16 math
 functions on GPUs

Enable ROUND_OPT flag and builtin usage for _Float16 functions on GPUs.
---
 libc/cmake/modules/LLVMLibCFlagRules.cmake          |  6 ++++--
 libc/src/__support/macros/properties/cpu_features.h |  4 ++++
 libc/src/math/generic/CMakeLists.txt                | 12 ++++++------
 libc/src/math/generic/ceilf16.cpp                   |  4 ++--
 libc/src/math/generic/floorf16.cpp                  |  4 ++--
 libc/src/math/generic/rintf16.cpp                   |  4 ++--
 libc/src/math/generic/roundevenf16.cpp              |  4 ++--
 libc/src/math/generic/roundf16.cpp                  |  5 +++--
 libc/src/math/generic/truncf16.cpp                  |  4 ++--
 9 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/libc/cmake/modules/LLVMLibCFlagRules.cmake b/libc/cmake/modules/LLVMLibCFlagRules.cmake
index eca7ba8d183e6..92245ffab4746 100644
--- a/libc/cmake/modules/LLVMLibCFlagRules.cmake
+++ b/libc/cmake/modules/LLVMLibCFlagRules.cmake
@@ -276,8 +276,10 @@ if(NOT(LIBC_TARGET_ARCHITECTURE_IS_X86 AND (LIBC_CPU_FEATURES MATCHES "SSE2")))
   set(SKIP_FLAG_EXPANSION_EXPLICIT_SIMD_OPT TRUE)
 endif()
 
-# Skip ROUND_OPT flag for targets that don't support SSE 4.2.
+# Skip ROUND_OPT flag for targets that don't support rounding instructions. On
+# x86, these are SSE4.1 instructions, but we already had code to check for
+# SSE4.2 support.
 if(NOT((LIBC_TARGET_ARCHITECTURE_IS_X86 AND (LIBC_CPU_FEATURES MATCHES "SSE4_2")) OR
-       LIBC_TARGET_ARCHITECTURE_IS_AARCH64))
+       LIBC_TARGET_ARCHITECTURE_IS_AARCH64 OR LIBC_TARGET_OS_IS_GPU))
   set(SKIP_FLAG_EXPANSION_ROUND_OPT TRUE)
 endif()
diff --git a/libc/src/__support/macros/properties/cpu_features.h b/libc/src/__support/macros/properties/cpu_features.h
index 80d48be702070..ba6e5b314e9de 100644
--- a/libc/src/__support/macros/properties/cpu_features.h
+++ b/libc/src/__support/macros/properties/cpu_features.h
@@ -49,4 +49,8 @@
 #define LIBC_TARGET_CPU_HAS_NEAREST_INT
 #endif
 
+#if defined(LIBC_TARGET_ARCH_IS_AARCH64) || defined(LIBC_TARGET_ARCH_IS_GPU)
+#define LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS
+#endif
+
 #endif // LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_CPU_FEATURES_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index d775026fabb3e..51743784ff156 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -111,7 +111,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
-    libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.cpu_features
   FLAGS
     ROUND_OPT
 )
@@ -503,7 +503,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
-    libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.cpu_features
   FLAGS
     ROUND_OPT
 )
@@ -572,7 +572,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
-    libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.cpu_features
   FLAGS
     ROUND_OPT
 )
@@ -641,7 +641,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
-    libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.cpu_features
   FLAGS
     ROUND_OPT
 )
@@ -710,7 +710,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
-    libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.cpu_features
   FLAGS
     ROUND_OPT
 )
@@ -903,7 +903,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
-    libc.src.__support.macros.properties.architectures
+    libc.src.__support.macros.properties.cpu_features
   FLAGS
     ROUND_OPT
 )
diff --git a/libc/src/math/generic/ceilf16.cpp b/libc/src/math/generic/ceilf16.cpp
index 708bc4cfd4860..8af31c6623a02 100644
--- a/libc/src/math/generic/ceilf16.cpp
+++ b/libc/src/math/generic/ceilf16.cpp
@@ -10,13 +10,13 @@
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/cpu_features.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, ceilf16, (float16 x)) {
 #if defined(__LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC) &&                       \
-    defined(LIBC_TARGET_ARCH_IS_AARCH64)
+    defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS)
   return static_cast<float16>(__builtin_ceilf(x));
 #else
   return fputil::ceil(x);
diff --git a/libc/src/math/generic/floorf16.cpp b/libc/src/math/generic/floorf16.cpp
index 84e4b0730ac68..3092048f5ab06 100644
--- a/libc/src/math/generic/floorf16.cpp
+++ b/libc/src/math/generic/floorf16.cpp
@@ -10,13 +10,13 @@
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/cpu_features.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, floorf16, (float16 x)) {
 #if defined(__LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC) &&                       \
-    defined(LIBC_TARGET_ARCH_IS_AARCH64)
+    defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS)
   return static_cast<float16>(__builtin_floorf(x));
 #else
   return fputil::floor(x);
diff --git a/libc/src/math/generic/rintf16.cpp b/libc/src/math/generic/rintf16.cpp
index 0e8c091efcf9b..3a53dd28e3d10 100644
--- a/libc/src/math/generic/rintf16.cpp
+++ b/libc/src/math/generic/rintf16.cpp
@@ -10,13 +10,13 @@
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/cpu_features.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, rintf16, (float16 x)) {
 #if defined(__LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC) &&                       \
-    defined(LIBC_TARGET_ARCH_IS_AARCH64)
+    defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS)
   return static_cast<float16>(__builtin_rintf(x));
 #else
   return fputil::round_using_current_rounding_mode(x);
diff --git a/libc/src/math/generic/roundevenf16.cpp b/libc/src/math/generic/roundevenf16.cpp
index b45670bd24ff1..c3dbd779b9739 100644
--- a/libc/src/math/generic/roundevenf16.cpp
+++ b/libc/src/math/generic/roundevenf16.cpp
@@ -10,13 +10,13 @@
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/cpu_features.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, roundevenf16, (float16 x)) {
 #if defined(__LIBC_USE_BUILTIN_ROUNDEVEN) &&                                   \
-    defined(LIBC_TARGET_ARCH_IS_AARCH64)
+    defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS)
   return static_cast<float16>(__builtin_roundevenf(x));
 #else
   return fputil::round_using_specific_rounding_mode(x, FP_INT_TONEAREST);
diff --git a/libc/src/math/generic/roundf16.cpp b/libc/src/math/generic/roundf16.cpp
index cb668c0e76388..a5e2b44fbd54b 100644
--- a/libc/src/math/generic/roundf16.cpp
+++ b/libc/src/math/generic/roundf16.cpp
@@ -10,12 +10,13 @@
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/cpu_features.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, roundf16, (float16 x)) {
-#if defined(__LIBC_USE_BUILTIN_ROUND) && defined(LIBC_TARGET_ARCH_IS_AARCH64)
+#if defined(__LIBC_USE_BUILTIN_ROUND) &&                                       \
+    defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS)
   return static_cast<float16>(__builtin_roundf(x));
 #else
   return fputil::round(x);
diff --git a/libc/src/math/generic/truncf16.cpp b/libc/src/math/generic/truncf16.cpp
index b931053e53438..31b1214a9a0e4 100644
--- a/libc/src/math/generic/truncf16.cpp
+++ b/libc/src/math/generic/truncf16.cpp
@@ -10,13 +10,13 @@
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
-#include "src/__support/macros/properties/architectures.h"
+#include "src/__support/macros/properties/cpu_features.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(float16, truncf16, (float16 x)) {
 #if defined(__LIBC_USE_BUILTIN_CEIL_FLOOR_RINT_TRUNC) &&                       \
-    defined(LIBC_TARGET_ARCH_IS_AARCH64)
+    defined(LIBC_TARGET_CPU_HAS_FAST_FLOAT16_OPS)
   return static_cast<float16>(__builtin_truncf(x));
 #else
   return fputil::trunc(x);

>From eb1764049da104aef73dd9aa1c7c2fba649f9c2d Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty at gmail.com>
Date: Fri, 19 Jul 2024 21:13:04 +0200
Subject: [PATCH 3/4] fixup! [libc][math][c23] Enable C23 _Float16 math
 functions on GPUs

Disable narrowing functions that take `long double` arguments.
---
 libc/config/gpu/entrypoints.txt | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index e5007f8afc49e..ac98e2c819932 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -354,19 +354,14 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.copysignf16
     libc.src.math.f16add
     libc.src.math.f16addf
-    libc.src.math.f16addl
     libc.src.math.f16div
     libc.src.math.f16divf
-    libc.src.math.f16divl
     libc.src.math.f16fma
     libc.src.math.f16fmaf
-    libc.src.math.f16fmal
     libc.src.math.f16sqrt
     libc.src.math.f16sqrtf
-    libc.src.math.f16sqrtl
     libc.src.math.f16sub
     libc.src.math.f16subf
-    libc.src.math.f16subl
     libc.src.math.fabsf16
     libc.src.math.fdimf16
     libc.src.math.floorf16

>From 3c7f76751fbbeb16b31bed92a0a04ea824008ed6 Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty at gmail.com>
Date: Fri, 19 Jul 2024 21:15:30 +0200
Subject: [PATCH 4/4] fixup! [libc][math][c23] Enable C23 _Float16 math
 functions on GPUs

Enable f16mul and f16mulf.
---
 libc/config/gpu/entrypoints.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index ac98e2c819932..b6fdd9dbc65b2 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -358,6 +358,8 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.f16divf
     libc.src.math.f16fma
     libc.src.math.f16fmaf
+    libc.src.math.f16mul
+    libc.src.math.f16mulf
     libc.src.math.f16sqrt
     libc.src.math.f16sqrtf
     libc.src.math.f16sub