[libc-commits] [libc] [libc][math] Implement nearest integer functions using builtins when available (PR #98376)

Wed Jul 10 13:14:25 PDT 2024

https://github.com/overmighty created https://github.com/llvm/llvm-project/pull/98376

None

>From 34ffd03ffb23c5cf5eb70b1164e1c33fae24662d Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty at gmail.com>
Date: Wed, 10 Jul 2024 20:41:57 +0200
Subject: [PATCH] [libc][math] Implement nearest integer functions using
 builtins when available

---
 .../cmake/modules/CheckCompilerFeatures.cmake |  26 ++-
 .../modules/LLVMLibCCompileOptionRules.cmake  |  20 ++-
 libc/cmake/modules/LLVMLibCFlagRules.cmake    |   3 +-
 .../check_builtin_ceil_floor_trunc.cpp        |   9 +
 .../compiler_features/check_builtin_round.cpp |   5 +
 .../check_builtin_roundeven.cpp               |   5 +
 libc/config/linux/aarch64/entrypoints.txt     |   1 +
 .../FPUtil/NearestIntegerOperations.h         |  37 ++--
 libc/src/math/generic/CMakeLists.txt          |  35 ++++
 libc/src/math/generic/ceil.cpp                |   8 +-
 libc/src/math/generic/ceilf.cpp               |   8 +-
 libc/src/math/generic/ceilf16.cpp             |  10 +-
 libc/src/math/generic/floor.cpp               |   8 +-
 libc/src/math/generic/floorf.cpp              |   8 +-
 libc/src/math/generic/floorf16.cpp            |  10 +-
 libc/src/math/generic/round.cpp               |   8 +-
 libc/src/math/generic/roundeven.cpp           |   4 +
 libc/src/math/generic/roundevenf.cpp          |   4 +
 libc/src/math/generic/roundevenf16.cpp        |   6 +
 libc/src/math/generic/roundf.cpp              |   8 +-
 libc/src/math/generic/roundf16.cpp            |   9 +-
 libc/src/math/generic/trunc.cpp               |   8 +-
 libc/src/math/generic/truncf.cpp              |   8 +-
 libc/src/math/generic/truncf16.cpp            |  10 +-
 .../math/performance_testing/CMakeLists.txt   |  19 ++
 .../performance_testing/NearestIntegerPerf.h  | 110 ++++++++++++
 .../nearest_integer_funcs_perf.cpp            | 165 ++++++++++++++++++
 27 files changed, 518 insertions(+), 34 deletions(-)
 create mode 100644 libc/cmake/modules/compiler_features/check_builtin_ceil_floor_trunc.cpp
 create mode 100644 libc/cmake/modules/compiler_features/check_builtin_round.cpp
 create mode 100644 libc/cmake/modules/compiler_features/check_builtin_roundeven.cpp
 create mode 100644 libc/test/src/math/performance_testing/NearestIntegerPerf.h
 create mode 100644 libc/test/src/math/performance_testing/nearest_integer_funcs_perf.cpp

diff --git a/libc/cmake/modules/CheckCompilerFeatures.cmake b/libc/cmake/modules/CheckCompilerFeatures.cmake
index d84c07b35d2d7..83822892c8096 100644
--- a/libc/cmake/modules/CheckCompilerFeatures.cmake
+++ b/libc/cmake/modules/CheckCompilerFeatures.cmake
@@ -2,7 +2,15 @@
 # Compiler features definition and flags
 # ------------------------------------------------------------------------------
 
-set(ALL_COMPILER_FEATURES "float16" "float128" "fixed_point")
+set(
+  ALL_COMPILER_FEATURES
+    "builtin_ceil_floor_trunc"
+    "builtin_round"
+    "builtin_roundeven"
+    "float16"
+    "float128"
+    "fixed_point"
+)
 
 # Making sure ALL_COMPILER_FEATURES is sorted.
 list(SORT ALL_COMPILER_FEATURES)
@@ -39,11 +47,19 @@ endfunction()
 set(AVAILABLE_COMPILER_FEATURES "")
 
 # Try compile a C file to check if flag is supported.
-set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
 foreach(feature IN LISTS ALL_COMPILER_FEATURES)
+  set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
   set(compile_options ${LIBC_COMPILE_OPTIONS_NATIVE})
   if(${feature} STREQUAL "fixed_point")
     list(APPEND compile_options "-ffixed-point")
+  elseif(${feature} MATCHES "^builtin_")
+    set(compile_options ${LIBC_COMPILE_OPTIONS_DEFAULT})
+    # The compiler might handle calls to rounding builtins by generating calls
+    # to the respective libc math functions, in which case we cannot use these
+    # builtins in our implementations of these functions. We check that this is
+    # not the case by trying to link an executable, since linking would fail due
+    # to unresolved references if calls to libc functions were generated.
+    set(CMAKE_TRY_COMPILE_TARGET_TYPE EXECUTABLE)
   endif()
 
   try_compile(
@@ -60,6 +76,12 @@ foreach(feature IN LISTS ALL_COMPILER_FEATURES)
       set(LIBC_TYPES_HAS_FLOAT128 TRUE)
     elseif(${feature} STREQUAL "fixed_point")
       set(LIBC_COMPILER_HAS_FIXED_POINT TRUE)
+    elseif(${feature} STREQUAL "builtin_ceil_floor_trunc")
+      set(LIBC_COMPILER_HAS_BUILTIN_CEIL_FLOOR_TRUNC TRUE)
+    elseif(${feature} STREQUAL "builtin_round")
+      set(LIBC_COMPILER_HAS_BUILTIN_ROUND TRUE)
+    elseif(${feature} STREQUAL "builtin_roundeven")
+      set(LIBC_COMPILER_HAS_BUILTIN_ROUNDEVEN TRUE)
     endif()
   endif()
 endforeach()
diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index c5e7dfe8abd0f..855d69d2a0fc9 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -4,7 +4,7 @@ function(_get_compile_options_from_flags output_var)
   if(LIBC_TARGET_ARCHITECTURE_IS_RISCV64 OR(LIBC_CPU_FEATURES MATCHES "FMA"))
     check_flag(ADD_FMA_FLAG ${FMA_OPT_FLAG} ${ARGN})
   endif()
-  check_flag(ADD_SSE4_2_FLAG ${ROUND_OPT_FLAG} ${ARGN})
+  check_flag(ADD_ROUND_OPT_FLAG ${ROUND_OPT_FLAG} ${ARGN})
   check_flag(ADD_EXPLICIT_SIMD_OPT_FLAG ${EXPLICIT_SIMD_OPT_FLAG} ${ARGN})
 
   if(LLVM_COMPILER_IS_GCC_COMPATIBLE)
@@ -16,8 +16,22 @@ function(_get_compile_options_from_flags output_var)
         list(APPEND compile_options "-D__LIBC_RISCV_USE_FMA")
       endif()
     endif()
-    if(ADD_SSE4_2_FLAG)
-      list(APPEND compile_options "-msse4.2")
+    if(ADD_ROUND_OPT_FLAG)
+      if(LIBC_TARGET_ARCHITECTURE_IS_X86)
+        # ROUND_OPT_FLAG is only enabled if SSE4.2 is detected, not just SSE4.1,
+        # because there was code to check for SSE4.2 already, and few CPUs only
+        # have SSE4.1.
+        list(APPEND compile_options "-msse4.2")
+      endif()
+      if(LIBC_COMPILER_HAS_BUILTIN_CEIL_FLOOR_TRUNC)
+        list(APPEND compile_options "-D__LIBC_USE_BUILTIN_CEIL_FLOOR_TRUNC")
+      endif()
+      if(LIBC_COMPILER_HAS_BUILTIN_ROUND)
+        list(APPEND compile_options "-D__LIBC_USE_BUILTIN_ROUND")
+      endif()
+      if(LIBC_COMPILER_HAS_BUILTIN_ROUNDEVEN)
+        list(APPEND compile_options "-D__LIBC_USE_BUILTIN_ROUNDEVEN")
+      endif()
     endif()
     if(ADD_EXPLICIT_SIMD_OPT_FLAG)
       list(APPEND compile_options "-D__LIBC_EXPLICIT_SIMD_OPT")
diff --git a/libc/cmake/modules/LLVMLibCFlagRules.cmake b/libc/cmake/modules/LLVMLibCFlagRules.cmake
index 18e36dfde5cc1..eca7ba8d183e6 100644
--- a/libc/cmake/modules/LLVMLibCFlagRules.cmake
+++ b/libc/cmake/modules/LLVMLibCFlagRules.cmake
@@ -277,6 +277,7 @@ if(NOT(LIBC_TARGET_ARCHITECTURE_IS_X86 AND (LIBC_CPU_FEATURES MATCHES "SSE2")))
 endif()
 
 # Skip ROUND_OPT flag for targets that don't support SSE 4.2.
-if(NOT(LIBC_TARGET_ARCHITECTURE_IS_X86 AND (LIBC_CPU_FEATURES MATCHES "SSE4_2")))
+if(NOT((LIBC_TARGET_ARCHITECTURE_IS_X86 AND (LIBC_CPU_FEATURES MATCHES "SSE4_2")) OR
+       LIBC_TARGET_ARCHITECTURE_IS_AARCH64))
   set(SKIP_FLAG_EXPANSION_ROUND_OPT TRUE)
 endif()
diff --git a/libc/cmake/modules/compiler_features/check_builtin_ceil_floor_trunc.cpp b/libc/cmake/modules/compiler_features/check_builtin_ceil_floor_trunc.cpp
new file mode 100644
index 0000000000000..031dd9376f3c1
--- /dev/null
+++ b/libc/cmake/modules/compiler_features/check_builtin_ceil_floor_trunc.cpp
@@ -0,0 +1,9 @@
+float try_builtin_ceilf(float x) { return __builtin_ceilf(x); }
+float try_builtin_floorf(float x) { return __builtin_ceilf(x); }
+float try_builtin_truncf(float x) { return __builtin_truncf(x); }
+
+double try_builtin_ceil(double x) { return __builtin_ceil(x); }
+double try_builtin_floor(double x) { return __builtin_ceil(x); }
+double try_builtin_trunc(double x) { return __builtin_trunc(x); }
+
+int main() {}
diff --git a/libc/cmake/modules/compiler_features/check_builtin_round.cpp b/libc/cmake/modules/compiler_features/check_builtin_round.cpp
new file mode 100644
index 0000000000000..8c3065c2de06a
--- /dev/null
+++ b/libc/cmake/modules/compiler_features/check_builtin_round.cpp
@@ -0,0 +1,5 @@
+float try_builtin_roundf(float x) { return __builtin_roundf(x); }
+
+double try_builtin_round(double x) { return __builtin_round(x); }
+
+int main() {}
diff --git a/libc/cmake/modules/compiler_features/check_builtin_roundeven.cpp b/libc/cmake/modules/compiler_features/check_builtin_roundeven.cpp
new file mode 100644
index 0000000000000..2480abae84c36
--- /dev/null
+++ b/libc/cmake/modules/compiler_features/check_builtin_roundeven.cpp
@@ -0,0 +1,5 @@
+float try_builtin_roundevenf(float x) { return __builtin_roundevenf(x); }
+
+double try_builtin_roundeven(double x) { return __builtin_roundeven(x); }
+
+int main() {}
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 030c3d3a99a02..515c472ef309e 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -489,6 +489,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.rintl
     libc.src.math.round
     libc.src.math.roundf
+    libc.src.math.roundevenf
     libc.src.math.roundl
     libc.src.math.scalbn
     libc.src.math.scalbnf
diff --git a/libc/src/__support/FPUtil/NearestIntegerOperations.h b/libc/src/__support/FPUtil/NearestIntegerOperations.h
index cff32938229d0..a9a0a97eebb5c 100644
--- a/libc/src/__support/FPUtil/NearestIntegerOperations.h
+++ b/libc/src/__support/FPUtil/NearestIntegerOperations.h
@@ -75,15 +75,17 @@ LIBC_INLINE T ceil(T x) {
   }
 
   uint32_t trim_size = FPBits<T>::FRACTION_LEN - exponent;
-  StorageType trunc_mantissa =
-      static_cast<StorageType>((bits.get_mantissa() >> trim_size) << trim_size);
-  bits.set_mantissa(trunc_mantissa);
-  T trunc_value = bits.get_val();
+  StorageType x_u = bits.uintval();
+  StorageType trunc_u =
+      static_cast<StorageType>((x_u >> trim_size) << trim_size);
 
   // If x is already an integer, return it.
-  if (trunc_value == x)
+  if (trunc_u == x_u)
     return x;
 
+  bits.set_uintval(trunc_u);
+  T trunc_value = bits.get_val();
+
   // If x is negative, the ceil operation is equivalent to the trunc operation.
   if (is_neg)
     return trunc_value;
@@ -130,15 +132,17 @@ LIBC_INLINE T round(T x) {
   uint32_t trim_size = FPBits<T>::FRACTION_LEN - exponent;
   bool half_bit_set =
       bool(bits.get_mantissa() & (StorageType(1) << (trim_size - 1)));
-  StorageType trunc_mantissa =
-      static_cast<StorageType>((bits.get_mantissa() >> trim_size) << trim_size);
-  bits.set_mantissa(trunc_mantissa);
-  T trunc_value = bits.get_val();
+  StorageType x_u = bits.uintval();
+  StorageType trunc_u =
+      static_cast<StorageType>((x_u >> trim_size) << trim_size);
 
   // If x is already an integer, return it.
-  if (trunc_value == x)
+  if (trunc_u == x_u)
     return x;
 
+  bits.set_uintval(trunc_u);
+  T trunc_value = bits.get_val();
+
   if (!half_bit_set) {
     // Franctional part is less than 0.5 so round value is the
     // same as the trunc value.
@@ -188,16 +192,17 @@ round_using_specific_rounding_mode(T x, int rnd) {
   }
 
   uint32_t trim_size = FPBits<T>::FRACTION_LEN - exponent;
-  FPBits<T> new_bits = bits;
-  StorageType trunc_mantissa =
-      static_cast<StorageType>((bits.get_mantissa() >> trim_size) << trim_size);
-  new_bits.set_mantissa(trunc_mantissa);
-  T trunc_value = new_bits.get_val();
+  StorageType x_u = bits.uintval();
+  StorageType trunc_u =
+      static_cast<StorageType>((x_u >> trim_size) << trim_size);
 
   // If x is already an integer, return it.
-  if (trunc_value == x)
+  if (trunc_u == x_u)
     return x;
 
+  FPBits<T> new_bits(trunc_u);
+  T trunc_value = new_bits.get_val();
+
   StorageType trim_value =
       bits.get_mantissa() &
       static_cast<StorageType>(((StorageType(1) << trim_size) - 1));
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 5e920307d39de..915fc076826f9 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -70,6 +70,8 @@ add_entrypoint_object(
     -O3
   DEPENDS
     libc.src.__support.FPUtil.nearest_integer_operations
+  FLAGS
+    ROUND_OPT
 )
 
 add_entrypoint_object(
@@ -82,6 +84,8 @@ add_entrypoint_object(
     -O3
   DEPENDS
     libc.src.__support.FPUtil.nearest_integer_operations
+  FLAGS
+    ROUND_OPT
 )
 
 add_entrypoint_object(
@@ -107,6 +111,9 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
+    libc.src.__support.macros.properties.architectures
+  FLAGS
+    ROUND_OPT
 )
 
 add_entrypoint_object(
@@ -455,6 +462,8 @@ add_entrypoint_object(
     -O3
   DEPENDS
     libc.src.__support.FPUtil.nearest_integer_operations
+  FLAGS
+    ROUND_OPT
 )
 
 add_entrypoint_object(
@@ -467,6 +476,8 @@ add_entrypoint_object(
     -O3
   DEPENDS
     libc.src.__support.FPUtil.nearest_integer_operations
+  FLAGS
+    ROUND_OPT
 )
 
 add_entrypoint_object(
@@ -492,6 +503,9 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
+    libc.src.__support.macros.properties.architectures
+  FLAGS
+    ROUND_OPT
 )
 
 add_entrypoint_object(
@@ -517,6 +531,8 @@ add_entrypoint_object(
     -O3
   DEPENDS
     libc.src.__support.FPUtil.nearest_integer_operations
+  FLAGS
+    ROUND_OPT
 )
 
 add_entrypoint_object(
@@ -529,6 +545,8 @@ add_entrypoint_object(
     -O3
   DEPENDS
     libc.src.__support.FPUtil.nearest_integer_operations
+  FLAGS
+    ROUND_OPT
 )
 
 add_entrypoint_object(
@@ -554,6 +572,9 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
+    libc.src.__support.macros.properties.architectures
+  FLAGS
+    ROUND_OPT
 )
 
 add_entrypoint_object(
@@ -579,6 +600,8 @@ add_entrypoint_object(
     -O3
   DEPENDS
     libc.src.__support.FPUtil.nearest_integer_operations
+  FLAGS
+    ROUND_OPT
 )
 
 add_entrypoint_object(
@@ -591,6 +614,8 @@ add_entrypoint_object(
     -O3
   DEPENDS
     libc.src.__support.FPUtil.nearest_integer_operations
+  FLAGS
+    ROUND_OPT
 )
 
 add_entrypoint_object(
@@ -616,6 +641,9 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
+    libc.src.__support.macros.properties.architectures
+  FLAGS
+    ROUND_OPT
 )
 
 add_entrypoint_object(
@@ -641,6 +669,8 @@ add_entrypoint_object(
     -O3
   DEPENDS
     libc.src.__support.FPUtil.nearest_integer_operations
+  FLAGS
+    ROUND_OPT
 )
 
 add_entrypoint_object(
@@ -653,6 +683,8 @@ add_entrypoint_object(
     -O3
   DEPENDS
     libc.src.__support.FPUtil.nearest_integer_operations
+  FLAGS
+    ROUND_OPT
 )
 
 add_entrypoint_object(
@@ -678,6 +710,9 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.nearest_integer_operations
+    libc.src.__support.macros.properties.architectures
+  FLAGS
+    ROUND_OPT
 )
 
 add_entrypoint_object(
diff --git a/libc/src/math/generic/ceil.cpp b/libc/src/math/generic/ceil.cpp
index efd0f246a9b90..63da803033e22 100644
--- a/libc/src/math/generic/ceil.cpp
+++ b/libc/src/math/generic/ceil.cpp
@@ -12,6 +12,12 @@
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, ceil, (double x)) { return fputil::ceil(x); }
+LLVM_LIBC_FUNCTION(double, ceil, (double x)) {
+#ifdef __LIBC_USE_BUILTIN_CEIL_FLOOR_TRUNC
+  return __builtin_ceil(x);
+#else
+  return fputil::ceil(x);
+#endif
+}
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/ceilf.cpp b/libc/src/math/generic/ceilf.cpp
index d49b34242da4f..51ef68f1dd871 100644
--- a/libc/src/math/generic/ceilf.cpp
+++ b/libc/src/math/generic/ceilf.cpp
@@ -12,6 +12,12 @@
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, ceilf, (float x)) { return fputil::ceil(x); }
+LLVM_LIBC_FUNCTION(float, ceilf, (float x)) {
+#ifdef __LIBC_USE_BUILTIN_CEIL_FLOOR_TRUNC
+  return __builtin_ceilf(x);
+#else
+  return fputil::ceil(x);
+#endif
+}
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/ceilf16.cpp b/libc/src/math/generic/ceilf16.cpp
index 205d7428f66e6..ee584c25a4ae9 100644
--- a/libc/src/math/generic/ceilf16.cpp
+++ b/libc/src/math/generic/ceilf16.cpp
@@ -9,9 +9,17 @@
 #include "src/math/ceilf16.h"
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
+#include "src/__support/macros/properties/architectures.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float16, ceilf16, (float16 x)) { return fputil::ceil(x); }
+LLVM_LIBC_FUNCTION(float16, ceilf16, (float16 x)) {
+#if defined(__LIBC_USE_BUILTIN_CEIL_FLOOR_TRUNC) &&                            \
+    defined(LIBC_TARGET_ARCH_IS_AARCH64)
+  return static_cast<float16>(__builtin_ceilf(x));
+#else
+  return fputil::ceil(x);
+#endif
+}
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/floor.cpp b/libc/src/math/generic/floor.cpp
index 60386f0c9cf81..bb58ca6a35402 100644
--- a/libc/src/math/generic/floor.cpp
+++ b/libc/src/math/generic/floor.cpp
@@ -12,6 +12,12 @@
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, floor, (double x)) { return fputil::floor(x); }
+LLVM_LIBC_FUNCTION(double, floor, (double x)) {
+#ifdef __LIBC_USE_BUILTIN_CEIL_FLOOR_TRUNC
+  return __builtin_floor(x);
+#else
+  return fputil::floor(x);
+#endif
+}
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/floorf.cpp b/libc/src/math/generic/floorf.cpp
index 85666688685dc..459f338d897be 100644
--- a/libc/src/math/generic/floorf.cpp
+++ b/libc/src/math/generic/floorf.cpp
@@ -12,6 +12,12 @@
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, floorf, (float x)) { return fputil::floor(x); }
+LLVM_LIBC_FUNCTION(float, floorf, (float x)) {
+#ifdef __LIBC_USE_BUILTIN_CEIL_FLOOR_TRUNC
+  return __builtin_floorf(x);
+#else
+  return fputil::floor(x);
+#endif
+}
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/floorf16.cpp b/libc/src/math/generic/floorf16.cpp
index db0b326c0e5f6..6d8c497946c84 100644
--- a/libc/src/math/generic/floorf16.cpp
+++ b/libc/src/math/generic/floorf16.cpp
@@ -9,9 +9,17 @@
 #include "src/math/floorf16.h"
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
+#include "src/__support/macros/properties/architectures.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float16, floorf16, (float16 x)) { return fputil::floor(x); }
+LLVM_LIBC_FUNCTION(float16, floorf16, (float16 x)) {
+#if defined(__LIBC_USE_BUILTIN_CEIL_FLOOR_TRUNC) &&                            \
+    defined(LIBC_TARGET_ARCH_IS_AARCH64)
+  return static_cast<float16>(__builtin_floorf(x));
+#else
+  return fputil::floor(x);
+#endif
+}
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/round.cpp b/libc/src/math/generic/round.cpp
index ca8f19f35f7fe..d873524ad9f42 100644
--- a/libc/src/math/generic/round.cpp
+++ b/libc/src/math/generic/round.cpp
@@ -12,6 +12,12 @@
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, round, (double x)) { return fputil::round(x); }
+LLVM_LIBC_FUNCTION(double, round, (double x)) {
+#ifdef __LIBC_USE_BUILTIN_ROUND
+  return __builtin_round(x);
+#else
+  return fputil::round(x);
+#endif
+}
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/roundeven.cpp b/libc/src/math/generic/roundeven.cpp
index 5f2adf9b5fce6..76409d526e208 100644
--- a/libc/src/math/generic/roundeven.cpp
+++ b/libc/src/math/generic/roundeven.cpp
@@ -13,7 +13,11 @@
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, roundeven, (double x)) {
+#ifdef __LIBC_USE_BUILTIN_ROUNDEVEN
+  return __builtin_roundeven(x);
+#else
   return fputil::round_using_specific_rounding_mode(x, FP_INT_TONEAREST);
+#endif
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/roundevenf.cpp b/libc/src/math/generic/roundevenf.cpp
index 353bec74ecf02..22538272bedbd 100644
--- a/libc/src/math/generic/roundevenf.cpp
+++ b/libc/src/math/generic/roundevenf.cpp
@@ -13,7 +13,11 @@
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, roundevenf, (float x)) {
+#ifdef __LIBC_USE_BUILTIN_ROUNDEVEN
+  return __builtin_roundevenf(x);
+#else
   return fputil::round_using_specific_rounding_mode(x, FP_INT_TONEAREST);
+#endif
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/roundevenf16.cpp b/libc/src/math/generic/roundevenf16.cpp
index 9ecf79ce6f6c2..90c75a10d3ddb 100644
--- a/libc/src/math/generic/roundevenf16.cpp
+++ b/libc/src/math/generic/roundevenf16.cpp
@@ -9,11 +9,17 @@
 #include "src/math/roundevenf16.h"
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
+#include "src/__support/macros/properties/architectures.h"
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float16, roundevenf16, (float16 x)) {
+#if defined(__LIBC_USE_BUILTIN_ROUNDEVEN) &&                                   \
+    defined(LIBC_TARGET_ARCH_IS_AARCH64)
+  return static_cast<float16>(__builtin_roundevenf(x));
+#else
   return fputil::round_using_specific_rounding_mode(x, FP_INT_TONEAREST);
+#endif
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/roundf.cpp b/libc/src/math/generic/roundf.cpp
index 9627390ea8b8d..8b3add7cb9e2d 100644
--- a/libc/src/math/generic/roundf.cpp
+++ b/libc/src/math/generic/roundf.cpp
@@ -12,6 +12,12 @@
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, roundf, (float x)) { return fputil::round(x); }
+LLVM_LIBC_FUNCTION(float, roundf, (float x)) {
+#ifdef __LIBC_USE_BUILTIN_ROUND
+  return __builtin_roundf(x);
+#else
+  return fputil::round(x);
+#endif
+}
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/roundf16.cpp b/libc/src/math/generic/roundf16.cpp
index 75a255d7798d5..fca0194ec5dbb 100644
--- a/libc/src/math/generic/roundf16.cpp
+++ b/libc/src/math/generic/roundf16.cpp
@@ -9,9 +9,16 @@
 #include "src/math/roundf16.h"
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
+#include "src/__support/macros/properties/architectures.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float16, roundf16, (float16 x)) { return fputil::round(x); }
+LLVM_LIBC_FUNCTION(float16, roundf16, (float16 x)) {
+#if defined(__LIBC_USE_BUILTIN_ROUND) && defined(LIBC_TARGET_ARCH_IS_AARCH64)
+  return static_cast<float16>(__builtin_roundf(x));
+#else
+  return fputil::round(x);
+#endif
+}
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/trunc.cpp b/libc/src/math/generic/trunc.cpp
index d171ab1f092fd..5761565646c36 100644
--- a/libc/src/math/generic/trunc.cpp
+++ b/libc/src/math/generic/trunc.cpp
@@ -12,6 +12,12 @@
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(double, trunc, (double x)) { return fputil::trunc(x); }
+LLVM_LIBC_FUNCTION(double, trunc, (double x)) {
+#ifdef __LIBC_USE_BUILTIN_CEIL_FLOOR_TRUNC
+  return __builtin_trunc(x);
+#else
+  return fputil::trunc(x);
+#endif
+}
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/truncf.cpp b/libc/src/math/generic/truncf.cpp
index 93bfb7f2882a5..2186fd6dd942e 100644
--- a/libc/src/math/generic/truncf.cpp
+++ b/libc/src/math/generic/truncf.cpp
@@ -12,6 +12,12 @@
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float, truncf, (float x)) { return fputil::trunc(x); }
+LLVM_LIBC_FUNCTION(float, truncf, (float x)) {
+#ifdef __LIBC_USE_BUILTIN_CEIL_FLOOR_TRUNC
+  return __builtin_truncf(x);
+#else
+  return fputil::trunc(x);
+#endif
+}
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/truncf16.cpp b/libc/src/math/generic/truncf16.cpp
index 65bd57d810323..43dfd083eb6f2 100644
--- a/libc/src/math/generic/truncf16.cpp
+++ b/libc/src/math/generic/truncf16.cpp
@@ -9,9 +9,17 @@
 #include "src/math/truncf16.h"
 #include "src/__support/FPUtil/NearestIntegerOperations.h"
 #include "src/__support/common.h"
+#include "src/__support/macros/properties/architectures.h"
 
 namespace LIBC_NAMESPACE {
 
-LLVM_LIBC_FUNCTION(float16, truncf16, (float16 x)) { return fputil::trunc(x); }
+LLVM_LIBC_FUNCTION(float16, truncf16, (float16 x)) {
+#if defined(__LIBC_USE_BUILTIN_CEIL_FLOOR_TRUNC) &&                            \
+    defined(LIBC_TARGET_ARCH_IS_AARCH64)
+  return static_cast<float16>(__builtin_truncf(x));
+#else
+  return fputil::trunc(x);
+#endif
+}
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/test/src/math/performance_testing/CMakeLists.txt b/libc/test/src/math/performance_testing/CMakeLists.txt
index 4ea78f9999e4d..8c1e8c72d4e64 100644
--- a/libc/test/src/math/performance_testing/CMakeLists.txt
+++ b/libc/test/src/math/performance_testing/CMakeLists.txt
@@ -366,3 +366,22 @@ add_perf_binary(
   COMPILE_OPTIONS
     -fno-builtin
 )
+
+add_perf_binary(
+  nearest_integer_funcs_perf
+  SRCS
+    nearest_integer_funcs_perf.cpp
+  DEPENDS
+    libc.src.math.ceilf16
+    libc.src.math.ceilf
+    libc.src.math.floorf16
+    libc.src.math.floorf
+    libc.src.math.roundevenf16
+    libc.src.math.roundevenf
+    libc.src.math.roundf16
+    libc.src.math.roundf
+    libc.src.math.truncf16
+    libc.src.math.truncf
+  COMPILE_OPTIONS
+    -fno-builtin
+)
diff --git a/libc/test/src/math/performance_testing/NearestIntegerPerf.h b/libc/test/src/math/performance_testing/NearestIntegerPerf.h
new file mode 100644
index 0000000000000..6ed8d8334eabe
--- /dev/null
+++ b/libc/test/src/math/performance_testing/NearestIntegerPerf.h
@@ -0,0 +1,110 @@
+//===-- Common utility class for differential analysis --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "test/src/math/performance_testing/Timer.h"
+
+#include <fstream>
+
+namespace LIBC_NAMESPACE {
+namespace testing {
+
+template <typename T> class NearestIntegerPerf {
+  using FPBits = fputil::FPBits<T>;
+  using StorageType = typename FPBits::StorageType;
+
+public:
+  typedef T Func(T);
+
+  static void runPerfInRange(Func myFunc, Func otherFunc,
+                             StorageType startingBit, StorageType endingBit,
+                             StorageType step, size_t rounds,
+                             std::ofstream &log) {
+    auto runner = [=](Func func) {
+      volatile T result;
+      for (size_t i = 0; i < rounds; i++) {
+        for (StorageType bits = startingBit; bits <= endingBit; bits += step) {
+          T x = FPBits(bits).get_val();
+          result = func(x);
+        }
+      }
+    };
+
+    Timer timer;
+    timer.start();
+    runner(myFunc);
+    timer.stop();
+
+    size_t numberOfRuns = (endingBit - startingBit) / step + 1;
+    double myAverage =
+        static_cast<double>(timer.nanoseconds()) / numberOfRuns / rounds;
+    log << "-- My function --\n";
+    log << "     Total time      : " << timer.nanoseconds() << " ns \n";
+    log << "     Average runtime : " << myAverage << " ns/op \n";
+    log << "     Ops per second  : "
+        << static_cast<uint64_t>(1'000'000'000.0 / myAverage) << " op/s \n";
+
+    timer.start();
+    runner(otherFunc);
+    timer.stop();
+
+    double otherAverage =
+        static_cast<double>(timer.nanoseconds()) / numberOfRuns / rounds;
+    log << "-- Other function --\n";
+    log << "     Total time      : " << timer.nanoseconds() << " ns \n";
+    log << "     Average runtime : " << otherAverage << " ns/op \n";
+    log << "     Ops per second  : "
+        << static_cast<uint64_t>(1'000'000'000.0 / otherAverage) << " op/s \n";
+
+    log << "-- Average runtime ratio --\n";
+    log << "     Mine / Other's  : " << myAverage / otherAverage << " \n";
+  }
+
+  static void runPerf(Func myFunc, Func otherFunc, size_t rounds,
+                      const char *logFile) {
+    std::ofstream log(logFile);
+    log << "Performance tests with inputs in normal integral range:\n";
+    runPerfInRange(myFunc, otherFunc,
+                   StorageType((FPBits::EXP_BIAS + 1) << FPBits::SIG_LEN),
+                   StorageType((FPBits::EXP_BIAS + FPBits::FRACTION_LEN - 1)
+                               << FPBits::SIG_LEN),
+                   StorageType(1 << FPBits::SIG_LEN),
+                   rounds * FPBits::EXP_BIAS * FPBits::EXP_BIAS, log);
+    log << "\n Performance tests with inputs in low integral range:\n";
+    runPerfInRange(myFunc, otherFunc, StorageType(1 << FPBits::SIG_LEN),
+                   StorageType((FPBits::EXP_BIAS - 1) << FPBits::SIG_LEN),
+                   StorageType(1 << FPBits::SIG_LEN),
+                   rounds * FPBits::EXP_BIAS * FPBits::EXP_BIAS, log);
+    log << "\n Performance tests with inputs in high integral range:\n";
+    runPerfInRange(myFunc, otherFunc,
+                   StorageType((FPBits::EXP_BIAS + FPBits::FRACTION_LEN)
+                               << FPBits::SIG_LEN),
+                   StorageType(FPBits::MAX_BIASED_EXPONENT << FPBits::SIG_LEN),
+                   StorageType(1 << FPBits::SIG_LEN),
+                   rounds * FPBits::EXP_BIAS * FPBits::EXP_BIAS, log);
+    log << "\n Performance tests with inputs in normal fractional range:\n";
+    runPerfInRange(myFunc, otherFunc,
+                   StorageType(((FPBits::EXP_BIAS + 1) << FPBits::SIG_LEN) + 1),
+                   StorageType(((FPBits::EXP_BIAS + 2) << FPBits::SIG_LEN) - 1),
+                   StorageType(1), rounds * 2, log);
+    log << "\n Performance tests with inputs in subnormal fractional range:\n";
+    runPerfInRange(myFunc, otherFunc, StorageType(1),
+                   StorageType(FPBits::SIG_MASK), StorageType(1), rounds, log);
+  }
+};
+
+} // namespace testing
+} // namespace LIBC_NAMESPACE
+
+#define NEAREST_INTEGER_PERF(T, myFunc, otherFunc, rounds, filename)           \
+  {                                                                            \
+    LIBC_NAMESPACE::testing::NearestIntegerPerf<T>::runPerf(                   \
+        &myFunc, &otherFunc, rounds, filename);                                \
+    LIBC_NAMESPACE::testing::NearestIntegerPerf<T>::runPerf(                   \
+        &myFunc, &otherFunc, rounds, filename);                                \
+  }
diff --git a/libc/test/src/math/performance_testing/nearest_integer_funcs_perf.cpp b/libc/test/src/math/performance_testing/nearest_integer_funcs_perf.cpp
new file mode 100644
index 0000000000000..081c6c2c4c2d9
--- /dev/null
+++ b/libc/test/src/math/performance_testing/nearest_integer_funcs_perf.cpp
@@ -0,0 +1,165 @@
+//===-- Performance test for nearest integer functions --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/math/ceilf.h"
+#include "src/math/ceilf16.h"
+#include "src/math/floorf.h"
+#include "src/math/floorf16.h"
+#include "src/math/roundevenf.h"
+#include "src/math/roundevenf16.h"
+#include "src/math/roundf.h"
+#include "src/math/roundf16.h"
+#include "src/math/truncf.h"
+#include "src/math/truncf16.h"
+#include "test/src/math/performance_testing/Timer.h"
+
+#include <fstream>
+#include <math.h>
+
+namespace LIBC_NAMESPACE::testing {
+
+template <typename T> class NearestIntegerPerf {
+  using FPBits = fputil::FPBits<T>;
+  using StorageType = typename FPBits::StorageType;
+
+public:
+  typedef T Func(T);
+
+  static void run_perf_in_range(Func my_func, Func other_func,
+                                StorageType starting_bit,
+                                StorageType ending_bit, StorageType step,
+                                size_t rounds, std::ofstream &log) {
+    auto runner = [=](Func func) {
+      volatile T result;
+      for (size_t i = 0; i < rounds; i++) {
+        for (StorageType bits = starting_bit; bits <= ending_bit;
+             bits += step) {
+          T x = FPBits(bits).get_val();
+          result = func(x);
+        }
+      }
+    };
+
+    Timer timer;
+    timer.start();
+    runner(my_func);
+    timer.stop();
+
+    size_t number_of_runs = (ending_bit - starting_bit) / step + 1;
+    double my_average =
+        static_cast<double>(timer.nanoseconds()) / number_of_runs / rounds;
+    log << "-- My function --\n";
+    log << "     Total time      : " << timer.nanoseconds() << " ns \n";
+    log << "     Average runtime : " << my_average << " ns/op \n";
+    log << "     Ops per second  : "
+        << static_cast<uint64_t>(1'000'000'000.0 / my_average) << " op/s \n";
+
+    timer.start();
+    runner(other_func);
+    timer.stop();
+
+    double other_average =
+        static_cast<double>(timer.nanoseconds()) / number_of_runs / rounds;
+    log << "-- Other function --\n";
+    log << "     Total time      : " << timer.nanoseconds() << " ns \n";
+    log << "     Average runtime : " << other_average << " ns/op \n";
+    log << "     Ops per second  : "
+        << static_cast<uint64_t>(1'000'000'000.0 / other_average) << " op/s \n";
+
+    log << "-- Average runtime ratio --\n";
+    log << "     Mine / Other's  : " << my_average / other_average << " \n";
+  }
+
+  static void run_perf(Func my_func, Func other_func, size_t rounds,
+                       const char *log_file) {
+    std::ofstream log(log_file);
+    log << "Performance tests with inputs in normal integral range:\n";
+    run_perf_in_range(
+        my_func, other_func,
+        /*starting_bit=*/StorageType((FPBits::EXP_BIAS + 1) << FPBits::SIG_LEN),
+        /*ending_bit=*/
+        StorageType((FPBits::EXP_BIAS + FPBits::FRACTION_LEN - 1)
+                    << FPBits::SIG_LEN),
+        /*step=*/StorageType(1 << FPBits::SIG_LEN),
+        rounds * FPBits::EXP_BIAS * FPBits::EXP_BIAS * 2, log);
+    log << "\n Performance tests with inputs in low integral range:\n";
+    run_perf_in_range(
+        my_func, other_func,
+        /*starting_bit=*/StorageType(1 << FPBits::SIG_LEN),
+        /*ending_bit=*/StorageType((FPBits::EXP_BIAS - 1) << FPBits::SIG_LEN),
+        /*step_bit=*/StorageType(1 << FPBits::SIG_LEN),
+        rounds * FPBits::EXP_BIAS * FPBits::EXP_BIAS * 2, log);
+    log << "\n Performance tests with inputs in high integral range:\n";
+    run_perf_in_range(
+        my_func, other_func,
+        /*starting_bit=*/
+        StorageType((FPBits::EXP_BIAS + FPBits::FRACTION_LEN)
+                    << FPBits::SIG_LEN),
+        /*ending_bit=*/
+        StorageType(FPBits::MAX_BIASED_EXPONENT << FPBits::SIG_LEN),
+        /*step=*/StorageType(1 << FPBits::SIG_LEN),
+        rounds * FPBits::EXP_BIAS * FPBits::EXP_BIAS * 2, log);
+    log << "\n Performance tests with inputs in normal fractional range:\n";
+    run_perf_in_range(
+        my_func, other_func,
+        /*starting_bit=*/
+        StorageType(((FPBits::EXP_BIAS + 1) << FPBits::SIG_LEN) + 1),
+        /*ending_bit=*/
+        StorageType(((FPBits::EXP_BIAS + 2) << FPBits::SIG_LEN) - 1),
+        /*step=*/StorageType(1), rounds * 2, log);
+    log << "\n Performance tests with inputs in subnormal fractional range:\n";
+    run_perf_in_range(my_func, other_func, /*starting_bit=*/StorageType(1),
+                      /*ending_bit=*/StorageType(FPBits::SIG_MASK),
+                      /*step=*/StorageType(1), rounds, log);
+  }
+};
+
+} // namespace LIBC_NAMESPACE::testing
+
+#define NEAREST_INTEGER_PERF(T, my_func, other_func, rounds, filename)         \
+  {                                                                            \
+    LIBC_NAMESPACE::testing::NearestIntegerPerf<T>::run_perf(                  \
+        &my_func, &other_func, rounds, filename);                              \
+    LIBC_NAMESPACE::testing::NearestIntegerPerf<T>::run_perf(                  \
+        &my_func, &other_func, rounds, filename);                              \
+  }
+
+static constexpr size_t FLOAT16_ROUNDS = 20'000;
+static constexpr size_t FLOAT_ROUNDS = 40;
+
+// LLVM libc might be the only libc implementation with support for float16 math
+// functions currently. We can't compare our float16 functions against the
+// system libc, so we compare them against this placeholder function.
+float16 placeholder(float16 x) { return x; }
+
+int main() {
+  NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::ceilf16, ::placeholder,
+                       FLOAT16_ROUNDS, "ceilf16_perf.log")
+  NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::floorf16, ::placeholder,
+                       FLOAT16_ROUNDS, "floorf16_perf.log")
+  NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::roundevenf16, ::placeholder,
+                       FLOAT16_ROUNDS, "roundevenf16_perf.log")
+  NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::roundf16, ::placeholder,
+                       FLOAT16_ROUNDS, "roundf16_perf.log")
+  NEAREST_INTEGER_PERF(float16, LIBC_NAMESPACE::truncf16, ::placeholder,
+                       FLOAT16_ROUNDS, "truncf16_perf.log")
+
+  NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::ceilf, ::ceilf, FLOAT_ROUNDS,
+                       "ceilf_perf.log")
+  NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::floorf, ::floorf, FLOAT_ROUNDS,
+                       "floorf_perf.log")
+  NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::roundevenf, ::roundevenf,
+                       FLOAT_ROUNDS, "roundevenf_perf.log")
+  NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::roundf, ::roundf, FLOAT_ROUNDS,
+                       "roundf_perf.log")
+  NEAREST_INTEGER_PERF(float, LIBC_NAMESPACE::truncf, ::truncf, FLOAT_ROUNDS,
+                       "truncf_perf.log")
+
+  return 0;
+}