[libc-commits] [libc] 484319f - [libc] Make expm1f correctly rounded when the targets have no FMA instructions.

Fri Jun 3 12:58:06 PDT 2022

Author: Tue Ly
Date: 2022-06-03T15:57:48-04:00
New Revision: 484319f4972850d6e9bb3d433042cb56806ba183

URL: https://github.com/llvm/llvm-project/commit/484319f4972850d6e9bb3d433042cb56806ba183
DIFF: https://github.com/llvm/llvm-project/commit/484319f4972850d6e9bb3d433042cb56806ba183.diff

LOG: [libc] Make expm1f correctly rounded when the targets have no FMA instructions.

Add another exceptional value and fix the case when |x| is small.

Performance tests with CORE-MATH project scripts:
With FMA instructions on Ryzen 1700:
```
$ ./perf.sh expm1f
LIBC-location: /home/lnt/experiment/llvm/llvm-project/build/projects/libc/lib/libllvmlibc.a
CORE-MATH reciprocal throughput   : 15.362
System LIBC reciprocal throughput : 53.194
LIBC reciprocal throughput        : 14.595
$ ./perf.sh expm1f --latency
LIBC-location: /home/lnt/experiment/llvm/llvm-project/build/projects/libc/lib/libllvmlibc.a
CORE-MATH latency   : 57.755
System LIBC latency : 147.020
LIBC latency        : 60.269
```
Without FMA instructions:
```
$ ./perf.sh expm1f
LIBC-location: /home/lnt/experiment/llvm/llvm-project/build/projects/libc/lib/libllvmlibc.a
CORE-MATH reciprocal throughput   : 15.362
System LIBC reciprocal throughput : 53.300
LIBC reciprocal throughput        : 18.020
$ ./perf.sh expm1f --latency
LIBC-location: /home/lnt/experiment/llvm/llvm-project/build/projects/libc/lib/libllvmlibc.a
CORE-MATH latency   : 57.758
System LIBC latency : 147.025
LIBC latency        : 70.304
```

Reviewed By: michaelrj

Differential Revision: https://reviews.llvm.org/D123440

Added: 
    

Modified: 
    libc/src/math/generic/expm1f.cpp
    libc/test/src/math/CMakeLists.txt
    libc/test/src/math/exhaustive/CMakeLists.txt
    libc/test/src/math/exhaustive/expm1f_test.cpp
    libc/test/src/math/expm1f_test.cpp

Removed: 
    


################################################################################
diff  --git a/libc/src/math/generic/expm1f.cpp b/libc/src/math/generic/expm1f.cpp
index 2cb68e33ebd08..3019d71369c08 100644

--- a/libc/src/math/generic/expm1f.cpp
+++ b/libc/src/math/generic/expm1f.cpp
@@ -34,6 +34,15 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
     return 0x1.8dbe62p-3f;
   }
 
+#if !defined(LIBC_TARGET_HAS_FMA)
+  if (unlikely(x_u == 0xbdc1'c6cbU)) { // x = -0x1.838d96p-4f
+    int round_mode = fputil::get_round();
+    if (round_mode == FE_TONEAREST || round_mode == FE_DOWNWARD)
+      return -0x1.71c884p-4f;
+    return -0x1.71c882p-4f;
+  }
+#endif // LIBC_TARGET_HAS_FMA
+
   // When |x| > 25*log(2), or nan
   if (unlikely(x_abs >= 0x418a'a123U)) {
     // x < log(2^-25)
@@ -70,19 +79,30 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
       // x = -0.0f
       if (unlikely(xbits.uintval() == 0x8000'0000U))
         return x;
-      // When |x| < 2^-25, the relative error of the approximation e^x - 1 ~ x
-      // is:
-      //   |(e^x - 1) - x| / |e^x - 1| < |x^2| / |x|
-      //                               = |x|
-      //                               < 2^-25
-      //                               < epsilon(1)/2.
-      // So the correctly rounded values of expm1(x) are:
-      //   = x + eps(x) if rounding mode = FE_UPWARD,
-      //                   or (rounding mode = FE_TOWARDZERO and x is negative),
-      //   = x otherwise.
-      // To simplify the rounding decision and make it more efficient, we use
-      //   fma(x, x, x) ~ x + x^2 instead.
-      return fputil::multiply_add(x, x, x);
+        // When |x| < 2^-25, the relative error of the approximation e^x - 1 ~ x
+        // is:
+        //   |(e^x - 1) - x| / |e^x - 1| < |x^2| / |x|
+        //                               = |x|
+        //                               < 2^-25
+        //                               < epsilon(1)/2.
+        // So the correctly rounded values of expm1(x) are:
+        //   = x + eps(x) if rounding mode = FE_UPWARD,
+        //                   or (rounding mode = FE_TOWARDZERO and x is
+        //                   negative),
+        //   = x otherwise.
+        // To simplify the rounding decision and make it more efficient, we use
+        //   fma(x, x, x) ~ x + x^2 instead.
+        // Note: to use the formula x + x^2 to decide the correct rounding, we
+        // do need fma(x, x, x) to prevent underflow caused by x*x when |x| <
+        // 2^-76. For targets without FMA instructions, we simply use double for
+        // intermediate results as it is more efficient than using an emulated
+        // version of FMA.
+#if defined(LIBC_TARGET_HAS_FMA)
+      return fputil::fma(x, x, x);
+#else
+      double xd = x;
+      return static_cast<float>(fputil::multiply_add(xd, xd, xd));
+#endif // LIBC_TARGET_HAS_FMA
     }
 
     // 2^-25 <= |x| < 2^-4

diff  --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index 956fb170462d0..68a9aed825a9f 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -1189,9 +1189,6 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fputil
 )
 
-# Without FMA instructions, the current expm1f implementation is not correctly
-# rounded for all float inputs (1 extra exceptional value). This will be fixed
-# in the followup patch: https://reviews.llvm.org/D123440
 add_fp_unittest(
   expm1f_test
   NEED_MPFR
@@ -1204,8 +1201,6 @@ add_fp_unittest(
     libc.include.math
     libc.src.math.expm1f
     libc.src.__support.FPUtil.fputil
-  FLAGS
-    FMA_OPT__ONLY
 )
 
 add_fp_unittest(

diff  --git a/libc/test/src/math/exhaustive/CMakeLists.txt b/libc/test/src/math/exhaustive/CMakeLists.txt
index b91f3c213a224..34f8a38772570 100644
--- a/libc/test/src/math/exhaustive/CMakeLists.txt
+++ b/libc/test/src/math/exhaustive/CMakeLists.txt
@@ -92,7 +92,6 @@ add_fp_unittest(
   DEPENDS
     .exhaustive_test
     libc.include.math
-    libc.src.math.expf
     libc.src.math.expm1f
     libc.src.__support.FPUtil.fputil
   LINK_LIBRARIES

diff  --git a/libc/test/src/math/exhaustive/expm1f_test.cpp b/libc/test/src/math/exhaustive/expm1f_test.cpp
index 5131f1de6025e..b99a2e478d8b2 100644
--- a/libc/test/src/math/exhaustive/expm1f_test.cpp
+++ b/libc/test/src/math/exhaustive/expm1f_test.cpp
@@ -18,7 +18,7 @@ using FPBits = __llvm_libc::fputil::FPBits<float>;
 
 namespace mpfr = __llvm_libc::testing::mpfr;
 
-struct LlvmLibcExpfExhaustiveTest : public LlvmLibcExhaustiveTest<uint32_t> {
+struct LlvmLibcExpm1fExhaustiveTest : public LlvmLibcExhaustiveTest<uint32_t> {
   bool check(uint32_t start, uint32_t stop,
              mpfr::RoundingMode rounding) override {
     mpfr::ForceRoundingMode r(rounding);
@@ -40,21 +40,21 @@ static const int NUM_THREADS = std::thread::hardware_concurrency();
 static constexpr uint32_t POS_START = 0x0000'0000U;
 static constexpr uint32_t POS_STOP = 0x42b2'0000U;
 
-TEST_F(LlvmLibcExpfExhaustiveTest, PostiveRangeRoundNearestTieToEven) {
+TEST_F(LlvmLibcExpm1fExhaustiveTest, PostiveRangeRoundNearestTieToEven) {
   test_full_range(POS_START, POS_STOP, NUM_THREADS,
                   mpfr::RoundingMode::Nearest);
 }
 
-TEST_F(LlvmLibcExpfExhaustiveTest, PostiveRangeRoundUp) {
+TEST_F(LlvmLibcExpm1fExhaustiveTest, PostiveRangeRoundUp) {
   test_full_range(POS_START, POS_STOP, NUM_THREADS, mpfr::RoundingMode::Upward);
 }
 
-TEST_F(LlvmLibcExpfExhaustiveTest, PostiveRangeRoundDown) {
+TEST_F(LlvmLibcExpm1fExhaustiveTest, PostiveRangeRoundDown) {
   test_full_range(POS_START, POS_STOP, NUM_THREADS,
                   mpfr::RoundingMode::Downward);
 }
 
-TEST_F(LlvmLibcExpfExhaustiveTest, PostiveRangeRoundTowardZero) {
+TEST_F(LlvmLibcExpm1fExhaustiveTest, PostiveRangeRoundTowardZero) {
   test_full_range(POS_START, POS_STOP, NUM_THREADS,
                   mpfr::RoundingMode::TowardZero);
 }
@@ -63,21 +63,21 @@ TEST_F(LlvmLibcExpfExhaustiveTest, PostiveRangeRoundTowardZero) {
 static constexpr uint32_t NEG_START = 0x8000'0000U;
 static constexpr uint32_t NEG_STOP = 0xc2d0'0000U;
 
-TEST_F(LlvmLibcExpfExhaustiveTest, NegativeRangeRoundNearestTieToEven) {
+TEST_F(LlvmLibcExpm1fExhaustiveTest, NegativeRangeRoundNearestTieToEven) {
   test_full_range(NEG_START, NEG_STOP, NUM_THREADS,
                   mpfr::RoundingMode::Nearest);
 }
 
-TEST_F(LlvmLibcExpfExhaustiveTest, NegativeRangeRoundUp) {
+TEST_F(LlvmLibcExpm1fExhaustiveTest, NegativeRangeRoundUp) {
   test_full_range(NEG_START, NEG_STOP, NUM_THREADS, mpfr::RoundingMode::Upward);
 }
 
-TEST_F(LlvmLibcExpfExhaustiveTest, NegativeRangeRoundDown) {
+TEST_F(LlvmLibcExpm1fExhaustiveTest, NegativeRangeRoundDown) {
   test_full_range(NEG_START, NEG_STOP, NUM_THREADS,
                   mpfr::RoundingMode::Downward);
 }
 
-TEST_F(LlvmLibcExpfExhaustiveTest, NegativeRangeRoundTowardZero) {
+TEST_F(LlvmLibcExpm1fExhaustiveTest, NegativeRangeRoundTowardZero) {
   test_full_range(NEG_START, NEG_STOP, NUM_THREADS,
                   mpfr::RoundingMode::TowardZero);
 }

diff  --git a/libc/test/src/math/expm1f_test.cpp b/libc/test/src/math/expm1f_test.cpp
index 78be76ec57f73..032badbb5c2be 100644
--- a/libc/test/src/math/expm1f_test.cpp
+++ b/libc/test/src/math/expm1f_test.cpp
@@ -97,6 +97,16 @@ TEST(LlvmLibcExpm1fTest, Borderline) {
   ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
                                  __llvm_libc::expm1f(x), 0.5);
   EXPECT_MATH_ERRNO(0);
+
+  x = float(FPBits(0x942ed494U));
+  ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
+                                 __llvm_libc::expm1f(x), 0.5);
+  EXPECT_MATH_ERRNO(0);
+
+  x = float(FPBits(0xbdc1c6cbU));
+  ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
+                                 __llvm_libc::expm1f(x), 0.5);
+  EXPECT_MATH_ERRNO(0);
 }
 
 TEST(LlvmLibcExpm1fTest, InFloatRange) {