[libc-commits] [libc] 5dbd511 - [libc][math] Improve tanhf performance.

Tue Jun 20 06:25:17 PDT 2023

Author: Tue Ly
Date: 2023-06-20T09:25:07-04:00
New Revision: 5dbd5118ec5435269602de30ebd4982dd8b273dc

URL: https://github.com/llvm/llvm-project/commit/5dbd5118ec5435269602de30ebd4982dd8b273dc
DIFF: https://github.com/llvm/llvm-project/commit/5dbd5118ec5435269602de30ebd4982dd8b273dc.diff

LOG: [libc][math] Improve tanhf performance.

Re-order exceptional branches and slightly adjust the evaluation.

Performance tested with the CORE-MATH project on AMD EPYC 7B12 (clocks/op)

Reciprocal throughputs:
```

--- BEFORE ---

$ CORE_MATH_PERF_MODE=rdtsc ./perf.sh tanhf
[####################] 100 %  (with -mavx2 -mfma)
Ntrial = 20 ; Min = 7.794 + 0.102 clc/call; Median-Min = 0.066 clc/call; Max = 8.267 clc/call;
[####################] 100 %. (with -msse4.2)
Ntrial = 20 ; Min = 10.783 + 0.172 clc/call; Median-Min = 0.144 clc/call; Max = 11.446 clc/call;
[####################] 100 %. (SSE2)
Ntrial = 20 ; Min = 18.926 + 0.381 clc/call; Median-Min = 0.342 clc/call; Max = 19.623 clc/call;

--- AFTER ---

$ CORE_MATH_PERF_MODE=rdtsc ./perf.sh tanhf
[####################] 100 %  (with -mavx2 -mfma)
Ntrial = 20 ; Min = 6.598 + 0.085 clc/call; Median-Min = 0.052 clc/call; Max = 6.868 clc/call;
[####################] 100 %  (with -msse4.2)
Ntrial = 20 ; Min = 9.245 + 0.304 clc/call; Median-Min = 0.248 clc/call; Max = 10.675 clc/call;
[####################] 100 %. (SSE2)
Ntrial = 20 ; Min = 11.724 + 0.440 clc/call; Median-Min = 0.444 clc/call; Max = 12.262 clc/call;
```

Latency:
```
--- BEFORE ---

$ PERF_ARGS="--latency" CORE_MATH_PERF_MODE=rdtsc ./perf.sh tanhf
[####################] 100 %  (with -mavx2 -mfma)
Ntrial = 20 ; Min = 38.821 + 0.157 clc/call; Median-Min = 0.122 clc/call; Max = 39.539 clc/call;
[####################] 100 %. (with -msse4.2)
Ntrial = 20 ; Min = 44.767 + 0.766 clc/call; Median-Min = 0.681 clc/call; Max = 45.951 clc/call;
[####################] 100 %. (SSE2)
Ntrial = 20 ; Min = 55.055 + 1.512 clc/call; Median-Min = 1.571 clc/call; Max = 57.039 clc/call;

--- AFTER ---

$ PERF_ARGS="--latency" CORE_MATH_PERF_MODE=rdtsc ./perf.sh tanhf
[####################] 100 %  (with -mavx2 -mfma)
Ntrial = 20 ; Min = 36.147 + 0.194 clc/call; Median-Min = 0.181 clc/call; Max = 36.536 clc/call;
[####################] 100 %  (with -msse4.2)
Ntrial = 20 ; Min = 40.904 + 0.728 clc/call; Median-Min = 0.557 clc/call; Max = 42.231 clc/call;
[####################] 100 %. (SSE2)
Ntrial = 20 ; Min = 55.776 + 0.557 clc/call; Median-Min = 0.542 clc/call; Max = 56.551 clc/call;
```

Reviewed By: michaelrj

Differential Revision: https://reviews.llvm.org/D153026

Added: 
    

Modified: 
    libc/src/__support/macros/properties/CMakeLists.txt
    libc/src/__support/macros/properties/cpu_features.h
    libc/src/math/generic/CMakeLists.txt
    libc/src/math/generic/tanhf.cpp
    libc/test/src/math/tanhf_test.cpp
    utils/bazel/llvm-project-overlay/libc/BUILD.bazel

Removed: 
    


################################################################################
diff  --git a/libc/src/__support/macros/properties/CMakeLists.txt b/libc/src/__support/macros/properties/CMakeLists.txt
index 8ddd16a1e3669..f6c88c34ebb5a 100644
--- a/libc/src/__support/macros/properties/CMakeLists.txt
+++ b/libc/src/__support/macros/properties/CMakeLists.txt
@@ -1,17 +1,19 @@
 add_header_library(
-    architectures
-    HDRS
+  architectures
+  HDRS
     architectures.h
 )
 
 add_header_library(
-    compiler
-    HDRS
+  compiler
+  HDRS
     compiler.h
 )
 
 add_header_library(
-    cpu_features
-    HDRS
+  cpu_features
+  HDRS
     cpu_features.h
+  DEPENDS
+    .architectures
 )

diff  --git a/libc/src/__support/macros/properties/cpu_features.h b/libc/src/__support/macros/properties/cpu_features.h
index 086b216e27573..493d9f446d374 100644
--- a/libc/src/__support/macros/properties/cpu_features.h
+++ b/libc/src/__support/macros/properties/cpu_features.h
@@ -12,6 +12,8 @@
 #ifndef LLVM_LIBC_SRC_SUPPORT_MACROS_PROPERTIES_CPU_FEATURES_H
 #define LLVM_LIBC_SRC_SUPPORT_MACROS_PROPERTIES_CPU_FEATURES_H
 
+#include "architectures.h"
+
 #if defined(__SSE2__)
 #define LIBC_TARGET_CPU_HAS_SSE2
 #endif
@@ -41,4 +43,10 @@
 #define LIBC_TARGET_CPU_HAS_FMA
 #endif
 
+#if defined(LIBC_TARGET_ARCH_IS_AARCH64) ||                                    \
+    (defined(LIBC_TARGET_ARCH_IS_X86_64) &&                                    \
+     defined(LIBC_TARGET_CPU_HAS_SSE4_2))
+#define LIBC_TARGET_CPU_HAS_NEAREST_INT
+#endif
+
 #endif // LLVM_LIBC_SRC_SUPPORT_MACROS_PROPERTIES_CPU_FEATURES_H

diff  --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 3d5273fa90cc1..098b69817fb20 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1374,6 +1374,8 @@ add_entrypoint_object(
     .explogxf
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
     libc.src.__support.macros.optimization
   COMPILE_OPTIONS
     -O3

diff  --git a/libc/src/math/generic/tanhf.cpp b/libc/src/math/generic/tanhf.cpp
index eb6b50a64b014..21c3ed8c8c333 100644
--- a/libc/src/math/generic/tanhf.cpp
+++ b/libc/src/math/generic/tanhf.cpp
@@ -8,66 +8,114 @@
 
 #include "src/math/tanhf.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/macros/optimization.h"            // LIBC_UNLIKELY
-#include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/macros/properties/cpu_features.h"
 #include "src/math/generic/explogxf.h"
 
 namespace __llvm_libc {
 
+// 2^6 * log2(e)
+constexpr double LOG2_E_EXP2_6 = ExpBase::LOG2_B * 2.0;
+
 LLVM_LIBC_FUNCTION(float, tanhf, (float x)) {
   using FPBits = typename fputil::FPBits<float>;
   FPBits xbits(x);
-  bool sign = xbits.get_sign();
-  uint32_t x_abs = xbits.uintval() & FPBits::FloatProp::EXP_MANT_MASK;
+  uint32_t x_u = xbits.uintval();
+  uint32_t x_abs = x_u & FPBits::FloatProp::EXP_MANT_MASK;
 
-  // |x| <= 2^-26
-  if (LIBC_UNLIKELY(x_abs <= 0x3280'0000U)) {
-    return static_cast<float>(
-        LIBC_UNLIKELY(x_abs == 0) ? x : (x - 0x1.5555555555555p-2 * x * x * x));
-  }
+  // When |x| >= 15, or x is inf or nan, or |x| <= 0.078125
+  if (LIBC_UNLIKELY((x_abs >= 0x4170'0000U) || (x_abs <= 0x3da0'0000U))) {
+    if (x_abs <= 0x3da0'0000U) {
+      // |x| <= 0.078125
+      if (LIBC_UNLIKELY(x_abs <= 0x3280'0000U)) {
+        // |x| <= 2^-26
+        return (x_abs != 0)
+                   ? static_cast<float>(x - 0x1.5555555555555p-2 * x * x * x)
+                   : x;
+      }
+
+      const double TAYLOR[] = {-0x1.5555555555555p-2, 0x1.1111111111111p-3,
+                               -0x1.ba1ba1ba1ba1cp-5, 0x1.664f4882c10fap-6,
+                               -0x1.226e355e6c23dp-7};
+      double xdbl = x;
+      double x2 = xdbl * xdbl;
+      // Taylor polynomial.
+      double x4 = x2 * x2;
+      double c0 = x2 * TAYLOR[0];
+      double c1 = fputil::multiply_add(x2, TAYLOR[2], TAYLOR[1]);
+      double c2 = fputil::multiply_add(x2, TAYLOR[4], TAYLOR[3]);
+      double pe = fputil::polyeval(x4, c0, c1, c2);
 
-  // When |x| >= 15, or x is inf or nan
-  if (LIBC_UNLIKELY(x_abs >= 0x4170'0000U)) {
-    if (xbits.is_nan())
+      return static_cast<float>(fputil::multiply_add(xdbl, pe, xdbl));
+    }
+
+    // |x| >= 15
+    if (LIBC_UNLIKELY(xbits.is_nan()))
       return x + 1.0f; // sNaN to qNaN + signal
 
-    if (xbits.is_inf())
-      return sign ? -1.0f : 1.0f;
+    const double SIGNS[2][2] = {{1.0f, -0x1.0p-25f}, {-1.0f, 0x1.0p-25f}};
 
-    if (sign) {
-      return -1.0f + opt_barrier(FPBits(FPBits::MIN_NORMAL).get_val());
-    } else
-      return 1.0f - opt_barrier(FPBits(FPBits::MIN_NORMAL).get_val());
-  }
+    bool sign = xbits.get_sign();
+    int idx = static_cast<int>(sign);
 
-  // |x| <= 0.078125
-  if (LIBC_UNLIKELY(x_abs <= 0x3da0'0000U)) {
-    double xdbl = x;
-    double x2 = xdbl * xdbl;
-    // Pure Taylor series.
-    double pe = fputil::polyeval(x2, 0.0, -0x1.5555555555555p-2,
-                                 0x1.1111111111111p-3, -0x1.ba1ba1ba1ba1cp-5,
-                                 0x1.664f4882c10fap-6, -0x1.226e355e6c23dp-7);
-    return static_cast<float>(fputil::multiply_add(xdbl, pe, xdbl));
-  }
+    if (LIBC_UNLIKELY(xbits.is_inf()))
+      return SIGNS[idx][0];
 
-  if (LIBC_UNLIKELY(xbits.bits == 0x4058'e0a3U)) {
-    if (fputil::fenv_is_round_down())
-      return FPBits(0x3f7f'6ad9U).get_val();
+    return SIGNS[idx][0] + SIGNS[idx][1];
   }
 
-  // Range reduction: e^(2x) = 2^(mid + hi) * e^lo
-  auto ep = exp_b_range_reduc<ExpBase>(2.0f * x); // exp(2 * x)
-  double r = ExpBase::powb_lo(ep.lo);
-  // tanh(x) = (exp(2x) - 1) / (exp(2x) + 1)
-#if defined(LIBC_TARGET_CPU_HAS_FMA)
-  return static_cast<float>(fputil::multiply_add(ep.mh, r, -1.0) /
-                            fputil::multiply_add(ep.mh, r, 1.0));
+  // Range reduction: e^(2x) = 2^(hi + mid) * e^lo
+  // Let  k = round( x * 2^6 * log2(e)),
+  // So   k  = (hi + mid) * 2^5
+  // Then lo = 2x - (hi + mid) * log(2) = 2x - k * 2^-5 * log(2).
+
+  double xd = static_cast<double>(x);
+  // k = round( x* 2^6 * log2(e) )
+  double k;
+  // mk = -k
+  int mk;
+#ifdef LIBC_TARGET_CPU_HAS_NEAREST_INT
+  k = fputil::nearest_integer(xd * LOG2_E_EXP2_6);
+  mk = -static_cast<int>(k);
 #else
-  double exp_x = ep.mh * r;
-  return static_cast<float>((exp_x - 1.0) / (exp_x + 1.0));
-#endif // LIBC_TARGET_CPU_HAS_FMA
+  constexpr double HALF_WAY[2] = {-0.5, 0.5};
+
+  mk = static_cast<int>(
+      fputil::multiply_add(xd, -LOG2_E_EXP2_6, HALF_WAY[xbits.get_sign()]));
+  k = static_cast<double>(-mk);
+#endif // LIBC_TARGET_CPU_HAS_NEAREST_INT
+  // -hi = floor(-k * 2^(-MID_BITS))
+  // exp_mhi = shift -hi to the exponent field of double precision.
+  int64_t exp_mhi = static_cast<int64_t>(mk >> ExpBase::MID_BITS)
+                    << fputil::FloatProperties<double>::MANTISSA_WIDTH;
+  // mh = 2^(-hi - mid)
+  int64_t mh_bits = ExpBase::EXP_2_MID[mk & ExpBase::MID_MASK] + exp_mhi;
+  double mh = fputil::FPBits<double>(uint64_t(mh_bits)).get_val();
+  // dx = lo/2 = x - (hi + mid) * log(2)/2 = x - k * 2^-6 * log(2)
+  double dx = fputil::multiply_add(
+      k, ExpBase::M_LOGB_2_LO * 0.5,
+      fputil::multiply_add(k, ExpBase::M_LOGB_2_HI * 0.5, xd));
+
+  // > P = fpminimax(expm1(2*x)/x, 4, [|D...|], [-log(2)/128, log(2)/128]);
+  constexpr double COEFFS[] = {0x1.ffffffffe5bc8p0, 0x1.555555555cd67p0,
+                               0x1.5555c2a9b48b4p-1, 0x1.11112a0e34bdbp-2};
+
+  double dx2 = dx * dx;
+  double c0 = fputil::multiply_add(dx, 2.0, 1.0);
+  double c1 = fputil::multiply_add(dx, COEFFS[1], COEFFS[0]);
+  double c2 = fputil::multiply_add(dx, COEFFS[3], COEFFS[2]);
+  double r = fputil::polyeval(dx2, c0, c1, c2);
+
+  // tanh(x) = sinh(x) / cosh(x)
+  //         = (e^x - e^(-x)) / (e^x + e^(-x))
+  //         = (e^(2x) - 1) / (e^(2x) + 1)
+  //         = (2^(hi + mid) * e^lo - 1) / (2^(hi + mid) * e^lo + 1)
+  //         = (e^lo - 2^(-hi - mid)) / (e^lo + 2^(-hi - mid))
+  //         = (r - mh) / (r + mh)
+  return static_cast<float>((r - mh) / (r + mh));
 }
 
 } // namespace __llvm_libc

diff  --git a/libc/test/src/math/tanhf_test.cpp b/libc/test/src/math/tanhf_test.cpp
index f4a4b72dd162a..07afe14a263d0 100644
--- a/libc/test/src/math/tanhf_test.cpp
+++ b/libc/test/src/math/tanhf_test.cpp
@@ -43,35 +43,31 @@ TEST(LlvmLibcTanhfTest, SpecialNumbers) {
 }
 
 TEST(LlvmLibcTanhfTest, InFloatRange) {
-  constexpr uint32_t COUNT = 100'000;
+  constexpr uint32_t COUNT = 100'001;
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
     float x = float(FPBits(v));
     if (isnan(x) || isinf(x))
       continue;
-    ASSERT_MPFR_MATCH(mpfr::Operation::Tanh, x, __llvm_libc::tanhf(x), 0.5);
+    ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tanh, x,
+                                   __llvm_libc::tanhf(x), 0.5);
   }
 }
 
-// For small values, tanh(x) is x.
-TEST(LlvmLibcTanhfTest, SmallValues) {
-  float x = float(FPBits(uint32_t(0x17800000)));
-  float result = __llvm_libc::tanhf(x);
-  EXPECT_MPFR_MATCH(mpfr::Operation::Tanh, x, result, 0.5);
-  EXPECT_FP_EQ(x, result);
-
-  x = float(FPBits(uint32_t(0x00400000)));
-  result = __llvm_libc::tanhf(x);
-  EXPECT_MPFR_MATCH(mpfr::Operation::Tanh, x, result, 0.5);
-  EXPECT_FP_EQ(x, result);
-}
-
 TEST(LlvmLibcTanhfTest, ExceptionalValues) {
-  float x = float(FPBits(uint32_t(0x3a12'85ffU)));
-  EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tanh, x,
-                                 __llvm_libc::tanhf(x), 0.5);
-
-  x = -float(FPBits(uint32_t(0x3a12'85ffU)));
-  EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tanh, x,
-                                 __llvm_libc::tanhf(x), 0.5);
+  constexpr int N = 4;
+  constexpr uint32_t INPUTS[N] = {
+      0x0040'0000,
+      0x1780'0000,
+      0x3a12'85ff,
+      0x4058'e0a3,
+  };
+
+  for (int i = 0; i < N; ++i) {
+    float x = float(FPBits(INPUTS[i]));
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tanh, x,
+                                   __llvm_libc::tanhf(x), 0.5);
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tanh, -x,
+                                   __llvm_libc::tanhf(-x), 0.5);
+  }
 }

diff  --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 58aa99d007743..eb7a3c87fc0f7 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -81,7 +81,10 @@ libc_support_library(
 libc_support_library(
     name = "__support_macros_properties_cpu_features",
     hdrs = ["src/__support/macros/properties/cpu_features.h"],
-    deps = [":libc_root"],
+    deps = [
+        "__support_macros_properties_architectures",
+        ":libc_root",
+    ],
 )
 
 libc_support_library(