[libc-commits] [libc] 4973eee - [libc][math] Improve tanhf performance.

Tue Ly via libc-commits libc-commits at lists.llvm.org
Mon Sep 19 05:43:17 PDT 2022


Author: Tue Ly
Date: 2022-09-19T08:43:03-04:00
New Revision: 4973eee1228674c80f9441a36019c8a83ee3458a

URL: https://github.com/llvm/llvm-project/commit/4973eee1228674c80f9441a36019c8a83ee3458a
DIFF: https://github.com/llvm/llvm-project/commit/4973eee1228674c80f9441a36019c8a83ee3458a.diff

LOG: [libc][math] Improve tanhf performance.

Optimize the core part of `tanhf` implementation that is to compute `e^x`
similar to https://reviews.llvm.org/D133870.  Factor the constants and
polynomial approximation out so that it can be used for `exp10f`

Performance benchmark using perf tool from the CORE-MATH project on Ryzen 1700:
```
$ CORE_MATH_PERF_MODE="rdtsc" ./perf.sh tanhf
GNU libc version: 2.35
GNU libc release: stable
CORE-MATH reciprocal throughput   : 13.377
System LIBC reciprocal throughput : 55.046

BEFORE:
LIBC reciprocal throughput        : 75.674
LIBC reciprocal throughput        : 33.242    (with `-msse4.2` flag)
LIBC reciprocal throughput        : 25.927    (with `-mfma` flag)

AFTER:
LIBC reciprocal throughput        : 26.359
LIBC reciprocal throughput        : 18.888    (with `-msse4.2` flag)
LIBC reciprocal throughput        : 14.243    (with `-mfma` flag)

$ CORE_MATH_PERF_MODE="rdtsc" ./perf.sh tanhf --latency
GNU libc version: 2.35
GNU libc release: stable
CORE-MATH latency   : 43.365
System LIBC latency : 123.499

BEFORE
LIBC latency        : 112.968
LIBC latency        : 104.908   (with `-msse4.2` flag)
LIBC latency        : 92.310    (with `-mfma` flag)

AFTER
LIBC latency        : 69.828
LIBC latency        : 63.874    (with `-msse4.2` flag)
LIBC latency        : 57.427    (with `-mfma` flag)
```

Reviewed By: orex, zimmermann6

Differential Revision: https://reviews.llvm.org/D134002

Added: 
    

Modified: 
    libc/docs/math.rst
    libc/src/math/generic/exp2f.cpp
    libc/src/math/generic/explogxf.cpp
    libc/src/math/generic/explogxf.h
    libc/src/math/generic/tanhf.cpp
    libc/test/src/math/explogxf_test.cpp

Removed: 
    


################################################################################
diff  --git a/libc/docs/math.rst b/libc/docs/math.rst
index 4592238890c50..e30b99c14da68 100644
--- a/libc/docs/math.rst
+++ b/libc/docs/math.rst
@@ -215,11 +215,11 @@ Performance
 +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
 | cosf         |        13 |                32 |        53 |                59 | :math:`[0, 2\pi]`                   | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA           |
 +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
-| coshf        |        15 |                20 |        51 |                48 | :math:`[-10, 10]`                   | Ryzen 1700 | Ubuntu 22.04 LTS x86_64 | Clang 14.0.0 | FMA           |
+| coshf        |        14 |                20 |        50 |                48 | :math:`[-10, 10]`                   | Ryzen 1700 | Ubuntu 22.04 LTS x86_64 | Clang 14.0.0 | FMA           |
 +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
 | expf         |         9 |                 7 |        44 |                38 | :math:`[-10, 10]`                   | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA           |
 +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
-| exp2f        |         9 |                 6 |        37 |                31 | :math:`[-10, 10]`                   | Ryzen 1700 | Ubuntu 22.04 LTS x86_64 | Clang 14.0.0 | FMA           |
+| exp2f        |         9 |                 6 |        35 |                31 | :math:`[-10, 10]`                   | Ryzen 1700 | Ubuntu 22.04 LTS x86_64 | Clang 14.0.0 | FMA           |
 +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
 | expm1f       |         9 |                44 |        42 |               121 | :math:`[-10, 10]`                   | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA           |
 +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
@@ -245,11 +245,11 @@ Performance
 +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
 | sincosf      |        19 |                30 |        57 |                68 | :math:`[-\pi, \pi]`                 | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA           |
 +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
-| sinhf        |        14 |                63 |        49 |               137 | :math:`[-10, 10]`                   | Ryzen 1700 | Ubuntu 22.04 LTS x86_64 | Clang 14.0.0 | FMA           |
+| sinhf        |        13 |                63 |        48 |               137 | :math:`[-10, 10]`                   | Ryzen 1700 | Ubuntu 22.04 LTS x86_64 | Clang 14.0.0 | FMA           |
 +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
 | tanf         |        19 |                50 |        82 |               107 | :math:`[-\pi, \pi]`                 | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA           |
 +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
-| tanhf        |        25 |                59 |        95 |               125 | :math:`[-10, 10]`                   | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA           |
+| tanhf        |        13 |                55 |        57 |               123 | :math:`[-10, 10]`                   | Ryzen 1700 | Ubuntu 22.04 LTS x86_64 | Clang 14.0.0 | FMA           |
 +--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
 
 References

diff  --git a/libc/src/math/generic/exp2f.cpp b/libc/src/math/generic/exp2f.cpp
index 411ae3359b540..118cb87e50b13 100644
--- a/libc/src/math/generic/exp2f.cpp
+++ b/libc/src/math/generic/exp2f.cpp
@@ -83,48 +83,48 @@ LLVM_LIBC_FUNCTION(float, exp2f, (float x)) {
   // reduction: find hi, mid, lo such that:
   //   x = hi + mid + lo, in which
   //     hi is an integer,
-  //     0 <= mid * 2^4 < 16 is an integer
-  //     -2^(-5) <= lo <= 2^-5.
+  //     0 <= mid * 2^5 < 32 is an integer
+  //     -2^(-6) <= lo <= 2^-6.
   // In particular,
-  //   hi + mid = round(x * 2^4) * 2^(-4).
+  //   hi + mid = round(x * 2^5) * 2^(-5).
   // Then,
   //   2^x = 2^(hi + mid + lo) = 2^hi * 2^mid * 2^lo.
-  // 2^mid is stored in the lookup table EXP_2_M of 16 elements.
-  // 2^lo is computed using a degree-6 minimax polynomial
+  // 2^mid is stored in the lookup table of 32 elements.
+  // 2^lo is computed using a degree-5 minimax polynomial
   // generated by Sollya.
   // We perform 2^hi * 2^mid by simply add hi to the exponent field
   // of 2^mid.
 
-  // kf = (hi + mid) * 2^4 = round(x * 2^4)
-  float kf = fputil::nearest_integer(x * 16.0f);
-  // dx = lo = x - (hi + mid) = x - kf * 2^(-4)
-  double dx = fputil::multiply_add(-0x1.0p-4f, kf, x);
+  // kf = (hi + mid) * 2^5 = round(x * 2^5)
+  float kf = fputil::nearest_integer(x * 32.0f);
+  // dx = lo = x - (hi + mid) = x - kf * 2^(-5)
+  double dx = fputil::multiply_add(-0x1.0p-5f, kf, x);
 
   int k = static_cast<int>(kf);
   // hi = floor(kf * 2^(-4))
   // exp_hi = shift hi to the exponent field of double precision.
-  int64_t exp_hi = static_cast<int64_t>(k >> 4)
+  int64_t exp_hi = static_cast<int64_t>(k >> ExpBase::MID_BITS)
                    << fputil::FloatProperties<double>::MANTISSA_WIDTH;
   // mh = 2^hi * 2^mid
   // mh_bits = bit field of mh
-  int64_t mh_bits = EXP_2_M[k & 15] + exp_hi;
+  int64_t mh_bits = ExpBase::EXP_2_MID[k & ExpBase::MID_MASK] + exp_hi;
   double mh = fputil::FPBits<double>(uint64_t(mh_bits)).get_val();
 
   // Degree-5 polynomial approximating (2^x - 1)/x generating by Sollya with:
   // > P = fpminimax((2^x - 1)/x, 5, [|D...|], [-1/32. 1/32]);
-  constexpr double COEFFS[6] = {0x1.62e42fefa39f3p-1,  0x1.ebfbdff82c57bp-3,
-                                0x1.c6b08d6f2d7aap-5,  0x1.3b2ab6fc92f5dp-7,
-                                0x1.5d897cfe27125p-10, 0x1.43090e61e6af1p-13};
+  constexpr double COEFFS[5] = {0x1.62e42fefa39efp-1, 0x1.ebfbdff8131c4p-3,
+                                0x1.c6b08d7061695p-5, 0x1.3b2b1bee74b2ap-7,
+                                0x1.5d88091198529p-10};
   double dx_sq = dx * dx;
-  double c1 = fputil::multiply_add(dx, COEFFS[1], COEFFS[0]);
-  double c2 = fputil::multiply_add(dx, COEFFS[3], COEFFS[2]);
-  double c3 = fputil::multiply_add(dx, COEFFS[5], COEFFS[4]);
-  double p = fputil::polyeval(dx_sq, c1, c2, c3);
+  double c1 = fputil::multiply_add(dx, COEFFS[0], 1.0);
+  double c2 = fputil::multiply_add(dx, COEFFS[2], COEFFS[1]);
+  double c3 = fputil::multiply_add(dx, COEFFS[4], COEFFS[3]);
+  double p = fputil::multiply_add(dx_sq, c3, c2);
   // 2^x = 2^(hi + mid + lo)
   //     = 2^(hi + mid) * 2^lo
   //     ~ mh * (1 + lo * P(lo))
   //     = mh + (mh*lo) * P(lo)
-  return fputil::multiply_add(p, dx * mh, mh);
+  return fputil::multiply_add(p, dx_sq * mh, c1 * mh);
 }
 
 } // namespace __llvm_libc

diff  --git a/libc/src/math/generic/explogxf.cpp b/libc/src/math/generic/explogxf.cpp
index 47d54e70b04e4..3e12e8a0ce4bb 100644
--- a/libc/src/math/generic/explogxf.cpp
+++ b/libc/src/math/generic/explogxf.cpp
@@ -10,21 +10,6 @@
 
 namespace __llvm_libc {
 
-// Wolfram alpha: N[Table[2^x-1,{x,-16/32,15/32,1/32}],27]
-// printf("%.13a,\n", d[i]);
-alignas(64) const double EXP_2_POW[EXP_num_p] = {
-    -0x1.2bec333018867p-2, -0x1.1c1142e274118p-2, -0x1.0bdd71829fcf2p-2,
-    -0x1.f69d99accc7b6p-3, -0x1.d4c6af7557c93p-3, -0x1.b23213cc8e86cp-3,
-    -0x1.8edb9f5703dc0p-3, -0x1.6abf137076a8ep-3, -0x1.45d819a94b14bp-3,
-    -0x1.20224341286e4p-3, -0x1.f332113d56b1fp-4, -0x1.a46f918837cb7p-4,
-    -0x1.53f391822dbc7p-4, -0x1.01b466423250ap-4, -0x1.5b505d5b6f268p-5,
-    -0x1.5f134923757f3p-6, 0x0.0000000000000p+0,  0x1.66c34c5615d0fp-6,
-    0x1.6ab0d9f3121ecp-5,  0x1.1301d0125b50ap-4,  0x1.72b83c7d517aep-4,
-    0x1.d4873168b9aa8p-4,  0x1.1c3d373ab11c3p-3,  0x1.4f4efa8fef709p-3,
-    0x1.837f0518db8a9p-3,  0x1.b8d39b9d54e55p-3,  0x1.ef5326091a112p-3,
-    0x1.13821818624b4p-2,  0x1.2ff6b54d8a89cp-2,  0x1.4d0ad5a753e07p-2,
-    0x1.6ac1f752150a5p-2,  0x1.891fac0e95613p-2};
-
 // N[Table[Log[2, 1 + x], {x, 0/64, 63/64, 1/64}], 40]
 alignas(64) const double LOG_P1_LOG2[LOG_P1_SIZE] = {
     0x0.0000000000000p+0, 0x1.6e79685c2d22ap-6, 0x1.6bad3758efd87p-5,

diff  --git a/libc/src/math/generic/explogxf.h b/libc/src/math/generic/explogxf.h
index b5639a8ac4194..ed83e44e72e9a 100644
--- a/libc/src/math/generic/explogxf.h
+++ b/libc/src/math/generic/explogxf.h
@@ -21,25 +21,54 @@
 
 namespace __llvm_libc {
 
-static constexpr int EXP_bits_p = 5;
-static constexpr int EXP_num_p = 1 << EXP_bits_p;
-constexpr double mlp = EXP_num_p;
-constexpr double mmld = -1.0 / mlp;
-
-// Wolfram alpha: N[Table[2^x-1,{x,-16/32,15/32,1/32}],27]
-// printf("%.13a,\n", d[i]);
-extern const double EXP_2_POW[EXP_num_p];
-
-// Look up table for bit fields of 2^(i/16) for i = 0..15, generated by Sollya
-// with:
-// > for i from 0 to 15 do printdouble(round(2^(i/16), D, RN));
-inline constexpr int64_t EXP_2_M[16] = {
-    0x3ff0000000000000, 0x3ff0b5586cf9890f, 0x3ff172b83c7d517b,
-    0x3ff2387a6e756238, 0x3ff306fe0a31b715, 0x3ff3dea64c123422,
-    0x3ff4bfdad5362a27, 0x3ff5ab07dd485429, 0x3ff6a09e667f3bcd,
-    0x3ff7a11473eb0187, 0x3ff8ace5422aa0db, 0x3ff9c49182a3f090,
-    0x3ffae89f995ad3ad, 0x3ffc199bdd85529c, 0x3ffd5818dcfba487,
-    0x3ffea4afa2a490da};
+struct ExpBase {
+  // Base = e
+  static constexpr int MID_BITS = 5;
+  static constexpr int MID_MASK = (1 << MID_BITS) - 1;
+  // log2(e) * 2^5
+  static constexpr double LOG2_B = 0x1.71547652b82fep+0 * (1 << MID_BITS);
+  // High and low parts of -log(2) * 2^(-5)
+  static constexpr double M_LOGB_2_HI = -0x1.62e42fefa0000p-1 / (1 << MID_BITS);
+  static constexpr double M_LOGB_2_LO =
+      -0x1.cf79abc9e3b3ap-40 / (1 << MID_BITS);
+  // Look up table for bit fields of 2^(i/32) for i = 0..31, generated by Sollya
+  // with:
+  // > for i from 0 to 31 do printdouble(round(2^(i/32), D, RN));
+  static constexpr int64_t EXP_2_MID[1 << MID_BITS] = {
+      0x3ff0000000000000, 0x3ff059b0d3158574, 0x3ff0b5586cf9890f,
+      0x3ff11301d0125b51, 0x3ff172b83c7d517b, 0x3ff1d4873168b9aa,
+      0x3ff2387a6e756238, 0x3ff29e9df51fdee1, 0x3ff306fe0a31b715,
+      0x3ff371a7373aa9cb, 0x3ff3dea64c123422, 0x3ff44e086061892d,
+      0x3ff4bfdad5362a27, 0x3ff5342b569d4f82, 0x3ff5ab07dd485429,
+      0x3ff6247eb03a5585, 0x3ff6a09e667f3bcd, 0x3ff71f75e8ec5f74,
+      0x3ff7a11473eb0187, 0x3ff82589994cce13, 0x3ff8ace5422aa0db,
+      0x3ff93737b0cdc5e5, 0x3ff9c49182a3f090, 0x3ffa5503b23e255d,
+      0x3ffae89f995ad3ad, 0x3ffb7f76f2fb5e47, 0x3ffc199bdd85529c,
+      0x3ffcb720dcef9069, 0x3ffd5818dcfba487, 0x3ffdfc97337b9b5f,
+      0x3ffea4afa2a490da, 0x3fff50765b6e4540,
+  };
+
+  // Approximating e^dx with degree-5 minimax polynomial generated by Sollya:
+  // > Q = fpminimax(expm1(x)/x, 4, [|1, D...|], [-log(2)/64, log(2)/64]);
+  // Then:
+  //   e^dx ~ P(dx) = 1 + dx + COEFFS[0] * dx^2 + ... + COEFFS[4] * dx^6.
+  static constexpr double COEFFS[4] = {
+      0x1.ffffffffe5bc8p-2, 0x1.555555555cd67p-3, 0x1.5555c2a9b48b4p-5,
+      0x1.11112a0e34bdbp-7};
+
+  static constexpr double powb_lo(double dx) {
+    using fputil::multiply_add;
+    double dx2 = dx * dx;
+    double c0 = 1.0 + dx;
+    // c1 = COEFFS[0] + COEFFS[1] * dx
+    double c1 = multiply_add(dx, ExpBase::COEFFS[1], ExpBase::COEFFS[0]);
+    // c2 = COEFFS[2] + COEFFS[3] * dx
+    double c2 = multiply_add(dx, ExpBase::COEFFS[3], ExpBase::COEFFS[2]);
+    // r = c4 + c5 * dx^4
+    //   = 1 + dx + COEFFS[0] * dx^2 + ... + COEFFS[5] * dx^7
+    return fputil::polyeval(dx2, c0, c1, c2);
+  }
+};
 
 constexpr int LOG_P1_BITS = 6;
 constexpr int LOG_P1_SIZE = 1 << LOG_P1_BITS;
@@ -55,65 +84,50 @@ extern const double LOG_P1_1_OVER[LOG_P1_SIZE];
 extern const double K_LOG2_ODD[4];
 extern const double K_LOG2_EVEN[4];
 
-// The algorithm represents exp(x) as
-//   exp(x) = 2^(ln(2) * i) * 2^(ln(2) * j / NUM_P )) * exp(dx)
-// where i integer value, j integer in range [-NUM_P/2, NUM_P/2).
-// 2^(ln(2) * j / NUM_P )) is a table values: 1.0 + EXP_M
-// exp(dx) calculates by taylor expansion.
-
-// Inversion of ln(2). Multiplication by EXP_num_p due to sampling by 1 /
-// EXP_num_p Precise value of the constant is not needed.
-static constexpr double LN2_INV = 0x1.71547652b82fep+0 * EXP_num_p;
-
-// log2(e) * 2^4
-static constexpr double LOG2_E_4 = 0x1.71547652b82fep+4;
-
-// LN2_HIGH + LN2_LOW = ln(2) with precision higher than double(ln(2))
-// Minus sign is to use FMA directly.
-static constexpr double LN2_HIGH = -0x1.62e42fefa0000p-1 / EXP_num_p;
-static constexpr double LN2_LOW = -0x1.cf79abc9e3b3ap-40 / EXP_num_p;
-
-// -log(2) * 2^(-4)
-static constexpr double M_LN2_4_HI = -0x1.62e42fefa0000p-5;
-static constexpr double M_LN2_4_LO = -0x1.cf79abc9e3b3ap-44;
-
-struct exe_eval_result_t {
-  // exp(x) = 2^MULT_POWER2 * mult_exp * (r + 1.0)
-  // where
-  //   MULT_POWER2 template parameter;
-  //   mult_exp = 2^e;
-  //   r in range [~-0.3, ~0.41]
-  double mult_exp;
-  double r;
+// Output of range reduction for exp_b: (2^(mid + hi), lo)
+// where:
+//   b^x = 2^(mid + hi) * b^lo
+struct exp_b_reduc_t {
+  double mh; // 2^(mid + hi)
+  double lo;
 };
 
-// The function correctly calculates exp value with at least float precision
-// in range not narrow than [-log(2^-150), 90]
-template <int MULT_POWER2 = 0>
-inline static exe_eval_result_t exp_eval(double x) {
-  double ps_dbl = fputil::nearest_integer(LN2_INV * x);
-  // Negative sign due to multiply_add optimization
-  double mult_e1, ml;
-  {
-    int ps =
-        static_cast<int>(ps_dbl) + (1 << (EXP_bits_p - 1)) +
-        ((fputil::FPBits<double>::EXPONENT_BIAS + MULT_POWER2) << EXP_bits_p);
-    int table_index = ps & (EXP_num_p - 1);
-    fputil::FPBits<double> bs;
-    bs.set_unbiased_exponent(ps >> EXP_bits_p);
-    ml = EXP_2_POW[table_index];
-    mult_e1 = bs.get_val();
-  }
-  double dx = fputil::multiply_add(ps_dbl, LN2_LOW,
-                                   fputil::multiply_add(ps_dbl, LN2_HIGH, x));
-
-  // Taylor series coefficients
-  double pe = dx * fputil::polyeval(dx, 1.0, 0x1.0p-1, 0x1.5555555555555p-3,
-                                    0x1.5555555555555p-5, 0x1.1111111111111p-7,
-                                    0x1.6c16c16c16c17p-10);
-
-  double r = fputil::multiply_add(ml, pe, pe) + ml;
-  return {mult_e1, r};
+// The function correctly calculates b^x value with at least float precision
+// in a limited range.
+// Range reduction:
+//   b^x = 2^(hi + mid) * b^lo
+// where:
+//   x = (hi + mid) * log_b(2) + lo
+//   hi is an integer,
+//   0 <= mid * 2^MID_BITS < 2^MID_BITS is an integer
+//   -2^(-MID_BITS - 1) <= lo * log2(b) <= 2^(-MID_BITS - 1)
+// Base class needs to provide the following constants:
+//   - MID_BITS    : number of bits after decimal points used for mid
+//   - MID_MASK    : 2^MID_BITS - 1, mask to extract mid bits
+//   - LOG2_B      : log2(b) * 2^MID_BITS for scaling
+//   - M_LOGB_2_HI : high part of -log_b(2) * 2^(-MID_BITS)
+//   - M_LOGB_2_LO : low part of -log_b(2) * 2^(-MID_BITS)
+//   - EXP_2_MID   : look up table for bit fields of 2^mid
+// Return:
+//   { 2^(hi + mid), lo }
+template <class Base> static inline exp_b_reduc_t exp_b_range_reduc(float x) {
+  double xd = static_cast<double>(x);
+  // kd = round((hi + mid) * log2(b) * 2^MID_BITS)
+  double kd = fputil::nearest_integer(Base::LOG2_B * xd);
+  // k = round((hi + mid) * log2(b) * 2^MID_BITS)
+  int k = static_cast<int>(kd);
+  // hi = floor(kd * 2^(-MID_BITS))
+  // exp_hi = shift hi to the exponent field of double precision.
+  int64_t exp_hi = static_cast<int64_t>((k >> Base::MID_BITS))
+                   << fputil::FloatProperties<double>::MANTISSA_WIDTH;
+  // mh = 2^hi * 2^mid
+  // mh_bits = bit field of mh
+  int64_t mh_bits = Base::EXP_2_MID[k & Base::MID_MASK] + exp_hi;
+  double mh = fputil::FPBits<double>(uint64_t(mh_bits)).get_val();
+  // dx = lo = x - (hi + mid) * log(2)
+  double dx = fputil::multiply_add(
+      kd, Base::M_LOGB_2_LO, fputil::multiply_add(kd, Base::M_LOGB_2_HI, xd));
+  return {mh, dx};
 }
 
 // The function correctly calculates sinh(x) and cosh(x) by calculating exp(x)
@@ -122,17 +136,17 @@ inline static exe_eval_result_t exp_eval(double x) {
 // reduction: find hi, mid, lo such that:
 //   x = (hi + mid) * log(2) + lo, in which
 //     hi is an integer,
-//     0 <= mid * 2^4 < 16 is an integer
-//     -2^(-5) <= lo * log2(e) <= 2^-5.
+//     0 <= mid * 2^5 < 32 is an integer
+//     -2^(-6) <= lo * log2(e) <= 2^-6.
 // In particular,
-//   hi + mid = round(x * log2(e) * 2^4) * 2^(-4).
+//   hi + mid = round(x * log2(e) * 2^5) * 2^(-5).
 // Then,
 //   e^x = 2^(hi + mid) * e^lo = 2^hi * 2^mid * e^lo.
-// 2^mid is stored in the lookup table EXP_2_M of 16 elements.
-// e^lo is computed using a degree-6 minimax polynomial
+// 2^mid is stored in the lookup table of 32 elements.
+// e^lo is computed using a degree-5 minimax polynomial
 // generated by Sollya:
-//   e^lo ~ P(lo) = 1 + lo + c2 * lo^2 + ... + c6 * lo^6
-//        = (1 + c2*lo^2 + c4*lo^4 + c6*lo^6) + lo * (1 + c3*lo^2 + c5*lo^4)
+//   e^lo ~ P(lo) = 1 + lo + c2 * lo^2 + ... + c5 * lo^5
+//        = (1 + c2*lo^2 + c4*lo^4) + lo * (1 + c3*lo^2 + c5*lo^4)
 //        = P_even + lo * P_odd
 // We perform 2^hi * 2^mid by simply add hi to the exponent field
 // of 2^mid.
@@ -156,24 +170,25 @@ inline static exe_eval_result_t exp_eval(double x) {
 template <bool is_sinh> static inline double exp_pm_eval(float x) {
   double xd = static_cast<double>(x);
 
-  // round(x * log2(e) * 2^4)
-  double kd = fputil::nearest_integer(LOG2_E_4 * xd);
+  // round(x * log2(e) * 2^5)
+  double kd = fputil::nearest_integer(ExpBase::LOG2_B * xd);
 
-  // k_p = round(x * log2(e) * 2^4)
+  // k_p = round(x * log2(e) * 2^5)
   int k_p = static_cast<int>(kd);
-  // k_m = round(-x * log2(e) * 2^4)
+  // k_m = round(-x * log2(e) * 2^5)
   int k_m = -k_p;
 
-  // hi = floor(kf * 2^(-4))
+  // hi = floor(kf * 2^(-5))
   // exp_hi = shift hi to the exponent field of double precision.
-  int64_t exp_hi_p = static_cast<int64_t>((k_p >> 4))
+  int64_t exp_hi_p = static_cast<int64_t>((k_p >> ExpBase::MID_BITS))
                      << fputil::FloatProperties<double>::MANTISSA_WIDTH;
-  int64_t exp_hi_m = static_cast<int64_t>((k_m >> 4))
+  int64_t exp_hi_m = static_cast<int64_t>((k_m >> ExpBase::MID_BITS))
                      << fputil::FloatProperties<double>::MANTISSA_WIDTH;
-  // mh = 2^hi * 2^mid
-  // mh_bits = bit field of mh
-  int64_t mh_bits_p = EXP_2_M[k_p & 15] + exp_hi_p;
-  int64_t mh_bits_m = EXP_2_M[k_m & 15] + exp_hi_m;
+  // mh_p = 2^(hi + mid)
+  // mh_m = 2^(-(hi + mid))
+  // mh_bits_* = bit field of mh_*
+  int64_t mh_bits_p = ExpBase::EXP_2_MID[k_p & ExpBase::MID_MASK] + exp_hi_p;
+  int64_t mh_bits_m = ExpBase::EXP_2_MID[k_m & ExpBase::MID_MASK] + exp_hi_m;
   double mh_p = fputil::FPBits<double>(uint64_t(mh_bits_p)).get_val();
   double mh_m = fputil::FPBits<double>(uint64_t(mh_bits_m)).get_val();
   // mh_sum = 2^(hi + mid) + 2^(-(hi + mid))
@@ -182,31 +197,18 @@ template <bool is_sinh> static inline double exp_pm_eval(float x) {
   double mh_
diff  = mh_p - mh_m;
 
   // dx = lo = x - (hi + mid) * log(2)
-  double dx = fputil::multiply_add(kd, M_LN2_4_LO,
-                                   fputil::multiply_add(kd, M_LN2_4_HI, xd));
+  double dx =
+      fputil::multiply_add(kd, ExpBase::M_LOGB_2_LO,
+                           fputil::multiply_add(kd, ExpBase::M_LOGB_2_HI, xd));
   double dx2 = dx * dx;
 
-  // Polynomials generated by Sollya with:
-  // Q = fpminimax(expm1(x)/x, 5, [|1, D...|], [-1/32*log(2), 1/32*log(2)]);
-  // Then:
-  //   e^lo ~ P(dx) = 1 + dx + COEFFS[0] * dx^2 + ... + COEFFS[4] * dx^6.
-  constexpr double COEFFS[5] = {0x1.fffffffffffep-2, 0x1.55555554ad3f3p-3,
-                                0x1.55555557179cap-5, 0x1.111228f3478c9p-7,
-                                0x1.6c161beccc69dp-10};
   // c0 = 1 + COEFFS[0] * lo^2
-  double c0 = fputil::multiply_add(dx2, COEFFS[0], 1.0);
-  // c1 = 1 + COEFFS[0] * lo^2
-  double c1 = fputil::multiply_add(dx2, COEFFS[1], 1.0);
-  // c2 = COEFFS[2] + COEFFS[4] * lo^2
-  double c2 = fputil::multiply_add(dx2, COEFFS[4], COEFFS[2]);
-  double dx4 = dx2 * dx2;
-  // P_even = c0 + c2 * lo^4
-  //        = (1 + COEFFS[0] * lo^2) + lo^4 * (COEFFS[2] + COEFFS[4] * lo^2)
-  //        = 1 + COEFFS[0] * lo^2 + COEFFS[2] * lo^4 + COEFFS[4] * lo^6
-  double p_even = fputil::multiply_add(dx4, c2, c0);
-  // P_odd = c1 + COEFFS[3] * lo^4
-  //       = 1 + COEFFS[1] * lo^2 + COEFFS[3] * lo^4
-  double p_odd = fputil::multiply_add(dx4, COEFFS[3], c1);
+  // P_even = 1 + COEFFS[0] * lo^2 + COEFFS[2] * lo^4
+  double p_even =
+      fputil::polyeval(dx2, 1.0, ExpBase::COEFFS[0], ExpBase::COEFFS[2]);
+  // P_odd = 1 + COEFFS[1] * lo^2 + COEFFS[3] * lo^4
+  double p_odd =
+      fputil::polyeval(dx2, 1.0, ExpBase::COEFFS[1], ExpBase::COEFFS[3]);
 
   double r;
   if constexpr (is_sinh)

diff  --git a/libc/src/math/generic/tanhf.cpp b/libc/src/math/generic/tanhf.cpp
index e1f753c12161a..22b95870f4c69 100644
--- a/libc/src/math/generic/tanhf.cpp
+++ b/libc/src/math/generic/tanhf.cpp
@@ -53,10 +53,17 @@ LLVM_LIBC_FUNCTION(float, tanhf, (float x)) {
       return FPBits(0x3f7f'6ad9U).get_val();
   }
 
-  auto ep = exp_eval(2.0f * (sign ? x : -x)); // exp(-2 * x)
-  double result = fputil::multiply_add(ep.mult_exp, ep.r, ep.mult_exp - 1.0) /
-                  (fputil::multiply_add(ep.mult_exp, ep.r, ep.mult_exp + 1.0));
-  return sign ? result : -result;
+  // Range reduction: e^(2x) = 2^(mid + hi) * e^lo
+  auto ep = exp_b_range_reduc<ExpBase>(2.0f * x); // exp(2 * x)
+  double r = ExpBase::powb_lo(ep.lo);
+  // tanh(x) = (exp(2x) - 1) / (exp(2x) + 1)
+#if defined(LIBC_TARGET_HAS_FMA)
+  return fputil::multiply_add(ep.mh, r, -1.0) /
+         fputil::multiply_add(ep.mh, r, 1.0);
+#else
+  double exp_x = ep.mh * r;
+  return (exp_x - 1.0) / (exp_x + 1.0);
+#endif // LIBC_TARGET_HAS_FMA
 }
 
 } // namespace __llvm_libc

diff  --git a/libc/test/src/math/explogxf_test.cpp b/libc/test/src/math/explogxf_test.cpp
index 6aa9cb3ad0b87..a9d7cc32b6a92 100644
--- a/libc/test/src/math/explogxf_test.cpp
+++ b/libc/test/src/math/explogxf_test.cpp
@@ -27,9 +27,9 @@ auto f_normal = [](float x) -> bool {
 
 TEST(LlvmLibcExpxfTest, InFloatRange) {
   auto fx = [](float x) -> float {
-    auto result = __llvm_libc::exp_eval<-1>(x);
-    return static_cast<float>(2 * result.mult_exp * result.r +
-                              2 * result.mult_exp);
+    auto result = __llvm_libc::exp_b_range_reduc<__llvm_libc::ExpBase>(x);
+    double r = __llvm_libc::ExpBase::powb_lo(result.lo);
+    return static_cast<float>(result.mh * r);
   };
   auto f_check = [](float x) -> bool {
     return !(


        


More information about the libc-commits mailing list