[libc-commits] [libc] c78144e - [libc][math] Improved performance of exp2f function.
Kirill Okhotnikov via libc-commits
libc-commits at lists.llvm.org
Thu Jul 28 01:58:11 PDT 2022
Author: Kirill Okhotnikov
Date: 2022-07-28T10:57:16+02:00
New Revision: c78144e1c74b689504b240c0a511b8561212523f
URL: https://github.com/llvm/llvm-project/commit/c78144e1c74b689504b240c0a511b8561212523f
DIFF: https://github.com/llvm/llvm-project/commit/c78144e1c74b689504b240c0a511b8561212523f.diff
LOG: [libc][math] Improved performance of exp2f function.
New exp2 function algorithm:
1) Improved performance: 8.176 vs 15.270 by core-math perf tool.
2) Improved accuracy. Only two special values left.
3) Lookup table size reduced twice.
Differential Revision: https://reviews.llvm.org/D129005
Added:
Modified:
libc/docs/math.rst
libc/src/math/generic/CMakeLists.txt
libc/src/math/generic/common_constants.cpp
libc/src/math/generic/common_constants.h
libc/src/math/generic/exp2f.cpp
Removed:
################################################################################
diff --git a/libc/docs/math.rst b/libc/docs/math.rst
index 681bc8ca3ea33..dbc95ce9dbea3 100644
--- a/libc/docs/math.rst
+++ b/libc/docs/math.rst
@@ -201,7 +201,7 @@ Performance
+--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
| expf | 9 | 7 | 44 | 38 | :math:`[-10, 10]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA |
+--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
-| exp2f | 25 | 8 | 81 | 37 | :math:`[-10, 10]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA |
+| exp2f | 8 | 6 | 35 | 23 | :math:`[-10, 10]` | i5-1135G7 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA |
+--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
| expm1f | 9 | 44 | 42 | 121 | :math:`[-10, 10]` | Ryzen 1700 | Ubuntu 20.04 LTS x86_64 | Clang 12.0.0 | FMA |
+--------------+-----------+-------------------+-----------+-------------------+-------------------------------------+------------+-------------------------+--------------+---------------+
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index b00bc25bcebbe..f9b10e7f97f02 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -499,7 +499,10 @@ add_entrypoint_object(
HDRS
../exp2f.h
DEPENDS
+ .common_constants
libc.src.__support.FPUtil.fputil
+ libc.src.__support.FPUtil.multiply_add
+ libc.src.__support.FPUtil.nearest_integer
libc.src.__support.FPUtil.polyeval
libc.include.math
COMPILE_OPTIONS
diff --git a/libc/src/math/generic/common_constants.cpp b/libc/src/math/generic/common_constants.cpp
index c2fcdf708de65..1bbca8ff57fc8 100644
--- a/libc/src/math/generic/common_constants.cpp
+++ b/libc/src/math/generic/common_constants.cpp
@@ -225,6 +225,20 @@ const double EXP_M2[128] = {
0x1.4e9c56c731f5dp1, 0x1.513c2e6c731d7p1, 0x1.53e14b042f9cap1,
0x1.568bb722dd593p1, 0x1.593b7d72305bbp1,
};
+// Wolfram alpha: N[Table[2^x-1,{x,-16/32,15/32,1/32}],27]
+// printf("%.13a,\n", d[i]);
+const double EXP_2_POW[EXP_num_p] = {
+ -0x1.2bec333018867p-2, -0x1.1c1142e274118p-2, -0x1.0bdd71829fcf2p-2,
+ -0x1.f69d99accc7b6p-3, -0x1.d4c6af7557c93p-3, -0x1.b23213cc8e86cp-3,
+ -0x1.8edb9f5703dc0p-3, -0x1.6abf137076a8ep-3, -0x1.45d819a94b14bp-3,
+ -0x1.20224341286e4p-3, -0x1.f332113d56b1fp-4, -0x1.a46f918837cb7p-4,
+ -0x1.53f391822dbc7p-4, -0x1.01b466423250ap-4, -0x1.5b505d5b6f268p-5,
+ -0x1.5f134923757f3p-6, 0x0.0000000000000p+0, 0x1.66c34c5615d0fp-6,
+ 0x1.6ab0d9f3121ecp-5, 0x1.1301d0125b50ap-4, 0x1.72b83c7d517aep-4,
+ 0x1.d4873168b9aa8p-4, 0x1.1c3d373ab11c3p-3, 0x1.4f4efa8fef709p-3,
+ 0x1.837f0518db8a9p-3, 0x1.b8d39b9d54e55p-3, 0x1.ef5326091a112p-3,
+ 0x1.13821818624b4p-2, 0x1.2ff6b54d8a89cp-2, 0x1.4d0ad5a753e07p-2,
+ 0x1.6ac1f752150a5p-2, 0x1.891fac0e95613p-2};
// Lookup table for sin(k * pi / 16) with k = 0, ..., 31.
// Table is generated with Sollya as follow:
diff --git a/libc/src/math/generic/common_constants.h b/libc/src/math/generic/common_constants.h
index fd3f8d4932bab..1c0d685abc215 100644
--- a/libc/src/math/generic/common_constants.h
+++ b/libc/src/math/generic/common_constants.h
@@ -37,6 +37,12 @@ extern const double EXP_M2[128];
// > for k from 0 to 31 do { D(sin(k * pi/16)); };
extern const double SIN_K_PI_OVER_16[32];
+static constexpr int EXP_bits_p = 5;
+static constexpr int EXP_num_p = 1 << EXP_bits_p;
+
+// Wolfram alpha: N[Table[2^x-1,{x,-16/32,15/32,1/32}],27]
+// printf("%.13a,\n", d[i]);
+extern const double EXP_2_POW[EXP_num_p];
} // namespace __llvm_libc
#endif // LLVM_LIBC_SRC_MATH_GENERIC_COMMON_CONSTANTS_H
diff --git a/libc/src/math/generic/exp2f.cpp b/libc/src/math/generic/exp2f.cpp
index 0f56959059e33..9756b1ac51901 100644
--- a/libc/src/math/generic/exp2f.cpp
+++ b/libc/src/math/generic/exp2f.cpp
@@ -7,45 +7,24 @@
//===----------------------------------------------------------------------===//
#include "src/math/exp2f.h"
-#include "src/__support/FPUtil/BasicOperations.h"
+#include "common_constants.h"
#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FMA.h"
#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
#include "src/__support/common.h"
#include <errno.h>
namespace __llvm_libc {
-// Lookup table for 2^(m * 2^(-6)) with m = 0, ..., 63.
-// Table is generated with Sollya as follow:
-// > display = hexadecimal;
-// > for i from 0 to 63 do { D(2^(i / 64)); };
-static constexpr double EXP_M[64] = {
- 0x1.0000000000000p0, 0x1.02c9a3e778061p0, 0x1.059b0d3158574p0,
- 0x1.0874518759bc8p0, 0x1.0b5586cf9890fp0, 0x1.0e3ec32d3d1a2p0,
- 0x1.11301d0125b51p0, 0x1.1429aaea92de0p0, 0x1.172b83c7d517bp0,
- 0x1.1a35beb6fcb75p0, 0x1.1d4873168b9aap0, 0x1.2063b88628cd6p0,
- 0x1.2387a6e756238p0, 0x1.26b4565e27cddp0, 0x1.29e9df51fdee1p0,
- 0x1.2d285a6e4030bp0, 0x1.306fe0a31b715p0, 0x1.33c08b26416ffp0,
- 0x1.371a7373aa9cbp0, 0x1.3a7db34e59ff7p0, 0x1.3dea64c123422p0,
- 0x1.4160a21f72e2ap0, 0x1.44e086061892dp0, 0x1.486a2b5c13cd0p0,
- 0x1.4bfdad5362a27p0, 0x1.4f9b2769d2ca7p0, 0x1.5342b569d4f82p0,
- 0x1.56f4736b527dap0, 0x1.5ab07dd485429p0, 0x1.5e76f15ad2148p0,
- 0x1.6247eb03a5585p0, 0x1.6623882552225p0, 0x1.6a09e667f3bcdp0,
- 0x1.6dfb23c651a2fp0, 0x1.71f75e8ec5f74p0, 0x1.75feb564267c9p0,
- 0x1.7a11473eb0187p0, 0x1.7e2f336cf4e62p0, 0x1.82589994cce13p0,
- 0x1.868d99b4492edp0, 0x1.8ace5422aa0dbp0, 0x1.8f1ae99157736p0,
- 0x1.93737b0cdc5e5p0, 0x1.97d829fde4e50p0, 0x1.9c49182a3f090p0,
- 0x1.a0c667b5de565p0, 0x1.a5503b23e255dp0, 0x1.a9e6b5579fdbfp0,
- 0x1.ae89f995ad3adp0, 0x1.b33a2b84f15fbp0, 0x1.b7f76f2fb5e47p0,
- 0x1.bcc1e904bc1d2p0, 0x1.c199bdd85529cp0, 0x1.c67f12e57d14bp0,
- 0x1.cb720dcef9069p0, 0x1.d072d4a07897cp0, 0x1.d5818dcfba487p0,
- 0x1.da9e603db3285p0, 0x1.dfc97337b9b5fp0, 0x1.e502ee78b3ff6p0,
- 0x1.ea4afa2a490dap0, 0x1.efa1bee615a27p0, 0x1.f50765b6e4540p0,
- 0x1.fa7c1819e90d8p0,
-};
+constexpr float mlp = EXP_num_p;
+constexpr float mmld = -1.0 / mlp;
+
+constexpr uint32_t exval1 = 0x3b42'9d37U;
+constexpr uint32_t exval2 = 0xbcf3'a937U;
+constexpr uint32_t exval_mask = exval1 & exval2;
LLVM_LIBC_FUNCTION(float, exp2f, (float x)) {
using FPBits = typename fputil::FPBits<float>;
@@ -54,36 +33,6 @@ LLVM_LIBC_FUNCTION(float, exp2f, (float x)) {
uint32_t x_u = xbits.uintval();
uint32_t x_abs = x_u & 0x7fff'ffffU;
- // Exceptional values.
- switch (x_u) {
- case 0x3b42'9d37U: // x = 0x1.853a6ep-9f
- if (fputil::get_round() == FE_TONEAREST)
- return 0x1.00870ap+0f;
- break;
- case 0x3c02'a9adU: // x = 0x1.05535ap-7f
- if (fputil::get_round() == FE_TONEAREST)
- return 0x1.016b46p+0f;
- break;
- case 0x3ca6'6e26U: { // x = 0x1.4cdc4cp-6f
- int round_mode = fputil::get_round();
- if (round_mode == FE_TONEAREST || round_mode == FE_UPWARD)
- return 0x1.03a16ap+0f;
- return 0x1.03a168p+0f;
- }
- case 0x3d92'a282U: // x = 0x1.254504p-4f
- if (fputil::get_round() == FE_UPWARD)
- return 0x1.0d0688p+0f;
- return 0x1.0d0686p+0f;
- case 0xbcf3'a937U: // x = -0x1.e7526ep-6f
- if (fputil::get_round() == FE_TONEAREST)
- return 0x1.f58d62p-1f;
- break;
- case 0xb8d3'd026U: // x = -0x1.a7a04cp-14f
- if (fputil::get_round() == FE_TONEAREST)
- return 0x1.fff6d2p-1f;
- break;
- }
-
// // When |x| >= 128, |x| < 2^-25, or x is nan
if (unlikely(x_abs >= 0x4300'0000U || x_abs <= 0x3280'0000U)) {
// |x| < 2^-25
@@ -101,7 +50,7 @@ LLVM_LIBC_FUNCTION(float, exp2f, (float x)) {
errno = ERANGE;
}
// x is +inf or nan
- return x + static_cast<float>(FPBits::inf());
+ return x + FPBits::inf().get_val();
}
// x < -150
if (x_u >= 0xc316'0000U) {
@@ -112,54 +61,41 @@ LLVM_LIBC_FUNCTION(float, exp2f, (float x)) {
if (xbits.is_nan())
return x;
if (fputil::get_round() == FE_UPWARD)
- return static_cast<float>(FPBits(FPBits::MIN_SUBNORMAL));
+ return FPBits(FPBits::MIN_SUBNORMAL).get_val();
if (x != 0.0f)
errno = ERANGE;
return 0.0f;
}
}
- // For -150 <= x < 128, to compute 2^x, we perform the following range
- // reduction: find hi, mid, lo such that:
- // x = hi + mid + lo, in which
- // hi is an integer,
- // mid * 2^6 is an integer
- // -2^(-7) <= lo < 2^-7.
- // In particular,
- // hi + mid = round(x * 2^6) * 2^(-6).
- // Then,
- // 2^(x) = 2^(hi + mid + lo) = 2^hi * 2^mid * 2^lo.
- // Multiply by 2^hi is simply adding hi to the exponent field. We store
- // exp(mid) in the lookup tables EXP_M. exp(lo) is computed using a degree-4
- // minimax polynomial generated by Sollya.
- // x_hi = round(hi + mid).
- // The default rounding mode for float-to-int conversion in C++ is
- // round-toward-zero. To make it round-to-nearest, we add (-1)^sign(x) * 0.5
- // before conversion.
- int x_hi =
- static_cast<int>(x * 0x1.0p+6f + (xbits.get_sign() ? -0.5f : 0.5f));
- // For 2-complement integers, arithmetic right shift is the same as dividing
- // by a power of 2 and then round down (toward negative infinity).
- int e_hi = (x_hi >> 6) +
- static_cast<int>(fputil::FloatProperties<double>::EXPONENT_BIAS);
- fputil::FPBits<double> exp_hi(
- static_cast<uint64_t>(e_hi)
- << fputil::FloatProperties<double>::MANTISSA_WIDTH);
- // mid = x_hi & 0x0000'003fU;
- double exp_hi_mid = static_cast<double>(exp_hi) * EXP_M[x_hi & 0x3f];
- // Subtract (hi + mid) from x to get lo.
- x -= static_cast<float>(x_hi) * 0x1.0p-6f;
- double xd = static_cast<double>(x);
- // Degree-4 minimax polynomial generated by Sollya with the following
- // commands:
- // > display = hexadecimal;
- // > Q = fpminimax((2^x - 1)/x, 3, [|D...|], [-2^-7, 2^-7]);
- // > Q;
- double exp_lo =
- fputil::polyeval(xd, 0x1p0, 0x1.62e42fefa2417p-1, 0x1.ebfbdff82f809p-3,
- 0x1.c6b0b92131c47p-5, 0x1.3b2ab6fb568a3p-7);
- double result = exp_hi_mid * exp_lo;
- return static_cast<float>(result);
+ if (unlikely(x_u & exval_mask) == exval_mask) {
+ if (unlikely(x_u == exval1)) { // x = 0x1.853a6ep-9f
+ if (fputil::get_round() == FE_TONEAREST)
+ return 0x1.00870ap+0f;
+ } else if (unlikely(x_u == exval2)) { // x = -0x1.e7526ep-6f
+ if (fputil::get_round() == FE_TONEAREST)
+ return 0x1.f58d62p-1f;
+ }
+ }
+
+ float kf = fputil::nearest_integer(x * mlp);
+ double dx = fputil::multiply_add(mmld, kf, x);
+ double mult_f, ml;
+ {
+ uint32_t ps = static_cast<int>(kf) + (1 << (EXP_bits_p - 1)) +
+ (fputil::FPBits<double>::EXPONENT_BIAS << EXP_bits_p);
+ fputil::FPBits<double> bs;
+ bs.set_unbiased_exponent(ps >> EXP_bits_p);
+ ml = 1.0 + EXP_2_POW[ps & (EXP_num_p - 1)];
+ mult_f = bs.get_val();
+ }
+
+ // N[Table[Ln[2]^n/n!,{n,1,6}],30]
+ double pe = fputil::polyeval(
+ dx, 1.0, 0x1.62e42fefa39efp-1, 0x1.ebfbdff82c58fp-3, 0x1.c6b08d704a0c0p-5,
+ 0x1.3b2ab6fba4e77p-7, 0x1.5d87fe78a6731p-10, 0x1.430912f86c787p-13);
+
+ return mult_f * ml * pe;
}
} // namespace __llvm_libc
More information about the libc-commits
mailing list