[libc-commits] [libc] bc8e87e - [libc][math] Update range reduction step for logf and reduce its latency.
Tue Ly via libc-commits
libc-commits at lists.llvm.org
Mon Apr 10 10:00:44 PDT 2023
Author: Tue Ly
Date: 2023-04-10T13:00:37-04:00
New Revision: bc8e87ef4ae94323fdb17e9c916d3e491d82b215
URL: https://github.com/llvm/llvm-project/commit/bc8e87ef4ae94323fdb17e9c916d3e491d82b215
DIFF: https://github.com/llvm/llvm-project/commit/bc8e87ef4ae94323fdb17e9c916d3e491d82b215.diff
LOG: [libc][math] Update range reduction step for logf and reduce its latency.
Simplify the range reduction steps by choosing the reduction constants
carefully so that the reduced arguments v = r*m_x - 1 and v^2 are exact in double
precision, even without FMA instructions, and -2^-8 <= v < 2^-7. This allows the
polynomial evaluations to be parallelized more efficiently.
Reviewed By: santoshn, zimmermann6
Differential Revision: https://reviews.llvm.org/D147755
Added:
Modified:
libc/src/math/generic/common_constants.cpp
libc/src/math/generic/common_constants.h
libc/src/math/generic/logf.cpp
libc/test/src/math/logf_test.cpp
Removed:
################################################################################
diff --git a/libc/src/math/generic/common_constants.cpp b/libc/src/math/generic/common_constants.cpp
index a694888bdb7a7..b1d0aeb2a65b4 100644
--- a/libc/src/math/generic/common_constants.cpp
+++ b/libc/src/math/generic/common_constants.cpp
@@ -109,7 +109,7 @@ const double LOG_F[128] = {
// precision, and -2^-8 <= v < 2^-7.
// TODO(lntue): Add reference to how the constants are derived after the
// resulting paper is ready.
-const float R[128] = {
+alignas(32) const float R[128] = {
0x1p0, 0x1.fcp-1, 0x1.f8p-1, 0x1.f4p-1, 0x1.fp-1, 0x1.ecp-1, 0x1.e8p-1,
0x1.e4p-1, 0x1.ep-1, 0x1.dep-1, 0x1.dap-1, 0x1.d6p-1, 0x1.d4p-1, 0x1.dp-1,
0x1.ccp-1, 0x1.cap-1, 0x1.c6p-1, 0x1.c4p-1, 0x1.cp-1, 0x1.bep-1, 0x1.bap-1,
@@ -130,6 +130,72 @@ const float R[128] = {
0x1.0ap-1, 0x1.08p-1, 0x1.08p-1, 0x1.06p-1, 0x1.06p-1, 0x1.04p-1, 0x1.04p-1,
0x1.02p-1, 0x1.0p-1};
+alignas(64) const double RD[128] = {
+ 0x1p0, 0x1.fcp-1, 0x1.f8p-1, 0x1.f4p-1, 0x1.fp-1, 0x1.ecp-1, 0x1.e8p-1,
+ 0x1.e4p-1, 0x1.ep-1, 0x1.dep-1, 0x1.dap-1, 0x1.d6p-1, 0x1.d4p-1, 0x1.dp-1,
+ 0x1.ccp-1, 0x1.cap-1, 0x1.c6p-1, 0x1.c4p-1, 0x1.cp-1, 0x1.bep-1, 0x1.bap-1,
+ 0x1.b8p-1, 0x1.b4p-1, 0x1.b2p-1, 0x1.aep-1, 0x1.acp-1, 0x1.a8p-1, 0x1.a6p-1,
+ 0x1.a4p-1, 0x1.ap-1, 0x1.9ep-1, 0x1.9cp-1, 0x1.98p-1, 0x1.96p-1, 0x1.94p-1,
+ 0x1.92p-1, 0x1.9p-1, 0x1.8cp-1, 0x1.8ap-1, 0x1.88p-1, 0x1.86p-1, 0x1.84p-1,
+ 0x1.8p-1, 0x1.7ep-1, 0x1.7cp-1, 0x1.7ap-1, 0x1.78p-1, 0x1.76p-1, 0x1.74p-1,
+ 0x1.72p-1, 0x1.7p-1, 0x1.6ep-1, 0x1.6cp-1, 0x1.6ap-1, 0x1.68p-1, 0x1.66p-1,
+ 0x1.64p-1, 0x1.62p-1, 0x1.6p-1, 0x1.5ep-1, 0x1.5cp-1, 0x1.5ap-1, 0x1.58p-1,
+ 0x1.56p-1, 0x1.54p-1, 0x1.54p-1, 0x1.52p-1, 0x1.5p-1, 0x1.4ep-1, 0x1.4cp-1,
+ 0x1.4ap-1, 0x1.4ap-1, 0x1.48p-1, 0x1.46p-1, 0x1.44p-1, 0x1.42p-1, 0x1.4p-1,
+ 0x1.4p-1, 0x1.3ep-1, 0x1.3cp-1, 0x1.3ap-1, 0x1.3ap-1, 0x1.38p-1, 0x1.36p-1,
+ 0x1.34p-1, 0x1.34p-1, 0x1.32p-1, 0x1.3p-1, 0x1.3p-1, 0x1.2ep-1, 0x1.2cp-1,
+ 0x1.2cp-1, 0x1.2ap-1, 0x1.28p-1, 0x1.28p-1, 0x1.26p-1, 0x1.24p-1, 0x1.24p-1,
+ 0x1.22p-1, 0x1.2p-1, 0x1.2p-1, 0x1.1ep-1, 0x1.1cp-1, 0x1.1cp-1, 0x1.1ap-1,
+ 0x1.1ap-1, 0x1.18p-1, 0x1.16p-1, 0x1.16p-1, 0x1.14p-1, 0x1.14p-1, 0x1.12p-1,
+ 0x1.1p-1, 0x1.1p-1, 0x1.0ep-1, 0x1.0ep-1, 0x1.0cp-1, 0x1.0cp-1, 0x1.0ap-1,
+ 0x1.0ap-1, 0x1.08p-1, 0x1.08p-1, 0x1.06p-1, 0x1.06p-1, 0x1.04p-1, 0x1.04p-1,
+ 0x1.02p-1, 0x1.0p-1};
+
+alignas(64) const double LOG_R[128] = {
+ 0x0.0000000000000p0, 0x1.010157588de71p-7, 0x1.0205658935847p-6,
+ 0x1.8492528c8cabfp-6, 0x1.0415d89e74444p-5, 0x1.466aed42de3eap-5,
+ 0x1.894aa149fb343p-5, 0x1.ccb73cdddb2ccp-5, 0x1.08598b59e3a07p-4,
+ 0x1.1973bd1465567p-4, 0x1.3bdf5a7d1ee64p-4, 0x1.5e95a4d9791cbp-4,
+ 0x1.700d30aeac0e1p-4, 0x1.9335e5d594989p-4, 0x1.b6ac88dad5b1cp-4,
+ 0x1.c885801bc4b23p-4, 0x1.ec739830a112p-4, 0x1.fe89139dbd566p-4,
+ 0x1.1178e8227e47cp-3, 0x1.1aa2b7e23f72ap-3, 0x1.2d1610c86813ap-3,
+ 0x1.365fcb0159016p-3, 0x1.4913d8333b561p-3, 0x1.527e5e4a1b58dp-3,
+ 0x1.6574ebe8c133ap-3, 0x1.6f0128b756abcp-3, 0x1.823c16551a3c2p-3,
+ 0x1.8beafeb38fe8cp-3, 0x1.95a5adcf7017fp-3, 0x1.a93ed3c8ad9e3p-3,
+ 0x1.b31d8575bce3dp-3, 0x1.bd087383bd8adp-3, 0x1.d1037f2655e7bp-3,
+ 0x1.db13db0d4894p-3, 0x1.e530effe71012p-3, 0x1.ef5ade4dcffe6p-3,
+ 0x1.f991c6cb3b379p-3, 0x1.07138604d5862p-2, 0x1.0c42d676162e3p-2,
+ 0x1.1178e8227e47cp-2, 0x1.16b5ccbacfb73p-2, 0x1.1bf99635a6b95p-2,
+ 0x1.269621134db92p-2, 0x1.2bef07cdc9354p-2, 0x1.314f1e1d35ce4p-2,
+ 0x1.36b6776be1117p-2, 0x1.3c25277333184p-2, 0x1.419b423d5e8c7p-2,
+ 0x1.4718dc271c41bp-2, 0x1.4c9e09e172c3cp-2, 0x1.522ae0738a3d8p-2,
+ 0x1.57bf753c8d1fbp-2, 0x1.5d5bddf595f3p-2, 0x1.630030b3aac49p-2,
+ 0x1.68ac83e9c6a14p-2, 0x1.6e60ee6af1972p-2, 0x1.741d876c67bb1p-2,
+ 0x1.79e26687cfb3ep-2, 0x1.7fafa3bd8151cp-2, 0x1.85855776dcbfbp-2,
+ 0x1.8b639a88b2df5p-2, 0x1.914a8635bf68ap-2, 0x1.973a3431356aep-2,
+ 0x1.9d32bea15ed3bp-2, 0x1.a33440224fa79p-2, 0x1.a33440224fa79p-2,
+ 0x1.a93ed3c8ad9e3p-2, 0x1.af5295248cddp-2, 0x1.b56fa04462909p-2,
+ 0x1.bb9611b80e2fbp-2, 0x1.c1c60693fa39ep-2, 0x1.c1c60693fa39ep-2,
+ 0x1.c7ff9c74554c9p-2, 0x1.ce42f18064743p-2, 0x1.d490246defa6bp-2,
+ 0x1.dae75484c9616p-2, 0x1.e148a1a2726cep-2, 0x1.e148a1a2726cep-2,
+ 0x1.e7b42c3ddad73p-2, 0x1.ee2a156b413e5p-2, 0x1.f4aa7ee03192dp-2,
+ 0x1.f4aa7ee03192dp-2, 0x1.fb358af7a4884p-2, 0x1.00e5ae5b207abp-1,
+ 0x1.04360be7603adp-1, 0x1.04360be7603adp-1, 0x1.078bf0533c568p-1,
+ 0x1.0ae76e2d054fap-1, 0x1.0ae76e2d054fap-1, 0x1.0e4898611cce1p-1,
+ 0x1.11af823c75aa8p-1, 0x1.11af823c75aa8p-1, 0x1.151c3f6f29612p-1,
+ 0x1.188ee40f23ca6p-1, 0x1.188ee40f23ca6p-1, 0x1.1c07849ae6007p-1,
+ 0x1.1f8635fc61659p-1, 0x1.1f8635fc61659p-1, 0x1.230b0d8bebc98p-1,
+ 0x1.269621134db92p-1, 0x1.269621134db92p-1, 0x1.2a2786d0ec107p-1,
+ 0x1.2dbf557b0df43p-1, 0x1.2dbf557b0df43p-1, 0x1.315da4434068bp-1,
+ 0x1.315da4434068bp-1, 0x1.35028ad9d8c86p-1, 0x1.38ae2171976e7p-1,
+ 0x1.38ae2171976e7p-1, 0x1.3c6080c36bfb5p-1, 0x1.3c6080c36bfb5p-1,
+ 0x1.4019c2125ca93p-1, 0x1.43d9ff2f923c5p-1, 0x1.43d9ff2f923c5p-1,
+ 0x1.47a1527e8a2d3p-1, 0x1.47a1527e8a2d3p-1, 0x1.4b6fd6f970c1fp-1,
+ 0x1.4b6fd6f970c1fp-1, 0x1.4f45a835a4e19p-1, 0x1.4f45a835a4e19p-1,
+ 0x1.5322e26867857p-1, 0x1.5322e26867857p-1, 0x1.5707a26bb8c66p-1,
+ 0x1.5707a26bb8c66p-1, 0x1.5af405c3649ep-1, 0x1.5af405c3649ep-1,
+ 0x1.5ee82aa24192p-1, 0x0.000000000000p0};
+
// Lookup table for exp(m) with m = -104, ..., 89.
// -104 = floor(log(single precision's min denormal))
// 89 = ceil(log(single precision's max normal))
diff --git a/libc/src/math/generic/common_constants.h b/libc/src/math/generic/common_constants.h
index c20eb8e61b532..b0567f3ed083a 100644
--- a/libc/src/math/generic/common_constants.h
+++ b/libc/src/math/generic/common_constants.h
@@ -20,6 +20,12 @@ extern const double LOG_F[128];
// Lookup table for range reduction constants r for logarithms.
extern const float R[128];
+// Lookup table for range reduction constants r for logarithms.
+extern const double RD[128];
+
+// Lookup table for -log(r)
+extern const double LOG_R[128];
+
// Lookup table for exp(m) with m = -104, ..., 89.
// -104 = floor(log(single precision's min denormal))
// 89 = ceil(log(single precision's max normal))
diff --git a/libc/src/math/generic/logf.cpp b/libc/src/math/generic/logf.cpp
index f2c1fa33309bf..73634e009c74e 100644
--- a/libc/src/math/generic/logf.cpp
+++ b/libc/src/math/generic/logf.cpp
@@ -57,81 +57,115 @@ LLVM_LIBC_FUNCTION(float, logf, (float x)) {
FPBits xbits(x);
uint32_t x_u = xbits.uintval();
+ int m = -FPBits::EXPONENT_BIAS;
+
using fputil::round_result_slightly_down;
using fputil::round_result_slightly_up;
- switch (x_u) {
- case 0x3f800001U: // x = 0x1.000002p+0f
- return round_result_slightly_up(0x1.fffffep-24f);
- case 0x41178febU: // x = 0x1.2f1fd6p+3f
- return round_result_slightly_up(0x1.1fcbcep+1f);
- case 0x4c5d65a5U: // x = 0x1.bacb4ap+25f
- return round_result_slightly_down(0x1.1e0696p+4f);
- case 0x500ffb03U: // x = 0x1.1ff606p+33f
- return round_result_slightly_up(0x1.6fdd34p+4f);
- case 0x5cd69e88U: // x = 0x1.ad3d1p+58f
- return round_result_slightly_up(0x1.45c146p+5f);
- case 0x65d890d3U: // x = 0x1.b121a6p+76f
- return round_result_slightly_down(0x1.a9a3f2p+5f);
- case 0x6f31a8ecU: // x = 0x1.6351d8p+95f
- return round_result_slightly_down(0x1.08b512p+6f);
- case 0x7a17f30aU: // x = 0x1.2fe614p+117f
- return round_result_slightly_up(0x1.451436p+6f);
-#ifndef LIBC_TARGET_CPU_HAS_FMA
- case 0x1b7679ffU: // x = 0x1.ecf3fep-73f
- return round_result_slightly_up(-0x1.8f8e5ap+5f);
- case 0x1e88452dU: // x = 0x1.108a5ap-66f
- return round_result_slightly_up(-0x1.6d7b18p+5f);
- case 0x5ee8984eU: // x = 0x1.d1309cp+62f;
- return round_result_slightly_up(0x1.5c9442p+5f);
- case 0x665e7ca6U: // x = 0x1.bcf94cp+77f
- return round_result_slightly_up(0x1.af66cp+5f);
- case 0x79e7ec37U: // x = 0x1.cfd86ep+116f
- return round_result_slightly_up(0x1.43ff6ep+6f);
+ // Small inputs
+ if (x_u < 0x4c5d65a5U) {
+ // Hard-to-round cases.
+ switch (x_u) {
+ case 0x3f7f4d6fU: // x = 0x1.fe9adep-1f
+ return round_result_slightly_up(-0x1.659ec8p-9f);
+ case 0x41178febU: // x = 0x1.2f1fd6p+3f
+ return round_result_slightly_up(0x1.1fcbcep+1f);
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+ case 0x3f800000U: // x = 1.0f
+ return 0.0f;
+#else
+ case 0x1e88452dU: // x = 0x1.108a5ap-66f
+ return round_result_slightly_up(-0x1.6d7b18p+5f);
#endif // LIBC_TARGET_CPU_HAS_FMA
- }
-
- int m = -FPBits::EXPONENT_BIAS;
-
- if (LIBC_UNLIKELY(x_u < FPBits::MIN_NORMAL || x_u > FPBits::MAX_NORMAL)) {
- if (xbits.is_zero()) {
- // Return -inf and raise FE_DIVBYZERO
- fputil::set_errno_if_required(ERANGE);
- fputil::raise_except_if_required(FE_DIVBYZERO);
- return static_cast<float>(FPBits::neg_inf());
}
- if (xbits.get_sign() && !xbits.is_nan()) {
- // Return NaN and raise FE_INVALID
- fputil::set_errno_if_required(EDOM);
- fputil::raise_except_if_required(FE_INVALID);
- return FPBits::build_quiet_nan(0);
+ // Subnormal inputs.
+ if (LIBC_UNLIKELY(x_u < FPBits::MIN_NORMAL)) {
+ if (x_u == 0) {
+ // Return -inf and raise FE_DIVBYZERO
+ fputil::set_errno_if_required(ERANGE);
+ fputil::raise_except_if_required(FE_DIVBYZERO);
+ return static_cast<float>(FPBits::neg_inf());
+ }
+ // Normalize denormal inputs.
+ xbits.set_val(xbits.get_val() * 0x1.0p23f);
+ m -= 23;
+ x_u = xbits.uintval();
+ }
+ } else {
+ // Hard-to-round cases.
+ switch (x_u) {
+ case 0x4c5d65a5U: // x = 0x1.bacb4ap+25f
+ return round_result_slightly_down(0x1.1e0696p+4f);
+ case 0x65d890d3U: // x = 0x1.b121a6p+76f
+ return round_result_slightly_down(0x1.a9a3f2p+5f);
+ case 0x6f31a8ecU: // x = 0x1.6351d8p+95f
+ return round_result_slightly_down(0x1.08b512p+6f);
+ case 0x7a17f30aU: // x = 0x1.2fe614p+117f
+ return round_result_slightly_up(0x1.451436p+6f);
+#ifndef LIBC_TARGET_CPU_HAS_FMA
+ case 0x500ffb03U: // x = 0x1.1ff606p+33f
+ return round_result_slightly_up(0x1.6fdd34p+4f);
+ case 0x5cd69e88U: // x = 0x1.ad3d1p+58f
+ return round_result_slightly_up(0x1.45c146p+5f);
+ case 0x5ee8984eU: // x = 0x1.d1309cp+62f;
+ return round_result_slightly_up(0x1.5c9442p+5f);
+#endif // LIBC_TARGET_CPU_HAS_FMA
}
- if (xbits.is_inf_or_nan()) {
+ // Exceptional inputs.
+ if (LIBC_UNLIKELY(x_u > FPBits::MAX_NORMAL)) {
+ if (x_u == 0x8000'0000U) {
+ // Return -inf and raise FE_DIVBYZERO
+ fputil::set_errno_if_required(ERANGE);
+ fputil::raise_except_if_required(FE_DIVBYZERO);
+ return static_cast<float>(FPBits::neg_inf());
+ }
+ if (xbits.get_sign() && !xbits.is_nan()) {
+ // Return NaN and raise FE_INVALID
+ fputil::set_errno_if_required(EDOM);
+ fputil::raise_except_if_required(FE_INVALID);
+ return FPBits::build_quiet_nan(0);
+ }
+ // x is +inf or nan
return x;
}
- // Normalize denormal inputs.
- xbits.set_val(xbits.get_val() * 0x1.0p23f);
- m -= 23;
}
- m += xbits.get_unbiased_exponent();
- int f_index = xbits.get_mantissa() >> 16;
- // Set bits to 1.m
- xbits.set_unbiased_exponent(0x7F);
-
- FPBits f = xbits;
- f.bits &= ~0x0000'FFFF;
+#ifndef LIBC_TARGET_CPU_HAS_FMA
+ // Returning the correct +0 when x = 1.0 for non-FMA targets with FE_DOWNWARD
+ // rounding mode.
+ if (LIBC_UNLIKELY((x_u & 0x007f'ffffU) == 0))
+ return static_cast<float>(
+ static_cast<double>(m + xbits.get_unbiased_exponent()) * LOG_2);
+#endif // LIBC_TARGET_CPU_HAS_FMA
- double d = static_cast<float>(xbits) - static_cast<float>(f);
- d *= ONE_OVER_F[f_index];
+ uint32_t mant = xbits.get_mantissa();
+ // Extract 7 leading fractional bits of the mantissa
+ int index = mant >> 16;
+ // Add unbiased exponent. Add an extra 1 if the 7 leading fractional bits are
+ // all 1's.
+ m += static_cast<int>((x_u + (1 << 16)) >> 23);
- double extra_factor =
- fputil::multiply_add(static_cast<double>(m), LOG_2, LOG_F[f_index]);
+ // Set bits to 1.m
+ xbits.set_unbiased_exponent(0x7F);
- double r = __llvm_libc::fputil::polyeval(
- d, extra_factor, 0x1.fffffffffffacp-1, -0x1.fffffffef9cb2p-2,
- 0x1.5555513bc679ap-2, -0x1.fff4805ea441p-3, 0x1.930180dbde91ap-3);
+ float u = static_cast<float>(xbits);
+ double v;
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+ v = static_cast<double>(fputil::multiply_add(u, R[index], -1.0f)); // Exact.
+#else
+ v = fputil::multiply_add(static_cast<double>(u), RD[index], -1.0); // Exact
+#endif // LIBC_TARGET_CPU_HAS_FMA
+ // Degree-5 polynomial approximation of log generated by Sollya with:
+ // > P = fpminimax(log(1 + x)/x, 4, [|1, D...|], [-2^-8, 2^-7]);
+ constexpr double COEFFS[4] = {-0x1.000000000fe63p-1, 0x1.555556e963c16p-2,
+ -0x1.000028dedf986p-2, 0x1.966681bfda7f7p-3};
+ double v2 = v * v; // Exact
+ double p2 = fputil::multiply_add(v, COEFFS[3], COEFFS[2]);
+ double p1 = fputil::multiply_add(v, COEFFS[1], COEFFS[0]);
+ double p0 = LOG_R[index] + v;
+ double r = fputil::multiply_add(static_cast<double>(m), LOG_2,
+ fputil::polyeval(v2, p0, p1, p2));
return static_cast<float>(r);
}
diff --git a/libc/test/src/math/logf_test.cpp b/libc/test/src/math/logf_test.cpp
index bf3aad7893992..5d9b2ab9a0d16 100644
--- a/libc/test/src/math/logf_test.cpp
+++ b/libc/test/src/math/logf_test.cpp
@@ -26,7 +26,7 @@ TEST(LlvmLibcLogfTest, SpecialNumbers) {
EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, __llvm_libc::logf(0.0f), FE_DIVBYZERO);
EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, __llvm_libc::logf(-0.0f), FE_DIVBYZERO);
EXPECT_FP_IS_NAN_WITH_EXCEPTION(__llvm_libc::logf(-1.0f), FE_INVALID);
- EXPECT_FP_EQ(zero, __llvm_libc::logf(1.0f));
+ EXPECT_FP_EQ_ALL_ROUNDING(zero, __llvm_libc::logf(1.0f));
}
TEST(LlvmLibcLogfTest, TrickyInputs) {
More information about the libc-commits
mailing list