[libc-commits] [libc] [libc][math] Implement fast pass for double precision pow function with up to 1ULP error. (PR #101926)

Mon Aug 5 05:05:08 PDT 2024

================
@@ -0,0 +1,490 @@
+//===-- Double-precision x^y function -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/pow.h"
+#include "common_constants.h" // Lookup tables EXP_M1 and EXP_M2.
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/FPUtil/sqrt.h" // Speedup for pow(x, 1/2) = sqrt(x)
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+namespace LIBC_NAMESPACE_DECL {
+
+using fputil::DoubleDouble;
+
+namespace {
+
+// Constants for log2(x) range reduction, generated by Sollya with:
+// > for i from 0 to 127 do {
+//     r = 2^-8 * ceil( 2^8 * (1 - 2^(-8)) / (1 + i*2^-7) );
+//     b = nearestint(log2(r) * 2^41) * 2^-41;
+//     c = round(log2(r) - b, D, RN);
+//     print("{", -c, ",", -b, "},");
+//   };
+// This is the same as -log2(RD[i]), with the least significant bits of the
+// high part set to be 2^-41, so that the sum of high parts + e_x is exact in
+// double precision.
+// We also replace the first and the last ones to be 0.
+constexpr DoubleDouble LOG2_R_DD[128] = {
+    {0.0, 0.0},
+    {-0x1.19b14945cf6bap-44, 0x1.72c7ba21p-7},
+    {-0x1.95539356f93dcp-43, 0x1.743ee862p-6},
+    {0x1.abe0a48f83604p-43, 0x1.184b8e4c5p-5},
+    {0x1.635577970e04p-43, 0x1.77394c9d9p-5},
+    {-0x1.401fbaaa67e3cp-45, 0x1.d6ebd1f2p-5},
+    {-0x1.5b1799ceaeb51p-43, 0x1.1bb32a6008p-4},
+    {0x1.7c407050799bfp-43, 0x1.4c560fe688p-4},
+    {0x1.da6339da288fcp-43, 0x1.7d60496cf8p-4},
+    {0x1.be4f6f22dbbadp-43, 0x1.960caf9ab8p-4},
+    {-0x1.c760bc9b188c4p-45, 0x1.c7b528b71p-4},
+    {0x1.164e932b2d51cp-44, 0x1.f9c95dc1dp-4},
+    {0x1.924ae921f7ecap-45, 0x1.097e38ce6p-3},
+    {-0x1.6d25a5b8a19b2p-44, 0x1.22dadc2ab4p-3},
+    {0x1.e50a1644ac794p-43, 0x1.3c6fb650ccp-3},
+    {0x1.f34baa74a7942p-43, 0x1.494f863b8cp-3},
+    {-0x1.8f7aac147fdc1p-46, 0x1.633a8bf438p-3},
+    {0x1.f84be19cb9578p-43, 0x1.7046031c78p-3},
+    {-0x1.66cccab240e9p-46, 0x1.8a8980abfcp-3},
+    {-0x1.3f7a55cd2af4cp-47, 0x1.97c1cb13c8p-3},
+    {0x1.3458cde69308cp-43, 0x1.b2602497d4p-3},
+    {-0x1.667f21fa8423fp-44, 0x1.bfc67a8p-3},
+    {0x1.d2fe4574e09b9p-47, 0x1.dac22d3e44p-3},
+    {0x1.367bde40c5e6dp-43, 0x1.e857d3d36p-3},
+    {0x1.d45da26510033p-46, 0x1.01d9bbcfa6p-2},
+    {-0x1.7204f55bbf90dp-44, 0x1.08bce0d96p-2},
+    {-0x1.d4f1b95e0ff45p-43, 0x1.169c05364p-2},
+    {0x1.c20d74c0211bfp-44, 0x1.1d982c9d52p-2},
+    {0x1.ad89a083e072ap-43, 0x1.249cd2b13cp-2},
+    {0x1.cd0cb4492f1bcp-43, 0x1.32bfee370ep-2},
+    {-0x1.2101a9685c779p-47, 0x1.39de8e155ap-2},
+    {0x1.9451cd394fe8dp-43, 0x1.4106017c3ep-2},
+    {0x1.661e393a16b95p-44, 0x1.4f6fbb2cecp-2},
+    {-0x1.c6d8d86531d56p-44, 0x1.56b22e6b58p-2},
+    {0x1.c1c885adb21d3p-43, 0x1.5dfdcf1eeap-2},
+    {0x1.3bb5921006679p-45, 0x1.6552b49986p-2},
+    {0x1.1d406db502403p-43, 0x1.6cb0f6865cp-2},
+    {0x1.55a63e278bad5p-43, 0x1.7b89f02cf2p-2},
+    {-0x1.66ae2a7ada553p-49, 0x1.8304d90c12p-2},
+    {-0x1.66cccab240e9p-45, 0x1.8a8980abfcp-2},
+    {-0x1.62404772a151dp-45, 0x1.921800924ep-2},
+    {0x1.ac9bca36fd02ep-44, 0x1.99b072a96cp-2},
+    {0x1.4bc302ffa76fbp-43, 0x1.a8ff97181p-2},
+    {0x1.01fea1ec47c71p-43, 0x1.b0b67f4f46p-2},
+    {-0x1.f20203b3186a6p-43, 0x1.b877c57b1cp-2},
+    {-0x1.2642415d47384p-45, 0x1.c043859e3p-2},
+    {-0x1.bc76a2753b99bp-50, 0x1.c819dc2d46p-2},
+    {-0x1.da93ae3a5f451p-43, 0x1.cffae611aep-2},
+    {-0x1.50e785694a8c6p-43, 0x1.d7e6c0abc4p-2},
+    {0x1.c56138c894641p-43, 0x1.dfdd89d586p-2},
+    {0x1.5669df6a2b592p-43, 0x1.e7df5fe538p-2},
+    {-0x1.ea92d9e0e8ac2p-48, 0x1.efec61b012p-2},
+    {0x1.a0331af2e6feap-43, 0x1.f804ae8d0cp-2},
+    {0x1.9518ce032f41dp-48, 0x1.0014332bep-1},
+    {-0x1.b3b3864c60011p-44, 0x1.042bd4b9a8p-1},
+    {-0x1.103e8f00d41c8p-45, 0x1.08494c66b9p-1},
+    {0x1.65be75cc3da17p-43, 0x1.0c6caaf0c5p-1},
+    {0x1.3676289cd3dd4p-43, 0x1.1096015deep-1},
+    {-0x1.41dfc7d7c3321p-43, 0x1.14c560fe69p-1},
+    {0x1.e0cda8bd74461p-44, 0x1.18fadb6e2dp-1},
+    {0x1.2a606046ad444p-44, 0x1.1d368296b5p-1},
+    {0x1.f9ea977a639cp-43, 0x1.217868b0c3p-1},
+    {-0x1.50520a377c7ecp-45, 0x1.25c0a0463cp-1},
+    {0x1.6e3cb71b554e7p-47, 0x1.2a0f3c3407p-1},
+    {-0x1.4275f1035e5e8p-48, 0x1.2e644fac05p-1},
+    {-0x1.4275f1035e5e8p-48, 0x1.2e644fac05p-1},
+    {-0x1.979a5db68721dp-45, 0x1.32bfee370fp-1},
+    {0x1.1ee969a95f529p-43, 0x1.37222bb707p-1},
+    {0x1.bb4b69336b66ep-43, 0x1.3b8b1c68fap-1},
+    {0x1.d5e6a8a4fb059p-45, 0x1.3ffad4e74fp-1},
+    {0x1.3106e404cabb7p-44, 0x1.44716a2c08p-1},
+    {0x1.3106e404cabb7p-44, 0x1.44716a2c08p-1},
+    {-0x1.9bcaf1aa4168ap-43, 0x1.48eef19318p-1},
+    {0x1.1646b761c48dep-44, 0x1.4d7380dcc4p-1},
+    {0x1.2f0c0bfe9dbecp-43, 0x1.51ff2e3021p-1},
+    {0x1.29904613e33cp-43, 0x1.5692101d9bp-1},
+    {0x1.1d406db502403p-44, 0x1.5b2c3da197p-1},
+    {0x1.1d406db502403p-44, 0x1.5b2c3da197p-1},
+    {-0x1.125d6cbcd1095p-44, 0x1.5fcdce2728p-1},
+    {-0x1.bd9b32266d92cp-43, 0x1.6476d98adap-1},
+    {0x1.54243b21709cep-44, 0x1.6927781d93p-1},
+    {0x1.54243b21709cep-44, 0x1.6927781d93p-1},
+    {-0x1.ce60916e52e91p-44, 0x1.6ddfc2a79p-1},
+    {0x1.f1f5ae718f241p-43, 0x1.729fd26b7p-1},
+    {-0x1.6eb9612e0b4f3p-43, 0x1.7767c12968p-1},
+    {-0x1.6eb9612e0b4f3p-43, 0x1.7767c12968p-1},
+    {0x1.fed21f9cb2cc5p-43, 0x1.7c37a9227ep-1},
+    {0x1.7f5dc57266758p-43, 0x1.810fa51bf6p-1},
+    {0x1.7f5dc57266758p-43, 0x1.810fa51bf6p-1},
+    {0x1.5b338360c2ae2p-43, 0x1.85efd062c6p-1},
+    {-0x1.96fc8f4b56502p-43, 0x1.8ad846cf37p-1},
+    {-0x1.96fc8f4b56502p-43, 0x1.8ad846cf37p-1},
+    {-0x1.bdc81c4db3134p-44, 0x1.8fc924c89bp-1},
+    {0x1.36c101ee1344p-43, 0x1.94c287492cp-1},
+    {0x1.36c101ee1344p-43, 0x1.94c287492cp-1},
+    {0x1.e41fa0a62e6aep-44, 0x1.99c48be206p-1},
+    {-0x1.d97ee9124773bp-46, 0x1.9ecf50bf44p-1},
+    {-0x1.d97ee9124773bp-46, 0x1.9ecf50bf44p-1},
+    {-0x1.3f94e00e7d6bcp-46, 0x1.a3e2f4ac44p-1},
+    {-0x1.6879fa00b120ap-43, 0x1.a8ff971811p-1},
+    {-0x1.6879fa00b120ap-43, 0x1.a8ff971811p-1},
+    {0x1.1659d8e2d7d38p-44, 0x1.ae255819fp-1},
+    {0x1.1e5e0ae0d3f8ap-43, 0x1.b35458761dp-1},
+    {0x1.1e5e0ae0d3f8ap-43, 0x1.b35458761dp-1},
+    {0x1.484a15babcf88p-43, 0x1.b88cb9a2abp-1},
+    {0x1.484a15babcf88p-43, 0x1.b88cb9a2abp-1},
+    {0x1.871a7610e40bdp-45, 0x1.bdce9dcc96p-1},
+    {-0x1.2d90e5edaeceep-43, 0x1.c31a27dd01p-1},
+    {-0x1.2d90e5edaeceep-43, 0x1.c31a27dd01p-1},
+    {-0x1.5dd31d962d373p-43, 0x1.c86f7b7ea5p-1},
+    {-0x1.5dd31d962d373p-43, 0x1.c86f7b7ea5p-1},
+    {-0x1.9ad57391924a7p-43, 0x1.cdcebd2374p-1},
+    {-0x1.3167ccc538261p-44, 0x1.d338120a6ep-1},
+    {-0x1.3167ccc538261p-44, 0x1.d338120a6ep-1},
+    {0x1.c7a4ff65ddbc9p-45, 0x1.d8aba045bp-1},
+    {0x1.c7a4ff65ddbc9p-45, 0x1.d8aba045bp-1},
+    {-0x1.f9ab3cf74babap-44, 0x1.de298ec0bbp-1},
+    {-0x1.f9ab3cf74babap-44, 0x1.de298ec0bbp-1},
+    {0x1.52842c1c1e586p-43, 0x1.e3b20546f5p-1},
+    {0x1.52842c1c1e586p-43, 0x1.e3b20546f5p-1},
+    {0x1.3c6764fc87b4ap-48, 0x1.e9452c8a71p-1},
+    {0x1.3c6764fc87b4ap-48, 0x1.e9452c8a71p-1},
+    {-0x1.a0976c0a2827dp-44, 0x1.eee32e2aedp-1},
+    {-0x1.a0976c0a2827dp-44, 0x1.eee32e2aedp-1},
+    {-0x1.a45314dc4fc42p-43, 0x1.f48c34bd1fp-1},
+    {-0x1.a45314dc4fc42p-43, 0x1.f48c34bd1fp-1},
+    {0x1.ef5d00e390ap-44, 0x1.fa406bd244p-1},
+    {0.0, 1.0},
+};
+
+bool is_odd_integer(double x) {
+  using FPBits = fputil::FPBits<double>;
+  FPBits xbits(x);
+  uint64_t x_u = xbits.uintval();
+  unsigned x_e = static_cast<unsigned>(xbits.get_biased_exponent());
+  unsigned lsb =
+      static_cast<unsigned>(cpp::countr_zero(x_u | FPBits::EXP_MASK));
+  constexpr unsigned UNIT_EXPONENT =
+      static_cast<unsigned>(FPBits::EXP_BIAS + FPBits::FRACTION_LEN);
+  return (x_e + lsb == UNIT_EXPONENT);
+}
+
+bool is_integer(double x) {
+  using FPBits = fputil::FPBits<double>;
+  FPBits xbits(x);
+  uint64_t x_u = xbits.uintval();
+  unsigned x_e = static_cast<unsigned>(xbits.get_biased_exponent());
+  unsigned lsb =
+      static_cast<unsigned>(cpp::countr_zero(x_u | FPBits::EXP_MASK));
+  constexpr unsigned UNIT_EXPONENT =
+      static_cast<unsigned>(FPBits::EXP_BIAS + FPBits::FRACTION_LEN);
+  return (x_e + lsb >= UNIT_EXPONENT);
+}
+
+} // namespace
+
+LLVM_LIBC_FUNCTION(double, pow, (double x, double y)) {
+  using FPBits = fputil::FPBits<double>;
+
+  FPBits xbits(x), ybits(y);
+
+  bool x_sign = xbits.sign() == Sign::NEG;
+  bool y_sign = ybits.sign() == Sign::NEG;
+
+  FPBits x_abs = xbits.abs();
+  FPBits y_abs = ybits.abs();
+
+  uint64_t x_mant = xbits.get_mantissa();
+  uint64_t y_mant = ybits.get_mantissa();
+  uint64_t x_u = xbits.uintval();
+  uint64_t x_a = x_abs.uintval();
+  uint64_t y_a = y_abs.uintval();
+
+  double e_x = static_cast<double>(xbits.get_exponent());
+  uint64_t sign = 0;
+
+  ///////// BEGIN - Check exceptional cases ////////////////////////////////////
+
+  // The double precision number that is closest to 1 is (1 - 2^-53), which has
+  //   log2(1 - 2^-53) ~ -1.715...p-53.
+  // So if |y| > 1075 / log2(1 - 2^-53), and x is finite:
+  //   |y * log2(x)| = 0 or > 1075.
+  // Hence x^y will either overflow or underflow if x is not zero.
+  if (LIBC_UNLIKELY(y_mant == 0 || y_a > 0x43d74910d52d3052 ||
+                    x_u == FPBits::one().uintval() ||
+                    x_u >= FPBits::inf().uintval() ||
+                    x_u < FPBits::min_normal().uintval())) {
+    // Exceptional exponents.
+    switch (y_a) {
+    case 0: // y = +-0.0
+      return 1.0;
+    case 0x3fe0'0000'0000'0000: // y = +-0.5
+      // TODO: speed up x^(-1/2) with rsqrt(x) when available.
+      return y_sign ? (1.0 / fputil::sqrt<double>(x)) : fputil::sqrt<double>(x);
+    case 0x3ff0'0000'0000'0000: // y = +-1.0
+      return y_sign ? (1.0 / x) : x;
+    case 0x4000'0000'0000'0000: // y = +-2.0;
+      return y_sign ? (1.0 / (x * x)) : (x * x);
+    }
+
+    if (y_a > 0x43d7'4910'd52d'3052) {
+      if (y_a >= 0x7ff0'0000'0000'0000) {
+        // y is inf or nan
+        if (y_mant != 0)
+          // y is NaN
+          // pow(1, NaN) = 1
+          // pow(x, NaN) = NaN
+          return (x_u == FPBits::one().uintval()) ? 1.0 : y;
+
+        // Now y is +-Inf
+        if (x_abs.is_nan())
+          // pow(NaN, +-Inf) = NaN
+          return x;
+
+        if (x_a == 0x3ff0'0000'0000'0000)
+          // pow(+-1, +-Inf) = 1.0
+          return 1.0;
+
+        if (x_a == 0 && y_sign) {
+          // pow(+-0, -Inf) = +inf and raise FE_DIVBYZERO
+          fputil::set_errno_if_required(EDOM);
+          fputil::raise_except_if_required(FE_DIVBYZERO);
+          return FPBits::inf().get_val();
+        }
+        // pow (|x| < 1, -inf) = +inf
+        // pow (|x| < 1, +inf) = 0.0f
+        // pow (|x| > 1, -inf) = 0.0f
+        // pow (|x| > 1, +inf) = +inf
+        return ((x_a < FPBits::one().uintval()) == y_sign)
+                   ? FPBits::inf().get_val()
+                   : 0.0;
+      }
+      // x^y will be overflow / underflow in double precision.  Set y to a
+      // large enough exponent but not too large, so that the computations
+      // won't be overflow in double precision.
+      y = y_sign ? -0x1.0p100 : 0x1.0p100;
+    }
+
+    // y is finite and non-zero.
+
+    if (x_u == FPBits::one().uintval())
+      // pow(1, y) = 1
+      return 1.0;
+
+    // TODO: Speed things up with pow(2, y) = exp2(y) and pow(10, y) = exp10(y).
+
+    if (x_a == 0) {
+      const bool out_is_neg = x_sign && is_odd_integer(y);
----------------
overmighty wrote:

Nit: should be either `bool out_is_neg` or `const bool OUT_IS_NEG` due to https://libc.llvm.org/dev/code_style.html#naming-style.

https://github.com/llvm/llvm-project/pull/101926