[libc-commits] [libc] [libc][math] Implement double precision cbrt correctly rounded to all rounding modes. (PR #99262)

Wed Jul 17 07:54:42 PDT 2024

================
@@ -0,0 +1,340 @@
+//===-- Implementation of cbrt function -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/cbrt.h"
+#include "hdr/fenv_macros.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/dyadic_float.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/common.h"
+#include "src/__support/integer_literals.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+#if ((LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS) != 0)
+#define LIBC_MATH_CBRT_SKIP_ACCURATE_PASS
+#endif
+
+namespace LIBC_NAMESPACE_DECL {
+
+using DoubleDouble = fputil::DoubleDouble;
+using Float128 = typename fputil::DyadicFloat<128>;
+
+namespace {
+
+// Initial approximation of x^(-2/3) for 1 <= x < 2.
+// Polynomial generated by Sollya with:
+// > P = fpminimax(x^(-2/3), 7, [|D...|], [1, 2]);
+// > dirtyinfnorm(P/x^(-2/3) - 1, [1, 2]);
+// 0x1.28...p-21
+constexpr double intial_approximation(double x) {
+  constexpr double COEFFS[8] = {
+      0x1.bc52aedead5c6p1,  -0x1.b52bfebf110b3p2,  0x1.1d8d71d53d126p3,
+      -0x1.de2db9e81cf87p2, 0x1.0154ca06153bdp2,   -0x1.5973c66ee6da7p0,
+      0x1.07bf6ac832552p-2, -0x1.5e53d9ce41cb8p-6,
+  };
+
+  double x_sq = x * x;
+
+  double c0 = fputil::multiply_add(x, COEFFS[1], COEFFS[0]);
+  double c1 = fputil::multiply_add(x, COEFFS[3], COEFFS[2]);
+  double c2 = fputil::multiply_add(x, COEFFS[5], COEFFS[4]);
+  double c3 = fputil::multiply_add(x, COEFFS[7], COEFFS[6]);
+
+  double x_4 = x_sq * x_sq;
+  double d0 = fputil::multiply_add(x_sq, c1, c0);
+  double d1 = fputil::multiply_add(x_sq, c3, c2);
+
+  return fputil::multiply_add(x_4, d1, d0);
+}
+
+// Get the error term for Newton iteration:
+//   h(x) = x^3 * a^2 - 1,
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+constexpr double get_error(const DoubleDouble &x_3, const DoubleDouble &a_sq) {
+  return fputil::multiply_add(x_3.hi, a_sq.hi, -1.0) +
+         fputil::multiply_add(x_3.lo, a_sq.hi, x_3.hi * a_sq.lo);
+}
+#else
+constexpr double get_error(const DoubleDouble &x_3, const DoubleDouble &a_sq) {
+  DoubleDouble x_3_a_sq = fputil::quick_mult(a_sq, x_3);
+  return (x_3_a_sq.hi - 1.0) + x_3_a_sq.lo;
+}
+#endif
+
+} // anonymous namespace
+
+// Correctly rounded cbrt algorithm:
+//
+// === Step 1 - Range reduction ===
+// For x = (-1)^s * 2^e * (1.m), we get 2 reduced arguments x_r and a as:
+//   x_r = 1.m
+//   a   = (-1)^s * 2^(e % 3) * (1.m)
+// Then cbrt(x) = x^(1/3) can be computed as:
+//   x^(1/3) = 2^(e / 3) * a^(1/3).
+//
+// In order to avoid division, we compute a^(-2/3) using Newton method and then
+// multiply the results by a:
+//   a^(1/3) = a * a^(-2/3).
+//
+// === Step 2 - First approximation to a^(-2/3) ===
+// First, we use a degree-7 minimax polynomial generated by Sollya to
+// approximate x_r^(-2/3) for 1 <= x_r < 2.
+//   p = P(x_r) ~ x_r^(-2/3),
+// with relative errors bounded by:
+//   | p / x_r^(-2/3) - 1 | < 1.16 * 2^-21.
+//
+// Then we multiply with 2^(e % 3) from a small lookup table to get:
+//   x_0 = 2^(-2*(e % 3)/3) * p
+//       ~ 2^(-2*(e % 3)/3) * x_r^(-2/3)
+//       = a^(-2/3)
+// With relative errors:
+//   | x_0 / a^(-2/3) - 1 | < 1.16 * 2^-21.
+// This step is done in double precision.
+//
+// === Step 3 - First Newton iteration ===
+// We follow the method described in:
+//   Sibidanov, A. and Zimmermann, P., "Correctly rounded cubic root evaluation
+//   in double precision", https://core-math.gitlabpages.inria.fr/cbrt64.pdf
+// to derive multiplicative Newton iterations as below:
+// Let x_n be the nth approximation to a^(-2/3).  Define the n^th error as:
+//   h_n = x_n^3 * a^2 - 1
+// Then:
+//   a^(-2/3) = x_n / (1 + h_n)^(1/3)
+//            = x_n * (1 - (1/3) * h_n + (2/9) * h_n^2 - (14/81) * h_n^3 + ...)
+// using the Taylor series expansion of (1 + h_n)^(-1/3).
+//
+// Apply to x_0 above:
+//   h_0 = x_0^3 * a^2 - 1
+//       = a^2 * (x_0 - a^(-2/3)) * (x_0^2 + x_0 * a^(-2/3) + a^(-4/3)),
+// it's bounded by:
+//   |h_0| < 4 * 3 * 1.16 * 2^-21 * 4 < 2^-17.
+// So in the first iteration step, we use:
+//   x_1 = x_0 * (1 - (1/3) * h_n + (2/9) * h_n^2 - (14/81) * h_n^3)
+// Its relative error is bounded by:
+//   | x_1 / a^(-2/3) - 1 | < 35/242 * |h_0|^4 < 2^-70.
+// Then we perform Ziv's rounding test and check if the answer is exact.
+// This step is done in double-double precision.
+//
+// === Step 4 - Second Newton iteration ===
+// If the Ziv's rounding test from the previous step fails, we define the error
+// term:
+//   h_1 = x_1^3 * a^2 - 1,
+// And perform another iteration:
+//   x_2 = x_1 * (1 - h_1 / 3)
+// with the relative errors exceed the precision of double-double.
+// We then check the Ziv's accuracy test with relative errors < 2^-102 to
+// compensate for rounding errors.
+//
+// === Step 5 - Final iteration ===
+// If the Ziv's accuracy test from the previous step fails, we perform another
+// iteration in 128-bit precision and check for exact outputs.
+//
+// TODO: It is possible to replace this costly computation step with special
+// exceptional handling, similar to what was done in the CORE-MATH project:
+// https://gitlab.inria.fr/core-math/core-math/-/blob/master/src/binary64/cbrt/cbrt.c
+
+LLVM_LIBC_FUNCTION(double, cbrt, (double x)) {
+  using FPBits = typename fputil::FPBits<double>;
----------------
overmighty wrote:

I don't think `typename` is necessary here either.

https://github.com/llvm/llvm-project/pull/99262