[libc-commits] [libc] [libc][math] Implement double precision cbrt correctly rounded to all rounding modes. (PR #99262)

Tue Jul 16 19:03:08 PDT 2024

https://github.com/lntue updated https://github.com/llvm/llvm-project/pull/99262

>From c0f95f91c27d8d3d9ded0fa9f2970c969d1e80a8 Mon Sep 17 00:00:00 2001
From: Tue Ly <lntue.h at gmail.com>
Date: Wed, 17 Jul 2024 01:40:04 +0000
Subject: [PATCH 1/2] [libc][math] Implement double precision cbrt correctly
 rounded to all rounding modes.

---
 libc/config/darwin/arm/entrypoints.txt    |   1 +
 libc/config/linux/aarch64/entrypoints.txt |   1 +
 libc/config/linux/arm/entrypoints.txt     |   1 +
 libc/config/linux/riscv/entrypoints.txt   |   1 +
 libc/config/linux/x86_64/entrypoints.txt  |   1 +
 libc/config/windows/entrypoints.txt       |   1 +
 libc/spec/stdc.td                         |   1 +
 libc/src/math/CMakeLists.txt              |   1 +
 libc/src/math/cbrt.h                      |  18 ++
 libc/src/math/generic/CMakeLists.txt      |  16 +
 libc/src/math/generic/cbrt.cpp            | 340 ++++++++++++++++++++++
 libc/test/src/math/CMakeLists.txt         |  12 +
 libc/test/src/math/cbrt_test.cpp          | 104 +++++++
 libc/test/src/math/smoke/CMakeLists.txt   |  10 +
 libc/test/src/math/smoke/cbrt_test.cpp    |  35 +++
 15 files changed, 543 insertions(+)
 create mode 100644 libc/src/math/cbrt.h
 create mode 100644 libc/src/math/generic/cbrt.cpp
 create mode 100644 libc/test/src/math/cbrt_test.cpp
 create mode 100644 libc/test/src/math/smoke/cbrt_test.cpp

diff --git a/libc/config/darwin/arm/entrypoints.txt b/libc/config/darwin/arm/entrypoints.txt
index 383118dc781e5..32a08f20b328f 100644
--- a/libc/config/darwin/arm/entrypoints.txt
+++ b/libc/config/darwin/arm/entrypoints.txt
@@ -123,6 +123,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.atan2f
     libc.src.math.atanf
     libc.src.math.atanhf
+    libc.src.math.cbrt
     libc.src.math.cbrtf
     libc.src.math.copysign
     libc.src.math.copysignf
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index dee6ac673643e..9b718c3f81151 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -345,6 +345,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.atan2f
     libc.src.math.atanf
     libc.src.math.atanhf
+    libc.src.math.cbrt
     libc.src.math.cbrtf
     libc.src.math.ceil
     libc.src.math.ceilf
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index b0ee0e989b5ed..a72f8668808a5 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -216,6 +216,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.atan2f
     libc.src.math.atanf
     libc.src.math.atanhf
+    libc.src.math.cbrt
     libc.src.math.cbrtf
     libc.src.math.ceil
     libc.src.math.ceilf
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 516a4b6ce3433..266c94d54a9df 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -347,6 +347,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.atan2f
     libc.src.math.atanf
     libc.src.math.atanhf
+    libc.src.math.cbrt
     libc.src.math.cbrtf
     libc.src.math.ceil
     libc.src.math.ceilf
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index b6c55e7aa3033..4d19a28f4a2b3 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -370,6 +370,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.canonicalize
     libc.src.math.canonicalizef
     libc.src.math.canonicalizel
+    libc.src.math.cbrt
     libc.src.math.cbrtf
     libc.src.math.ceil
     libc.src.math.ceilf
diff --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt
index 499c6bfe3a229..afc9ca87ff094 100644
--- a/libc/config/windows/entrypoints.txt
+++ b/libc/config/windows/entrypoints.txt
@@ -121,6 +121,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.atan2f
     libc.src.math.atanf
     libc.src.math.atanhf
+    libc.src.math.cbrt
     libc.src.math.cbrtf
     libc.src.math.copysign
     libc.src.math.copysignf
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index aa56152aee141..a4c6b40b98388 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -382,6 +382,7 @@ def StdC : StandardSpec<"stdc"> {
       ],
       [], // Enumerations
       [
+          FunctionSpec<"cbrt", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"cbrtf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
 
           FunctionSpec<"copysign", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 6462afbc54a4f..dc2339896f2bb 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -65,6 +65,7 @@ add_math_entrypoint_object(canonicalizel)
 add_math_entrypoint_object(canonicalizef16)
 add_math_entrypoint_object(canonicalizef128)
 
+add_math_entrypoint_object(cbrt)
 add_math_entrypoint_object(cbrtf)
 
 add_math_entrypoint_object(ceil)
diff --git a/libc/src/math/cbrt.h b/libc/src/math/cbrt.h
new file mode 100644
index 0000000000000..a7d5fe80e57b3
--- /dev/null
+++ b/libc/src/math/cbrt.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for cbrt --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_CBRT_H
+#define LLVM_LIBC_SRC_MATH_CBRT_H
+
+namespace LIBC_NAMESPACE {
+
+double cbrt(double x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_CBRT_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index c2f58fb1a4f71..318728d6e315c 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -4180,3 +4180,19 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.multiply_add
     libc.src.__support.macros.optimization
 )
+
+add_entrypoint_object(
+  cbrt
+  SRCS
+    cbrt.cpp
+  HDRS
+    ../cbrt.h
+  COMPILE_OPTIONS
+    -O3
+  DEPENDS
+    libc.hdr.fenv_macros
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.macros.optimization
+)
diff --git a/libc/src/math/generic/cbrt.cpp b/libc/src/math/generic/cbrt.cpp
new file mode 100644
index 0000000000000..a60e2ea44b6e9
--- /dev/null
+++ b/libc/src/math/generic/cbrt.cpp
@@ -0,0 +1,340 @@
+//===-- Implementation of cbrt function -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/cbrt.h"
+#include "hdr/fenv_macros.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/dyadic_float.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/common.h"
+#include "src/__support/integer_literals.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+#if ((LIBC_MATH & LIBC_MATH_SKIP_ACCURATE_PASS) != 0)
+#define LIBC_MATH_CBRT_SKIP_ACCURATE_PASS
+#endif
+
+namespace LIBC_NAMESPACE_DECL {
+
+using DoubleDouble = fputil::DoubleDouble;
+using Float128 = typename fputil::DyadicFloat<128>;
+
+namespace {
+
+// Initial approximation of x^(-2/3) for 1 <= x < 2.
+// Polynomial generated by Sollya with:
+// > P = fpminimax(x^(-2/3), 7, [|D...|], [1, 2]);
+// > dirtyinfnorm(P/x^(-2/3) - 1, [1, 2]);
+// 0x1.28...p-21
+constexpr double intial_approximation(double x) {
+  constexpr double COEFFS[8] = {
+      0x1.bc52aedead5c6p1,  -0x1.b52bfebf110b3p2,  0x1.1d8d71d53d126p3,
+      -0x1.de2db9e81cf87p2, 0x1.0154ca06153bdp2,   -0x1.5973c66ee6da7p0,
+      0x1.07bf6ac832552p-2, -0x1.5e53d9ce41cb8p-6,
+  };
+
+  double x_sq = x * x;
+
+  double c0 = fputil::multiply_add(x, COEFFS[1], COEFFS[0]);
+  double c1 = fputil::multiply_add(x, COEFFS[3], COEFFS[2]);
+  double c2 = fputil::multiply_add(x, COEFFS[5], COEFFS[4]);
+  double c3 = fputil::multiply_add(x, COEFFS[7], COEFFS[6]);
+
+  double x_4 = x_sq * x_sq;
+  double d0 = fputil::multiply_add(x_sq, c1, c0);
+  double d1 = fputil::multiply_add(x_sq, c3, c2);
+
+  return fputil::multiply_add(x_4, d1, d0);
+}
+
+// Get the error term for Newton iteration:
+//   h(x) = x^3 * a^2 - 1,
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+constexpr double get_error(const DoubleDouble &x_3, const DoubleDouble &a_sq) {
+  return fputil::multiply_add(x_3.hi, a_sq.hi, -1.0) +
+         fputil::multiply_add(x_3.lo, a_sq.hi, x_3.hi * a_sq.lo);
+}
+#else
+constexpr double get_error(const DoubleDouble &x_3, const DoubleDouble &a_sq) {
+  DoubleDouble x_3_a_sq = fputil::quick_mult(a_sq, x_3);
+  return (x_3_a_sq.hi - 1.0) + x_3_a_sq.lo;
+}
+#endif
+
+} // anonymous namespace
+
+// Correctly rounded cbrt algorithm:
+//
+// === Step 1 - Range reduction ===
+// For x = (-1)^s * 2^e * (1.m), we get 2 reduced arguments x_r and a as:
+//   x_r = 1.m
+//   a   = (-1)^s * 2^(e % 3) * (1.m)
+// Then cbrt(x) = x^(1/3) can be computed as:
+//   x^(1/3) = 2^(e / 3) * a^(1/3).
+//
+// In order to avoid division, we compute a^(-2/3) using Newton method and then
+// multiply the results by a:
+//   a^(1/3) = a * a^(-2/3).
+//
+// === Step 2 - First approximation to a^(-2/3) ===
+// First, we use a degree-7 minimax polynomial generated by Sollya to
+// approximate x_r^(-2/3) for 1 <= x_r < 2.
+//   p = P(x_r) ~ x_r^(-2/3),
+// with relative errors bounded by:
+//   | p / x_r^(-2/3) - 1 | < 1.16 * 2^-21.
+//
+// Then we multiply with 2^(e % 3) from a small lookup table to get:
+//   x_0 = 2^(-2*(e % 3)/3) * p
+//       ~ 2^(-2*(e % 3)/3) * x_r^(-2/3)
+//       = a^(-2/3)
+// With relative errors:
+//   | x_0 / a^(-2/3) - 1 | < 1.16 * 2^-21.
+// This step is done in double precision.
+//
+// === Step 3 - First Newton iteration ===
+// We follow the method described in:
+//   Sibidanov, A. and Zimmermann, P., "Correctly rounded cubic root evaluation
+//   in double precision", https://core-math.gitlabpages.inria.fr/cbrt64.pdf
+// to derive multiplicative Newton iterations as below:
+// Let x_n be the nth approximation to a^(-2/3).  Define the n^th error as:
+//   h_n = x_n^3 * a^2 - 1
+// Then:
+//   a^(-2/3) = x_n / (1 + h_n)^(1/3)
+//            = x_n * (1 - (1/3) * h_n + (2/9) * h_n^2 - (14/81) * h_n^3 + ...)
+// using the Taylor series expansion of (1 + h_n)^(-1/3).
+//
+// Apply to x_0 above:
+//   h_0 = x_0^3 * a^2 - 1
+//       = a^2 * (x_0 - a^(-2/3)) * (x_0^2 + x_0 * a^(-2/3) + a^(-4/3)),
+// it's bounded by:
+//   |h_0| < 4 * 3 * 1.16 * 2^-21 * 4 < 2^-17.
+// So in the first iteration step, we use:
+//   x_1 = x_0 * (1 - (1/3) * h_n + (2/9) * h_n^2 - (14/81) * h_n^3)
+// Its relative error is bounded by:
+//   | x_1 / a^(-2/3) - 1 | < 35/242 * |h_0|^4 < 2^-70.
+// Then we perform Ziv's rounding test and check if the answer is exact.
+// This step is done in double-double precision.
+//
+// === Step 4 - Second Newton iteration ===
+// If the Ziv's rounding test from the previous step fails, we define the error
+// term:
+//   h_1 = x_1^3 * a^2 - 1,
+// And perform another iteration:
+//   x_2 = x_1 * (1 - h_1 / 3)
+// with the relative errors exceed the precision of double-double.
+// We then check the Ziv's accuracy test with relative errors < 2^-102 to
+// compensate for rounding errors.
+//
+// === Step 5 - Final iteration ===
+// If the Ziv's accuracy test from the previous step fails, we perform another
+// iteration in 128-bit precision and check for exact outputs.
+//
+// TODO: It is possible to replace this costly computation step with special
+// exceptional handling, similar to what was done in the CORE-MATH project:
+// https://gitlab.inria.fr/core-math/core-math/-/blob/master/src/binary64/cbrt/cbrt.c
+
+LLVM_LIBC_FUNCTION(double, cbrt, (double x)) {
+  using FPBits = typename fputil::FPBits<double>;
+
+  uint64_t x_u = FPBits(x).uintval();
+  uint64_t x_abs = x_u & 0x7fff'ffff'ffff'ffff;
+
+  unsigned exp_bias_correction = 682; // 1023 * 2/3
+
+  if (LIBC_UNLIKELY(x_abs < FPBits::min_normal().uintval() ||
+                    x_abs >= FPBits::inf().uintval())) {
+    if (x_abs == 0 || x_abs >= FPBits::inf().uintval())
+      // x is 0, Inf, or NaN.
+      return x;
+
+    // x is non-zero denormal number.
+    // Normalize x.
+    x *= 0x1.0p60;
+    exp_bias_correction -= 20;
+  }
+
+  FPBits x_bits(x);
+
+  // When using biased exponent of x in double precision,
+  //   x_e = real_exponent_of_x + 1023
+  // Then:
+  //   x_e / 3 = real_exponent_of_x / 3 + 1023/3
+  //           = real_exponent_of_x / 3 + 341
+  // So to make it the correct biased exponent of x^(1/3), we add
+  //   1023 - 341 = 682
+  // to the quotient x_e / 3.
+  unsigned x_e = static_cast<unsigned>(x_bits.get_biased_exponent());
+  unsigned out_e = (x_e / 3 + exp_bias_correction);
+  unsigned shift_e = x_e % 3;
+
+  // Set x_r = 1.mantissa
+  double x_r =
+      FPBits(x_bits.get_mantissa() |
+             (static_cast<uint64_t>(FPBits::EXP_BIAS) << FPBits::FRACTION_LEN))
+          .get_val();
+
+  // Set a = (-1)^x_sign * 2^(x_e % 3) * (1.mantissa)
+  uint64_t a_bits = x_bits.uintval() & 0x800F'FFFF'FFFF'FFFF;
+  a_bits |=
+      (static_cast<uint64_t>(shift_e + static_cast<unsigned>(FPBits::EXP_BIAS))
+       << FPBits::FRACTION_LEN);
+  double a = FPBits(a_bits).get_val();
+
+  // Initial approximation of x_r^(-2/3).
+  double p = intial_approximation(x_r);
+
+  // Look up for 2^(-2*n/3) used for first approximation step.
+  constexpr double EXP2_M2_OVER_3[3] = {1.0, 0x1.428a2f98d728bp-1,
+                                        0x1.965fea53d6e3dp-2};
+
+  // x0 is an initial approximation of a^(-2/3) for 1 <= |a| < 8.
+  // Relative error: < 1.16 * 2^(-21).
+  double x0 = static_cast<double>(EXP2_M2_OVER_3[shift_e] * p);
+
+  // First iteration in double precision.
+  DoubleDouble a_sq = fputil::exact_mult(a, a);
+
+  // h0 = x0^3 * a^2 - 1
+  DoubleDouble x0_sq = fputil::exact_mult(x0, x0);
+  DoubleDouble x0_3 = fputil::quick_mult(x0, x0_sq);
+
+  double h0 = get_error(x0_3, a_sq);
+
+#ifdef LIBC_MATH_CBRT_SKIP_ACCURATE_PASS
+  constexpr double REL_ERROR = 0;
+#else
+  constexpr double REL_ERROR = 0x1.0p-51;
+#endif // LIBC_MATH_CBRT_SKIP_ACCURATE_PASS
+
+  // Taylor polynomial of (1 + h)^(-1/3):
+  //   (1 + h)^(-1/3) = 1 - h/3 + 2 h^2 / 9 - 14 h^3 / 81 + ...
+  constexpr double ERR_COEFFS[3] = {
+      -0x1.5555555555555p-2 - REL_ERROR, // -1/3 - relative_error
+      0x1.c71c71c71c71cp-3,              // 2/9
+      -0x1.61f9add3c0ca4p-3,             // -14/81
+  };
+  // e0 = -14 * h^2 / 81 + 2 * h / 9 - 1/3 - relative_error.
+  double e0 = fputil::polyeval(h0, ERR_COEFFS[0], ERR_COEFFS[1], ERR_COEFFS[2]);
+  double x0_h0 = x0 * h0;
+
+  // x1 = x0 (1 - h0/3 + 2 h0^2 / 9 - 14 h0^3 / 81)
+  // x1 approximate a^(-2/3) with relative errors bounded by:
+  //   | x1 / a^(-2/3) - 1 | < (34/243) h0^4 < h0 * REL_ERROR
+  DoubleDouble x1_dd{x0_h0 * e0, x0};
+
+  // r1 = x1 * a ~ a^(-2/3) * a = a^(1/3).
+  DoubleDouble r1 = fputil::quick_mult(a, x1_dd);
+
+  // Lambda function to update the exponent of the result.
+  auto update_exponent = [=](double r) -> double {
+    uint64_t r_m = FPBits(r).uintval() & 0x800F'FFFF'FFFF'FFFF;
+    // Adjust exponent and sign.
+    uint64_t r_bits =
+        r_m | (static_cast<uint64_t>(out_e) << FPBits::FRACTION_LEN);
+    return FPBits(r_bits).get_val();
+  };
+
+#ifdef LIBC_MATH_CBRT_SKIP_ACCURATE_PASS
+  // TODO: We probably don't need to use double-double if accurate tests and
+  // passes are skipped.
+  return update_exponent(r1.hi + r1.lo);
+#else
+  // Accurate checks and passes.
+  double r1_lower = r1.hi + r1.lo;
+  double r1_upper =
+      r1.hi + fputil::multiply_add(x0_h0, 2.0 * REL_ERROR * a, r1.lo);
+
+  // Ziv's accuracy test.
+  if (LIBC_LIKELY(r1_upper == r1_lower)) {
+    // Test for exact outputs.
+    // Check if lower (52 - 17 = 35) bits are 0's.
+    if (LIBC_UNLIKELY((FPBits(r1_lower).uintval() & 0x0000'0007'FFFF'FFFF) ==
+                      0)) {
+      double r1_err = (r1_lower - r1.hi) - r1.lo;
+      if (FPBits(r1_err).abs().get_val() < 0x1.0p69)
+        fputil::clear_except_if_required(FE_INEXACT);
+    }
+
+    return update_exponent(r1_lower);
+  }
+
+  // Accuracy test failed, perform another Newton iteration.
+  double x1 = x1_dd.hi + (e0 + REL_ERROR) * x0_h0;
+
+  // Second iteration in double-double precision.
+  // h1 = x1^3 * a^2 - 1.
+  DoubleDouble x1_sq = fputil::exact_mult(x1, x1);
+  DoubleDouble x1_3 = fputil::quick_mult(x1, x1_sq);
+  double h1 = get_error(x1_3, a_sq);
+
+  // e1 = -x1*h1/3.
+  double e1 = h1 * (x1 * -0x1.5555555555555p-2);
+  // x2 = x1*(1 - h1/3) = x1 + e1 ~ a^(-2/3) with relative errors < 2^-101.
+  DoubleDouble x2 = fputil::exact_add(x1, e1);
+  // r2 = a * x2 ~ a * a^(-2/3) = a^(1/3) with relative errors < 2^-100.
+  DoubleDouble r2 = fputil::quick_mult(a, x2);
+
+  double r2_upper = r2.hi + fputil::multiply_add(a, 0x1.0p-102, r2.lo);
+  double r2_lower = r2.hi + fputil::multiply_add(a, -0x1.0p-102, r2.lo);
+
+  // Ziv's accuracy test.
+  if (LIBC_LIKELY(r2_upper == r2_lower))
+    return update_exponent(r2_upper);
+
+  // TODO: Investigate removing float128 and just list exceptional cases.
+  // Apply another Newton iteration with ~126-bit accuracy.
+  Float128 x2_f128 = fputil::quick_add(Float128(x2.hi), Float128(x2.lo));
+  // x2^3
+  Float128 x2_3 =
+      fputil::quick_mul(fputil::quick_mul(x2_f128, x2_f128), x2_f128);
+  // a^2
+  Float128 a_sq_f128 = fputil::quick_mul(Float128(a), Float128(a));
+  // x2^3 * a^2
+  Float128 x2_3_a_sq = fputil::quick_mul(x2_3, a_sq_f128);
+  // h2 = x2^3 * a^2 - 1
+  Float128 h2_f128 = fputil::quick_add(x2_3_a_sq, Float128(-1.0));
+  double h2 = static_cast<double>(h2_f128);
+  // t2 = 1 - h2 / 3
+  Float128 t2 =
+      fputil::quick_add(Float128(1.0), Float128(h2 * (-0x1.5555555555555p-2)));
+  // x3 = x2 * (1 - h2 / 3) ~ a^(-2/3)
+  Float128 x3 = fputil::quick_mul(x2_f128, t2);
+  // r3 = a * x3 ~ a * a^(-2/3) = a^(1/3)
+  Float128 r3 = fputil::quick_mul(Float128(a), x3);
+
+  // Check for exact cases:
+  Float128::MantissaType rounding_bits =
+      r3.mantissa & 0x0000'0000'0000'03FF'FFFF'FFFF'FFFF'FFFF_u128;
+
+  double result = static_cast<double>(r3);
+  if ((rounding_bits < 0x0000'0000'0000'0000'0000'0000'0000'000F_u128) ||
+      (rounding_bits >= 0x0000'0000'0000'03FF'FFFF'FFFF'FFFF'FFF0_u128)) {
+    // Output is exact.
+    r3.mantissa &= 0xFFFF'FFFF'FFFF'FFFF'FFFF'FFFF'FFFF'FFF0_u128;
+
+    if (rounding_bits >= 0x0000'0000'0000'03FF'FFFF'FFFF'FFFF'FFF0_u128) {
+      Float128 tmp{r3.sign, r3.exponent - 123,
+                   0x8000'0000'0000'0000'0000'0000'0000'0000_u128};
+      Float128 r4 = fputil::quick_add(r3, tmp);
+      result = static_cast<double>(r4);
+    } else {
+      result = static_cast<double>(r3);
+    }
+
+    fputil::clear_except_if_required(FE_INEXACT);
+  }
+
+  return update_exponent(result);
+#endif // LIBC_MATH_CBRT_SKIP_ACCURATE_PASS
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index 0dc7ae6aae2df..64b4d2c58fb6a 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -2225,6 +2225,18 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  cbrt_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    cbrt_test.cpp
+  DEPENDS
+    libc.src.math.cbrt
+    libc.src.__support.FPUtil.fp_bits
+)
+
 add_subdirectory(generic)
 add_subdirectory(smoke)
 
diff --git a/libc/test/src/math/cbrt_test.cpp b/libc/test/src/math/cbrt_test.cpp
new file mode 100644
index 0000000000000..123351496118b
--- /dev/null
+++ b/libc/test/src/math/cbrt_test.cpp
@@ -0,0 +1,104 @@
+//===-- Unittests for cbrt ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/math_macros.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/math/cbrt.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+using LlvmLibcCbrtTest = LIBC_NAMESPACE::testing::FPTest<double>;
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+using LIBC_NAMESPACE::testing::tlog;
+
+TEST_F(LlvmLibcCbrtTest, InDoubleRange) {
+  constexpr uint64_t COUNT = 123'451;
+  uint64_t START = LIBC_NAMESPACE::fputil::FPBits<double>(1.0).uintval();
+  uint64_t STOP = LIBC_NAMESPACE::fputil::FPBits<double>(8.0).uintval();
+  uint64_t STEP = (STOP - START) / COUNT;
+
+  auto test = [&](mpfr::RoundingMode rounding_mode) {
+    mpfr::ForceRoundingMode force_rounding(rounding_mode);
+    if (!force_rounding.success)
+      return;
+
+    uint64_t fails = 0;
+    uint64_t tested = 0;
+    uint64_t total = 0;
+    double worst_input, worst_output = 0.0;
+    double ulp = 0.5;
+
+    for (uint64_t i = 0, v = START; i <= COUNT; ++i, v += STEP) {
+      double x = FPBits(v).get_val();
+      if (isnan(x) || isinf(x))
+        continue;
+
+      double result = LIBC_NAMESPACE::cbrt(x);
+      ++total;
+      if (isnan(result) || isinf(result))
+        continue;
+
+      ++tested;
+
+      if (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Cbrt, x, result,
+                                             0.5, rounding_mode)) {
+        ++fails;
+        while (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Cbrt, x,
+                                                  result, ulp, rounding_mode)) {
+          worst_input = x;
+          worst_output = result;
+
+          if (ulp > 1000.0)
+            break;
+
+          ulp *= 2.0;
+        }
+      }
+    }
+    if (fails) {
+      tlog << " Cbrt failed: " << fails << "/" << tested << "/" << total
+           << " tests.\n";
+      tlog << "   Max ULPs is at most: " << static_cast<uint64_t>(ulp) << ".\n";
+      EXPECT_MPFR_MATCH(mpfr::Operation::Cbrt, worst_input, worst_output, 0.5,
+                        rounding_mode);
+    }
+  };
+
+  tlog << " Test Rounding To Nearest...\n";
+  test(mpfr::RoundingMode::Nearest);
+
+  tlog << " Test Rounding Downward...\n";
+  test(mpfr::RoundingMode::Downward);
+
+  tlog << " Test Rounding Upward...\n";
+  test(mpfr::RoundingMode::Upward);
+
+  tlog << " Test Rounding Toward Zero...\n";
+  test(mpfr::RoundingMode::TowardZero);
+}
+
+TEST_F(LlvmLibcCbrtTest, SpecialValues) {
+  constexpr double INPUTS[] = {
+      0x1.4f61672324c8p-1028, 0x1.00152f57068b7p-1, 0x1.006509cda9886p-1,
+      0x1.018369b92e523p-1,   0x1.10af932ef2bf9p-1, 0x1.1a41117939fdbp-1,
+      0x1.2ae8076520d9ap-1,   0x1.a202bfc89ddffp-1, 0x1.a6bb8c803147bp-1,
+      0x1.000197b499b1bp+0,   0x1.00065ed266c6cp+0, 0x1.d4306c202c4c2p+0,
+      0x1.8fd409efe4851p+1,   0x1.95fd0eb31cc4p+1,  0x1.7cef1d276e335p+2,
+      0x1.94910c4fc98p+2,     0x1.a0cc1327bb4c4p+2, 0x1.e7d6ebed549c4p+2,
+  };
+  for (double v : INPUTS) {
+    double x = FPBits(v).get_val();
+    ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cbrt, x,
+                                   LIBC_NAMESPACE::cbrt(x), 0.5);
+    ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cbrt, -x,
+                                   LIBC_NAMESPACE::cbrt(-x), 0.5);
+  }
+}
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 7f1bc0c204c68..76d5919ad9156 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -3971,3 +3971,13 @@ add_fp_unittest(
   DEPENDS
     libc.src.math.cbrtf
 )
+
+add_fp_unittest(
+  cbrt_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    cbrt_test.cpp
+  DEPENDS
+    libc.src.math.cbrt
+)
diff --git a/libc/test/src/math/smoke/cbrt_test.cpp b/libc/test/src/math/smoke/cbrt_test.cpp
new file mode 100644
index 0000000000000..724e0e979decc
--- /dev/null
+++ b/libc/test/src/math/smoke/cbrt_test.cpp
@@ -0,0 +1,35 @@
+//===-- Unittests for cbrt ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/cbrt.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+using LlvmLibcCbrtTest = LIBC_NAMESPACE::testing::FPTest<double>;
+
+using LIBC_NAMESPACE::testing::tlog;
+
+TEST_F(LlvmLibcCbrtTest, SpecialNumbers) {
+  EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::cbrt(aNaN));
+  EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::cbrt(inf));
+  EXPECT_FP_EQ_ALL_ROUNDING(neg_inf, LIBC_NAMESPACE::cbrt(neg_inf));
+  EXPECT_FP_EQ_ALL_ROUNDING(zero, LIBC_NAMESPACE::cbrt(zero));
+  EXPECT_FP_EQ_ALL_ROUNDING(neg_zero, LIBC_NAMESPACE::cbrt(neg_zero));
+  EXPECT_FP_EQ_ALL_ROUNDING(1.0, LIBC_NAMESPACE::cbrt(1.0));
+  EXPECT_FP_EQ_ALL_ROUNDING(-1.0, LIBC_NAMESPACE::cbrt(-1.0));
+  EXPECT_FP_EQ_ALL_ROUNDING(2.0, LIBC_NAMESPACE::cbrt(8.0));
+  EXPECT_FP_EQ_ALL_ROUNDING(-2.0, LIBC_NAMESPACE::cbrt(-8.0));
+  EXPECT_FP_EQ_ALL_ROUNDING(3.0, LIBC_NAMESPACE::cbrt(27.0));
+  EXPECT_FP_EQ_ALL_ROUNDING(-3.0, LIBC_NAMESPACE::cbrt(-27.0));
+  EXPECT_FP_EQ_ALL_ROUNDING(5.0, LIBC_NAMESPACE::cbrt(125.0));
+  EXPECT_FP_EQ_ALL_ROUNDING(-5.0, LIBC_NAMESPACE::cbrt(-125.0));
+  EXPECT_FP_EQ_ALL_ROUNDING(0x1.0p42, LIBC_NAMESPACE::cbrt(0x1.0p126));
+  EXPECT_FP_EQ_ALL_ROUNDING(-0x1.0p42, LIBC_NAMESPACE::cbrt(-0x1.0p126));
+  EXPECT_FP_EQ_ALL_ROUNDING(0x1.0p341, LIBC_NAMESPACE::cbrt(0x1.0p1023));
+  EXPECT_FP_EQ_ALL_ROUNDING(-0x1.0p341, LIBC_NAMESPACE::cbrt(-0x1.0p1023));
+}

>From 68677bd3a6d1857a7369f6daff4d0f31d0324a1e Mon Sep 17 00:00:00 2001
From: Tue Ly <lntue.h at gmail.com>
Date: Wed, 17 Jul 2024 02:02:48 +0000
Subject: [PATCH 2/2] Add GPU entry point.

---
 libc/config/gpu/entrypoints.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index b0c4652c6b8ee..3c6a92d279e50 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -245,6 +245,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.atanf
     libc.src.math.atanh
     libc.src.math.atanhf
+    libc.src.math.cbrt
     libc.src.math.cbrtf
     libc.src.math.ceil
     libc.src.math.ceilf