[libc-commits] [libc] [libc][math] Optimize `asinpif` and `acospif` using estrin's scheme (PR #184286)

Wed Mar 4 13:06:53 PST 2026

https://github.com/hulxv updated https://github.com/llvm/llvm-project/pull/184286

>From ae5b32da59b6fa4843356bbb762f90b70e7b9d65 Mon Sep 17 00:00:00 2001
From: hulxv <hulxxv at gmail.com>
Date: Tue, 3 Mar 2026 05:08:20 +0200
Subject: [PATCH] [libc][math] optimize `asinpif` and `acospif`

---
 libc/src/__support/math/asinpif.h         | 22 -----------
 libc/src/__support/math/inv_trigf_utils.h | 48 +++++++++++++++++++----
 2 files changed, 40 insertions(+), 30 deletions(-)

diff --git a/libc/src/__support/math/asinpif.h b/libc/src/__support/math/asinpif.h
index 9a5daf6198a42..79d3ebbe63b5b 100644
--- a/libc/src/__support/math/asinpif.h
+++ b/libc/src/__support/math/asinpif.h
@@ -23,22 +23,6 @@ namespace LIBC_NAMESPACE_DECL {
 namespace math {
 
 LIBC_INLINE float asinpif(float x) {
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  constexpr size_t N_EXCEPTS = 5;
-  constexpr fputil::ExceptValues<float, N_EXCEPTS> ASINPIF_EXCEPTS = {
-      {// (inputs, RZ output, RU offset, RD offset, RN offset)
-       // x = 0x1.e768f6p-122, asinpif(x) = 0x1.364b7ap-123 (RZ)
-       {0x02F3B47B, 0x021B25BD, 1, 0, 0},
-       // x = 0x1.e768f6p-24, asinpif(x) = 0x1.364b7ap-25 (RZ)
-       {0x33F3B47B, 0x331B25BD, 1, 0, 1},
-       // x = 0x1.dddb4ep-19, asinpif(x) = 0x1.303686p-20 (RZ)
-       {0x366EEDA7, 0x35981B43, 1, 0, 1},
-       // x = -0x1.dddb4ep-19, asinpif(x) = -0x1.303686p-20 (RZ)
-       {0xB66EEDA7, 0xB5981B43, 0, 1, 1},
-       // x = -0x1.e768f6p-24, asinpif(x) = -0x1.364b7ap-25 (RZ)
-       {0xB3F3B47B, 0xB31B25BD, 0, 1, 1}}};
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
   using FPBits = fputil::FPBits<float>;
 
   FPBits xbits(x);
@@ -61,12 +45,6 @@ LIBC_INLINE float asinpif(float x) {
     return FPBits::quiet_nan().get_val();
   }
 
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  auto r = ASINPIF_EXCEPTS.lookup(xbits.uintval());
-  if (LIBC_UNLIKELY(r.has_value()))
-    return r.value();
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
   // if |x| <= 0.5:
   //   asinpi(x) = x * (c0 + x^2 * P1(x^2))
   if (LIBC_UNLIKELY(x_abs <= 0.5)) {
diff --git a/libc/src/__support/math/inv_trigf_utils.h b/libc/src/__support/math/inv_trigf_utils.h
index 7a93831333db0..a26fa86019444 100644
--- a/libc/src/__support/math/inv_trigf_utils.h
+++ b/libc/src/__support/math/inv_trigf_utils.h
@@ -184,14 +184,14 @@ LIBC_INLINE double asin_eval(double xsq) {
 // > prec = 200;
 // > display = hexadecimal;
 // > g = asin(x) / (pi * x);
-// > P = fpminimax(g, [|0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20|],
+// > P = fpminimax(g, [|0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22|],
 // >              [|D...|], [0, 0.5]);
 // > for i from 0 to degree(P) do coeff(P, i);
 // > print("Error:", dirtyinfnorm(P - g, [1e-30; 0.25]));
-// Error: 0x1.45c281e1cf9b58p-50 ~= 2^−49.652
+// Error : 0x1.a53f84eafa3ea69bb81b6c52b3278872083fca2c757bd778acp-54 ~= 2^-54
 //
 // Non-zero coefficients (even powers only):
-LIBC_INLINE_VAR constexpr double ASINPI_COEFFS[13] = {
+LIBC_INLINE_VAR constexpr double ASINPI_COEFFS[12] = {
     0x1.45f306dc9c881p-2,  // x^0
     0x1.b2995e7b7e756p-5,  // x^2
     0x1.8723a1d12f828p-6,  // x^4
@@ -206,12 +206,44 @@ LIBC_INLINE_VAR constexpr double ASINPI_COEFFS[13] = {
     0x1.4b50c2eb13708p-7   // x^22
 };
 
-// Evaluates P1(v2) = c1 + c2*v2 + c3*v2^2 + ... (tail of P without c0)
+// Evaluates P1(v2) = c1 + c2*v2 + c3*v2^2 + ... + c12*v2^11 (tail of P
+// without c0) using Estrin's scheme for instruction-level parallelism.
+//
+// The tail polynomial has 12 coefficients ASINPI_COEFFS[1..11] in powers of
+// v2:
+//   P1(v2) = c1 + c2*v2 + c3*v2^2 + c4*v2^3 + ... + c11*v2^10
+//
+// Estrin pairs them bottom-up:
+//   Level 0 (6 pairs, using v2):
+//     p0 = c1  + c2*v2       p1 = c3  + c4*v2
+//     p2 = c5  + c6*v2       p3 = c7  + c8*v2
+//     p4 = c9  + c10*v2      p5 = c11
+//   Level 1 (3 pairs, using v4):
+//     q0 = p0 + p1*v4        q1 = p2 + p3*v4
+//     q2 = p4 + p5*v4
+//   Level 2 (using v8):
+//     r0 = q0 + q1*v8        r1 = q2
+//   Level 3 (using v16):
+//     result = r0 + r1*v16
 LIBC_INLINE double asinpi_eval(double v2) {
-  return fputil::polyeval(
-      v2, ASINPI_COEFFS[1], ASINPI_COEFFS[2], ASINPI_COEFFS[3],
-      ASINPI_COEFFS[4], ASINPI_COEFFS[5], ASINPI_COEFFS[6], ASINPI_COEFFS[7],
-      ASINPI_COEFFS[8], ASINPI_COEFFS[9], ASINPI_COEFFS[10], ASINPI_COEFFS[11]);
+  double v4 = v2 * v2;
+  double v8 = v4 * v4;
+  double v16 = v8 * v8;
+
+  double p0 = fputil::multiply_add(v2, ASINPI_COEFFS[2], ASINPI_COEFFS[1]);
+  double p1 = fputil::multiply_add(v2, ASINPI_COEFFS[4], ASINPI_COEFFS[3]);
+  double p2 = fputil::multiply_add(v2, ASINPI_COEFFS[6], ASINPI_COEFFS[5]);
+  double p3 = fputil::multiply_add(v2, ASINPI_COEFFS[8], ASINPI_COEFFS[7]);
+  double p4 = fputil::multiply_add(v2, ASINPI_COEFFS[10], ASINPI_COEFFS[9]);
+  double p5 = ASINPI_COEFFS[11];
+
+  double q0 = fputil::multiply_add(v4, p1, p0);
+  double q1 = fputil::multiply_add(v4, p3, p2);
+  double q2 = fputil::multiply_add(v4, p5, p4);
+
+  double r0 = fputil::multiply_add(v8, q1, q0);
+
+  return fputil::multiply_add(v16, q2, r0);
 }
 
 } // namespace inv_trigf_utils_internal