[libc-commits] [libc] [libc][math] Improve the performance of sin/cos for small inputs |x| < 2^-4. (PR #201748)

Fri Jun 5 06:24:06 PDT 2026

https://github.com/lntue updated https://github.com/llvm/llvm-project/pull/201748

>From d707ffe770f6408c1ac46b7a7b6d41f91a73788e Mon Sep 17 00:00:00 2001
From: Tue Ly <lntue.h at gmail.com>
Date: Fri, 5 Jun 2026 05:14:13 +0000
Subject: [PATCH 1/3] [libc][math] Improve the performance of sin/cos for small
 inputs |x| < 2^-4.

---
 libc/src/__support/math/sin.h | 109 +++++++++++++++++++++++-----------
 1 file changed, 75 insertions(+), 34 deletions(-)

diff --git a/libc/src/__support/math/sin.h b/libc/src/__support/math/sin.h
index cd0c15abe62db..046b83404790e 100644
--- a/libc/src/__support/math/sin.h
+++ b/libc/src/__support/math/sin.h
@@ -29,6 +29,44 @@ namespace LIBC_NAMESPACE_DECL {
 
 namespace math {
 
+LIBC_INLINE double
+sin_accurate(double x, uint16_t x_e, unsigned k,
+             const range_reduction_double_internal::LargeRangeReduction
+                 &range_reduction_large) {
+  using namespace math::range_reduction_double_internal;
+  using FPBits = typename fputil::FPBits<double>;
+
+  DFloat128 u_f128, sin_u, cos_u;
+  if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT))
+    u_f128 = range_reduction_small_f128(x);
+  else
+    u_f128 = range_reduction_large.accurate();
+
+  math::sincos_eval_internal::sincos_eval(u_f128, sin_u, cos_u);
+
+  auto get_sin_k = [](unsigned kk) -> DFloat128 {
+    unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
+    DFloat128 ans = SIN_K_PI_OVER_128_F128[idx];
+    if (kk & 128)
+      ans.sign = Sign::NEG;
+    return ans;
+  };
+
+  // cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128).
+  DFloat128 sin_k_f128 = get_sin_k(k);
+  DFloat128 cos_k_f128 = get_sin_k(k + 64);
+
+  // sin(x) = sin(k * pi/128 + u)
+  //        = sin(u) * cos(k*pi/128) + cos(u) * sin(k*pi/128)
+  DFloat128 r = fputil::quick_add(fputil::quick_mul(sin_k_f128, cos_u),
+                                  fputil::quick_mul(cos_k_f128, sin_u));
+
+  // TODO: Add assertion if Ziv's accuracy tests fail in debug mode.
+  // https://github.com/llvm/llvm-project/issues/96452.
+
+  return static_cast<double>(r);
+}
+
 LIBC_INLINE double sin(double x) {
   using namespace math::range_reduction_double_internal;
   using FPBits = typename fputil::FPBits<double>;
@@ -43,7 +81,7 @@ LIBC_INLINE double sin(double x) {
   // |x| < 2^16
   if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) {
     // |x| < 2^-7
-    if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 7)) {
+    if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 4)) {
       // |x| < 2^-26, |sin(x) - x| < ulp(x)/2.
       if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 26)) {
         // Signed zeros.
@@ -64,9 +102,40 @@ LIBC_INLINE double sin(double x) {
 #endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
       }
       // No range reduction needed.
-      k = 0;
-      y.lo = 0.0;
-      y.hi = x;
+
+      // Use degree-9 polynomial approximation:
+      //   sin(x) ~ x + a1 * x^3 + a2 * x^5 + a3 * x^7 + a4 * x^9
+      //          ~ x + x^3 * Q(x^2).
+      // > P = fpminimax(sin(x)/x, [|0, 2, 4, 6, 8|], [|1, D...|], [0, 2^-4]);
+      // > dirtyinfnorm((sin(x) - x*P)/sin(x), [-2^-4, 2^-4]);
+      // 0x1.3c2e...p-69
+      // > P;
+      constexpr double COEFFS[] = {-0x1.5555555555555p-3, 0x1.111111110f491p-7,
+                                   -0x1.a01a00e16af3ep-13,
+                                   0x1.71c24233f1bafp-19};
+      double x_sq = x * x;
+      double c0 = fputil::multiply_add(x_sq, COEFFS[1], COEFFS[0]);
+      double c1 = fputil::multiply_add(x_sq, COEFFS[3], COEFFS[2]);
+      double x4 = x_sq * x_sq;
+      double x3 = x * x_sq;
+      double r_lo = fputil::multiply_add(x4, c1, c0) * x3;
+
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+      return x + r_lo;
+#else
+      // Overall errors <= 2 * ulp(x^3/6) + |x| * 2^-68.
+      double err = fputil::multiply_add(x_sq, 0x1.0p-53, 0x1.0p-68);
+      double r_lo_u = fputil::multiply_add(x, err, r_lo);
+      double r_lo_l = fputil::multiply_add(-x, err, r_lo);
+      double r_upper = x + r_lo_u;
+      double r_lower = x + r_lo_l;
+
+      if (LIBC_LIKELY(r_upper == r_lower))
+        return r_upper;
+
+      k = range_reduction_small(x, y);
+      return sin_accurate(x, x_e, k, range_reduction_large);
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
     } else {
       // Small range reduction.
       k = range_reduction_small(x, y);
@@ -124,7 +193,7 @@ LIBC_INLINE double sin(double x) {
   DoubleDouble sin_k_cos_y = fputil::quick_mult(cos_y, sin_k);
   DoubleDouble cos_k_sin_y = fputil::quick_mult(sin_y, cos_k);
 
-  DoubleDouble rr = fputil::exact_add<false>(sin_k_cos_y.hi, cos_k_sin_y.hi);
+  DoubleDouble rr = fputil::exact_add(sin_k_cos_y.hi, cos_k_sin_y.hi);
   rr.lo += sin_k_cos_y.lo + cos_k_sin_y.lo;
 
 #ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
@@ -142,35 +211,7 @@ LIBC_INLINE double sin(double x) {
   if (LIBC_LIKELY(r_upper == r_lower))
     return r_upper;
 
-  DFloat128 u_f128, sin_u, cos_u;
-  if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT))
-    u_f128 = range_reduction_small_f128(x);
-  else
-    u_f128 = range_reduction_large.accurate();
-
-  math::sincos_eval_internal::sincos_eval(u_f128, sin_u, cos_u);
-
-  auto get_sin_k = [](unsigned kk) -> DFloat128 {
-    unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
-    DFloat128 ans = SIN_K_PI_OVER_128_F128[idx];
-    if (kk & 128)
-      ans.sign = Sign::NEG;
-    return ans;
-  };
-
-  // cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128).
-  DFloat128 sin_k_f128 = get_sin_k(k);
-  DFloat128 cos_k_f128 = get_sin_k(k + 64);
-
-  // sin(x) = sin(k * pi/128 + u)
-  //        = sin(u) * cos(k*pi/128) + cos(u) * sin(k*pi/128)
-  DFloat128 r = fputil::quick_add(fputil::quick_mul(sin_k_f128, cos_u),
-                                  fputil::quick_mul(cos_k_f128, sin_u));
-
-  // TODO: Add assertion if Ziv's accuracy tests fail in debug mode.
-  // https://github.com/llvm/llvm-project/issues/96452.
-
-  return static_cast<double>(r);
+  return sin_accurate(x, x_e, k, range_reduction_large);
 #endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 }
 

>From 6e4ebaa43a7fe01063bdd888d141a5542cd38763 Mon Sep 17 00:00:00 2001
From: Tue Ly <lntue.h at gmail.com>
Date: Fri, 5 Jun 2026 05:29:37 +0000
Subject: [PATCH 2/3] Skip sin_accurate definition when
 LIBC_MATH_HAS_SKIP_ACCURATE_PASS is defined.

---
 libc/src/__support/math/sin.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libc/src/__support/math/sin.h b/libc/src/__support/math/sin.h
index 046b83404790e..5b7d9b71b401d 100644
--- a/libc/src/__support/math/sin.h
+++ b/libc/src/__support/math/sin.h
@@ -29,6 +29,7 @@ namespace LIBC_NAMESPACE_DECL {
 
 namespace math {
 
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 LIBC_INLINE double
 sin_accurate(double x, uint16_t x_e, unsigned k,
              const range_reduction_double_internal::LargeRangeReduction
@@ -66,6 +67,7 @@ sin_accurate(double x, uint16_t x_e, unsigned k,
 
   return static_cast<double>(r);
 }
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 LIBC_INLINE double sin(double x) {
   using namespace math::range_reduction_double_internal;

>From a0359b0a1300241a9fde3fc2ee9339bc4356b175 Mon Sep 17 00:00:00 2001
From: Tue Ly <lntue.h at gmail.com>
Date: Fri, 5 Jun 2026 13:23:24 +0000
Subject: [PATCH 3/3] Improve cos in a similar way.

---
 libc/src/__support/math/cos.h | 112 +++++++++++++++++++++++-----------
 1 file changed, 77 insertions(+), 35 deletions(-)

diff --git a/libc/src/__support/math/cos.h b/libc/src/__support/math/cos.h
index f968a999aeb3c..ef589ef3a198d 100644
--- a/libc/src/__support/math/cos.h
+++ b/libc/src/__support/math/cos.h
@@ -30,6 +30,47 @@ namespace LIBC_NAMESPACE_DECL {
 
 namespace math {
 
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+LIBC_INLINE double
+cos_accurate(double x, uint16_t x_e, unsigned k,
+             const range_reduction_double_internal::LargeRangeReduction
+                 &range_reduction_large) {
+  using namespace math::range_reduction_double_internal;
+  using FPBits = typename fputil::FPBits<double>;
+
+  DFloat128 u_f128, sin_u, cos_u;
+  if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT))
+    u_f128 = range_reduction_small_f128(x);
+  else
+    u_f128 = range_reduction_large.accurate();
+
+  math::sincos_eval_internal::sincos_eval(u_f128, sin_u, cos_u);
+
+  auto get_sin_k = [](unsigned kk) -> DFloat128 {
+    unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
+    DFloat128 ans = SIN_K_PI_OVER_128_F128[idx];
+    if (kk & 128)
+      ans.sign = Sign::NEG;
+    return ans;
+  };
+
+  // -sin(k * pi/128) = sin((k + 128) * pi/128)
+  // cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128).
+  DFloat128 msin_k_f128 = get_sin_k(k + 128);
+  DFloat128 cos_k_f128 = get_sin_k(k + 64);
+
+  // cos(x) = cos((k * pi/128 + u)
+  //        = cos(u) * cos(k*pi/128) - sin(u) * sin(k*pi/128)
+  DFloat128 r = fputil::quick_add(fputil::quick_mul(cos_k_f128, cos_u),
+                                  fputil::quick_mul(msin_k_f128, sin_u));
+
+  // TODO: Add assertion if Ziv's accuracy tests fail in debug mode.
+  // https://github.com/llvm/llvm-project/issues/96452.
+
+  return static_cast<double>(r);
+}
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 LIBC_INLINE double cos(double x) {
   using namespace range_reduction_double_internal;
   using FPBits = typename fputil::FPBits<double>;
@@ -43,8 +84,8 @@ LIBC_INLINE double cos(double x) {
 
   // |x| < 2^16.
   if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT)) {
-    // |x| < 2^-7
-    if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 7)) {
+    // |x| < 2^-4
+    if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 4)) {
       // |x| < 2^-27
       if (LIBC_UNLIKELY(x_e < FPBits::EXP_BIAS - 27)) {
         // Signed zeros.
@@ -55,9 +96,39 @@ LIBC_INLINE double cos(double x) {
         return fputil::round_result_slightly_down(1.0);
       }
       // No range reduction needed.
-      k = 0;
-      y.lo = 0.0;
-      y.hi = x;
+
+      // Use degree-8 polynomial approximation:
+      //   sin(x) ~ 1 + a1 * x^2 + a2 * x^4 + a3 * x^6 + a4 * x^8
+      //          ~ 1 + x^2 * Q(x^2).
+      // > P = fpminimax(cos(x), [|0, 2, 4, 6, 8|], [|1, D...|], [0, 2^-4]);
+      // > dirtyinfnorm(cos(x) - P, [-2^-4, 2^-4]);
+      // 0x1.3cfe...p-70
+      // > P;
+      constexpr double COEFFS[] = {-0x1p-1, 0x1.5555555555262p-5,
+                                   -0x1.6c16c1508bff1p-10,
+                                   0x1.a00ffd769159ap-16};
+      double x_sq = x * x;
+      double c0 = fputil::multiply_add(x_sq, COEFFS[1], COEFFS[0]);
+      double c1 = fputil::multiply_add(x_sq, COEFFS[3], COEFFS[2]);
+      double x4 = x_sq * x_sq;
+      double r_lo = fputil::multiply_add(x4, c1, c0) * x_sq;
+
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+      return x + r_lo;
+#else
+      // Overall errors <= 2 * ulp(x^2/6) + |x| * 2^-69.
+      double err = fputil::multiply_add(x_sq, 0x1.0p-53, 0x1.0p-69);
+      double r_lo_u = r_lo + err;
+      double r_lo_l = r_lo - err;
+      double r_upper = x + r_lo_u;
+      double r_lower = x + r_lo_l;
+
+      if (LIBC_LIKELY(r_upper == r_lower))
+        return r_upper;
+
+      k = range_reduction_small(x, y);
+      return cos_accurate(x, x_e, k, range_reduction_large);
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
     } else {
       // Small range reduction.
       k = range_reduction_small(x, y);
@@ -131,36 +202,7 @@ LIBC_INLINE double cos(double x) {
   if (LIBC_LIKELY(r_upper == r_lower))
     return r_upper;
 
-  DFloat128 u_f128, sin_u, cos_u;
-  if (LIBC_LIKELY(x_e < FPBits::EXP_BIAS + FAST_PASS_EXPONENT))
-    u_f128 = range_reduction_small_f128(x);
-  else
-    u_f128 = range_reduction_large.accurate();
-
-  math::sincos_eval_internal::sincos_eval(u_f128, sin_u, cos_u);
-
-  auto get_sin_k = [](unsigned kk) -> DFloat128 {
-    unsigned idx = (kk & 64) ? 64 - (kk & 63) : (kk & 63);
-    DFloat128 ans = SIN_K_PI_OVER_128_F128[idx];
-    if (kk & 128)
-      ans.sign = Sign::NEG;
-    return ans;
-  };
-
-  // -sin(k * pi/128) = sin((k + 128) * pi/128)
-  // cos(k * pi/128) = sin(k * pi/128 + pi/2) = sin((k + 64) * pi/128).
-  DFloat128 msin_k_f128 = get_sin_k(k + 128);
-  DFloat128 cos_k_f128 = get_sin_k(k + 64);
-
-  // cos(x) = cos((k * pi/128 + u)
-  //        = cos(u) * cos(k*pi/128) - sin(u) * sin(k*pi/128)
-  DFloat128 r = fputil::quick_add(fputil::quick_mul(cos_k_f128, cos_u),
-                                  fputil::quick_mul(msin_k_f128, sin_u));
-
-  // TODO: Add assertion if Ziv's accuracy tests fail in debug mode.
-  // https://github.com/llvm/llvm-project/issues/96452.
-
-  return static_cast<double>(r);
+  return cos_accurate(x, x_e, k, range_reduction_large);
 #endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 }