[libc-commits] [libc] [libc][math] Improve performance of double precision trig functions. (PR #111793)

Thu Oct 10 08:52:09 PDT 2024

================
@@ -17,150 +17,269 @@
 #include "src/__support/common.h"
 #include "src/__support/integer_literals.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
 
-#ifdef LIBC_TARGET_CPU_HAS_FMA
-#include "range_reduction_double_fma.h"
-
-// With FMA, we limit the maxmimum exponent to be 2^16, so that the error bound
-// from the fma::range_reduction_small is bounded by 2^-88 instead of 2^-72.
-#define FAST_PASS_EXPONENT 16
-using LIBC_NAMESPACE::fma::ONE_TWENTY_EIGHT_OVER_PI;
-using LIBC_NAMESPACE::fma::range_reduction_small;
-using LIBC_NAMESPACE::fma::SIN_K_PI_OVER_128;
+namespace LIBC_NAMESPACE_DECL {
 
-LIBC_INLINE constexpr bool NO_FMA = false;
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+static constexpr bool NO_FMA = false;
 #else
-#include "range_reduction_double_nofma.h"
+static constexpr bool NO_FMA = true;
+#endif // LIBC_TARGET_CPU_HAS_FMA
 
-using LIBC_NAMESPACE::nofma::FAST_PASS_EXPONENT;
-using LIBC_NAMESPACE::nofma::ONE_TWENTY_EIGHT_OVER_PI;
-using LIBC_NAMESPACE::nofma::range_reduction_small;
-using LIBC_NAMESPACE::nofma::SIN_K_PI_OVER_128;
+using LIBC_NAMESPACE::fputil::DoubleDouble;
+using Float128 = LIBC_NAMESPACE::fputil::DyadicFloat<128>;
 
-LIBC_INLINE constexpr bool NO_FMA = true;
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#define FAST_PASS_EXPONENT 16
 
-namespace LIBC_NAMESPACE_DECL {
+// For 2^-7 < |x| < 2^16, return k and u such that:
+//   k = round(x * 128/pi)
+//   x mod pi/128 = x - k * pi/128 ~ u.hi + u.lo
+// Error bound:
+//   |(x - k * pi/128) - (u_hi + u_lo)| <= max(ulp(ulp(u_hi)), 2^-119)
+//                                      <= 2^-111.
+LIBC_INLINE unsigned range_reduction_small(double x, DoubleDouble &u) {
+  // Values of -pi/128 used for inputs with absolute value <= 2^16.
+  // The first 3 parts are generated with (53 - 21 = 32)-bit precision, so that
+  // the product k * MPI_OVER_128[i] is exact.
+  // Generated by Sollya with:
+  // > display = hexadecimal!;
+  // > a = round(pi/128, 32, RN);
+  // > b = round(pi/128 - a, 32, RN);
+  // > c = round(pi/128 - a - b, D, RN);
+  // > print(-a, ",", -b, ",", -c);
+  constexpr double MPI_OVER_128[3] = {-0x1.921fb544p-6, -0x1.0b4611a6p-40,
+                                      -0x1.3198a2e037073p-75};
+  constexpr double ONE_TWENTY_EIGHT_OVER_PI_D = 0x1.45f306dc9c883p5;
+  double prod_hi = x * ONE_TWENTY_EIGHT_OVER_PI_D;
+  double kd = fputil::nearest_integer(prod_hi);
 
-namespace generic {
+  // Let y = x - k * (pi/128)
+  // Then |y| < pi / 256
+  // With extra rounding errors, we can bound |y| < 1.6 * 2^-7.
+  double y_hi = fputil::multiply_add(kd, MPI_OVER_128[0], x); // Exact
+  // |u.hi| < 1.6*2^-7
+  u.hi = fputil::multiply_add(kd, MPI_OVER_128[1], y_hi);
+  double u0 = y_hi - u.hi; // Exact;
+  // |u.lo| <= max(ulp(u.hi), |kd * MPI_OVER_128[2]|)
+  double u1 = fputil::multiply_add(kd, MPI_OVER_128[1], u0); // Exact
----------------
nickdesaulniers wrote:

The computation of `u.hi` and `u1` both multiply `kd` and `MPI_OVER_128[1]`. Does that end up getting calculated twice?

https://github.com/llvm/llvm-project/pull/111793