[libc-commits] [libc] [libc][math] Improve performance of double precision trig functions. (PR #111793)
Nick Desaulniers via libc-commits
libc-commits at lists.llvm.org
Thu Oct 10 08:52:09 PDT 2024
================
@@ -17,150 +17,269 @@
#include "src/__support/common.h"
#include "src/__support/integer_literals.h"
#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
-#ifdef LIBC_TARGET_CPU_HAS_FMA
-#include "range_reduction_double_fma.h"
-
-// With FMA, we limit the maxmimum exponent to be 2^16, so that the error bound
-// from the fma::range_reduction_small is bounded by 2^-88 instead of 2^-72.
-#define FAST_PASS_EXPONENT 16
-using LIBC_NAMESPACE::fma::ONE_TWENTY_EIGHT_OVER_PI;
-using LIBC_NAMESPACE::fma::range_reduction_small;
-using LIBC_NAMESPACE::fma::SIN_K_PI_OVER_128;
+namespace LIBC_NAMESPACE_DECL {
-LIBC_INLINE constexpr bool NO_FMA = false;
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+static constexpr bool NO_FMA = false;
#else
-#include "range_reduction_double_nofma.h"
+static constexpr bool NO_FMA = true;
+#endif // LIBC_TARGET_CPU_HAS_FMA
-using LIBC_NAMESPACE::nofma::FAST_PASS_EXPONENT;
-using LIBC_NAMESPACE::nofma::ONE_TWENTY_EIGHT_OVER_PI;
-using LIBC_NAMESPACE::nofma::range_reduction_small;
-using LIBC_NAMESPACE::nofma::SIN_K_PI_OVER_128;
+using LIBC_NAMESPACE::fputil::DoubleDouble;
+using Float128 = LIBC_NAMESPACE::fputil::DyadicFloat<128>;
-LIBC_INLINE constexpr bool NO_FMA = true;
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#define FAST_PASS_EXPONENT 16
-namespace LIBC_NAMESPACE_DECL {
+// For 2^-7 < |x| < 2^16, return k and u such that:
+// k = round(x * 128/pi)
+// x mod pi/128 = x - k * pi/128 ~ u.hi + u.lo
+// Error bound:
+// |(x - k * pi/128) - (u_hi + u_lo)| <= max(ulp(ulp(u_hi)), 2^-119)
+// <= 2^-111.
+LIBC_INLINE unsigned range_reduction_small(double x, DoubleDouble &u) {
+ // Values of -pi/128 used for inputs with absolute value <= 2^16.
+ // The first 3 parts are generated with (53 - 21 = 32)-bit precision, so that
+ // the product k * MPI_OVER_128[i] is exact.
+ // Generated by Sollya with:
+ // > display = hexadecimal!;
+ // > a = round(pi/128, 32, RN);
+ // > b = round(pi/128 - a, 32, RN);
+ // > c = round(pi/128 - a - b, D, RN);
+ // > print(-a, ",", -b, ",", -c);
+ constexpr double MPI_OVER_128[3] = {-0x1.921fb544p-6, -0x1.0b4611a6p-40,
+ -0x1.3198a2e037073p-75};
+ constexpr double ONE_TWENTY_EIGHT_OVER_PI_D = 0x1.45f306dc9c883p5;
+ double prod_hi = x * ONE_TWENTY_EIGHT_OVER_PI_D;
+ double kd = fputil::nearest_integer(prod_hi);
-namespace generic {
+ // Let y = x - k * (pi/128)
+ // Then |y| < pi / 256
+ // With extra rounding errors, we can bound |y| < 1.6 * 2^-7.
+ double y_hi = fputil::multiply_add(kd, MPI_OVER_128[0], x); // Exact
+ // |u.hi| < 1.6*2^-7
+ u.hi = fputil::multiply_add(kd, MPI_OVER_128[1], y_hi);
+ double u0 = y_hi - u.hi; // Exact;
+ // |u.lo| <= max(ulp(u.hi), |kd * MPI_OVER_128[2]|)
+ double u1 = fputil::multiply_add(kd, MPI_OVER_128[1], u0); // Exact
----------------
nickdesaulniers wrote:
The computation of `u.hi` and `u1` both multiply `kd` and `MPI_OVER_128[1]`. Does that end up getting calculated twice?
https://github.com/llvm/llvm-project/pull/111793
More information about the libc-commits
mailing list