[libc-commits] [libc] [libc][math] Implement an integer-only version of double precision sin and cos with 1 ULP errors. (PR #184752)
Simon Tatham via libc-commits
libc-commits at lists.llvm.org
Fri Mar 6 06:23:43 PST 2026
================
@@ -0,0 +1,274 @@
+//===-- Trig range reduction and evaluation using integer-only --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_SINCOS_INTEGER_UTILS_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_SINCOS_INTEGER_UTILS_H
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/big_int.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+#include "src/__support/math_extras.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+namespace integer_only {
+
+struct Frac128 : public UInt<128> {
+ using UInt<128>::UInt;
+
+ constexpr Frac128 operator~() const {
+ Frac128 r;
+ r.val[0] = ~val[0];
+ r.val[1] = ~val[1];
+ return r;
+ }
+
+ constexpr Frac128 operator+(const Frac128 &other) const {
+ UInt<128> r = UInt<128>(*this) + (UInt<128>(other));
+ return Frac128(r.val);
+ }
+
+ constexpr Frac128 operator-(const Frac128 &other) const {
+ UInt<128> r = UInt<128>(*this) - (UInt<128>(other));
+ return Frac128(r.val);
+ }
+
+ constexpr Frac128 operator*(const Frac128 &other) const {
+ UInt<128> r = UInt<128>::quick_mul_hi(UInt<128>(other));
+ return Frac128(r.val);
+ }
+};
+
+// 1280 + 64 bits of 2/pi, printed using MPFR.
+// Notice that if we store from the highest bytes to lowest bytes, it is
+// essentially having 2/pi in big-endian. On the other hand, uint64_t type
+// that will be used for computations later are in little-endian. So a few
+// bit-reversal instructions will be introduced when we extract the parts
+// out. It's possble to skip the bit-reversal instructions entirely by
+// having this table presented in little-endian, meaning from the lowest
+// bytes to highest bytes. The tradeoff will be a bit more complicated and
+// awkward in the computations of the index, but it might be worth it?
+// We also add 8 more bytes to extend to all non-negative exponents.
+LIBC_INLINE_VAR constexpr unsigned TWO_OVER_PI_LENGTH = 1280 / 8 + 7;
+
+LIBC_INLINE_VAR constexpr uint8_t TWO_OVER_PI[TWO_OVER_PI_LENGTH] = {
+ 0x1D, 0x36, 0xF4, 0x9A, 0x20, 0xBC, 0xCF, 0xF0, 0xAB, 0x6B, 0x7B, 0xFC,
+ 0x46, 0x30, 0x03, 0x56, 0x08, 0x5D, 0x8D, 0x1F, 0xB1, 0x5F, 0xFB, 0x6B,
+ 0xEA, 0x92, 0x52, 0x8A, 0xF7, 0x39, 0x07, 0x3D, 0x7B, 0xF1, 0xE5, 0xEB,
+ 0xC7, 0xBA, 0x27, 0x75, 0x2D, 0xEA, 0x5F, 0x9E, 0x66, 0x3F, 0x46, 0x4F,
+ 0xB7, 0x09, 0xCB, 0x27, 0xCF, 0x7E, 0x36, 0x6D, 0x1F, 0x6D, 0x0A, 0x5A,
+ 0x8B, 0x11, 0x2F, 0xEF, 0x0F, 0x98, 0x05, 0xDE, 0xFF, 0x97, 0xF8, 0x1F,
+ 0x3B, 0x28, 0xF9, 0xBD, 0x8B, 0x5F, 0x84, 0x9C, 0xF4, 0x39, 0x53, 0x83,
+ 0x39, 0xD6, 0x91, 0x39, 0x41, 0x7E, 0x5F, 0xB4, 0x26, 0x70, 0x9C, 0xE9,
+ 0x84, 0x44, 0xBB, 0x2E, 0xF5, 0x35, 0x82, 0xE8, 0x3E, 0xA7, 0x29, 0xB1,
+ 0x1C, 0xEB, 0x1D, 0xFE, 0x1C, 0x92, 0xD1, 0x09, 0xEA, 0x2E, 0x49, 0x06,
+ 0xE0, 0xD2, 0x4D, 0x42, 0x3A, 0x6E, 0x24, 0xB7, 0x61, 0xC5, 0xBB, 0xDE,
+ 0xAB, 0x63, 0x51, 0xFE, 0x41, 0x90, 0x43, 0x3C, 0x99, 0x95, 0x62, 0xDB,
+ 0xC0, 0xDD, 0x34, 0xF5, 0xD1, 0x57, 0x27, 0xFC, 0x29, 0x15, 0x44, 0x4E,
+ 0x6E, 0x83, 0xF9, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+LIBC_INLINE_VAR constexpr Frac128 PI_OVER_2_M1({0x898c'c517'01b8'39a2,
+ 0x921f'b544'42d1'8469});
+
+// Perform range reduction mod pi/2
+//
+// Inputs:
+// x_u: explicit mantissa
+// x_e: biased exponent
+// Output:
+// k : round(x * 2/pi) mod 4
+// x_frac: |x - k * pi/2|
+// Return:
+// x_frac_is_neg.
+LIBC_INLINE constexpr bool trig_range_reduction(uint64_t x_u, unsigned x_e,
+ unsigned &k, Frac128 &x_frac) {
+ using FPBits = typename fputil::FPBits<double>;
+ bool x_frac_is_neg = false;
+ // We do multiplication x * (2/pi)
+ // Let T[i] be the i'th byte of 2/pi expansion:
+ // Then 2/pi = T[0] * 2^-8 + T[1] * 2^-16 + ...
+ // = sum_i T[i] * 2^(-8(i + 1))
+ // To be able to drop all T[j] * 2^(-8(j + 1)) for small j < i, we will want
+ // ulp(x) * lsb(T[i - 1] * 2^(-8 * i)) >= 4 = 2^2 (since 4 * pi/2 = 2*pi)
+ // So:
+ // 2^(e - 52) * 2^(-8 * i) >= 2^2
+ // Or equivalently,
+ // e - 54 - 8*i >= 0.
+ // Define:
+ // i = floor( (e - 54)/8 ),
+ // and let
+ // s = e - 54 - 8i >= 0.
+ // Since we store the mantissa of x, which is 53 bits long in a 64 bit
+ // integer, we have some wiggle room to shuffle the lsb of x.
+ // By shifting mantissa of x_u to the left by s, the lsb of x_u will be:
+ // 2^(e - 52 - s), for which, the product of lsb's is now exactly 4
+ // lsb(x_u) * 2^(-8 * i)) = 4.
+ // This will allow us to compute the full product:
+ // x_u * (T[i] * 2^(-8(i + 1)) + ... ) in exact fixed point.
+ // From the formula of i, in order for i >= 0, e >= 54. To support all the
+ // exponents e >= 0, we could add ceil(54 / 8) = 7 0x00 bytes and shift the
+ // index by 7.
+ unsigned e_num = x_e - FPBits::EXP_BIAS + 2; // e - 54 + 7*8
+ // With
+ // i = floor( (e - 54) / 8 ),
+ // the shifted-by-7 index is:
+ // j = i + 7 = floor( (e - 54) / 8 ) + 7
+ // Since the 64-bit integer chunk will be form by T[j] ... T[j + 7],
+ // and we store the table in the little-endian form, we will index to the
+ // lowest part of the 64-bit integer chunk, which is:
+ // idx = the index of the T[j + 7] part.
+ unsigned j = e_num >> 3;
+ unsigned idx = (TWO_OVER_PI_LENGTH - 1 - j - 7);
+ unsigned shift = e_num & 7; // s = e - 54 - 8*i
+ x_u <<= shift; // lsb(x_u) = 2^(e - 52 - s)
+ UInt<64> x_u64(x_u);
+ // Gather parts
+ auto get_uint64 = [](const uint8_t *ptr) -> uint64_t {
+ return ptr[0] | (uint64_t(ptr[1]) << 8) | (uint64_t(ptr[2]) << 16) |
+ (uint64_t(ptr[3]) << 24) | (uint64_t(ptr[4]) << 32) |
+ (uint64_t(ptr[5]) << 40) | (uint64_t(ptr[6]) << 48) |
+ (uint64_t(ptr[7]) << 56);
+ };
+ // lsb(c0) = 2^(-8i - 64)
+ uint64_t c0 = get_uint64(&TWO_OVER_PI[idx]);
+ // lsb(p0) = lsb(x_u) * lsb(c0)
+ // = 2^(e - 52 - s) * 2^(-8i - 64)
+ // = 2^(-62)
+ // msb(p0) = 2^(-62 + 63) = 2^1.
+ uint64_t p0 = x_u * c0;
+ // lsb(c1) = lsb(c0) * 2^-64 = 2^(-8i - 128)
+ UInt<64> c1(get_uint64(&TWO_OVER_PI[idx - 8]));
+ // lsb(c2) = lsb(c1) * 2^-64 = 2^(-8i - 192)
+ UInt<64> c2(get_uint64(&TWO_OVER_PI[idx - 16]));
+ // lsb(p1) = lsb(x_u) * lsb(c1) = 2^(-62 - 64) = 2^-126
+ UInt<128> p1 = x_u64.ful_mul(c1);
+ // lsb(p2) = lsb(x_u) * lsb(c2) * 2^64 = 2^-126
+ UInt<128> p2(x_u64.quick_mul_hi(c2));
+ UInt<128> sum = p1 + p2;
+ sum.val[1] += p0;
+ // Get the highest 2 bits.
+ k = static_cast<unsigned>(sum.val[1] >> 62);
+ bool round_bit = sum.val[1] & 0x2000'0000'0000'0000;
+ // Shift so that the leading bit is 0.5.
+ x_frac.val[0] = (sum.val[0] << 2);
+ x_frac.val[1] = (sum.val[1] << 2) | (sum.val[0] >> 62);
+
+ // Round to nearest k.
+ if (round_bit) {
+ // Flip the sign.
+ x_frac_is_neg = true;
+ ++k;
+ // Fast approximation of `1 - x_frac` with error = -lsb(x_frac) = -2^-128.
+ // Since in 2-complement, -x = ~x + lsb(x).
+ x_frac = ~x_frac;
+ }
+
+ // Perform multiplication x_frac * pi/2
+ x_frac = fputil::multiply_add(x_frac, PI_OVER_2_M1, x_frac);
+
+ return x_frac_is_neg;
+}
+
+// 128-bit fixed-point minimax polynomial approximation of sin(x) generated by
+// Sollya with:
+// > P = fpminimax(sin(x), [|1, 3, 5, 7, 9, 11, 13|], [|1, 128...|],
+// [0, pi/4], fixed);
+// > dirtyinfnorm( (sin(x) - P(x))/sin(x), [0, pi/4]);
+// 0x1.17a4...p-58
+LIBC_INLINE_VAR constexpr Frac128 SIN_COEFF[] = {
+ // Positive coefficients
+ Frac128({0x321f'bc0b'b8ca'f059, 0x0222'2222'221e'eac3}), // x^5
+ Frac128({0x0556'929e'ad60'7cb2, 0x0000'2e3b'c6ab'd75e}), // x^9
+ Frac128({0x4c97'758e'92ac'214c, 0x0000'0000'aec7'1a39}), // x^13
+ // Negative coefficients
+ Frac128({0x91b3'96a3'd5c5'fd6a, 0x2aaa'aaaa'aaaa'8ff2}), // x^3
+ Frac128({0x36aa'355c'3311'996d, 0x000d'00d0'0cdf'8c9b}), // x^7
+ Frac128({0xa260'c74f'239d'd891, 0x0000'006b'9795'15a2}), // x^11
+};
+// 128-bit fixed-point minimax polynomial approximation of cos(x) generated by
+// Sollya with:
+// > P = fpminimax(cos(x), [|0, 2, 4, 6, 8, 10, 12|], [|1, 128...|],
+// [0, pi/4], fixed);
+// > dirtyinfnorm( (cos(x) - P(x))/cos(x), [0, pi/4]);
+// 0x1.269f...p-54
+LIBC_INLINE_VAR constexpr Frac128 COS_COEFF[] = {
+ // Positive coefficients
+ Frac128({0x860a'3e6c'cc50'e0d8, 0x0aaa'aaaa'aa77'5c33}), // x^4
+ Frac128({0x84b2'76a3'c971'e7b8, 0x0001'a019'f80a'8ad5}), // x^8
+ Frac128({0xed56'891e'f750'c7a9, 0x0000'0008'dc50'133d}), // x^12
+ // Negative coefficients
+ Frac128({0x56f6'2e74'b16e'5555, 0x7fff'ffff'fffe'4bfe}), // x^2
+ Frac128({0xa87a'8f81'7440'7dd6, 0x005b'05b0'58fc'6fed}), // x^6
+ Frac128({0x0082'310d'4e65'6b1f, 0x0000'049f'7cff'73d2}), // x^10
+};
+
+// Compute sin(x) with relative errors ~ 2^-54.
+LIBC_INLINE constexpr double sin_eval(const Frac128 &x_frac, unsigned k,
+ bool is_neg, bool x_frac_is_neg) {
+ // cos when k = 1, 3
+ bool is_cos = ((k & 1) == 1);
+ // flip sign when k = 2, 3
+ is_neg = is_neg != ((k & 2) == 2);
+
+ const Frac128 *coeffs = is_cos ? COS_COEFF : SIN_COEFF;
+
+ Frac128 xsq = x_frac * x_frac;
+ Frac128 x4 = xsq * xsq;
+ Frac128 poly_pos = fputil::polyeval(x4, coeffs[0], coeffs[1], coeffs[2]);
----------------
statham-arm wrote:
_Why_ are we evaluating the positive and negative terms separately? There's no comment explaining it. Wouldn't it be simpler to just diverge once based on `is_cos`, evaluate a single polynomial in x², and multiply the result by x in the sin case?
I can't see that it makes much difference to the accuracy either way – there's no reason to expect the rounding errors in the multiplications to cancel each other out more often in this strategy than the more obvious one. (And the other concerns you'd have in FP, such as well-conditioning and significance loss, don't occur anyway with this strategy.)
Perhaps it's to avoid overflow of `UInt<128>`? But why avoid that? It should work fine – you compute the answer mod 1, and since you know on each branch that it's bounded within an interval with width < 1, that can't give you the wrong answer.
https://github.com/llvm/llvm-project/pull/184752
More information about the libc-commits
mailing list