[libc-commits] [libc] [llvm] [libc][math][c23] Improve rsqrtf16() function (PR #160639)
Anton Shepelev via libc-commits
libc-commits at lists.llvm.org
Sun Apr 26 19:39:21 PDT 2026
https://github.com/amemov updated https://github.com/llvm/llvm-project/pull/160639
>From eb3025d7bbe0d07af0efd7d4b64e848e878baa5e Mon Sep 17 00:00:00 2001
From: Anton Shepelev <shepelev777 at gmail.com>
Date: Wed, 24 Sep 2025 20:59:06 -0700
Subject: [PATCH 1/5] - Draft of math approximation for targets that have only
int hardware - Fixed typo in +inf case. Should return +0 according to
F.10.4.9
---
libc/src/__support/math/rsqrtf16.h | 109 +++++++++++++++++++++++++++--
1 file changed, 105 insertions(+), 4 deletions(-)
diff --git a/libc/src/__support/math/rsqrtf16.h b/libc/src/__support/math/rsqrtf16.h
index ab7529682950d..551aa9710ea70 100644
--- a/libc/src/__support/math/rsqrtf16.h
+++ b/libc/src/__support/math/rsqrtf16.h
@@ -58,12 +58,11 @@ LIBC_INLINE constexpr float16 rsqrtf16(float16 x) {
return FPBits::quiet_nan().get_val();
}
- // x = +inf => rsqrt(x) = 0
- return FPBits::zero().get_val();
+ // x = +inf => rsqrt(x) = +0
+ return FPBits::zero(xbits.sign()).get_val();
}
- // TODO: add integer based implementation when LIBC_TARGET_CPU_HAS_FPU_FLOAT
- // is not defined
+#ifdef LIBC_TARGET_CPU_HAS_FPU_FLOAT
float result = 1.0f / fputil::sqrt<float>(fputil::cast<float>(x));
// Targeted post-corrections to ensure correct rounding in half for specific
@@ -76,6 +75,108 @@ LIBC_INLINE constexpr float16 rsqrtf16(float16 x) {
}
return fputil::cast<float16>(result);
+
+#else
+ // Range reduction:
+ // x can be expressed as m*2^e, where e - int exponent and m - mantissa
+ // rsqrtf16(x) = rsqrtf16(m*2^e)
+ // rsqrtf16(m*2^e) = 1/sqrt(m) * 1/sqrt(2^e) = 1/sqrt(m) * 1/2^(e/2)
+ // 1/sqrt(m) * 1/2^(e/2) = 1/sqrt(m) * 2^(-e/2)
+
+ // Compute reduction directly from half bits to avoid frexp/ldexp overhead.
+ int exponent = 0;
+ int signifcand = 0; // same as mantissa, but int
+ uint16_t eh = static_cast<uint16_t>((x_abs >> 10) & 0x1F);
+ uint16_t frac = static_cast<uint16_t>(x_abs & 0x3FF);
+
+ int result;
+ if (eh != 0) {
+ // ((2^-1 + frac/2^11) * 2) * 2^(eh-15)
+
+ // Normal: x = (1 + frac/2^10) * 2^(eh-15) = ((0.5 + frac/2^11) * 2) *
+ // 2^(eh-15)
+ // => mantissa in [0.5,1): m = 0.5 + frac/2^11, exponent = (eh - 15) + 1 =
+ // eh - 14
+ exponent = static_cast<int>(eh) - 14;
+ mantissa = 0.5f + static_cast<float>(frac) * 0x1.0p-11f;
+ } else {
+ // Subnormal: x = (frac/2^10) * 2^(1-15) = frac * 2^-24.
+ // Normalize frac so that bit 9 becomes 1; then mantissa m = (frac <<
+ // t)/2^10 ∈ [0.5,1) and exponent E = -14 - t so that x = m * 2^E.
+ if (LIBC_UNLIKELY(frac == 0)) {
+ // Should have been handled by zero check above, but keep safe.
+ return FPBits::inf(Sign::POS).get_val();
+ }
+ int shifts = 0;
+ while ((frac & 0x200u) == 0u) { // bring into [0x200, 0x3FF]
+ frac <<= 1;
+ ++shifts;
+ }
+ exponent = -14 - shifts;
+ mantissa = static_cast<float>(frac) * 0x1.0p-10f;
+ }
+
+ float result = 0.0f;
+ int exp_floored = -(exponent >> 1);
+
+ if (mantissa == 0.5f) {
+ // When mantissa is 0.5f, x was a power of 2 (or subnormal that normalizes
+ // this way). 1/sqrt(0.5f) = sqrt(2.0f).
+ // If exponent is odd (exponent = 2k + 1):
+ // rsqrt(x) = (1/sqrt(0.5)) * 2^(-(2k+1)/2) = sqrt(2) * 2^(-k-0.5)
+ // = sqrt(2) * 2^(-k) * (1/sqrt(2)) = 2^(-k)
+ // exp_floored = -((2k+1)>>1) = -(k) = -k
+ // So result = ldexp(1.0f, exp_floored)
+ // If exponent is even (exponent = 2k):
+ // rsqrt(x) = (1/sqrt(0.5)) * 2^(-2k/2) = sqrt(2) * 2^(-k)
+ // exp_floored = -((2k)>>1) = -(k) = -k
+ // So result = ldexp(sqrt(2.0f), exp_floored)
+ if (exponent & 1) {
+ result = fputil::ldexp(1.0f, exp_floored);
+ } else {
+ constexpr float SQRT_2_F = 0x1.6a09e6p0f; // sqrt(2.0f)
+ result = fputil::ldexp(SQRT_2_F, exp_floored);
+ }
+ } else {
+ // 4 Degree minimax polynomial (single-precision coefficients) generated
+ // with Sollya: P = fpminimax(1/sqrt(x), 4,
+ // [|single,single,single,single,single|], [0.5;1])
+ float y = fputil::polyeval(mantissa,
+ 0x1.771256p1f, // c0
+ -0x1.5e7c4ap2f, // c1
+ 0x1.b3851cp2f, // c2
+ -0x1.1a27ep2f, // c3
+ 0x1.265c66p0f); // c4
+
+ // Newton-Raphson iteration in float (use multiply_add to leverage FMA when
+ // available):
+ float y2 = y * y;
+ float factor = fputil::multiply_add(-0.5f * mantissa, y2, 1.5f);
+ y = y * factor;
+
+ result = fputil::ldexp(y, exp_floored);
+ if (exponent & 1) {
+ constexpr float ONE_OVER_SQRT2 = 0x1.6a09e6p-1f; // 1/sqrt(2)
+ result *= ONE_OVER_SQRT2;
+ }
+
+ // Targeted post-correction: for the specific half-precision mantissa
+ // pattern M == 0x011F we observe a consistent -1 ULP bias across exponents.
+ // Apply a tiny upward nudge to cross the rounding boundary in all modes.
+ const uint16_t half_mantissa = static_cast<uint16_t>(x_abs & 0x3ff);
+ if (half_mantissa == 0x011F) {
+ // Nudge up to fix consistent -1 ULP at that mantissa boundary
+ result = fputil::multiply_add(result, 0x1.0p-21f,
+ result); // result *= (1 + 2^-21)
+ } else if (half_mantissa == 0x0313) {
+ // Nudge down to fix +1 ULP under upward rounding at this mantissa
+ // boundary
+ result = fputil::multiply_add(result, -0x1.0p-21f,
+ result); // result *= (1 - 2^-21)
+ }
+ }
+ return fputil::cast<float16>(result);
+#endif
}
} // namespace math
>From ed1e4d394de1e1b47ec2e12d34dbc6583a39f4d7 Mon Sep 17 00:00:00 2001
From: amemov <shepelev777 at gmail.com>
Date: Mon, 17 Nov 2025 22:00:28 -0800
Subject: [PATCH 2/5] Added approximation from previous PR
---
libc/src/__support/math/rsqrtf16.h | 45 +++++-------------------------
1 file changed, 7 insertions(+), 38 deletions(-)
diff --git a/libc/src/__support/math/rsqrtf16.h b/libc/src/__support/math/rsqrtf16.h
index 551aa9710ea70..4b237950b0b17 100644
--- a/libc/src/__support/math/rsqrtf16.h
+++ b/libc/src/__support/math/rsqrtf16.h
@@ -16,6 +16,7 @@
#include "src/__support/FPUtil/FEnvImpl.h"
#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/FPUtil/PolyEval.h"
#include "src/__support/FPUtil/cast.h"
#include "src/__support/FPUtil/multiply_add.h"
#include "src/__support/FPUtil/sqrt.h"
@@ -77,44 +78,10 @@ LIBC_INLINE constexpr float16 rsqrtf16(float16 x) {
return fputil::cast<float16>(result);
#else
- // Range reduction:
- // x can be expressed as m*2^e, where e - int exponent and m - mantissa
- // rsqrtf16(x) = rsqrtf16(m*2^e)
- // rsqrtf16(m*2^e) = 1/sqrt(m) * 1/sqrt(2^e) = 1/sqrt(m) * 1/2^(e/2)
- // 1/sqrt(m) * 1/2^(e/2) = 1/sqrt(m) * 2^(-e/2)
+ float xf = fputil::cast<float>(x);
- // Compute reduction directly from half bits to avoid frexp/ldexp overhead.
int exponent = 0;
- int signifcand = 0; // same as mantissa, but int
- uint16_t eh = static_cast<uint16_t>((x_abs >> 10) & 0x1F);
- uint16_t frac = static_cast<uint16_t>(x_abs & 0x3FF);
-
- int result;
- if (eh != 0) {
- // ((2^-1 + frac/2^11) * 2) * 2^(eh-15)
-
- // Normal: x = (1 + frac/2^10) * 2^(eh-15) = ((0.5 + frac/2^11) * 2) *
- // 2^(eh-15)
- // => mantissa in [0.5,1): m = 0.5 + frac/2^11, exponent = (eh - 15) + 1 =
- // eh - 14
- exponent = static_cast<int>(eh) - 14;
- mantissa = 0.5f + static_cast<float>(frac) * 0x1.0p-11f;
- } else {
- // Subnormal: x = (frac/2^10) * 2^(1-15) = frac * 2^-24.
- // Normalize frac so that bit 9 becomes 1; then mantissa m = (frac <<
- // t)/2^10 ∈ [0.5,1) and exponent E = -14 - t so that x = m * 2^E.
- if (LIBC_UNLIKELY(frac == 0)) {
- // Should have been handled by zero check above, but keep safe.
- return FPBits::inf(Sign::POS).get_val();
- }
- int shifts = 0;
- while ((frac & 0x200u) == 0u) { // bring into [0x200, 0x3FF]
- frac <<= 1;
- ++shifts;
- }
- exponent = -14 - shifts;
- mantissa = static_cast<float>(frac) * 0x1.0p-10f;
- }
+ float mantissa = fputil::frexp(xf, exponent);
float result = 0.0f;
int exp_floored = -(exponent >> 1);
@@ -139,8 +106,9 @@ LIBC_INLINE constexpr float16 rsqrtf16(float16 x) {
}
} else {
// 4 Degree minimax polynomial (single-precision coefficients) generated
- // with Sollya: P = fpminimax(1/sqrt(x), 4,
- // [|single,single,single,single,single|], [0.5;1])
+ // with Sollya:
+ // P = fpminimax(1/sqrt(x), 4,
+ // [|single,single,single,single,single|], [0.5;1])
float y = fputil::polyeval(mantissa,
0x1.771256p1f, // c0
-0x1.5e7c4ap2f, // c1
@@ -175,6 +143,7 @@ LIBC_INLINE constexpr float16 rsqrtf16(float16 x) {
result); // result *= (1 - 2^-21)
}
}
+
return fputil::cast<float16>(result);
#endif
}
>From a214bd1a221666f0d00c552922d9dd453905765f Mon Sep 17 00:00:00 2001
From: amemov <shepelev777 at gmail.com>
Date: Sat, 25 Apr 2026 13:37:01 -0700
Subject: [PATCH 3/5] new approximation draft
---
libc/src/__support/math/CMakeLists.txt | 14 ++
libc/src/__support/math/rsqrtf16.h | 237 +++++++++++++-----
.../llvm-project-overlay/libc/BUILD.bazel | 1 -
3 files changed, 182 insertions(+), 70 deletions(-)
diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt
index 095028c644d37..6f75baef8bf96 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -126,6 +126,20 @@ add_header_library(
libc.src.__support.macros.properties.types
)
+add_header_library(
+ rsqrtf16
+ HDRS
+ rsqrtf16.h
+ DEPENDS
+ libc.src.__support.FPUtil.cast
+ libc.src.__support.FPUtil.fenv_impl
+ libc.src.__support.FPUtil.fp_bits
+ libc.src.__support.FPUtil.multiply_add
+ libc.src.__support.FPUtil.sqrt
+ libc.src.__support.macros.optimization
+ libc.src.__support.macros.properties.types
+)
+
add_header_library(
asin_utils
HDRS
diff --git a/libc/src/__support/math/rsqrtf16.h b/libc/src/__support/math/rsqrtf16.h
index 4b237950b0b17..23c74a8515c15 100644
--- a/libc/src/__support/math/rsqrtf16.h
+++ b/libc/src/__support/math/rsqrtf16.h
@@ -15,8 +15,6 @@
#include "src/__support/FPUtil/FEnvImpl.h"
#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/ManipulationFunctions.h"
-#include "src/__support/FPUtil/PolyEval.h"
#include "src/__support/FPUtil/cast.h"
#include "src/__support/FPUtil/multiply_add.h"
#include "src/__support/FPUtil/sqrt.h"
@@ -25,6 +23,173 @@
namespace LIBC_NAMESPACE_DECL {
namespace math {
+namespace rsqrtf16_internal {
+
+LIBC_INLINE_VAR constexpr int RSQRT_FRACTION_BITS = 29;
+LIBC_INLINE_VAR constexpr int64_t ONE = int64_t(1) << RSQRT_FRACTION_BITS;
+LIBC_INLINE_VAR constexpr int64_t THREE_HALVES = 3 * (ONE >> 1);
+
+// Degree-4 minimax polynomial generated with Sollya:
+// P = fpminimax(1/sqrt(x), 4,
+// [|single,single,single,single,single|], [0.5;1])
+// Coefficients are stored in Q29 fixed-point format.
+LIBC_INLINE_VAR constexpr int64_t COEFFS[5] = {
+ 1'573'164'416, -2'940'085'504, 3'653'406'208, -2'366'894'080, 617'319'616,
+};
+LIBC_INLINE_VAR constexpr int64_t ONE_OVER_SQRT2 = 0x16a09e60;
+
+LIBC_INLINE constexpr int floor_log2(uint64_t x) {
+ int result = -1;
+ while (x) {
+ x >>= 1;
+ ++result;
+ }
+ return result;
+}
+
+LIBC_INLINE constexpr int64_t eval_polynomial(uint32_t m) {
+ int64_t y = COEFFS[4];
+ y = COEFFS[3] + ((y * m) >> RSQRT_FRACTION_BITS);
+ y = COEFFS[2] + ((y * m) >> RSQRT_FRACTION_BITS);
+ y = COEFFS[1] + ((y * m) >> RSQRT_FRACTION_BITS);
+ y = COEFFS[0] + ((y * m) >> RSQRT_FRACTION_BITS);
+ return y;
+}
+
+LIBC_INLINE constexpr int64_t newton_raphson(uint32_t m, int64_t y) {
+ int64_t y2 = (y * y) >> RSQRT_FRACTION_BITS;
+ int64_t my2 = (static_cast<int64_t>(m) * y2) >> RSQRT_FRACTION_BITS;
+ int64_t factor = THREE_HALVES - (my2 >> 1);
+ return (y * factor) >> RSQRT_FRACTION_BITS;
+}
+
+LIBC_INLINE constexpr uint16_t fixed_to_half_bits(uint64_t y, int scale_exp) {
+ int y_log2 = floor_log2(y);
+ int out_exp = scale_exp + y_log2 - RSQRT_FRACTION_BITS;
+ int biased_exp = out_exp + 15;
+
+ uint32_t out_sig = y_log2 >= 10 ? static_cast<uint32_t>(y >> (y_log2 - 10))
+ : static_cast<uint32_t>(y << (10 - y_log2));
+
+ if (biased_exp <= 0)
+ return 0x0400;
+ if (biased_exp >= 31)
+ return 0x7bff;
+
+ return static_cast<uint16_t>((biased_exp << 10) | (out_sig & 0x3ff));
+}
+
+LIBC_INLINE constexpr uint16_t approximate_rsqrt(uint16_t x_abs) {
+ uint32_t x_mant = x_abs & 0x03ff;
+ int exponent = 0;
+
+ if (x_abs >= 0x0400) {
+ x_mant |= 0x0400;
+ exponent = static_cast<int>(x_abs >> 10) - 14;
+ } else {
+ exponent = -13;
+ while ((x_mant & 0x0400) == 0) {
+ x_mant <<= 1;
+ --exponent;
+ }
+ }
+
+ uint32_t m = x_mant << (RSQRT_FRACTION_BITS - 11);
+ int64_t y = newton_raphson(m, eval_polynomial(m));
+
+ int scale_exp = 0;
+ if (exponent & 1) {
+ y = (y * ONE_OVER_SQRT2) >> RSQRT_FRACTION_BITS;
+ scale_exp = -((exponent - 1) / 2);
+ } else {
+ scale_exp = -(exponent / 2);
+ }
+
+ return fixed_to_half_bits(static_cast<uint64_t>(y), scale_exp);
+}
+
+// Compare y = sig * 2^exp with 1 / sqrt(x_sig * 2^x_exp).
+// Return -1 if y is below the exact value, 0 if exact, and 1 if above.
+LIBC_INLINE constexpr int compare_with_rsqrt(uint32_t sig, int exp,
+ uint32_t x_sig, int x_exp) {
+ uint64_t lhs = static_cast<uint64_t>(sig) * sig * x_sig;
+ int scale = 2 * exp + x_exp;
+
+ if (scale >= 0)
+ return (scale == 0 && lhs == 1) ? 0 : 1;
+
+ int rshift = -scale;
+ if (rshift >= 64)
+ return -1;
+
+ uint64_t rhs = uint64_t(1) << rshift;
+ if (lhs < rhs)
+ return -1;
+ if (lhs > rhs)
+ return 1;
+ return 0;
+}
+
+LIBC_INLINE constexpr int compare_half_with_rsqrt(uint16_t y, uint32_t x_sig,
+ int x_exp) {
+ uint32_t y_sig = 0x0400 | (y & 0x03ff);
+ int y_exp = static_cast<int>(y >> 10) - 25;
+ return compare_with_rsqrt(y_sig, y_exp, x_sig, x_exp);
+}
+
+LIBC_INLINE constexpr uint16_t floor_rsqrt(uint16_t approx, uint32_t x_sig,
+ int x_exp) {
+ uint16_t y = approx < 0x0400 ? 0x0400 : approx;
+ while (compare_half_with_rsqrt(y, x_sig, x_exp) > 0)
+ --y;
+ while (y < 0x7bff && compare_half_with_rsqrt(y + 1, x_sig, x_exp) <= 0)
+ ++y;
+ return y;
+}
+
+LIBC_INLINE constexpr uint16_t round_result(uint16_t y, uint32_t x_sig,
+ int x_exp) {
+ if (compare_half_with_rsqrt(y, x_sig, x_exp) == 0)
+ return y;
+
+ int rounding_mode = FE_TONEAREST;
+ if (!cpp::is_constant_evaluated())
+ rounding_mode = fputil::get_round();
+ if (rounding_mode == FE_UPWARD)
+ return y + 1;
+ if (rounding_mode != FE_TONEAREST)
+ return y;
+
+ uint32_t y_sig = 0x0400 | (y & 0x03ff);
+ int y_exp = static_cast<int>(y >> 10) - 25;
+ uint32_t midpoint_sig = (y_sig << 1) | 1;
+ int midpoint_cmp = compare_with_rsqrt(midpoint_sig, y_exp - 1, x_sig, x_exp);
+
+ if (midpoint_cmp < 0)
+ return y + 1;
+ if (midpoint_cmp > 0)
+ return y;
+ return (y & 1) ? static_cast<uint16_t>(y + 1) : y;
+}
+
+LIBC_INLINE constexpr float16 rsqrtf16_no_float(uint16_t x_abs) {
+ uint32_t x_sig = 0;
+ int x_exp = 0;
+ if (x_abs >= 0x0400) {
+ x_sig = 0x0400 | (x_abs & 0x03ff);
+ x_exp = static_cast<int>(x_abs >> 10) - 25;
+ } else {
+ x_sig = x_abs;
+ x_exp = -24;
+ }
+
+ uint16_t approx = approximate_rsqrt(x_abs);
+ uint16_t y = floor_rsqrt(approx, x_sig, x_exp);
+ return fputil::FPBits<float16>(round_result(y, x_sig, x_exp)).get_val();
+}
+
+} // namespace rsqrtf16_internal
+
LIBC_INLINE constexpr float16 rsqrtf16(float16 x) {
using FPBits = fputil::FPBits<float16>;
FPBits xbits(x);
@@ -78,73 +243,7 @@ LIBC_INLINE constexpr float16 rsqrtf16(float16 x) {
return fputil::cast<float16>(result);
#else
- float xf = fputil::cast<float>(x);
-
- int exponent = 0;
- float mantissa = fputil::frexp(xf, exponent);
-
- float result = 0.0f;
- int exp_floored = -(exponent >> 1);
-
- if (mantissa == 0.5f) {
- // When mantissa is 0.5f, x was a power of 2 (or subnormal that normalizes
- // this way). 1/sqrt(0.5f) = sqrt(2.0f).
- // If exponent is odd (exponent = 2k + 1):
- // rsqrt(x) = (1/sqrt(0.5)) * 2^(-(2k+1)/2) = sqrt(2) * 2^(-k-0.5)
- // = sqrt(2) * 2^(-k) * (1/sqrt(2)) = 2^(-k)
- // exp_floored = -((2k+1)>>1) = -(k) = -k
- // So result = ldexp(1.0f, exp_floored)
- // If exponent is even (exponent = 2k):
- // rsqrt(x) = (1/sqrt(0.5)) * 2^(-2k/2) = sqrt(2) * 2^(-k)
- // exp_floored = -((2k)>>1) = -(k) = -k
- // So result = ldexp(sqrt(2.0f), exp_floored)
- if (exponent & 1) {
- result = fputil::ldexp(1.0f, exp_floored);
- } else {
- constexpr float SQRT_2_F = 0x1.6a09e6p0f; // sqrt(2.0f)
- result = fputil::ldexp(SQRT_2_F, exp_floored);
- }
- } else {
- // 4 Degree minimax polynomial (single-precision coefficients) generated
- // with Sollya:
- // P = fpminimax(1/sqrt(x), 4,
- // [|single,single,single,single,single|], [0.5;1])
- float y = fputil::polyeval(mantissa,
- 0x1.771256p1f, // c0
- -0x1.5e7c4ap2f, // c1
- 0x1.b3851cp2f, // c2
- -0x1.1a27ep2f, // c3
- 0x1.265c66p0f); // c4
-
- // Newton-Raphson iteration in float (use multiply_add to leverage FMA when
- // available):
- float y2 = y * y;
- float factor = fputil::multiply_add(-0.5f * mantissa, y2, 1.5f);
- y = y * factor;
-
- result = fputil::ldexp(y, exp_floored);
- if (exponent & 1) {
- constexpr float ONE_OVER_SQRT2 = 0x1.6a09e6p-1f; // 1/sqrt(2)
- result *= ONE_OVER_SQRT2;
- }
-
- // Targeted post-correction: for the specific half-precision mantissa
- // pattern M == 0x011F we observe a consistent -1 ULP bias across exponents.
- // Apply a tiny upward nudge to cross the rounding boundary in all modes.
- const uint16_t half_mantissa = static_cast<uint16_t>(x_abs & 0x3ff);
- if (half_mantissa == 0x011F) {
- // Nudge up to fix consistent -1 ULP at that mantissa boundary
- result = fputil::multiply_add(result, 0x1.0p-21f,
- result); // result *= (1 + 2^-21)
- } else if (half_mantissa == 0x0313) {
- // Nudge down to fix +1 ULP under upward rounding at this mantissa
- // boundary
- result = fputil::multiply_add(result, -0x1.0p-21f,
- result); // result *= (1 - 2^-21)
- }
- }
-
- return fputil::cast<float16>(result);
+ return rsqrtf16_internal::rsqrtf16_no_float(x_abs);
#endif
}
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 9d1f4f9e6a493..2200cdc86c785 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -2997,7 +2997,6 @@ libc_support_library(
":__support_fputil_cast",
":__support_fputil_fenv_impl",
":__support_fputil_fp_bits",
- ":__support_fputil_manipulation_functions",
":__support_fputil_multiply_add",
":__support_fputil_sqrt",
":__support_macros_optimization",
>From 5fa3ae6668bbf53443f7d311b970951c6e7b5693 Mon Sep 17 00:00:00 2001
From: amemov <shepelev777 at gmail.com>
Date: Sat, 25 Apr 2026 14:01:24 -0700
Subject: [PATCH 4/5] fix: remove duplicate target from cmake
---
libc/src/__support/math/CMakeLists.txt | 16 ++--------------
1 file changed, 2 insertions(+), 14 deletions(-)
diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt
index 6f75baef8bf96..7eb6c97c3acfa 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -126,20 +126,6 @@ add_header_library(
libc.src.__support.macros.properties.types
)
-add_header_library(
- rsqrtf16
- HDRS
- rsqrtf16.h
- DEPENDS
- libc.src.__support.FPUtil.cast
- libc.src.__support.FPUtil.fenv_impl
- libc.src.__support.FPUtil.fp_bits
- libc.src.__support.FPUtil.multiply_add
- libc.src.__support.FPUtil.sqrt
- libc.src.__support.macros.optimization
- libc.src.__support.macros.properties.types
-)
-
add_header_library(
asin_utils
HDRS
@@ -2686,8 +2672,10 @@ add_header_library(
libc.src.__support.FPUtil.cast
libc.src.__support.FPUtil.fenv_impl
libc.src.__support.FPUtil.fp_bits
+ libc.src.__support.FPUtil.multiply_add
libc.src.__support.FPUtil.sqrt
libc.src.__support.macros.optimization
+ libc.src.__support.macros.properties.types
)
add_header_library(
>From 1e8e5410e08dbfc1f8f3cb5e612bbb59c881bcc6 Mon Sep 17 00:00:00 2001
From: amemov <shepelev777 at gmail.com>
Date: Sun, 26 Apr 2026 19:39:05 -0700
Subject: [PATCH 5/5] reduced branching
---
libc/src/__support/math/CMakeLists.txt | 4 ++++
libc/src/__support/math/rsqrtf16.h | 21 +++++++------------
.../llvm-project-overlay/libc/BUILD.bazel | 1 +
3 files changed, 13 insertions(+), 13 deletions(-)
diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt
index 7eb6c97c3acfa..2b70cf31a4e77 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -2685,11 +2685,15 @@ add_header_library(
DEPENDS
libc.hdr.errno_macros
libc.hdr.fenv_macros
+ libc.include.llvm-libc-macros.float16_macros
+ libc.src.__support.CPP.bit
libc.src.__support.FPUtil.cast
libc.src.__support.FPUtil.fenv_impl
libc.src.__support.FPUtil.fp_bits
+ libc.src.__support.FPUtil.multiply_add
libc.src.__support.FPUtil.sqrt
libc.src.__support.macros.optimization
+ libc.src.__support.macros.properties.types
)
add_header_library(
diff --git a/libc/src/__support/math/rsqrtf16.h b/libc/src/__support/math/rsqrtf16.h
index 23c74a8515c15..67eaab1e0f652 100644
--- a/libc/src/__support/math/rsqrtf16.h
+++ b/libc/src/__support/math/rsqrtf16.h
@@ -13,6 +13,7 @@
#ifdef LIBC_TYPES_HAS_FLOAT16
+#include "src/__support/CPP/bit.h"
#include "src/__support/FPUtil/FEnvImpl.h"
#include "src/__support/FPUtil/FPBits.h"
#include "src/__support/FPUtil/cast.h"
@@ -39,12 +40,7 @@ LIBC_INLINE_VAR constexpr int64_t COEFFS[5] = {
LIBC_INLINE_VAR constexpr int64_t ONE_OVER_SQRT2 = 0x16a09e60;
LIBC_INLINE constexpr int floor_log2(uint64_t x) {
- int result = -1;
- while (x) {
- x >>= 1;
- ++result;
- }
- return result;
+ return 63 - cpp::countl_zero(x);
}
LIBC_INLINE constexpr int64_t eval_polynomial(uint32_t m) {
@@ -87,11 +83,9 @@ LIBC_INLINE constexpr uint16_t approximate_rsqrt(uint16_t x_abs) {
x_mant |= 0x0400;
exponent = static_cast<int>(x_abs >> 10) - 14;
} else {
- exponent = -13;
- while ((x_mant & 0x0400) == 0) {
- x_mant <<= 1;
- --exponent;
- }
+ int shift = cpp::countl_zero(x_mant) - (32 - 11);
+ x_mant <<= shift;
+ exponent = -13 - shift;
}
uint32_t m = x_mant << (RSQRT_FRACTION_BITS - 11);
@@ -140,9 +134,10 @@ LIBC_INLINE constexpr int compare_half_with_rsqrt(uint16_t y, uint32_t x_sig,
LIBC_INLINE constexpr uint16_t floor_rsqrt(uint16_t approx, uint32_t x_sig,
int x_exp) {
uint16_t y = approx < 0x0400 ? 0x0400 : approx;
- while (compare_half_with_rsqrt(y, x_sig, x_exp) > 0)
+ if (LIBC_UNLIKELY(compare_half_with_rsqrt(y, x_sig, x_exp) > 0))
--y;
- while (y < 0x7bff && compare_half_with_rsqrt(y + 1, x_sig, x_exp) <= 0)
+ if (LIBC_UNLIKELY(y < 0x7bff &&
+ compare_half_with_rsqrt(y + 1, x_sig, x_exp) <= 0))
++y;
return y;
}
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 2200cdc86c785..89f15f453cc2c 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -2994,6 +2994,7 @@ libc_support_library(
name = "__support_math_rsqrtf16",
hdrs = ["src/__support/math/rsqrtf16.h"],
deps = [
+ ":__support_cpp_bit",
":__support_fputil_cast",
":__support_fputil_fenv_impl",
":__support_fputil_fp_bits",
More information about the libc-commits
mailing list