[libc-commits] [libc] 1768cb3 - Use __builtin_clz to find leading 1 in generic sqrt (where possible)
Tue Ly via libc-commits
libc-commits at lists.llvm.org
Mon Feb 28 14:34:35 PST 2022
Author: Clint Caywood
Date: 2022-02-28T17:33:49-05:00
New Revision: 1768cb3a674ad657dd3eccd001f0a3e9a7675af6
URL: https://github.com/llvm/llvm-project/commit/1768cb3a674ad657dd3eccd001f0a3e9a7675af6
DIFF: https://github.com/llvm/llvm-project/commit/1768cb3a674ad657dd3eccd001f0a3e9a7675af6.diff
LOG: Use __builtin_clz to find leading 1 in generic sqrt (where possible)
__builtin_clz requires just a single instruction on x86 and arm, so this is a performance improvement.
Reviewed By: lntue
Differential Revision: https://reviews.llvm.org/D120579
Added:
Modified:
libc/src/__support/FPUtil/generic/sqrt.h
libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
Removed:
################################################################################
diff --git a/libc/src/__support/FPUtil/generic/sqrt.h b/libc/src/__support/FPUtil/generic/sqrt.h
index ad85bca94af55..3efc98b4528ac 100644
--- a/libc/src/__support/FPUtil/generic/sqrt.h
+++ b/libc/src/__support/FPUtil/generic/sqrt.h
@@ -31,47 +31,28 @@ template <> struct SpecialLongDouble<long double> {
};
#endif // SPECIAL_X86_LONG_DOUBLE
-template <typename T>
-static inline void normalize(int &exponent,
- typename FPBits<T>::UIntType &mantissa);
-
-template <> inline void normalize<float>(int &exponent, uint32_t &mantissa) {
- // Use binary search to shift the leading 1 bit.
- // With MantissaWidth<float> = 23, it will take
- // ceil(log2(23)) = 5 steps checking the mantissa bits as followed:
- // Step 1: 0000 0000 0000 XXXX XXXX XXXX
- // Step 2: 0000 00XX XXXX XXXX XXXX XXXX
- // Step 3: 000X XXXX XXXX XXXX XXXX XXXX
- // Step 4: 00XX XXXX XXXX XXXX XXXX XXXX
- // Step 5: 0XXX XXXX XXXX XXXX XXXX XXXX
- constexpr int NSTEPS = 5; // = ceil(log2(MantissaWidth))
- constexpr uint32_t BOUNDS[NSTEPS] = {1 << 12, 1 << 18, 1 << 21, 1 << 22,
- 1 << 23};
- constexpr int SHIFTS[NSTEPS] = {12, 6, 3, 2, 1};
-
- for (int i = 0; i < NSTEPS; ++i) {
- if (mantissa < BOUNDS[i]) {
- exponent -= SHIFTS[i];
- mantissa <<= SHIFTS[i];
- }
- }
+// The following overloads are matched based on what is accepted by
+// __builtin_clz* rather than using the exactly-sized aliases from stdint.h.
+// This way, we can avoid making any assumptions about integer sizes and let the
+// compiler match for us.
+template <typename T> static inline int clz(T val);
+template <> inline int clz<unsigned int>(unsigned int val) {
+ return __builtin_clz(val);
+}
+template <> inline int clz<unsigned long int>(unsigned long int val) {
+ return __builtin_clzl(val);
+}
+template <> inline int clz<unsigned long long int>(unsigned long long int val) {
+ return __builtin_clzll(val);
}
-template <> inline void normalize<double>(int &exponent, uint64_t &mantissa) {
- // Use binary search to shift the leading 1 bit similar to float.
- // With MantissaWidth<double> = 52, it will take
- // ceil(log2(52)) = 6 steps checking the mantissa bits.
- constexpr int NSTEPS = 6; // = ceil(log2(MantissaWidth))
- constexpr uint64_t BOUNDS[NSTEPS] = {1ULL << 26, 1ULL << 39, 1ULL << 46,
- 1ULL << 49, 1ULL << 51, 1ULL << 52};
- constexpr int SHIFTS[NSTEPS] = {27, 14, 7, 4, 2, 1};
-
- for (int i = 0; i < NSTEPS; ++i) {
- if (mantissa < BOUNDS[i]) {
- exponent -= SHIFTS[i];
- mantissa <<= SHIFTS[i];
- }
- }
+template <typename T>
+static inline void normalize(int &exponent,
+ typename FPBits<T>::UIntType &mantissa) {
+ const int shift =
+ clz(mantissa) - (8 * sizeof(mantissa) - 1 - MantissaWidth<T>::VALUE);
+ exponent -= shift;
+ mantissa <<= shift;
}
#ifdef LONG_DOUBLE_IS_DOUBLE
@@ -82,22 +63,11 @@ inline void normalize<long double>(int &exponent, uint64_t &mantissa) {
#elif !defined(SPECIAL_X86_LONG_DOUBLE)
template <>
inline void normalize<long double>(int &exponent, __uint128_t &mantissa) {
- // Use binary search to shift the leading 1 bit similar to float.
- // With MantissaWidth<long double> = 112, it will take
- // ceil(log2(112)) = 7 steps checking the mantissa bits.
- constexpr int NSTEPS = 7; // = ceil(log2(MantissaWidth))
- constexpr __uint128_t BOUNDS[NSTEPS] = {
- __uint128_t(1) << 56, __uint128_t(1) << 84, __uint128_t(1) << 98,
- __uint128_t(1) << 105, __uint128_t(1) << 109, __uint128_t(1) << 111,
- __uint128_t(1) << 112};
- constexpr int SHIFTS[NSTEPS] = {57, 29, 15, 8, 4, 2, 1};
-
- for (int i = 0; i < NSTEPS; ++i) {
- if (mantissa < BOUNDS[i]) {
- exponent -= SHIFTS[i];
- mantissa <<= SHIFTS[i];
- }
- }
+ const uint64_t hi_bits = static_cast<uint64_t>(mantissa >> 64);
+ const int shift = hi_bits ? (clz(hi_bits) - 15)
+ : (clz(static_cast<uint64_t>(mantissa)) + 49);
+ exponent -= shift;
+ mantissa <<= shift;
}
#endif
diff --git a/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h b/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
index 82a996b21378e..b7547045196e3 100644
--- a/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
+++ b/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
@@ -18,21 +18,11 @@ namespace fputil {
namespace x86 {
inline void normalize(int &exponent, __uint128_t &mantissa) {
- // Use binary search to shift the leading 1 bit similar to float.
- // With MantissaWidth<long double> = 63, it will take
- // ceil(log2(63)) = 6 steps checking the mantissa bits.
- constexpr int NSTEPS = 6; // = ceil(log2(MantissaWidth))
- constexpr __uint128_t BOUNDS[NSTEPS] = {
- __uint128_t(1) << 32, __uint128_t(1) << 48, __uint128_t(1) << 56,
- __uint128_t(1) << 60, __uint128_t(1) << 62, __uint128_t(1) << 63};
- constexpr int SHIFTS[NSTEPS] = {32, 16, 8, 4, 2, 1};
-
- for (int i = 0; i < NSTEPS; ++i) {
- if (mantissa < BOUNDS[i]) {
- exponent -= SHIFTS[i];
- mantissa <<= SHIFTS[i];
- }
- }
+ const int shift =
+ __builtin_clzll(static_cast<uint64_t>(mantissa)) -
+ (8 * sizeof(uint64_t) - 1 - MantissaWidth<long double>::VALUE);
+ exponent -= shift;
+ mantissa <<= shift;
}
// if constexpr statement in sqrt.h still requires x86::sqrt to be declared
More information about the libc-commits
mailing list