[libc-commits] [PATCH] D120579: Use __builtin_clz to find leading 1 in generic sqrt (where possible)

Fri Feb 25 11:12:54 PST 2022

cratonica created this revision.
cratonica added reviewers: lntue, sivachandra.
Herald added subscribers: libc-commits, pengfei, kristof.beyls.
Herald added a project: libc-project.
cratonica requested review of this revision.

__builtin_clz requires just a single instruction on x86 and arm, so this is a performance improvement.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D120579

Files:
  libc/src/__support/FPUtil/generic/sqrt.h


Index: libc/src/__support/FPUtil/generic/sqrt.h
===================================================================

--- libc/src/__support/FPUtil/generic/sqrt.h
+++ libc/src/__support/FPUtil/generic/sqrt.h
@@ -21,6 +21,8 @@
 
 namespace internal {
 
+template <typename T> static inline int clz(T val);
+
 template <typename T> struct SpecialLongDouble {
   static constexpr bool VALUE = false;
 };
@@ -31,47 +33,27 @@
 };
 #endif // SPECIAL_X86_LONG_DOUBLE
 
-template <typename T>
-static inline void normalize(int &exponent,
-                             typename FPBits<T>::UIntType &mantissa);
-
-template <> inline void normalize<float>(int &exponent, uint32_t &mantissa) {
-  // Use binary search to shift the leading 1 bit.
-  // With MantissaWidth<float> = 23, it will take
-  // ceil(log2(23)) = 5 steps checking the mantissa bits as followed:
-  // Step 1: 0000 0000 0000 XXXX XXXX XXXX
-  // Step 2: 0000 00XX XXXX XXXX XXXX XXXX
-  // Step 3: 000X XXXX XXXX XXXX XXXX XXXX
-  // Step 4: 00XX XXXX XXXX XXXX XXXX XXXX
-  // Step 5: 0XXX XXXX XXXX XXXX XXXX XXXX
-  constexpr int NSTEPS = 5; // = ceil(log2(MantissaWidth))
-  constexpr uint32_t BOUNDS[NSTEPS] = {1 << 12, 1 << 18, 1 << 21, 1 << 22,
-                                       1 << 23};
-  constexpr int SHIFTS[NSTEPS] = {12, 6, 3, 2, 1};
-
-  for (int i = 0; i < NSTEPS; ++i) {
-    if (mantissa < BOUNDS[i]) {
-      exponent -= SHIFTS[i];
-      mantissa <<= SHIFTS[i];
-    }
-  }
+// The following overloads are matched based on what is accepted by
+// __builtin_clz* rather than using the exactly-sized aliases from stdint.h.
+// This way, we can avoid making any assumptions about integer sizes and let the
+// compiler match for us.
+template <> inline int clz<unsigned int>(unsigned int val) {
+  return __builtin_clz(val);
+}
+template <> inline int clz<unsigned long int>(unsigned long int val) {
+  return __builtin_clzl(val);
+}
+template <> inline int clz<unsigned long long int>(unsigned long long int val) {
+  return __builtin_clzll(val);
 }
 
-template <> inline void normalize<double>(int &exponent, uint64_t &mantissa) {
-  // Use binary search to shift the leading 1 bit similar to float.
-  // With MantissaWidth<double> = 52, it will take
-  // ceil(log2(52)) = 6 steps checking the mantissa bits.
-  constexpr int NSTEPS = 6; // = ceil(log2(MantissaWidth))
-  constexpr uint64_t BOUNDS[NSTEPS] = {1ULL << 26, 1ULL << 39, 1ULL << 46,
-                                       1ULL << 49, 1ULL << 51, 1ULL << 52};
-  constexpr int SHIFTS[NSTEPS] = {27, 14, 7, 4, 2, 1};
-
-  for (int i = 0; i < NSTEPS; ++i) {
-    if (mantissa < BOUNDS[i]) {
-      exponent -= SHIFTS[i];
-      mantissa <<= SHIFTS[i];
-    }
-  }
+template <typename T>
+static inline void normalize(int &exponent,
+                             typename FPBits<T>::UIntType &mantissa) {
+  const int shift =
+      clz(mantissa) - (8 * sizeof(mantissa) - 1 - MantissaWidth<T>::VALUE);
+  exponent -= shift;
+  mantissa <<= shift;
 }
 
 #ifdef LONG_DOUBLE_IS_DOUBLE
@@ -82,9 +64,9 @@
 #elif !defined(SPECIAL_X86_LONG_DOUBLE)
 template <>
 inline void normalize<long double>(int &exponent, __uint128_t &mantissa) {
-  // Use binary search to shift the leading 1 bit similar to float.
-  // With MantissaWidth<long double> = 112, it will take
-  // ceil(log2(112)) = 7 steps checking the mantissa bits.
+  // There is no __builtin_clz for 128-bit integers, so use binary search to
+  // shift the leading 1 bit. With MantissaWidth<long double> = 112, it will
+  // take ceil(log2(112)) = 7 steps checking the mantissa bits.
   constexpr int NSTEPS = 7; // = ceil(log2(MantissaWidth))
   constexpr __uint128_t BOUNDS[NSTEPS] = {
       __uint128_t(1) << 56,  __uint128_t(1) << 84,  __uint128_t(1) << 98,


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D120579.411467.patch
Type: text/x-patch
Size: 3762 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/libc-commits/attachments/20220225/33a4bd8d/attachment.bin>