[llvm] r302171 - [APInt] Reduce number of allocations involved in multiplying. Reduce worst case multiply size

Thu May 4 10:00:41 PDT 2017

Author: ctopper
Date: Thu May  4 12:00:41 2017
New Revision: 302171

URL: http://llvm.org/viewvc/llvm-project?rev=302171&view=rev
Log:
[APInt] Reduce number of allocations involved in multiplying. Reduce worst case multiply size

Currently multiply is implemented in operator*=. Operator* makes a copy and uses operator*= to modify the copy.

Operator*= itself allocates a temporary buffer to hold the multiply result as it computes it. Then copies it to the buffer in *this.

Operator*= attempts to bound the size of the result based on the number of active bits in its inputs. It also has a couple special cases to handle 0 inputs without any memory allocations or multiply operations. The best case is that it calculates a single word regardless of input bit width. The worst case is that it calculates the a 2x input width result and drop the upper bits.

Since operator* uses operator*= it incurs two allocations, one for a copy of *this and one for the temporary allocation. Neither of these allocations are kept after the method operation is done.

The main usage in the backend appears to be ConstantRange::multiply which uses operator* rather than operator*=.

This patch moves the multiply operation to operator* and implements operator*= using it. This avoids the copy in operator*. operator* now allocates a result buffer sized the same width as its inputs no matter what. This buffer will be used as the buffer for the returned APInt. Finally, we reuse tcMultiply to implement the multiply operation. This function is capable of not calculating additional upper words that will be discarded.

This change does lose the special optimizations for the inputs using less words than their size implies. But it also removed the getActiveBits calls from all multiplies. If we think those optimizations are important we could look at providing additional bounds to tcMultiply to limit the computations.

Differential Revision: https://reviews.llvm.org/D32830

Modified:
    llvm/trunk/lib/Support/APInt.cpp

Modified: llvm/trunk/lib/Support/APInt.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Support/APInt.cpp?rev=302171&r1=302170&r2=302171&view=diff
==============================================================================

--- llvm/trunk/lib/Support/APInt.cpp (original)
+++ llvm/trunk/lib/Support/APInt.cpp Thu May  4 12:00:41 2017
@@ -225,114 +225,17 @@ APInt& APInt::operator-=(uint64_t RHS) {
   return clearUnusedBits();
 }
 
-/// Multiplies an integer array, x, by a uint64_t integer and places the result
-/// into dest.
-/// @returns the carry out of the multiplication.
-/// @brief Multiply a multi-digit APInt by a single digit (64-bit) integer.
-static uint64_t mul_1(uint64_t dest[], uint64_t x[], unsigned len, uint64_t y) {
-  // Split y into high 32-bit part (hy)  and low 32-bit part (ly)
-  uint64_t ly = y & 0xffffffffULL, hy = y >> 32;
-  uint64_t carry = 0;
-
-  // For each digit of x.
-  for (unsigned i = 0; i < len; ++i) {
-    // Split x into high and low words
-    uint64_t lx = x[i] & 0xffffffffULL;
-    uint64_t hx = x[i] >> 32;
-    // hasCarry - A flag to indicate if there is a carry to the next digit.
-    // hasCarry == 0, no carry
-    // hasCarry == 1, has carry
-    // hasCarry == 2, no carry and the calculation result == 0.
-    uint8_t hasCarry = 0;
-    dest[i] = carry + lx * ly;
-    // Determine if the add above introduces carry.
-    hasCarry = (dest[i] < carry) ? 1 : 0;
-    carry = hx * ly + (dest[i] >> 32) + (hasCarry ? (1ULL << 32) : 0);
-    // The upper limit of carry can be (2^32 - 1)(2^32 - 1) +
-    // (2^32 - 1) + 2^32 = 2^64.
-    hasCarry = (!carry && hasCarry) ? 1 : (!carry ? 2 : 0);
-
-    carry += (lx * hy) & 0xffffffffULL;
-    dest[i] = (carry << 32) | (dest[i] & 0xffffffffULL);
-    carry = (((!carry && hasCarry != 2) || hasCarry == 1) ? (1ULL << 32) : 0) +
-            (carry >> 32) + ((lx * hy) >> 32) + hx * hy;
-  }
-  return carry;
-}
-
-/// Multiplies integer array x by integer array y and stores the result into
-/// the integer array dest. Note that dest's size must be >= xlen + ylen.
-/// @brief Generalized multiplication of integer arrays.
-static void mul(uint64_t dest[], uint64_t x[], unsigned xlen, uint64_t y[],
-                unsigned ylen) {
-  dest[xlen] = mul_1(dest, x, xlen, y[0]);
-  for (unsigned i = 1; i < ylen; ++i) {
-    uint64_t ly = y[i] & 0xffffffffULL, hy = y[i] >> 32;
-    uint64_t carry = 0, lx = 0, hx = 0;
-    for (unsigned j = 0; j < xlen; ++j) {
-      lx = x[j] & 0xffffffffULL;
-      hx = x[j] >> 32;
-      // hasCarry - A flag to indicate if has carry.
-      // hasCarry == 0, no carry
-      // hasCarry == 1, has carry
-      // hasCarry == 2, no carry and the calculation result == 0.
-      uint8_t hasCarry = 0;
-      uint64_t resul = carry + lx * ly;
-      hasCarry = (resul < carry) ? 1 : 0;
-      carry = (hasCarry ? (1ULL << 32) : 0) + hx * ly + (resul >> 32);
-      hasCarry = (!carry && hasCarry) ? 1 : (!carry ? 2 : 0);
-
-      carry += (lx * hy) & 0xffffffffULL;
-      resul = (carry << 32) | (resul & 0xffffffffULL);
-      dest[i+j] += resul;
-      carry = (((!carry && hasCarry != 2) || hasCarry == 1) ? (1ULL << 32) : 0)+
-              (carry >> 32) + (dest[i+j] < resul ? 1 : 0) +
-              ((lx * hy) >> 32) + hx * hy;
-    }
-    dest[i+xlen] = carry;
-  }
-}
-
-APInt& APInt::operator*=(const APInt& RHS) {
+APInt APInt::operator*(const APInt& RHS) const {
   assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
-  if (isSingleWord()) {
-    U.VAL *= RHS.U.VAL;
-    clearUnusedBits();
-    return *this;
-  }
-
-  // Get some bit facts about LHS and check for zero
-  unsigned lhsBits = getActiveBits();
-  unsigned lhsWords = !lhsBits ? 0 : whichWord(lhsBits - 1) + 1;
-  if (!lhsWords)
-    // 0 * X ===> 0
-    return *this;
-
-  // Get some bit facts about RHS and check for zero
-  unsigned rhsBits = RHS.getActiveBits();
-  unsigned rhsWords = !rhsBits ? 0 : whichWord(rhsBits - 1) + 1;
-  if (!rhsWords) {
-    // X * 0 ===> 0
-    clearAllBits();
-    return *this;
-  }
-
-  // Allocate space for the result
-  unsigned destWords = rhsWords + lhsWords;
-  uint64_t *dest = getMemory(destWords);
-
-  // Perform the long multiply
-  mul(dest, U.pVal, lhsWords, RHS.U.pVal, rhsWords);
-
-  // Copy result back into *this
-  clearAllBits();
-  unsigned wordsToCopy = destWords >= getNumWords() ? getNumWords() : destWords;
-  memcpy(U.pVal, dest, wordsToCopy * APINT_WORD_SIZE);
-  clearUnusedBits();
+  if (isSingleWord())
+    return APInt(BitWidth, U.VAL * RHS.U.VAL);
 
-  // delete dest array and return
-  delete[] dest;
-  return *this;
+  APInt Result(getMemory(getNumWords()), getBitWidth());
+
+  tcMultiply(Result.U.pVal, U.pVal, RHS.U.pVal, getNumWords());
+
+  Result.clearUnusedBits();
+  return Result;
 }
 
 void APInt::AndAssignSlowCase(const APInt& RHS) {
@@ -347,13 +250,10 @@ void APInt::XorAssignSlowCase(const APIn
   tcXor(U.pVal, RHS.U.pVal, getNumWords());
 }
 
-APInt APInt::operator*(const APInt& RHS) const {
+APInt& APInt::operator*=(const APInt& RHS) {
   assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
-  if (isSingleWord())
-    return APInt(BitWidth, U.VAL * RHS.U.VAL);
-  APInt Result(*this);
-  Result *= RHS;
-  return Result;
+  *this = *this * RHS;
+  return *this;
 }
 
 bool APInt::EqualSlowCase(const APInt& RHS) const {