[llvm] r329071 - Revert "[SLP] Fix PR36481: vectorize reassociated instructions."

Tue Apr 3 07:40:33 PDT 2018

Author: d0k
Date: Tue Apr  3 07:40:33 2018
New Revision: 329071

URL: http://llvm.org/viewvc/llvm-project?rev=329071&view=rev
Log:
Revert "[SLP] Fix PR36481: vectorize reassociated instructions."

This reverts commit r328980 and r329046. Makes the vectorizer crash.

Modified:
    llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h
    llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp
    llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/trunk/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load.ll
    llvm/trunk/test/Transforms/SLPVectorizer/X86/extract.ll
    llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
    llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll
    llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll
    llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
    llvm/trunk/test/Transforms/SLPVectorizer/X86/reassociated-loads.ll
    llvm/trunk/test/Transforms/SLPVectorizer/X86/store-jumbled.ll

Modified: llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h?rev=329071&r1=329070&r2=329071&view=diff
==============================================================================

--- llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h (original)
+++ llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h Tue Apr  3 07:40:33 2018
@@ -667,20 +667,6 @@ int64_t getPtrStride(PredicatedScalarEvo
                      const ValueToValueMap &StridesMap = ValueToValueMap(),
                      bool Assume = false, bool ShouldCheckWrap = true);
 
-/// \brief Attempt to sort the pointers in \p VL and return the sorted indices
-/// in \p SortedIndices, if reordering is required.
-///
-/// Returns 'true' if sorting is legal, otherwise returns 'false'.
-///
-/// For example, for a given \p VL of memory accesses in program order, a[i+4],
-/// a[i+0], a[i+1] and a[i+7], this function will sort the \p VL and save the
-/// sorted indices in \p SortedIndices as a[i+0], a[i+1], a[i+4], a[i+7] and
-/// saves the mask for actual memory accesses in program order in
-/// \p SortedIndices as <1,2,0,3>
-bool sortPtrAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
-                     ScalarEvolution &SE,
-                     SmallVectorImpl<unsigned> &SortedIndices);
-
 /// \brief Returns true if the memory operations \p A and \p B are consecutive.
 /// This is a simple API that does not depend on the analysis pass. 
 bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,

Modified: llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp?rev=329071&r1=329070&r2=329071&view=diff
==============================================================================
--- llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp (original)
+++ llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp Tue Apr  3 07:40:33 2018
@@ -1087,67 +1087,6 @@ int64_t llvm::getPtrStride(PredicatedSca
   return Stride;
 }
 
-bool llvm::sortPtrAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
-                           ScalarEvolution &SE,
-                           SmallVectorImpl<unsigned> &SortedIndices) {
-  assert(llvm::all_of(
-             VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
-         "Expected list of pointer operands.");
-  SmallVector<std::pair<int64_t, Value *>, 4> OffValPairs;
-  OffValPairs.reserve(VL.size());
-
-  // Walk over the pointers, and map each of them to an offset relative to
-  // first pointer in the array.
-  Value *Ptr0 = VL[0];
-  const SCEV *Scev0 = SE.getSCEV(Ptr0);
-  Value *Obj0 = GetUnderlyingObject(Ptr0, DL);
-
-  llvm::SmallSet<int64_t, 4> Offsets;
-  for (auto *Ptr : VL) {
-    // TODO: Outline this code as a special, more time consuming, version of
-    // computeConstantDifference() function.
-    if (Ptr->getType()->getPointerAddressSpace() !=
-        Ptr0->getType()->getPointerAddressSpace())
-      return false;
-    // If a pointer refers to a different underlying object, bail - the
-    // pointers are by definition incomparable.
-    Value *CurrObj = GetUnderlyingObject(Ptr, DL);
-    if (CurrObj != Obj0)
-      return false;
-
-    const SCEV *Scev = SE.getSCEV(Ptr);
-    const auto *Diff = dyn_cast<SCEVConstant>(SE.getMinusSCEV(Scev, Scev0));
-    // The pointers may not have a constant offset from each other, or SCEV
-    // may just not be smart enough to figure out they do. Regardless,
-    // there's nothing we can do.
-    if (!Diff)
-      return false;
-
-    // Check if the pointer with the same offset is found.
-    int64_t Offset = Diff->getAPInt().getSExtValue();
-    if (!Offsets.insert(Offset).second)
-      return false;
-    OffValPairs.emplace_back(Offset, Ptr);
-  }
-  SortedIndices.clear();
-  SortedIndices.resize(VL.size());
-  std::iota(SortedIndices.begin(), SortedIndices.end(), 0);
-
-  // Sort the memory accesses and keep the order of their uses in UseOrder.
-  std::stable_sort(SortedIndices.begin(), SortedIndices.end(),
-                   [&OffValPairs](unsigned Left, unsigned Right) {
-                     return OffValPairs[Left].first < OffValPairs[Right].first;
-                   });
-
-  // Check if the order is consecutive already.
-  if (llvm::all_of(SortedIndices, [&SortedIndices](const unsigned I) {
-        return I == SortedIndices[I];
-      }))
-    SortedIndices.clear();
-
-  return true;
-}
-
 /// Take the address space operand from the Load/Store instruction.
 /// Returns -1 if this is not a valid Load/Store instruction.
 static unsigned getAddressSpaceOperand(Value *I) {

Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=329071&r1=329070&r2=329071&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Tue Apr  3 07:40:33 2018
@@ -452,21 +452,16 @@ static bool allSameType(ArrayRef<Value *
 }
 
 /// \returns True if Extract{Value,Element} instruction extracts element Idx.
-static Optional<unsigned> getExtractIndex(Instruction *E) {
-  unsigned Opcode = E->getOpcode();
-  assert((Opcode == Instruction::ExtractElement ||
-          Opcode == Instruction::ExtractValue) &&
-         "Expected extractelement or extractvalue instruction.");
+static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) {
+  assert(Opcode == Instruction::ExtractElement ||
+         Opcode == Instruction::ExtractValue);
   if (Opcode == Instruction::ExtractElement) {
-    auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
-    if (!CI)
-      return None;
-    return CI->getZExtValue();
-  }
-  ExtractValueInst *EI = cast<ExtractValueInst>(E);
-  if (EI->getNumIndices() != 1)
-    return None;
-  return *EI->idx_begin();
+    ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
+    return CI && CI->getZExtValue() == Idx;
+  } else {
+    ExtractValueInst *EI = cast<ExtractValueInst>(E);
+    return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx;
+  }
 }
 
 /// \returns True if in-tree use also needs extract. This refers to
@@ -591,7 +586,6 @@ public:
     MustGather.clear();
     ExternalUses.clear();
     NumOpsWantToKeepOrder.clear();
-    NumOpsWantToKeepOriginalOrder = 0;
     for (auto &Iter : BlocksSchedules) {
       BlockScheduling *BS = Iter.second.get();
       BS->clear();
@@ -604,18 +598,14 @@ public:
   /// \brief Perform LICM and CSE on the newly generated gather sequences.
   void optimizeGatherSequence();
 
-  /// \returns The best order of instructions for vectorization.
-  Optional<ArrayRef<unsigned>> bestOrder() const {
-    auto I = std::max_element(
-        NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
-        [](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
-           const decltype(NumOpsWantToKeepOrder)::value_type &D2) {
-          return D1.second < D2.second;
-        });
-    if (I == NumOpsWantToKeepOrder.end() || I->getSecond() <= NumOpsWantToKeepOriginalOrder)
-      return None;
-
-    return makeArrayRef(I->getFirst());
+  /// \returns true if it is beneficial to reverse the vector order.
+  bool shouldReorder() const {
+    return std::accumulate(
+               NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(), 0,
+               [](int Val1,
+                  const decltype(NumOpsWantToKeepOrder)::value_type &Val2) {
+                 return Val1 + (Val2.second < 0 ? 1 : -1);
+               }) > 0;
   }
 
   /// \return The vector element size in bits to use when vectorizing the
@@ -662,13 +652,9 @@ private:
   /// This is the recursive part of buildTree.
   void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);
 
-  /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
-  /// be vectorized to use the original vector (or aggregate "bitcast" to a
-  /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
-  /// returns false, setting \p CurrentOrder to either an empty vector or a
-  /// non-identity permutation that allows to reuse extract instructions.
-  bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
-                       SmallVectorImpl<unsigned> &CurrentOrder) const;
+  /// \returns True if the ExtractElement/ExtractValue instructions in VL can
+  /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
+  bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const;
 
   /// Vectorize a single entry in the tree.
   Value *vectorizeTree(TreeEntry *E);
@@ -732,9 +718,6 @@ private:
     /// Does this sequence require some shuffling?
     SmallVector<unsigned, 4> ReuseShuffleIndices;
 
-    /// Does this entry require reordering?
-    ArrayRef<unsigned> ReorderIndices;
-
     /// Points back to the VectorizableTree.
     ///
     /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has
@@ -750,8 +733,7 @@ private:
 
   /// Create a new VectorizableTree entry.
   void newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, int &UserTreeIdx,
-                    ArrayRef<unsigned> ReuseShuffleIndices = None,
-                    ArrayRef<unsigned> ReorderIndices = None) {
+                    ArrayRef<unsigned> ReuseShuffleIndices = None) {
     VectorizableTree.emplace_back(VectorizableTree);
     int idx = VectorizableTree.size() - 1;
     TreeEntry *Last = &VectorizableTree[idx];
@@ -759,7 +741,6 @@ private:
     Last->NeedToGather = !Vectorized;
     Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
                                      ReuseShuffleIndices.end());
-    Last->ReorderIndices = ReorderIndices;
     if (Vectorized) {
       for (int i = 0, e = VL.size(); i != e; ++i) {
         assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
@@ -1221,38 +1202,10 @@ private:
   /// List of users to ignore during scheduling and that don't need extracting.
   ArrayRef<Value *> UserIgnoreList;
 
-  using OrdersType = SmallVector<unsigned, 4>;
-  /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
-  /// sorted SmallVectors of unsigned.
-  struct OrdersTypeDenseMapInfo {
-    static OrdersType getEmptyKey() {
-      OrdersType V;
-      V.push_back(~1U);
-      return V;
-    }
-
-    static OrdersType getTombstoneKey() {
-      OrdersType V;
-      V.push_back(~2U);
-      return V;
-    }
-
-    static unsigned getHashValue(const OrdersType &V) {
-      return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
-    }
-
-    static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
-      return LHS == RHS;
-    }
-  };
-
-  /// Contains orders of operations along with the number of bundles that have
-  /// operations in this order. It stores only those orders that require
-  /// reordering, if reordering is not required it is counted using \a
-  /// NumOpsWantToKeepOriginalOrder.
-  DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder;
-  /// Number of bundles that do not require reordering.
-  unsigned NumOpsWantToKeepOriginalOrder = 0;
+  /// Number of operation bundles that contain consecutive operations - number
+  /// of operation bundles that contain consecutive operations in reversed
+  /// order.
+  DenseMap<unsigned, int> NumOpsWantToKeepOrder;
 
   // Analysis and block reference.
   Function *F;
@@ -1604,35 +1557,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
     }
     case Instruction::ExtractValue:
     case Instruction::ExtractElement: {
-      OrdersType CurrentOrder;
-      bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
+      bool Reuse = canReuseExtract(VL, VL0);
       if (Reuse) {
         DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
-        ++NumOpsWantToKeepOriginalOrder;
-        newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
-                     ReuseShuffleIndicies);
-        return;
-      }
-      if (!CurrentOrder.empty()) {
-        DEBUG({
-          dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
-                    "with order";
-          for (unsigned Idx : CurrentOrder)
-            dbgs() << " " << Idx;
-          dbgs() << "\n";
-        });
-        // Insert new order with initial value 0, if it does not exist,
-        // otherwise return the iterator to the existing one.
-        auto StoredCurrentOrderAndNum =
-            NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
-        ++StoredCurrentOrderAndNum->getSecond();
-        newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies,
-                     StoredCurrentOrderAndNum->getFirst());
-        return;
+        ++NumOpsWantToKeepOrder[S.Opcode];
+      } else {
+        SmallVector<Value *, 4> ReverseVL(VL.rbegin(), VL.rend());
+        if (canReuseExtract(ReverseVL, VL0))
+          --NumOpsWantToKeepOrder[S.Opcode];
+        BS.cancelScheduling(VL, VL0);
       }
-      DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
-      newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies);
-      BS.cancelScheduling(VL, VL0);
+      newTreeEntry(VL, Reuse, UserTreeIdx, ReuseShuffleIndicies);
       return;
     }
     case Instruction::Load: {
@@ -1654,55 +1589,51 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
 
       // Make sure all loads in the bundle are simple - we can't vectorize
       // atomic or volatile loads.
-      SmallVector<Value *, 4> PointerOps(VL.size());
-      auto POIter = PointerOps.begin();
-      for (Value *V : VL) {
-        auto *L = cast<LoadInst>(V);
+      for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
+        LoadInst *L = cast<LoadInst>(VL[i]);
         if (!L->isSimple()) {
           BS.cancelScheduling(VL, VL0);
           newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
           DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
           return;
         }
-        *POIter = L->getPointerOperand();
-        ++POIter;
       }
 
-      OrdersType CurrentOrder;
-      // Check the order of pointer operands.
-      if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
-        Value *Ptr0;
-        Value *PtrN;
-        if (CurrentOrder.empty()) {
-          Ptr0 = PointerOps.front();
-          PtrN = PointerOps.back();
+      // Check if the loads are consecutive, reversed, or neither.
+      // TODO: What we really want is to sort the loads, but for now, check
+      // the two likely directions.
+      bool Consecutive = true;
+      bool ReverseConsecutive = true;
+      for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
+        if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
+          Consecutive = false;
+          break;
         } else {
-          Ptr0 = PointerOps[CurrentOrder.front()];
-          PtrN = PointerOps[CurrentOrder.back()];
+          ReverseConsecutive = false;
         }
-        const SCEV *Scev0 = SE->getSCEV(Ptr0);
-        const SCEV *ScevN = SE->getSCEV(PtrN);
-        const auto *Diff =
-            dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
-        uint64_t Size = DL->getTypeAllocSize(ScalarTy);
-        // Check that the sorted loads are consecutive.
-        if (Diff && Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size) {
-          if (CurrentOrder.empty()) {
-            // Original loads are consecutive and does not require reordering.
-            ++NumOpsWantToKeepOriginalOrder;
-            newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
-                         ReuseShuffleIndicies);
-            DEBUG(dbgs() << "SLP: added a vector of loads.\n");
-          } else {
-            // Need to reorder.
-            auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
-            ++I->getSecond();
-            newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx,
-                         ReuseShuffleIndicies, I->getFirst());
-            DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
+      }
+
+      if (Consecutive) {
+        ++NumOpsWantToKeepOrder[S.Opcode];
+        newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+        DEBUG(dbgs() << "SLP: added a vector of loads.\n");
+        return;
+      }
+
+      // If none of the load pairs were consecutive when checked in order,
+      // check the reverse order.
+      if (ReverseConsecutive)
+        for (unsigned i = VL.size() - 1; i > 0; --i)
+          if (!isConsecutiveAccess(VL[i], VL[i - 1], *DL, *SE)) {
+            ReverseConsecutive = false;
+            break;
           }
-          return;
-        }
+
+      if (ReverseConsecutive) {
+        --NumOpsWantToKeepOrder[S.Opcode];
+        newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
+        DEBUG(dbgs() << "SLP: added a vector of reversed loads.\n");
+        return;
       }
 
       DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
@@ -2013,8 +1944,7 @@ unsigned BoUpSLP::canMapToVector(Type *T
   return N;
 }
 
-bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
-                              SmallVectorImpl<unsigned> &CurrentOrder) const {
+bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const {
   Instruction *E0 = cast<Instruction>(OpValue);
   assert(E0->getOpcode() == Instruction::ExtractElement ||
          E0->getOpcode() == Instruction::ExtractValue);
@@ -2023,8 +1953,6 @@ bool BoUpSLP::canReuseExtract(ArrayRef<V
   // correct offset.
   Value *Vec = E0->getOperand(0);
 
-  CurrentOrder.clear();
-
   // We have to extract from a vector/aggregate with the same number of elements.
   unsigned NElts;
   if (E0->getOpcode() == Instruction::ExtractValue) {
@@ -2044,40 +1972,15 @@ bool BoUpSLP::canReuseExtract(ArrayRef<V
     return false;
 
   // Check that all of the indices extract from the correct offset.
-  bool ShouldKeepOrder = true;
-  unsigned E = VL.size();
-  // Assign to all items the initial value E + 1 so we can check if the extract
-  // instruction index was used already.
-  // Also, later we can check that all the indices are used and we have a
-  // consecutive access in the extract instructions, by checking that no
-  // element of CurrentOrder still has value E + 1.
-  CurrentOrder.assign(E, E + 1);
-  unsigned I = 0;
-  for (; I < E; ++I) {
-    auto *Inst = cast<Instruction>(VL[I]);
+  for (unsigned I = 0, E = VL.size(); I < E; ++I) {
+    Instruction *Inst = cast<Instruction>(VL[I]);
+    if (!matchExtractIndex(Inst, I, Inst->getOpcode()))
+      return false;
     if (Inst->getOperand(0) != Vec)
-      break;
-    Optional<unsigned> Idx = getExtractIndex(Inst);
-    if (!Idx)
-      break;
-    const unsigned ExtIdx = *Idx;
-    if (ExtIdx != I) {
-      if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1)
-        break;
-      ShouldKeepOrder = false;
-      CurrentOrder[ExtIdx] = I;
-    } else {
-      if (CurrentOrder[I] != E + 1)
-        break;
-      CurrentOrder[I] = I;
-    }
-  }
-  if (I < E) {
-    CurrentOrder.clear();
-    return false;
+      return false;
   }
 
-  return ShouldKeepOrder;
+  return true;
 }
 
 bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
@@ -2179,13 +2082,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
               TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
         }
       }
-      if (!E->NeedToGather) {
+      if (canReuseExtract(VL, S.OpValue)) {
         int DeadCost = ReuseShuffleCost;
-        if (!E->ReorderIndices.empty()) {
-          // TODO: Merge this shuffle with the ReuseShuffleCost.
-          DeadCost += TTI->getShuffleCost(
-              TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
-        }
         for (unsigned i = 0, e = VL.size(); i < e; ++i) {
           Instruction *E = cast<Instruction>(VL[i]);
           // If all users are going to be vectorized, instruction can be
@@ -2348,8 +2246,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
           TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
       int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
                                            VecTy, alignment, 0, VL0);
-      if (!E->ReorderIndices.empty()) {
-        // TODO: Merge this shuffle with the ReuseShuffleCost.
+      if (!isConsecutiveAccess(VL[0], VL[1], *DL, *SE)) {
         VecLdCost += TTI->getShuffleCost(
             TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
       }
@@ -2519,13 +2416,13 @@ int BoUpSLP::getSpillCost() {
         LiveValues.insert(cast<Instruction>(&*J));
     }
 
-    DEBUG({
+    DEBUG(
       dbgs() << "SLP: #LV: " << LiveValues.size();
       for (auto *X : LiveValues)
         dbgs() << " " << X->getName();
       dbgs() << ", Looking at ";
       Inst->dump();
-    });
+      );
 
     // Now find the sequence of instructions between PrevInst and Inst.
     BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
@@ -3047,15 +2944,6 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<V
   return V;
 }
 
-static void inversePermutation(ArrayRef<unsigned> Indices,
-                               SmallVectorImpl<unsigned> &Mask) {
-  Mask.clear();
-  const unsigned E = Indices.size();
-  Mask.resize(E);
-  for (unsigned I = 0; I < E; ++I)
-    Mask[Indices[I]] = I;
-}
-
 Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
   IRBuilder<>::InsertPointGuard Guard(Builder);
 
@@ -3132,19 +3020,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
     }
 
     case Instruction::ExtractElement: {
-      if (!E->NeedToGather) {
+      if (canReuseExtract(E->Scalars, VL0)) {
         Value *V = VL0->getOperand(0);
-        if (!E->ReorderIndices.empty()) {
-          OrdersType Mask;
-          inversePermutation(E->ReorderIndices, Mask);
-          Builder.SetInsertPoint(VL0);
-          V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask,
-                                          "reorder_shuffle");
-        }
         if (NeedToShuffleReuses) {
-          // TODO: Merge this shuffle with the ReorderShuffleMask.
-          if (!E->ReorderIndices.empty())
-            Builder.SetInsertPoint(VL0);
+          Builder.SetInsertPoint(VL0);
           V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
                                           E->ReuseShuffleIndices, "shuffle");
         }
@@ -3165,21 +3044,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
       return V;
     }
     case Instruction::ExtractValue: {
-      if (!E->NeedToGather) {
+      if (canReuseExtract(E->Scalars, VL0)) {
         LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));
         Builder.SetInsertPoint(LI);
         PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
         Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
         LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
         Value *NewV = propagateMetadata(V, E->Scalars);
-        if (!E->ReorderIndices.empty()) {
-          OrdersType Mask;
-          inversePermutation(E->ReorderIndices, Mask);
-          NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask,
-                                             "reorder_shuffle");
-        }
         if (NeedToShuffleReuses) {
-          // TODO: Merge this shuffle with the ReorderShuffleMask.
           NewV = Builder.CreateShuffleVector(
               NewV, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle");
         }
@@ -3353,9 +3225,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
     case Instruction::Load: {
       // Loads are inserted at the head of the tree because we don't want to
       // sink them all the way down past store instructions.
-      bool IsReorder = !E->ReorderIndices.empty();
-      if (IsReorder)
-        VL0 = cast<Instruction>(E->Scalars[E->ReorderIndices.front()]);
+      bool IsReversed =
+          !isConsecutiveAccess(E->Scalars[0], E->Scalars[1], *DL, *SE);
+      if (IsReversed)
+        VL0 = cast<Instruction>(E->Scalars.back());
       setInsertPointAfterBundle(E->Scalars, VL0);
 
       LoadInst *LI = cast<LoadInst>(VL0);
@@ -3379,14 +3252,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
       }
       LI->setAlignment(Alignment);
       Value *V = propagateMetadata(LI, E->Scalars);
-      if (IsReorder) {
-        OrdersType Mask;
-        inversePermutation(E->ReorderIndices, Mask);
-        V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
-                                        Mask, "reorder_shuffle");
+      if (IsReversed) {
+        SmallVector<uint32_t, 4> Mask(E->Scalars.size());
+        std::iota(Mask.rbegin(), Mask.rend(), 0);
+        V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), Mask);
       }
       if (NeedToShuffleReuses) {
-        // TODO: Merge this shuffle with the ReorderShuffleMask.
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
                                         E->ReuseShuffleIndices, "shuffle");
       }
@@ -4954,10 +4825,8 @@ bool SLPVectorizerPass::tryToVectorizeLi
       ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
 
       R.buildTree(Ops);
-      Optional<ArrayRef<unsigned>> Order = R.bestOrder();
       // TODO: check if we can allow reordering for more cases.
-      if (AllowReorder && Order) {
-        // TODO: reorder tree nodes without tree rebuilding.
+      if (AllowReorder && R.shouldReorder()) {
         // Conceptually, there is nothing actually preventing us from trying to
         // reorder a larger list. In fact, we do exactly this when vectorizing
         // reductions. However, at this point, we only expect to get here when
@@ -5703,13 +5572,9 @@ public:
     while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
       auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
       V.buildTree(VL, ExternallyUsedValues, IgnoreList);
-      Optional<ArrayRef<unsigned>> Order = V.bestOrder();
-      if (Order) {
-        // TODO: reorder tree nodes without tree rebuilding.
-        SmallVector<Value *, 4> ReorderedOps(VL.size());
-        llvm::transform(*Order, ReorderedOps.begin(),
-                        [VL](const unsigned Idx) { return VL[Idx]; });
-        V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList);
+      if (V.shouldReorder()) {
+        SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
+        V.buildTree(Reversed, ExternallyUsedValues, IgnoreList);
       }
       if (V.isTreeTinyAndNotFullyVectorizable())
         break;
@@ -6214,7 +6079,7 @@ bool SLPVectorizerPass::vectorizeChainsI
 
       // Try to vectorize them.
       unsigned NumElts = (SameTypeIt - IncIt);
-      DEBUG(dbgs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
+      DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
       // The order in which the phi nodes appear in the program does not matter.
       // So allow tryToVectorizeList to reorder them if it is beneficial. This
       // is done when there are exactly two elements since tryToVectorizeList

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load.ll?rev=329071&r1=329070&r2=329071&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/external_user_jumbled_load.ll Tue Apr  3 07:40:33 2018
@@ -10,16 +10,15 @@ define void @hoge(i64 %idx, <4 x i32>* %
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [20 x [13 x i32]], [20 x [13 x i32]]* @array, i64 0, i64 [[IDX]], i64 6
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [20 x [13 x i32]], [20 x [13 x i32]]* @array, i64 0, i64 [[IDX]], i64 7
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [20 x [13 x i32]], [20 x [13 x i32]]* @array, i64 0, i64 [[IDX]], i64 8
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP1]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP10]], i32 2
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP12]], i32 3
 ; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* [[SINK:%.*]]
 ; CHECK-NEXT:    ret void

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/extract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/extract.ll?rev=329071&r1=329070&r2=329071&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/extract.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/extract.ll Tue Apr  3 07:40:33 2018
@@ -30,11 +30,14 @@ define void @fextr1(double* %ptr) {
 ; CHECK-LABEL: @fextr1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[LD:%.*]] = load <2 x double>, <2 x double>* undef
-; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x double> [[LD]], <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[V0:%.*]] = extractelement <2 x double> [[LD]], i32 0
+; CHECK-NEXT:    [[V1:%.*]] = extractelement <2 x double> [[LD]], i32 1
 ; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 0
-; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> <double 3.400000e+00, double 1.200000e+00>, [[REORDER_SHUFFLE]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[P1]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[TMP0]], <2 x double>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double [[V1]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> <double 3.400000e+00, double 1.200000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[P1]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP2]], <2 x double>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll?rev=329071&r1=329070&r2=329071&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll Tue Apr  3 07:40:33 2018
@@ -11,16 +11,21 @@
     define i32 @fn1() {
 ; CHECK-LABEL: @fn1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @b to <4 x i32>*), align 4
-; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[REORDER_SHUFFLE]], zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 2
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 8, i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP6]], <4 x i32> <i32 6, i32 0, i32 0, i32 0>
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 0), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 1) to <2 x i32>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 3), align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp sgt <4 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP4]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 2
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 8, i32 3
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> [[TMP12]], <4 x i32> <i32 6, i32 0, i32 0, i32 0>
+; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4
 ; CHECK-NEXT:    ret i32 0
 ;
   entry:

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll?rev=329071&r1=329070&r2=329071&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll Tue Apr  3 07:40:33 2018
@@ -21,21 +21,28 @@
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 10
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 11
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 13
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw <4 x i32> [[TMP1]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP11]]
 ; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[B]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -76,21 +83,28 @@ entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 10
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 11
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 13
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP5]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = mul nsw <4 x i32> [[TMP11]], [[TMP4]]
 ; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[B]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll?rev=329071&r1=329070&r2=329071&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll Tue Apr  3 07:40:33 2018
@@ -48,11 +48,11 @@ define void @phiUsingLoads(i32* noalias
 ; CHECK-NEXT:    [[ARRAYIDX65:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
 ; CHECK-NEXT:    [[ARRAYIDX66:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[B]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP27:%.*]], <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    store <4 x i32> [[TMP34:%.*]], <4 x i32>* [[TMP1]], align 4
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x i32> [ undef, [[ENTRY]] ], [ [[TMP27]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x i32> [ undef, [[ENTRY]] ], [ [[TMP34]], [[FOR_INC]] ]
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
@@ -103,16 +103,23 @@ define void @phiUsingLoads(i32* noalias
 ; CHECK-NEXT:    [[ARRAYIDX49:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[ARRAYIDX52:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP22]]
-; CHECK-NEXT:    [[TMP23:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[ARRAYIDX58:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP23]]
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i32* [[ARRAYIDX49]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i32> [[TMP25]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i32* [[ARRAYIDX49]] to <2 x i32>*
+; CHECK-NEXT:    [[TMP23:%.*]] = load <2 x i32>, <2 x i32>* [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP24:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP24]]
+; CHECK-NEXT:    [[TMP25:%.*]] = load i32, i32* [[ARRAYIDX55]], align 4
+; CHECK-NEXT:    [[TMP26:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[ARRAYIDX58:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP26]]
+; CHECK-NEXT:    [[TMP27:%.*]] = load i32, i32* [[ARRAYIDX58]], align 4
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <2 x i32> [[TMP23]], i32 0
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <4 x i32> undef, i32 [[TMP28]], i32 0
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <2 x i32> [[TMP23]], i32 1
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[TMP30]], i32 1
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> [[TMP31]], i32 [[TMP25]], i32 2
+; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP27]], i32 3
 ; CHECK-NEXT:    br label [[FOR_INC]]
 ; CHECK:       for.inc:
-; CHECK-NEXT:    [[TMP27]] = phi <4 x i32> [ [[TMP7]], [[IF_THEN]] ], [ [[TMP13]], [[IF_THEN14]] ], [ [[TMP19]], [[IF_THEN30]] ], [ [[TMP26]], [[IF_THEN46]] ], [ [[TMP2]], [[IF_ELSE43]] ]
+; CHECK-NEXT:    [[TMP34]] = phi <4 x i32> [ [[TMP7]], [[IF_THEN]] ], [ [[TMP13]], [[IF_THEN14]] ], [ [[TMP19]], [[IF_THEN30]] ], [ [[TMP33]], [[IF_THEN46]] ], [ [[TMP2]], [[IF_ELSE43]] ]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load.ll?rev=329071&r1=329070&r2=329071&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load.ll Tue Apr  3 07:40:33 2018
@@ -6,26 +6,33 @@
 define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {
 ; CHECK-LABEL: @jumbled-load(
 ; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
+; CHECK-NEXT:    [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
+; CHECK-NEXT:    [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
+; CHECK-NEXT:    [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; CHECK-NEXT:    [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4
 ; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
+; CHECK-NEXT:    [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
+; CHECK-NEXT:    [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
+; CHECK-NEXT:    [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[REORDER_SHUFFLE]], [[REORDER_SHUFFLE1]]
+; CHECK-NEXT:    [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[LOAD_3]], [[LOAD_5]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[LOAD_2]], [[LOAD_8]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = mul i32 [[LOAD_4]], [[LOAD_7]]
+; CHECK-NEXT:    [[MUL_4:%.*]] = mul i32 [[LOAD_1]], [[LOAD_6]]
 ; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
+; CHECK-NEXT:    store i32 [[MUL_1]], i32* [[GEP_7]], align 4
 ; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
+; CHECK-NEXT:    store i32 [[MUL_2]], i32* [[GEP_8]], align 4
 ; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
+; CHECK-NEXT:    store i32 [[MUL_3]], i32* [[GEP_9]], align 4
 ; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    store i32 [[MUL_4]], i32* [[GEP_10]], align 4
 ; CHECK-NEXT:    ret i32 undef
 ;
   %in.addr = getelementptr inbounds i32, i32* %in, i64 0
@@ -64,27 +71,25 @@ define i32 @jumbled-load(i32* noalias no
 define i32 @jumbled-load-multiuses(i32* noalias nocapture %in, i32* noalias nocapture %out) {
 ; CHECK-LABEL: @jumbled-load-multiuses(
 ; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
+; CHECK-NEXT:    [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
+; CHECK-NEXT:    [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
+; CHECK-NEXT:    [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP7]], i32 2
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[REORDER_SHUFFLE]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP9]], i32 3
-; CHECK-NEXT:    [[TMP11:%.*]] = mul <4 x i32> [[REORDER_SHUFFLE]], [[TMP10]]
+; CHECK-NEXT:    [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[LOAD_3]], [[LOAD_4]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[LOAD_2]], [[LOAD_2]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = mul i32 [[LOAD_4]], [[LOAD_1]]
+; CHECK-NEXT:    [[MUL_4:%.*]] = mul i32 [[LOAD_1]], [[LOAD_3]]
 ; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
+; CHECK-NEXT:    store i32 [[MUL_1]], i32* [[GEP_7]], align 4
 ; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
+; CHECK-NEXT:    store i32 [[MUL_2]], i32* [[GEP_8]], align 4
 ; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
+; CHECK-NEXT:    store i32 [[MUL_3]], i32* [[GEP_9]], align 4
 ; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4
+; CHECK-NEXT:    store i32 [[MUL_4]], i32* [[GEP_10]], align 4
 ; CHECK-NEXT:    ret i32 undef
 ;
   %in.addr = getelementptr inbounds i32, i32* %in, i64 0

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/reassociated-loads.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/reassociated-loads.ll?rev=329071&r1=329070&r2=329071&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/reassociated-loads.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/reassociated-loads.ll Tue Apr  3 07:40:33 2018
@@ -5,49 +5,70 @@ define signext i8 @Foo(<32 x i8>* %__v)
 ; CHECK-LABEL: @Foo(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <32 x i8>, <32 x i8>* [[__V:%.*]], align 32
-; CHECK-NEXT:    [[ADD_I_1_I:%.*]] = add i8 undef, undef
-; CHECK-NEXT:    [[ADD_I_2_I:%.*]] = add i8 [[ADD_I_1_I]], undef
-; CHECK-NEXT:    [[ADD_I_3_I:%.*]] = add i8 [[ADD_I_2_I]], undef
-; CHECK-NEXT:    [[ADD_I_4_I:%.*]] = add i8 [[ADD_I_3_I]], undef
-; CHECK-NEXT:    [[ADD_I_5_I:%.*]] = add i8 [[ADD_I_4_I]], undef
-; CHECK-NEXT:    [[ADD_I_6_I:%.*]] = add i8 [[ADD_I_5_I]], undef
-; CHECK-NEXT:    [[ADD_I_7_I:%.*]] = add i8 [[ADD_I_6_I]], undef
-; CHECK-NEXT:    [[ADD_I_8_I:%.*]] = add i8 [[ADD_I_7_I]], undef
-; CHECK-NEXT:    [[ADD_I_9_I:%.*]] = add i8 [[ADD_I_8_I]], undef
-; CHECK-NEXT:    [[ADD_I_10_I:%.*]] = add i8 [[ADD_I_9_I]], undef
-; CHECK-NEXT:    [[ADD_I_11_I:%.*]] = add i8 [[ADD_I_10_I]], undef
-; CHECK-NEXT:    [[ADD_I_12_I:%.*]] = add i8 [[ADD_I_11_I]], undef
-; CHECK-NEXT:    [[ADD_I_13_I:%.*]] = add i8 [[ADD_I_12_I]], undef
-; CHECK-NEXT:    [[ADD_I_14_I:%.*]] = add i8 [[ADD_I_13_I]], undef
-; CHECK-NEXT:    [[ADD_I_15_I:%.*]] = add i8 [[ADD_I_14_I]], undef
-; CHECK-NEXT:    [[ADD_I_16_I:%.*]] = add i8 [[ADD_I_15_I]], undef
-; CHECK-NEXT:    [[ADD_I_17_I:%.*]] = add i8 [[ADD_I_16_I]], undef
-; CHECK-NEXT:    [[ADD_I_18_I:%.*]] = add i8 [[ADD_I_17_I]], undef
-; CHECK-NEXT:    [[ADD_I_19_I:%.*]] = add i8 [[ADD_I_18_I]], undef
-; CHECK-NEXT:    [[ADD_I_20_I:%.*]] = add i8 [[ADD_I_19_I]], undef
-; CHECK-NEXT:    [[ADD_I_21_I:%.*]] = add i8 [[ADD_I_20_I]], undef
-; CHECK-NEXT:    [[ADD_I_22_I:%.*]] = add i8 [[ADD_I_21_I]], undef
-; CHECK-NEXT:    [[ADD_I_23_I:%.*]] = add i8 [[ADD_I_22_I]], undef
-; CHECK-NEXT:    [[ADD_I_24_I:%.*]] = add i8 [[ADD_I_23_I]], undef
-; CHECK-NEXT:    [[ADD_I_25_I:%.*]] = add i8 [[ADD_I_24_I]], undef
-; CHECK-NEXT:    [[ADD_I_26_I:%.*]] = add i8 [[ADD_I_25_I]], undef
-; CHECK-NEXT:    [[ADD_I_27_I:%.*]] = add i8 [[ADD_I_26_I]], undef
-; CHECK-NEXT:    [[ADD_I_28_I:%.*]] = add i8 [[ADD_I_27_I]], undef
-; CHECK-NEXT:    [[ADD_I_29_I:%.*]] = add i8 [[ADD_I_28_I]], undef
-; CHECK-NEXT:    [[ADD_I_30_I:%.*]] = add i8 [[ADD_I_29_I]], undef
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x i8> [[TMP0]], <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <32 x i8> [[TMP0]], [[RDX_SHUF]]
-; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x i8> [[BIN_RDX]], <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <32 x i8> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x i8> [[BIN_RDX2]], <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add <32 x i8> [[BIN_RDX2]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x i8> [[BIN_RDX4]], <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX6:%.*]] = add <32 x i8> [[BIN_RDX4]], [[RDX_SHUF5]]
-; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x i8> [[BIN_RDX6]], <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX8:%.*]] = add <32 x i8> [[BIN_RDX6]], [[RDX_SHUF7]]
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <32 x i8> [[BIN_RDX8]], i32 0
-; CHECK-NEXT:    [[ADD_I_31_I:%.*]] = add i8 [[ADD_I_30_I]], undef
-; CHECK-NEXT:    ret i8 [[TMP1]]
+; CHECK-NEXT:    [[VECEXT_I_I_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 0
+; CHECK-NEXT:    [[VECEXT_I_I_1_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 1
+; CHECK-NEXT:    [[ADD_I_1_I:%.*]] = add i8 [[VECEXT_I_I_1_I]], [[VECEXT_I_I_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_2_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 2
+; CHECK-NEXT:    [[ADD_I_2_I:%.*]] = add i8 [[ADD_I_1_I]], [[VECEXT_I_I_2_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_3_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 3
+; CHECK-NEXT:    [[ADD_I_3_I:%.*]] = add i8 [[ADD_I_2_I]], [[VECEXT_I_I_3_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_4_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 4
+; CHECK-NEXT:    [[ADD_I_4_I:%.*]] = add i8 [[ADD_I_3_I]], [[VECEXT_I_I_4_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_5_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 5
+; CHECK-NEXT:    [[ADD_I_5_I:%.*]] = add i8 [[ADD_I_4_I]], [[VECEXT_I_I_5_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_6_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 6
+; CHECK-NEXT:    [[ADD_I_6_I:%.*]] = add i8 [[ADD_I_5_I]], [[VECEXT_I_I_6_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_7_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 7
+; CHECK-NEXT:    [[ADD_I_7_I:%.*]] = add i8 [[ADD_I_6_I]], [[VECEXT_I_I_7_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_8_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 8
+; CHECK-NEXT:    [[ADD_I_8_I:%.*]] = add i8 [[ADD_I_7_I]], [[VECEXT_I_I_8_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_9_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 9
+; CHECK-NEXT:    [[ADD_I_9_I:%.*]] = add i8 [[ADD_I_8_I]], [[VECEXT_I_I_9_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_10_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 10
+; CHECK-NEXT:    [[ADD_I_10_I:%.*]] = add i8 [[ADD_I_9_I]], [[VECEXT_I_I_10_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_11_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 11
+; CHECK-NEXT:    [[ADD_I_11_I:%.*]] = add i8 [[ADD_I_10_I]], [[VECEXT_I_I_11_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_12_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 12
+; CHECK-NEXT:    [[ADD_I_12_I:%.*]] = add i8 [[ADD_I_11_I]], [[VECEXT_I_I_12_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_13_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 13
+; CHECK-NEXT:    [[ADD_I_13_I:%.*]] = add i8 [[ADD_I_12_I]], [[VECEXT_I_I_13_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_14_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 14
+; CHECK-NEXT:    [[ADD_I_14_I:%.*]] = add i8 [[ADD_I_13_I]], [[VECEXT_I_I_14_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_15_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 15
+; CHECK-NEXT:    [[ADD_I_15_I:%.*]] = add i8 [[ADD_I_14_I]], [[VECEXT_I_I_15_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_16_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 16
+; CHECK-NEXT:    [[ADD_I_16_I:%.*]] = add i8 [[ADD_I_15_I]], [[VECEXT_I_I_16_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_17_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 17
+; CHECK-NEXT:    [[ADD_I_17_I:%.*]] = add i8 [[ADD_I_16_I]], [[VECEXT_I_I_17_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_18_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 18
+; CHECK-NEXT:    [[ADD_I_18_I:%.*]] = add i8 [[ADD_I_17_I]], [[VECEXT_I_I_18_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_19_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 19
+; CHECK-NEXT:    [[ADD_I_19_I:%.*]] = add i8 [[ADD_I_18_I]], [[VECEXT_I_I_19_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_20_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 20
+; CHECK-NEXT:    [[ADD_I_20_I:%.*]] = add i8 [[ADD_I_19_I]], [[VECEXT_I_I_20_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_21_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 21
+; CHECK-NEXT:    [[ADD_I_21_I:%.*]] = add i8 [[ADD_I_20_I]], [[VECEXT_I_I_21_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_22_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 22
+; CHECK-NEXT:    [[ADD_I_22_I:%.*]] = add i8 [[ADD_I_21_I]], [[VECEXT_I_I_22_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_23_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 23
+; CHECK-NEXT:    [[ADD_I_23_I:%.*]] = add i8 [[ADD_I_22_I]], [[VECEXT_I_I_23_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_24_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 24
+; CHECK-NEXT:    [[ADD_I_24_I:%.*]] = add i8 [[ADD_I_23_I]], [[VECEXT_I_I_24_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_25_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 25
+; CHECK-NEXT:    [[ADD_I_25_I:%.*]] = add i8 [[ADD_I_24_I]], [[VECEXT_I_I_25_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_26_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 26
+; CHECK-NEXT:    [[ADD_I_26_I:%.*]] = add i8 [[ADD_I_25_I]], [[VECEXT_I_I_26_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_27_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 27
+; CHECK-NEXT:    [[ADD_I_27_I:%.*]] = add i8 [[ADD_I_26_I]], [[VECEXT_I_I_27_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_28_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 28
+; CHECK-NEXT:    [[ADD_I_28_I:%.*]] = add i8 [[ADD_I_27_I]], [[VECEXT_I_I_28_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_29_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 29
+; CHECK-NEXT:    [[ADD_I_29_I:%.*]] = add i8 [[ADD_I_28_I]], [[VECEXT_I_I_29_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_30_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 30
+; CHECK-NEXT:    [[ADD_I_30_I:%.*]] = add i8 [[ADD_I_29_I]], [[VECEXT_I_I_30_I]]
+; CHECK-NEXT:    [[VECEXT_I_I_31_I:%.*]] = extractelement <32 x i8> [[TMP0]], i64 31
+; CHECK-NEXT:    [[ADD_I_31_I:%.*]] = add i8 [[ADD_I_30_I]], [[VECEXT_I_I_31_I]]
+; CHECK-NEXT:    ret i8 [[ADD_I_31_I]]
 ;
 entry:
   %0 = load <32 x i8>, <32 x i8>* %__v, align 32

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/store-jumbled.ll?rev=329071&r1=329070&r2=329071&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/store-jumbled.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/store-jumbled.ll Tue Apr  3 07:40:33 2018
@@ -6,26 +6,33 @@
 define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {
 ; CHECK-LABEL: @jumbled-load(
 ; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
+; CHECK-NEXT:    [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
+; CHECK-NEXT:    [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
+; CHECK-NEXT:    [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
+; CHECK-NEXT:    [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4
 ; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
+; CHECK-NEXT:    [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
+; CHECK-NEXT:    [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
+; CHECK-NEXT:    [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT:    [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
-; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i32> [[REORDER_SHUFFLE]], [[REORDER_SHUFFLE1]]
+; CHECK-NEXT:    [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[LOAD_1]], [[LOAD_5]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[LOAD_2]], [[LOAD_6]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = mul i32 [[LOAD_3]], [[LOAD_7]]
+; CHECK-NEXT:    [[MUL_4:%.*]] = mul i32 [[LOAD_4]], [[LOAD_8]]
 ; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
 ; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
 ; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
 ; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    store i32 [[MUL_1]], i32* [[GEP_9]], align 4
+; CHECK-NEXT:    store i32 [[MUL_2]], i32* [[GEP_7]], align 4
+; CHECK-NEXT:    store i32 [[MUL_3]], i32* [[GEP_10]], align 4
+; CHECK-NEXT:    store i32 [[MUL_4]], i32* [[GEP_8]], align 4
 ; CHECK-NEXT:    ret i32 undef
 ;
   %in.addr = getelementptr inbounds i32, i32* %in, i64 0