[llvm] r320548 - [SLP] Vectorize jumbled memory loads.

Tue Dec 12 19:08:29 PST 2017

Author: ashahid
Date: Tue Dec 12 19:08:29 2017
New Revision: 320548

URL: http://llvm.org/viewvc/llvm-project?rev=320548&view=rev
Log:
[SLP] Vectorize jumbled memory loads.

Summary:
This patch tries to vectorize loads of consecutive memory accesses, accessed
in non-consecutive or jumbled way. An earlier attempt was made with patch D26905
which was reverted back due to some basic issue with representing the 'use mask' of
jumbled accesses.

This patch fixes the mask representation by recording the 'use mask' in the usertree entry.

Change-Id: I9fe7f5045f065d84c126fa307ef6ebe0787296df

Reviewers: mkuper, loladiro, Ayal, zvi, danielcdh

Reviewed By: Ayal

Subscribers: mgrang, dcaballe, hans, mzolotukhin

Differential Revision: https://reviews.llvm.org/D36130

Added:
    llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll
    llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll
Modified:
    llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h
    llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp
    llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
    llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
    llvm/trunk/test/Transforms/SLPVectorizer/X86/store-jumbled.ll

Modified: llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h?rev=320548&r1=320547&r2=320548&view=diff
==============================================================================

--- llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h (original)
+++ llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h Tue Dec 12 19:08:29 2017
@@ -667,6 +667,21 @@ int64_t getPtrStride(PredicatedScalarEvo
                      const ValueToValueMap &StridesMap = ValueToValueMap(),
                      bool Assume = false, bool ShouldCheckWrap = true);
 
+/// \brief Attempt to sort the 'loads' in \p VL and return the sorted values in
+/// \p Sorted.
+///
+/// Returns 'false' if sorting is not legal or feasible, otherwise returns
+/// 'true'. If \p Mask is not null, it also returns the \p Mask which is the
+/// shuffle mask for actual memory access order.
+///
+/// For example, for a given VL of memory accesses in program order, a[i+2],
+/// a[i+0], a[i+1] and a[i+3], this function will sort the VL and save the
+/// sorted value in 'Sorted' as a[i+0], a[i+1], a[i+2], a[i+3] and saves the
+/// mask for actual memory accesses in program order in 'Mask' as <2,0,1,3>
+bool sortLoadAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
+    ScalarEvolution &SE, SmallVectorImpl<Value *> &Sorted,
+    SmallVectorImpl<unsigned> *Mask = nullptr);
+
 /// \brief Returns true if the memory operations \p A and \p B are consecutive.
 /// This is a simple API that does not depend on the analysis pass. 
 bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,

Modified: llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp?rev=320548&r1=320547&r2=320548&view=diff
==============================================================================
--- llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp (original)
+++ llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp Tue Dec 12 19:08:29 2017
@@ -1107,6 +1107,77 @@ static unsigned getAddressSpaceOperand(V
   return -1;
 }
 
+// TODO:This API can be improved by using the permutation of given width as the
+// accesses are entered into the map.
+bool llvm::sortLoadAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
+                           ScalarEvolution &SE,
+                           SmallVectorImpl<Value *> &Sorted,
+                           SmallVectorImpl<unsigned> *Mask) {
+  SmallVector<std::pair<int64_t, Value *>, 4> OffValPairs;
+  OffValPairs.reserve(VL.size());
+  Sorted.reserve(VL.size());
+
+  // Walk over the pointers, and map each of them to an offset relative to
+  // first pointer in the array.
+  Value *Ptr0 = getPointerOperand(VL[0]);
+  const SCEV *Scev0 = SE.getSCEV(Ptr0);
+  Value *Obj0 = GetUnderlyingObject(Ptr0, DL);
+  PointerType *PtrTy = dyn_cast<PointerType>(Ptr0->getType());
+  uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
+
+  for (auto *Val : VL) {
+    // The only kind of access we care about here is load.
+    if (!isa<LoadInst>(Val))
+      return false;
+
+    Value *Ptr = getPointerOperand(Val);
+    assert(Ptr && "Expected value to have a pointer operand.");
+    // If a pointer refers to a different underlying object, bail - the
+    // pointers are by definition incomparable.
+    Value *CurrObj = GetUnderlyingObject(Ptr, DL);
+    if (CurrObj != Obj0)
+      return false;
+
+    const SCEVConstant *Diff =
+        dyn_cast<SCEVConstant>(SE.getMinusSCEV(SE.getSCEV(Ptr), Scev0));
+    // The pointers may not have a constant offset from each other, or SCEV
+    // may just not be smart enough to figure out they do. Regardless,
+    // there's nothing we can do.
+    if (!Diff || static_cast<unsigned>(Diff->getAPInt().abs().getSExtValue()) >
+                     (VL.size() - 1) * Size)
+      return false;
+
+    OffValPairs.emplace_back(Diff->getAPInt().getSExtValue(), Val);
+  }
+  SmallVector<unsigned, 4> UseOrder(VL.size());
+  for (unsigned i = 0; i < VL.size(); i++) {
+    UseOrder[i] = i;
+  }
+
+  // Sort the memory accesses and keep the order of their uses in UseOrder.
+  std::sort(UseOrder.begin(), UseOrder.end(),
+            [&OffValPairs](unsigned Left, unsigned Right) {
+            return OffValPairs[Left].first < OffValPairs[Right].first;
+            });
+
+  for (unsigned i = 0; i < VL.size(); i++)
+    Sorted.emplace_back(OffValPairs[UseOrder[i]].second);
+
+  // Sort UseOrder to compute the Mask.
+  if (Mask) {
+    Mask->reserve(VL.size());
+    for (unsigned i = 0; i < VL.size(); i++)
+      Mask->emplace_back(i);
+    std::sort(Mask->begin(), Mask->end(),
+              [&UseOrder](unsigned Left, unsigned Right) {
+              return UseOrder[Left] < UseOrder[Right];
+              });
+  }
+
+  return true;
+}
+
+
 /// Returns true if the memory operations \p A and \p B are consecutive.
 bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
                                ScalarEvolution &SE, bool CheckType) {

Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=320548&r1=320547&r2=320548&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Tue Dec 12 19:08:29 2017
@@ -646,17 +646,23 @@ private:
   int getEntryCost(TreeEntry *E);
 
   /// This is the recursive part of buildTree.
-  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);
+  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int UserIndx = -1,
+                     int OpdNum = 0);
 
   /// \returns True if the ExtractElement/ExtractValue instructions in VL can
   /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
   bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const;
 
-  /// Vectorize a single entry in the tree.
-  Value *vectorizeTree(TreeEntry *E);
-
-  /// Vectorize a single entry in the tree, starting in \p VL.
-  Value *vectorizeTree(ArrayRef<Value *> VL);
+  /// Vectorize a single entry in the tree.\p OpdNum indicate the ordinality of
+  /// operand corrsponding to this tree entry \p E for the user tree entry
+  /// indicated by \p UserIndx.
+  //  In other words, "E == TreeEntry[UserIndx].getOperand(OpdNum)".
+  Value *vectorizeTree(TreeEntry *E, int OpdNum = 0, int UserIndx = -1);
+
+  /// Vectorize a single entry in the tree, starting in \p VL.\p OpdNum indicate
+  /// the ordinality of operand corrsponding to the \p VL of scalar values for the
+  /// user indicated by \p UserIndx this \p VL feeds into.
+  Value *vectorizeTree(ArrayRef<Value *> VL, int OpdNum = 0, int UserIndx = -1);
 
   /// \returns the pointer to the vectorized value if \p VL is already
   /// vectorized, or NULL. They may happen in cycles.
@@ -702,6 +708,16 @@ private:
       return std::equal(VL.begin(), VL.end(), Scalars.begin());
     }
 
+    /// \returns true if the scalars in VL are found in this tree entry.
+    bool isFoundJumbled(ArrayRef<Value *> VL, const DataLayout &DL,
+        ScalarEvolution &SE) const {
+      assert(VL.size() == Scalars.size() && "Invalid size");
+      SmallVector<Value *, 8> List;
+      if (!sortLoadAccesses(VL, DL, SE, List))
+        return false;
+      return std::equal(List.begin(), List.end(), Scalars.begin());
+    }
+
     /// A vector of scalars.
     ValueList Scalars;
 
@@ -711,6 +727,14 @@ private:
     /// Do we need to gather this sequence ?
     bool NeedToGather = false;
 
+    /// Records optional shuffle mask for the uses of jumbled memory accesses.
+    /// For example, a non-empty ShuffleMask[1] represents the permutation of
+    /// lanes that operand #1 of this vectorized instruction should undergo
+    /// before feeding this vectorized instruction, whereas an empty
+    /// ShuffleMask[0] indicates that the lanes of operand #0 of this vectorized
+    /// instruction need not be permuted at all.
+    SmallVector<SmallVector<unsigned, 4>, 2> ShuffleMask;
+
     /// Points back to the VectorizableTree.
     ///
     /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has
@@ -726,12 +750,31 @@ private:
 
   /// Create a new VectorizableTree entry.
   TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
-                          int &UserTreeIdx) {
+                          int &UserTreeIdx, const InstructionsState &S,
+                          ArrayRef<unsigned> ShuffleMask = None,
+                          int OpdNum = 0) {
+    assert((!Vectorized || S.Opcode != 0) &&
+           "Vectorized TreeEntry without opcode");
     VectorizableTree.emplace_back(VectorizableTree);
+
     int idx = VectorizableTree.size() - 1;
     TreeEntry *Last = &VectorizableTree[idx];
     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
     Last->NeedToGather = !Vectorized;
+
+    TreeEntry *UserTreeEntry = nullptr;
+    if (UserTreeIdx != -1)
+      UserTreeEntry = &VectorizableTree[UserTreeIdx];
+
+    if (UserTreeEntry && !ShuffleMask.empty()) {
+      if ((unsigned)OpdNum >= UserTreeEntry->ShuffleMask.size())
+        UserTreeEntry->ShuffleMask.resize(OpdNum + 1);
+      assert(UserTreeEntry->ShuffleMask[OpdNum].empty() &&
+             "Mask already present");
+      using mask = SmallVector<unsigned, 4>;
+      mask tempMask(ShuffleMask.begin(), ShuffleMask.end());
+      UserTreeEntry->ShuffleMask[OpdNum] = tempMask;
+    }
     if (Vectorized) {
       for (int i = 0, e = VL.size(); i != e; ++i) {
         assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
@@ -1384,34 +1427,34 @@ void BoUpSLP::buildTree(ArrayRef<Value *
 }
 
 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
-                            int UserTreeIdx) {
+                            int UserTreeIdx, int OpdNum) {
   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
 
   InstructionsState S = getSameOpcode(VL);
   if (Depth == RecursionMaxDepth) {
     DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(VL, false, UserTreeIdx, S);
     return;
   }
 
   // Don't handle vectors.
   if (S.OpValue->getType()->isVectorTy()) {
     DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(VL, false, UserTreeIdx, S);
     return;
   }
 
   if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
     if (SI->getValueOperand()->getType()->isVectorTy()) {
       DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, false, UserTreeIdx, S);
       return;
     }
 
   // If all of the operands are identical or constant we have a simple solution.
   if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.Opcode) {
     DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(VL, false, UserTreeIdx, S);
     return;
   }
 
@@ -1423,7 +1466,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
     if (EphValues.count(VL[i])) {
       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
             ") is ephemeral.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, false, UserTreeIdx, S);
       return;
     }
   }
@@ -1434,7 +1477,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
       DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
       if (E->Scalars[i] != VL[i]) {
         DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
-        newTreeEntry(VL, false, UserTreeIdx);
+        newTreeEntry(VL, false, UserTreeIdx, S);
         return;
       }
     }
@@ -1453,7 +1496,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
     if (getTreeEntry(I)) {
       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
             ") is already in tree.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, false, UserTreeIdx, S);
       return;
     }
   }
@@ -1463,7 +1506,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
     if (MustGather.count(VL[i])) {
       DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, false, UserTreeIdx, S);
       return;
     }
   }
@@ -1477,7 +1520,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
     // Don't go into unreachable blocks. They may contain instructions with
     // dependency cycles which confuse the final scheduling.
     DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(VL, false, UserTreeIdx, S);
     return;
   }
 
@@ -1486,7 +1529,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
     for (unsigned j = i + 1; j < e; ++j)
       if (VL[i] == VL[j]) {
         DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
-        newTreeEntry(VL, false, UserTreeIdx);
+        newTreeEntry(VL, false, UserTreeIdx, S);
         return;
       }
 
@@ -1501,7 +1544,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
     assert((!BS.getScheduleData(VL0) ||
             !BS.getScheduleData(VL0)->isPartOfBundle()) &&
            "tryScheduleBundle should cancelScheduling on failure");
-    newTreeEntry(VL, false, UserTreeIdx);
+    newTreeEntry(VL, false, UserTreeIdx, S);
     return;
   }
   DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
@@ -1520,12 +1563,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
           if (Term) {
             DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
             BS.cancelScheduling(VL, VL0);
-            newTreeEntry(VL, false, UserTreeIdx);
+            newTreeEntry(VL, false, UserTreeIdx, S);
             return;
           }
         }
 
-      newTreeEntry(VL, true, UserTreeIdx);
+      newTreeEntry(VL, true, UserTreeIdx, S);
       DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
 
       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
@@ -1535,7 +1578,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
           Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
               PH->getIncomingBlock(i)));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
       }
       return;
     }
@@ -1547,7 +1590,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
       } else {
         BS.cancelScheduling(VL, VL0);
       }
-      newTreeEntry(VL, Reuse, UserTreeIdx);
+      newTreeEntry(VL, Reuse, UserTreeIdx, S);
       return;
     }
     case Instruction::Load: {
@@ -1562,7 +1605,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
       if (DL->getTypeSizeInBits(ScalarTy) !=
           DL->getTypeAllocSizeInBits(ScalarTy)) {
         BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx);
+        newTreeEntry(VL, false, UserTreeIdx, S);
         DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
         return;
       }
@@ -1573,15 +1616,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
         LoadInst *L = cast<LoadInst>(VL[i]);
         if (!L->isSimple()) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx);
+          newTreeEntry(VL, false, UserTreeIdx, S);
           DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
           return;
         }
       }
 
       // Check if the loads are consecutive, reversed, or neither.
-      // TODO: What we really want is to sort the loads, but for now, check
-      // the two likely directions.
       bool Consecutive = true;
       bool ReverseConsecutive = true;
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
@@ -1595,7 +1636,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
 
       if (Consecutive) {
         ++NumLoadsWantToKeepOrder;
-        newTreeEntry(VL, true, UserTreeIdx);
+        newTreeEntry(VL, true, UserTreeIdx, S);
         DEBUG(dbgs() << "SLP: added a vector of loads.\n");
         return;
       }
@@ -1609,15 +1650,41 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
             break;
           }
 
-      BS.cancelScheduling(VL, VL0);
-      newTreeEntry(VL, false, UserTreeIdx);
-
       if (ReverseConsecutive) {
-        ++NumLoadsWantToChangeOrder;
         DEBUG(dbgs() << "SLP: Gathering reversed loads.\n");
-      } else {
-        DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
+        ++NumLoadsWantToChangeOrder;
+        BS.cancelScheduling(VL, VL0);
+        newTreeEntry(VL, false, UserTreeIdx, S);
+        return;
+      }
+
+      if (VL.size() > 2) {
+        bool ShuffledLoads = true;
+        SmallVector<Value *, 8> Sorted;
+        SmallVector<unsigned, 4> Mask;
+        if (sortLoadAccesses(VL, *DL, *SE, Sorted, &Mask)) {
+          auto NewVL = makeArrayRef(Sorted.begin(), Sorted.end());
+          for (unsigned i = 0, e = NewVL.size() - 1; i < e; ++i) {
+            if (!isConsecutiveAccess(NewVL[i], NewVL[i + 1], *DL, *SE)) {
+              ShuffledLoads = false;
+              break;
+            }
+          }
+          // TODO: Tracking how many load wants to have arbitrary shuffled order
+          // would be usefull.
+          if (ShuffledLoads) {
+            DEBUG(dbgs() << "SLP: added a vector of loads which needs "
+                            "permutation of loaded lanes.\n");
+            newTreeEntry(NewVL, true, UserTreeIdx, S,
+                         makeArrayRef(Mask.begin(), Mask.end()), OpdNum);
+            return;
+          }
+        }
       }
+
+      DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
+      BS.cancelScheduling(VL, VL0);
+      newTreeEntry(VL, false, UserTreeIdx, S);
       return;
     }
     case Instruction::ZExt:
@@ -1637,12 +1704,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
         Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
         if (Ty != SrcTy || !isValidElementType(Ty)) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx);
+          newTreeEntry(VL, false, UserTreeIdx, S);
           DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
           return;
         }
       }
-      newTreeEntry(VL, true, UserTreeIdx);
+      newTreeEntry(VL, true, UserTreeIdx, S);
       DEBUG(dbgs() << "SLP: added a vector of casts.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1651,7 +1718,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
       }
       return;
     }
@@ -1665,13 +1732,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
         if (Cmp->getPredicate() != P0 ||
             Cmp->getOperand(0)->getType() != ComparedTy) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx);
+          newTreeEntry(VL, false, UserTreeIdx, S);
           DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
           return;
         }
       }
 
-      newTreeEntry(VL, true, UserTreeIdx);
+      newTreeEntry(VL, true, UserTreeIdx, S);
       DEBUG(dbgs() << "SLP: added a vector of compares.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1680,7 +1747,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
       }
       return;
     }
@@ -1703,7 +1770,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor:
-      newTreeEntry(VL, true, UserTreeIdx);
+      newTreeEntry(VL, true, UserTreeIdx, S);
       DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
 
       // Sort operands of the instructions so that each side is more likely to
@@ -1712,7 +1779,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
         ValueList Left, Right;
         reorderInputsAccordingToOpcode(S.Opcode, VL, Left, Right);
         buildTree_rec(Left, Depth + 1, UserTreeIdx);
-        buildTree_rec(Right, Depth + 1, UserTreeIdx);
+        buildTree_rec(Right, Depth + 1, UserTreeIdx, 1);
         return;
       }
 
@@ -1722,7 +1789,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
       }
       return;
 
@@ -1732,7 +1799,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
         if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
           DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx);
+          newTreeEntry(VL, false, UserTreeIdx, S);
           return;
         }
       }
@@ -1745,7 +1812,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
         if (Ty0 != CurTy) {
           DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx);
+          newTreeEntry(VL, false, UserTreeIdx, S);
           return;
         }
       }
@@ -1757,12 +1824,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
           DEBUG(
               dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx);
+          newTreeEntry(VL, false, UserTreeIdx, S);
           return;
         }
       }
 
-      newTreeEntry(VL, true, UserTreeIdx);
+      newTreeEntry(VL, true, UserTreeIdx, S);
       DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
       for (unsigned i = 0, e = 2; i < e; ++i) {
         ValueList Operands;
@@ -1770,7 +1837,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
       }
       return;
     }
@@ -1779,12 +1846,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
         if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx);
+          newTreeEntry(VL, false, UserTreeIdx, S);
           DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
           return;
         }
 
-      newTreeEntry(VL, true, UserTreeIdx);
+      newTreeEntry(VL, true, UserTreeIdx, S);
       DEBUG(dbgs() << "SLP: added a vector of stores.\n");
 
       ValueList Operands;
@@ -1802,7 +1869,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
       if (!isTriviallyVectorizable(ID)) {
         BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx);
+        newTreeEntry(VL, false, UserTreeIdx, S);
         DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
         return;
       }
@@ -1816,7 +1883,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
             getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
             !CI->hasIdenticalOperandBundleSchema(*CI2)) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx);
+          newTreeEntry(VL, false, UserTreeIdx, S);
           DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
                        << "\n");
           return;
@@ -1827,7 +1894,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
           Value *A1J = CI2->getArgOperand(1);
           if (A1I != A1J) {
             BS.cancelScheduling(VL, VL0);
-            newTreeEntry(VL, false, UserTreeIdx);
+            newTreeEntry(VL, false, UserTreeIdx, S);
             DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
                          << " argument "<< A1I<<"!=" << A1J
                          << "\n");
@@ -1840,14 +1907,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
                         CI->op_begin() + CI->getBundleOperandsEndIndex(),
                         CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
           BS.cancelScheduling(VL, VL0);
-          newTreeEntry(VL, false, UserTreeIdx);
+          newTreeEntry(VL, false, UserTreeIdx, S);
           DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
                        << *VL[i] << '\n');
           return;
         }
       }
 
-      newTreeEntry(VL, true, UserTreeIdx);
+      newTreeEntry(VL, true, UserTreeIdx, S);
       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
@@ -1855,7 +1922,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
           CallInst *CI2 = dyn_cast<CallInst>(j);
           Operands.push_back(CI2->getArgOperand(i));
         }
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
       }
       return;
     }
@@ -1864,11 +1931,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
       // then do not vectorize this instruction.
       if (!S.IsAltShuffle) {
         BS.cancelScheduling(VL, VL0);
-        newTreeEntry(VL, false, UserTreeIdx);
+        newTreeEntry(VL, false, UserTreeIdx, S);
         DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
         return;
       }
-      newTreeEntry(VL, true, UserTreeIdx);
+      newTreeEntry(VL, true, UserTreeIdx, S);
       DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
 
       // Reorder operands if reordering would enable vectorization.
@@ -1876,7 +1943,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
         ValueList Left, Right;
         reorderAltShuffleOperands(S.Opcode, VL, Left, Right);
         buildTree_rec(Left, Depth + 1, UserTreeIdx);
-        buildTree_rec(Right, Depth + 1, UserTreeIdx);
+        buildTree_rec(Right, Depth + 1, UserTreeIdx, 1);
         return;
       }
 
@@ -1886,13 +1953,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
       }
       return;
 
     default:
       BS.cancelScheduling(VL, VL0);
-      newTreeEntry(VL, false, UserTreeIdx);
+      newTreeEntry(VL, false, UserTreeIdx, S);
       DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
       return;
   }
@@ -2730,12 +2797,20 @@ Value *BoUpSLP::alreadyVectorized(ArrayR
   return nullptr;
 }
 
-Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
+Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int OpdNum, int UserIndx) {
   InstructionsState S = getSameOpcode(VL);
   if (S.Opcode) {
     if (TreeEntry *E = getTreeEntry(S.OpValue)) {
-      if (E->isSame(VL))
-        return vectorizeTree(E);
+      TreeEntry *UserTreeEntry = nullptr;
+      if (UserIndx != -1)
+        UserTreeEntry = &VectorizableTree[UserIndx];
+
+      if (E->isSame(VL) ||
+          (UserTreeEntry &&
+           (unsigned)OpdNum < UserTreeEntry->ShuffleMask.size() &&
+           !UserTreeEntry->ShuffleMask[OpdNum].empty() &&
+           E->isFoundJumbled(VL, *DL, *SE)))
+        return vectorizeTree(E, OpdNum, UserIndx);
     }
   }
 
@@ -2747,9 +2822,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<V
   return Gather(VL, VecTy);
 }
 
-Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
+Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
   IRBuilder<>::InsertPointGuard Guard(Builder);
 
+  TreeEntry *UserTreeEntry = nullptr;
   if (E->VectorizedValue) {
     DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
     return E->VectorizedValue;
@@ -2769,6 +2845,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
     return V;
   }
 
+  assert(ScalarToTreeEntry.count(E->Scalars[0]) &&
+         "Expected user tree entry, missing!");
+  int CurrIndx = ScalarToTreeEntry[E->Scalars[0]];
+
   unsigned ShuffleOrOp = S.IsAltShuffle ?
            (unsigned) Instruction::ShuffleVector : S.Opcode;
   switch (ShuffleOrOp) {
@@ -2798,7 +2878,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
 
         Builder.SetInsertPoint(IBB->getTerminator());
         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
-        Value *Vec = vectorizeTree(Operands);
+        Value *Vec = vectorizeTree(Operands, i, CurrIndx);
         NewPhi->addIncoming(Vec, IBB);
       }
 
@@ -2851,7 +2931,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
 
       setInsertPointAfterBundle(E->Scalars, VL0);
 
-      Value *InVec = vectorizeTree(INVL);
+      Value *InVec = vectorizeTree(INVL, 0, CurrIndx);
 
       if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
@@ -2872,8 +2952,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
 
       setInsertPointAfterBundle(E->Scalars, VL0);
 
-      Value *L = vectorizeTree(LHSV);
-      Value *R = vectorizeTree(RHSV);
+      Value *L = vectorizeTree(LHSV, 0, CurrIndx);
+      Value *R = vectorizeTree(RHSV, 1, CurrIndx);
 
       if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
@@ -2900,9 +2980,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
 
       setInsertPointAfterBundle(E->Scalars, VL0);
 
-      Value *Cond = vectorizeTree(CondVec);
-      Value *True = vectorizeTree(TrueVec);
-      Value *False = vectorizeTree(FalseVec);
+      Value *Cond = vectorizeTree(CondVec, 0, CurrIndx);
+      Value *True = vectorizeTree(TrueVec, 1, CurrIndx);
+      Value *False = vectorizeTree(FalseVec, 2, CurrIndx);
 
       if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
@@ -2943,8 +3023,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
 
       setInsertPointAfterBundle(E->Scalars, VL0);
 
-      Value *LHS = vectorizeTree(LHSVL);
-      Value *RHS = vectorizeTree(RHSVL);
+      Value *LHS = vectorizeTree(LHSVL, 0, CurrIndx);
+      Value *RHS = vectorizeTree(RHSVL, 1, CurrIndx);
 
       if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
@@ -2965,7 +3045,20 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
       // sink them all the way down past store instructions.
       setInsertPointAfterBundle(E->Scalars, VL0);
 
-      LoadInst *LI = cast<LoadInst>(VL0);
+      if (UserIndx != -1)
+        UserTreeEntry = &VectorizableTree[UserIndx];
+
+      bool isJumbled = false;
+      LoadInst *LI = NULL;
+      if (UserTreeEntry &&
+          (unsigned)OpdNum < UserTreeEntry->ShuffleMask.size() &&
+          !UserTreeEntry->ShuffleMask[OpdNum].empty()) {
+        isJumbled = true;
+        LI = cast<LoadInst>(E->Scalars[0]);
+      } else {
+        LI = cast<LoadInst>(VL0);
+      }
+
       Type *ScalarLoadTy = LI->getType();
       unsigned AS = LI->getPointerAddressSpace();
 
@@ -2987,7 +3080,21 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
       LI->setAlignment(Alignment);
       E->VectorizedValue = LI;
       ++NumVectorInstructions;
-      return propagateMetadata(LI, E->Scalars);
+      propagateMetadata(LI, E->Scalars);
+
+      if (isJumbled) {
+        SmallVector<Constant *, 8> Mask;
+        for (unsigned LaneEntry : UserTreeEntry->ShuffleMask[OpdNum])
+          Mask.push_back(Builder.getInt32(LaneEntry));
+        // Generate shuffle for jumbled memory access
+        Value *Undef = UndefValue::get(VecTy);
+        Value *Shuf = Builder.CreateShuffleVector((Value *)LI, Undef,
+                                                  ConstantVector::get(Mask));
+        E->VectorizedValue = Shuf;
+        ++NumVectorInstructions;
+        return Shuf;
+      }
+      return LI;
     }
     case Instruction::Store: {
       StoreInst *SI = cast<StoreInst>(VL0);
@@ -3000,7 +3107,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
 
       setInsertPointAfterBundle(E->Scalars, VL0);
 
-      Value *VecValue = vectorizeTree(ScalarStoreValues);
+      Value *VecValue = vectorizeTree(ScalarStoreValues, 0, CurrIndx);
       Value *ScalarPtr = SI->getPointerOperand();
       Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));
       StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
@@ -3026,7 +3133,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
       for (Value *V : E->Scalars)
         Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
 
-      Value *Op0 = vectorizeTree(Op0VL);
+      Value *Op0 = vectorizeTree(Op0VL, 0, CurrIndx);
 
       std::vector<Value *> OpVecs;
       for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
@@ -3035,7 +3142,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
         for (Value *V : E->Scalars)
           OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
 
-        Value *OpVec = vectorizeTree(OpVL);
+        Value *OpVec = vectorizeTree(OpVL, j, CurrIndx);
         OpVecs.push_back(OpVec);
       }
 
@@ -3074,7 +3181,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
           OpVL.push_back(CEI->getArgOperand(j));
         }
 
-        Value *OpVec = vectorizeTree(OpVL);
+        Value *OpVec = vectorizeTree(OpVL, j, CurrIndx);
         DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
         OpVecs.push_back(OpVec);
       }
@@ -3105,8 +3212,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
       reorderAltShuffleOperands(S.Opcode, E->Scalars, LHSVL, RHSVL);
       setInsertPointAfterBundle(E->Scalars, VL0);
 
-      Value *LHS = vectorizeTree(LHSVL);
-      Value *RHS = vectorizeTree(RHSVL);
+      Value *LHS = vectorizeTree(LHSVL, 0, CurrIndx);
+      Value *RHS = vectorizeTree(RHSVL, 1, CurrIndx);
 
       if (Value *V = alreadyVectorized(E->Scalars, VL0))
         return V;
@@ -3206,9 +3313,14 @@ BoUpSLP::vectorizeTree(ExtraValueToDebug
       continue;
     TreeEntry *E = getTreeEntry(Scalar);
     assert(E && "Invalid scalar");
-    assert(!E->NeedToGather && "Extracting from a gather list");
+    assert((!E->NeedToGather) && "Extracting from a gather list");
 
-    Value *Vec = E->VectorizedValue;
+    Value *Vec = dyn_cast<ShuffleVectorInst>(E->VectorizedValue);
+    if (Vec && dyn_cast<LoadInst>(cast<Instruction>(Vec)->getOperand(0))) {
+      Vec = cast<Instruction>(E->VectorizedValue)->getOperand(0);
+    } else {
+      Vec = E->VectorizedValue;
+    }
     assert(Vec && "Can't find vectorizable value");
 
     Value *Lane = Builder.getInt32(ExternalUse.Lane);

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll?rev=320548&r1=320547&r2=320548&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll Tue Dec 12 19:08:29 2017
@@ -11,20 +11,16 @@
     define i32 @fn1() {
 ; CHECK-LABEL: @fn1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 0), align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 1), align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 2), align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i32 3), align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP3]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt <4 x i32> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP4]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 2
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 8, i32 3
-; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP11]], <4 x i32> <i32 6, i32 0, i32 0, i32 0>
-; CHECK-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x i32]* @b to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 ptrtoint (i32 ()* @fn1 to i32), i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 8, i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP7]], <4 x i32> <i32 6, i32 0, i32 0, i32 0>
+; CHECK-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* bitcast ([4 x i32]* @a to <4 x i32>*), align 4
 ; CHECK-NEXT:    ret i32 0
 ;
   entry:

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll?rev=320548&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll Tue Dec 12 19:08:29 2017
@@ -0,0 +1,125 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s
+
+
+;void jumble (int * restrict A, int * restrict B) {
+  ;  int tmp0 = A[10]*A[0];
+  ;  int tmp1 = A[11]*A[1];
+  ;  int tmp2 = A[12]*A[3];
+  ;  int tmp3 = A[13]*A[2];
+  ;  B[0] = tmp0;
+  ;  B[1] = tmp1;
+  ;  B[2] = tmp2;
+  ;  B[3] = tmp3;
+  ;}
+
+
+  ; Function Attrs: norecurse nounwind uwtable
+  define void @jumble1(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
+; CHECK-LABEL: @jumble1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 10
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 11
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 13
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw <4 x i32> [[TMP1]], [[TMP4]]
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 10
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = load i32, i32* %A, align 4
+  %mul = mul nsw i32 %0, %1
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 11
+  %2 = load i32, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 1
+  %3 = load i32, i32* %arrayidx3, align 4
+  %mul4 = mul nsw i32 %2, %3
+  %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 12
+  %4 = load i32, i32* %arrayidx5, align 4
+  %arrayidx6 = getelementptr inbounds i32, i32* %A, i64 3
+  %5 = load i32, i32* %arrayidx6, align 4
+  %mul7 = mul nsw i32 %4, %5
+  %arrayidx8 = getelementptr inbounds i32, i32* %A, i64 13
+  %6 = load i32, i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32, i32* %A, i64 2
+  %7 = load i32, i32* %arrayidx9, align 4
+  %mul10 = mul nsw i32 %6, %7
+  store i32 %mul, i32* %B, align 4
+  %arrayidx12 = getelementptr inbounds i32, i32* %B, i64 1
+  store i32 %mul4, i32* %arrayidx12, align 4
+  %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 2
+  store i32 %mul7, i32* %arrayidx13, align 4
+  %arrayidx14 = getelementptr inbounds i32, i32* %B, i64 3
+  store i32 %mul10, i32* %arrayidx14, align 4
+  ret void
+  }
+
+;Reversing the operand of MUL
+  ; Function Attrs: norecurse nounwind uwtable
+  define void @jumble2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
+; CHECK-LABEL: @jumble2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 10
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 11
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 13
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 10
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = load i32, i32* %A, align 4
+  %mul = mul nsw i32 %1, %0
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 11
+  %2 = load i32, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 1
+  %3 = load i32, i32* %arrayidx3, align 4
+  %mul4 = mul nsw i32 %3, %2
+  %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 12
+  %4 = load i32, i32* %arrayidx5, align 4
+  %arrayidx6 = getelementptr inbounds i32, i32* %A, i64 3
+  %5 = load i32, i32* %arrayidx6, align 4
+  %mul7 = mul nsw i32 %5, %4
+  %arrayidx8 = getelementptr inbounds i32, i32* %A, i64 13
+  %6 = load i32, i32* %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32, i32* %A, i64 2
+  %7 = load i32, i32* %arrayidx9, align 4
+  %mul10 = mul nsw i32 %7, %6
+  store i32 %mul, i32* %B, align 4
+  %arrayidx12 = getelementptr inbounds i32, i32* %B, i64 1
+  store i32 %mul4, i32* %arrayidx12, align 4
+  %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 2
+  store i32 %mul7, i32* %arrayidx13, align 4
+  %arrayidx14 = getelementptr inbounds i32, i32* %B, i64 3
+  store i32 %mul10, i32* %arrayidx14, align 4
+  ret void
+  }
+

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll?rev=320548&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-used-in-phi.ll Tue Dec 12 19:08:29 2017
@@ -0,0 +1,225 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s
+
+;void phiUsingLoads(int *restrict A, int *restrict B) {
+;  int tmp0, tmp1, tmp2, tmp3;
+;  for (int i = 0; i < 100; i++) {
+;    if (A[0] == 0) {
+;      tmp0 = A[i + 0];
+;      tmp1 = A[i + 1];
+;      tmp2 = A[i + 2];
+;      tmp3 = A[i + 3];
+;    } else if (A[25] == 0) {
+;      tmp0 = A[i + 0];
+;      tmp1 = A[i + 1];
+;      tmp2 = A[i + 2];
+;      tmp3 = A[i + 3];
+;    } else if (A[50] == 0) {
+;      tmp0 = A[i + 0];
+;      tmp1 = A[i + 1];
+;      tmp2 = A[i + 2];
+;      tmp3 = A[i + 3];
+;    } else if (A[75] == 0) {
+;      tmp0 = A[i + 0];
+;      tmp1 = A[i + 1];
+;      tmp2 = A[i + 3];
+;      tmp3 = A[i + 2];
+;    }
+;  }
+;  B[0] = tmp0;
+;  B[1] = tmp1;
+;  B[2] = tmp2;
+;  B[3] = tmp3;
+;}
+
+
+; Function Attrs: norecurse nounwind uwtable
+define void @phiUsingLoads(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) local_unnamed_addr #0 {
+; CHECK-LABEL: @phiUsingLoads(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 25
+; CHECK-NEXT:    [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 50
+; CHECK-NEXT:    [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 75
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX65:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX66:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[B]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP27:%.*]], <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x i32> [ undef, [[ENTRY]] ], [ [[TMP27]], [[FOR_INC]] ]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[ARRAYIDX2]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX12]], align 4
+; CHECK-NEXT:    [[CMP13:%.*]] = icmp eq i32 [[TMP8]], 0
+; CHECK-NEXT:    br i1 [[CMP13]], label [[IF_THEN14:%.*]], label [[IF_ELSE27:%.*]]
+; CHECK:       if.then14:
+; CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[ARRAYIDX17]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, <4 x i32>* [[TMP12]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       if.else27:
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX28]], align 4
+; CHECK-NEXT:    [[CMP29:%.*]] = icmp eq i32 [[TMP14]], 0
+; CHECK-NEXT:    br i1 [[CMP29]], label [[IF_THEN30:%.*]], label [[IF_ELSE43:%.*]]
+; CHECK:       if.then30:
+; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP16:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[ARRAYIDX39:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP17:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[ARRAYIDX42:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i32* [[ARRAYIDX33]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i32>, <4 x i32>* [[TMP18]], align 4
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       if.else43:
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX44]], align 4
+; CHECK-NEXT:    [[CMP45:%.*]] = icmp eq i32 [[TMP20]], 0
+; CHECK-NEXT:    br i1 [[CMP45]], label [[IF_THEN46:%.*]], label [[FOR_INC]]
+; CHECK:       if.then46:
+; CHECK-NEXT:    [[ARRAYIDX49:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP21:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[ARRAYIDX52:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP22:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[ARRAYIDX55:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP22]]
+; CHECK-NEXT:    [[TMP23:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[ARRAYIDX58:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP23]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i32* [[ARRAYIDX49]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i32>, <4 x i32>* [[TMP24]], align 4
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i32> [[TMP25]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[TMP27]] = phi <4 x i32> [ [[TMP7]], [[IF_THEN]] ], [ [[TMP13]], [[IF_THEN14]] ], [ [[TMP19]], [[IF_THEN30]] ], [ [[TMP26]], [[IF_THEN46]] ], [ [[TMP2]], [[IF_ELSE43]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+;
+entry:
+  %0 = load i32, i32* %A, align 4
+  %cmp1 = icmp eq i32 %0, 0
+  %arrayidx12 = getelementptr inbounds i32, i32* %A, i64 25
+  %arrayidx28 = getelementptr inbounds i32, i32* %A, i64 50
+  %arrayidx44 = getelementptr inbounds i32, i32* %A, i64 75
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.inc
+  store i32 %tmp0.1, i32* %B, align 4
+  %arrayidx64 = getelementptr inbounds i32, i32* %B, i64 1
+  store i32 %tmp1.1, i32* %arrayidx64, align 4
+  %arrayidx65 = getelementptr inbounds i32, i32* %B, i64 2
+  store i32 %tmp2.1, i32* %arrayidx65, align 4
+  %arrayidx66 = getelementptr inbounds i32, i32* %B, i64 3
+  store i32 %tmp3.1, i32* %arrayidx66, align 4
+  ret void
+
+for.body:                                         ; preds = %for.inc, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %tmp3.0111 = phi i32 [ undef, %entry ], [ %tmp3.1, %for.inc ]
+  %tmp2.0110 = phi i32 [ undef, %entry ], [ %tmp2.1, %for.inc ]
+  %tmp1.0109 = phi i32 [ undef, %entry ], [ %tmp1.1, %for.inc ]
+  %tmp0.0108 = phi i32 [ undef, %entry ], [ %tmp0.1, %for.inc ]
+  br i1 %cmp1, label %if.then, label %if.else
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %2 = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 %2
+  %3 = load i32, i32* %arrayidx5, align 4
+  %4 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx8 = getelementptr inbounds i32, i32* %A, i64 %4
+  %5 = load i32, i32* %arrayidx8, align 4
+  %6 = add nuw nsw i64 %indvars.iv, 3
+  %arrayidx11 = getelementptr inbounds i32, i32* %A, i64 %6
+  %7 = load i32, i32* %arrayidx11, align 4
+  br label %for.inc
+
+if.else:                                          ; preds = %for.body
+  %8 = load i32, i32* %arrayidx12, align 4
+  %cmp13 = icmp eq i32 %8, 0
+  br i1 %cmp13, label %if.then14, label %if.else27
+
+if.then14:                                        ; preds = %if.else
+  %arrayidx17 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %9 = load i32, i32* %arrayidx17, align 4
+  %10 = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx20 = getelementptr inbounds i32, i32* %A, i64 %10
+  %11 = load i32, i32* %arrayidx20, align 4
+  %12 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx23 = getelementptr inbounds i32, i32* %A, i64 %12
+  %13 = load i32, i32* %arrayidx23, align 4
+  %14 = add nuw nsw i64 %indvars.iv, 3
+  %arrayidx26 = getelementptr inbounds i32, i32* %A, i64 %14
+  %15 = load i32, i32* %arrayidx26, align 4
+  br label %for.inc
+
+if.else27:                                        ; preds = %if.else
+  %16 = load i32, i32* %arrayidx28, align 4
+  %cmp29 = icmp eq i32 %16, 0
+  br i1 %cmp29, label %if.then30, label %if.else43
+
+if.then30:                                        ; preds = %if.else27
+  %arrayidx33 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %17 = load i32, i32* %arrayidx33, align 4
+  %18 = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx36 = getelementptr inbounds i32, i32* %A, i64 %18
+  %19 = load i32, i32* %arrayidx36, align 4
+  %20 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx39 = getelementptr inbounds i32, i32* %A, i64 %20
+  %21 = load i32, i32* %arrayidx39, align 4
+  %22 = add nuw nsw i64 %indvars.iv, 3
+  %arrayidx42 = getelementptr inbounds i32, i32* %A, i64 %22
+  %23 = load i32, i32* %arrayidx42, align 4
+  br label %for.inc
+
+if.else43:                                        ; preds = %if.else27
+  %24 = load i32, i32* %arrayidx44, align 4
+  %cmp45 = icmp eq i32 %24, 0
+  br i1 %cmp45, label %if.then46, label %for.inc
+
+if.then46:                                        ; preds = %if.else43
+  %arrayidx49 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  %25 = load i32, i32* %arrayidx49, align 4
+  %26 = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx52 = getelementptr inbounds i32, i32* %A, i64 %26
+  %27 = load i32, i32* %arrayidx52, align 4
+  %28 = add nuw nsw i64 %indvars.iv, 3
+  %arrayidx55 = getelementptr inbounds i32, i32* %A, i64 %28
+  %29 = load i32, i32* %arrayidx55, align 4
+  %30 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx58 = getelementptr inbounds i32, i32* %A, i64 %30
+  %31 = load i32, i32* %arrayidx58, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %if.then30, %if.else43, %if.then46, %if.then14
+  %tmp0.1 = phi i32 [ %1, %if.then ], [ %9, %if.then14 ], [ %17, %if.then30 ], [ %25, %if.then46 ], [ %tmp0.0108, %if.else43 ]
+  %tmp1.1 = phi i32 [ %3, %if.then ], [ %11, %if.then14 ], [ %19, %if.then30 ], [ %27, %if.then46 ], [ %tmp1.0109, %if.else43 ]
+  %tmp2.1 = phi i32 [ %5, %if.then ], [ %13, %if.then14 ], [ %21, %if.then30 ], [ %29, %if.then46 ], [ %tmp2.0110, %if.else43 ]
+  %tmp3.1 = phi i32 [ %7, %if.then ], [ %15, %if.then14 ], [ %23, %if.then30 ], [ %31, %if.then46 ], [ %tmp3.0111, %if.else43 ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 100
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load.ll?rev=320548&r1=320547&r2=320548&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load.ll Tue Dec 12 19:08:29 2017
@@ -5,34 +5,27 @@
 
 define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {
 ; CHECK-LABEL: @jumbled-load(
-; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* %in, i64 0
-; CHECK-NEXT:    [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4
+; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
-; CHECK-NEXT:    [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
-; CHECK-NEXT:    [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
-; CHECK-NEXT:    [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4
-; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* %inn, i64 0
-; CHECK-NEXT:    [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
 ; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
-; CHECK-NEXT:    [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
-; CHECK-NEXT:    [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
-; CHECK-NEXT:    [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4
-; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[LOAD_3]], [[LOAD_5]]
-; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[LOAD_2]], [[LOAD_8]]
-; CHECK-NEXT:    [[MUL_3:%.*]] = mul i32 [[LOAD_4]], [[LOAD_7]]
-; CHECK-NEXT:    [[MUL_4:%.*]] = mul i32 [[LOAD_1]], [[LOAD_6]]
-; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* %out, i64 0
-; CHECK-NEXT:    store i32 [[MUL_1]], i32* [[GEP_7]], align 4
-; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* %out, i64 1
-; CHECK-NEXT:    store i32 [[MUL_2]], i32* [[GEP_8]], align 4
-; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* %out, i64 2
-; CHECK-NEXT:    store i32 [[MUL_3]], i32* [[GEP_9]], align 4
-; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* %out, i64 3
-; CHECK-NEXT:    store i32 [[MUL_4]], i32* [[GEP_10]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i32> [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
+; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
+; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
+; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
 ; CHECK-NEXT:    ret i32 undef
 ;
   %in.addr = getelementptr inbounds i32, i32* %in, i64 0

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/store-jumbled.ll?rev=320548&r1=320547&r2=320548&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/store-jumbled.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/store-jumbled.ll Tue Dec 12 19:08:29 2017
@@ -6,33 +6,26 @@
 define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {
 ; CHECK-LABEL: @jumbled-load(
 ; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
-; CHECK-NEXT:    [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
-; CHECK-NEXT:    [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
-; CHECK-NEXT:    [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
-; CHECK-NEXT:    [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
 ; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
-; CHECK-NEXT:    [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
-; CHECK-NEXT:    [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
-; CHECK-NEXT:    [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
-; CHECK-NEXT:    [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4
-; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[LOAD_1]], [[LOAD_5]]
-; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[LOAD_2]], [[LOAD_6]]
-; CHECK-NEXT:    [[MUL_3:%.*]] = mul i32 [[LOAD_3]], [[LOAD_7]]
-; CHECK-NEXT:    [[MUL_4:%.*]] = mul i32 [[LOAD_4]], [[LOAD_8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i32> [[TMP3]], [[TMP6]]
 ; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
 ; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
 ; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
 ; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
-; CHECK-NEXT:    store i32 [[MUL_1]], i32* [[GEP_9]], align 4
-; CHECK-NEXT:    store i32 [[MUL_2]], i32* [[GEP_7]], align 4
-; CHECK-NEXT:    store i32 [[MUL_3]], i32* [[GEP_10]], align 4
-; CHECK-NEXT:    store i32 [[MUL_4]], i32* [[GEP_8]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
 ; CHECK-NEXT:    ret i32 undef
 ;
   %in.addr = getelementptr inbounds i32, i32* %in, i64 0