[llvm] r293386 - [SLP] Vectorize loads of consecutive memory accesses, accessed in non-consecutive (jumbled) way.

Sat Jan 28 09:59:45 PST 2017

Author: ashahid
Date: Sat Jan 28 11:59:44 2017
New Revision: 293386

URL: http://llvm.org/viewvc/llvm-project?rev=293386&view=rev
Log:
[SLP] Vectorize loads of consecutive memory accesses, accessed in non-consecutive (jumbled) way.
The jumbled scalar loads will be sorted while building the tree and these accesses will be marked to generate shufflevector after the vectorized load with proper mask.

Reviewers: hfinkel, mssimpso, mkuper

Differential Revision: https://reviews.llvm.org/D26905

Change-Id: I9c0c8e6f91a00076a7ee1465440a3f6ae092f7ad

Modified:
    llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h
    llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp
    llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
    llvm/trunk/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
    llvm/trunk/test/Transforms/SLPVectorizer/X86/store-jumbled.ll

Modified: llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h?rev=293386&r1=293385&r2=293386&view=diff
==============================================================================

--- llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h (original)
+++ llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h Sat Jan 28 11:59:44 2017
@@ -690,6 +690,11 @@ int64_t getPtrStride(PredicatedScalarEvo
                      const ValueToValueMap &StridesMap = ValueToValueMap(),
                      bool Assume = false, bool ShouldCheckWrap = true);
 
+/// \brief Saves the sorted memory accesses in vector argument 'Sorted' after
+/// sorting the jumbled memory accesses.
+void sortMemAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
+                     ScalarEvolution &SE, SmallVectorImpl<Value *> &Sorted);
+
 /// \brief Returns true if the memory operations \p A and \p B are consecutive.
 /// This is a simple API that does not depend on the analysis pass. 
 bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,

Modified: llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp?rev=293386&r1=293385&r2=293386&view=diff
==============================================================================
--- llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp (original)
+++ llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp Sat Jan 28 11:59:44 2017
@@ -1058,6 +1058,37 @@ static unsigned getAddressSpaceOperand(V
   return -1;
 }
 
+/// Saves the memory accesses after sorting it into vector argument 'Sorted'.
+void llvm::sortMemAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
+                         ScalarEvolution &SE,
+                         SmallVectorImpl<Value *> &Sorted) {
+  SmallVector<std::pair<int, Value *>, 4> OffValPairs;
+  for (auto *Val : VL) {
+    // Compute the constant offset from the base pointer of each memory accesses
+    // and insert into the vector of key,value pair which needs to be sorted.
+    Value *Ptr = getPointerOperand(Val);
+    unsigned AS = getAddressSpaceOperand(Val);
+    unsigned PtrBitWidth = DL.getPointerSizeInBits(AS);
+    Type *Ty = cast<PointerType>(Ptr->getType())->getElementType();
+    APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty));
+
+    // FIXME: Currently the offsets are assumed to be constant.However this not
+    // always true as offsets can be variables also and we would need to
+    // consider the difference of the variable offsets.
+    APInt Offset(PtrBitWidth, 0);
+    Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
+    OffValPairs.push_back(std::make_pair(Offset.getSExtValue(), Val));
+  }
+  std::sort(OffValPairs.begin(), OffValPairs.end(),
+            [](const std::pair<int, Value *> &Left,
+               const std::pair<int, Value *> &Right) {
+              return Left.first < Right.first;
+            });
+
+  for (auto& it : OffValPairs)
+    Sorted.push_back(it.second);
+}
+
 /// Returns true if the memory operations \p A and \p B are consecutive.
 bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
                                ScalarEvolution &SE, bool CheckType) {

Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=293386&r1=293385&r2=293386&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Sat Jan 28 11:59:44 2017
@@ -410,8 +410,10 @@ private:
   /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
   bool canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const;
 
-  /// Vectorize a single entry in the tree.
-  Value *vectorizeTree(TreeEntry *E);
+  /// Vectorize a single entry in the tree. VL icontains all isomorphic scalars
+  /// in order of its usage in a user program, for example ADD1, ADD2 and so on
+  /// or LOAD1 , LOAD2 etc.
+  Value *vectorizeTree(ArrayRef<Value *> VL, TreeEntry *E);
 
   /// Vectorize a single entry in the tree, starting in \p VL.
   Value *vectorizeTree(ArrayRef<Value *> VL);
@@ -452,7 +454,7 @@ private:
                                       SmallVectorImpl<Value *> &Right);
   struct TreeEntry {
     TreeEntry() : Scalars(), VectorizedValue(nullptr),
-    NeedToGather(0) {}
+    NeedToGather(0), NeedToShuffle(0) {}
 
     /// \returns true if the scalars in VL are equal to this entry.
     bool isSame(ArrayRef<Value *> VL) const {
@@ -460,6 +462,15 @@ private:
       return std::equal(VL.begin(), VL.end(), Scalars.begin());
     }
 
+    /// \returns true if the scalars in VL are found in this tree entry.
+    bool isFoundJumbled(ArrayRef<Value *> VL, const DataLayout &DL,
+                        ScalarEvolution &SE) const {
+      assert(VL.size() == Scalars.size() && "Invalid size");
+      SmallVector<Value *, 8> List;
+      sortMemAccesses(VL, DL, SE, List);
+      return std::equal(List.begin(), List.end(), Scalars.begin());
+    }
+
     /// A vector of scalars.
     ValueList Scalars;
 
@@ -468,15 +479,20 @@ private:
 
     /// Do we need to gather this sequence ?
     bool NeedToGather;
+
+    /// Do we need to shuffle the load ?
+    bool NeedToShuffle;
   };
 
   /// Create a new VectorizableTree entry.
-  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
+  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
+                          bool NeedToShuffle) {
     VectorizableTree.emplace_back();
     int idx = VectorizableTree.size() - 1;
     TreeEntry *Last = &VectorizableTree[idx];
     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
     Last->NeedToGather = !Vectorized;
+    Last->NeedToShuffle = NeedToShuffle;
     if (Vectorized) {
       for (int i = 0, e = VL.size(); i != e; ++i) {
         assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
@@ -983,21 +999,21 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
 
   if (Depth == RecursionMaxDepth) {
     DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
-    newTreeEntry(VL, false);
+    newTreeEntry(VL, false, false);
     return;
   }
 
   // Don't handle vectors.
   if (VL[0]->getType()->isVectorTy()) {
     DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
-    newTreeEntry(VL, false);
+    newTreeEntry(VL, false, false);
     return;
   }
 
   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
     if (SI->getValueOperand()->getType()->isVectorTy()) {
       DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
-      newTreeEntry(VL, false);
+      newTreeEntry(VL, false, false);
       return;
     }
   unsigned Opcode = getSameOpcode(VL);
@@ -1014,7 +1030,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
   // If all of the operands are identical or constant we have a simple solution.
   if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) {
     DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
-    newTreeEntry(VL, false);
+    newTreeEntry(VL, false, false);
     return;
   }
 
@@ -1026,7 +1042,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
     if (EphValues.count(VL[i])) {
       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
             ") is ephemeral.\n");
-      newTreeEntry(VL, false);
+      newTreeEntry(VL, false, false);
       return;
     }
   }
@@ -1039,7 +1055,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
       DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
       if (E->Scalars[i] != VL[i]) {
         DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
-        newTreeEntry(VL, false);
+        newTreeEntry(VL, false, false);
         return;
       }
     }
@@ -1052,7 +1068,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
     if (ScalarToTreeEntry.count(VL[i])) {
       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
             ") is already in tree.\n");
-      newTreeEntry(VL, false);
+      newTreeEntry(VL, false, false);
       return;
     }
   }
@@ -1062,7 +1078,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
     if (MustGather.count(VL[i])) {
       DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
-      newTreeEntry(VL, false);
+      newTreeEntry(VL, false, false);
       return;
     }
   }
@@ -1076,7 +1092,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
     // Don't go into unreachable blocks. They may contain instructions with
     // dependency cycles which confuse the final scheduling.
     DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
-    newTreeEntry(VL, false);
+    newTreeEntry(VL, false, false);
     return;
   }
 
@@ -1085,7 +1101,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
     for (unsigned j = i+1; j < e; ++j)
       if (VL[i] == VL[j]) {
         DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
-        newTreeEntry(VL, false);
+        newTreeEntry(VL, false, false);
         return;
       }
 
@@ -1100,7 +1116,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
     assert((!BS.getScheduleData(VL[0]) ||
             !BS.getScheduleData(VL[0])->isPartOfBundle()) &&
            "tryScheduleBundle should cancelScheduling on failure");
-    newTreeEntry(VL, false);
+    newTreeEntry(VL, false, false);
     return;
   }
   DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
@@ -1117,12 +1133,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
           if (Term) {
             DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
             BS.cancelScheduling(VL);
-            newTreeEntry(VL, false);
+            newTreeEntry(VL, false, false);
             return;
           }
         }
 
-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, false);
       DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
 
       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
@@ -1144,7 +1160,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
       } else {
         BS.cancelScheduling(VL);
       }
-      newTreeEntry(VL, Reuse);
+      newTreeEntry(VL, Reuse, false);
       return;
     }
     case Instruction::Load: {
@@ -1160,7 +1176,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
       if (DL->getTypeSizeInBits(ScalarTy) !=
           DL->getTypeAllocSizeInBits(ScalarTy)) {
         BS.cancelScheduling(VL);
-        newTreeEntry(VL, false);
+        newTreeEntry(VL, false, false);
         DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
         return;
       }
@@ -1171,15 +1187,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
         LoadInst *L = cast<LoadInst>(VL[i]);
         if (!L->isSimple()) {
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          newTreeEntry(VL, false, false);
           DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
           return;
         }
       }
 
       // Check if the loads are consecutive, reversed, or neither.
-      // TODO: What we really want is to sort the loads, but for now, check
-      // the two likely directions.
       bool Consecutive = true;
       bool ReverseConsecutive = true;
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
@@ -1193,7 +1207,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
 
       if (Consecutive) {
         ++NumLoadsWantToKeepOrder;
-        newTreeEntry(VL, true);
+        newTreeEntry(VL, true, false);
         DEBUG(dbgs() << "SLP: added a vector of loads.\n");
         return;
       }
@@ -1207,8 +1221,25 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
             break;
           }
 
+      if (VL.size() > 2 && !ReverseConsecutive) {
+        bool ShuffledLoads = true;
+        SmallVector<Value *, 8> List;
+        sortMemAccesses(VL, *DL, *SE, List);
+        auto NewVL = makeArrayRef(List.begin(), List.end());
+        for (unsigned i = 0, e = NewVL.size() - 1; i < e; ++i) {
+          if (!isConsecutiveAccess(NewVL[i], NewVL[i + 1], *DL, *SE)) {
+            ShuffledLoads = false;
+            break;
+          }
+        }
+        if (ShuffledLoads) {
+          newTreeEntry(NewVL, true, true);
+          return;
+        }
+      }
+
       BS.cancelScheduling(VL);
-      newTreeEntry(VL, false);
+      newTreeEntry(VL, false, false);
 
       if (ReverseConsecutive) {
         ++NumLoadsWantToChangeOrder;
@@ -1231,16 +1262,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
       Type *SrcTy = VL0->getOperand(0)->getType();
-      for (unsigned i = 0; i < VL.size(); ++i) {
-        Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
+      for (Value *Val : VL) {
+        Type *Ty = cast<Instruction>(Val)->getOperand(0)->getType();
         if (Ty != SrcTy || !isValidElementType(Ty)) {
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          newTreeEntry(VL, false, false);
           DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
           return;
         }
       }
-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, false);
       DEBUG(dbgs() << "SLP: added a vector of casts.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1263,13 +1294,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
         if (Cmp->getPredicate() != P0 ||
             Cmp->getOperand(0)->getType() != ComparedTy) {
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          newTreeEntry(VL, false, false);
           DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
           return;
         }
       }
 
-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, false);
       DEBUG(dbgs() << "SLP: added a vector of compares.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1301,7 +1332,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor: {
-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, false);
       DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
 
       // Sort operands of the instructions so that each side is more likely to
@@ -1326,11 +1357,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
     }
     case Instruction::GetElementPtr: {
       // We don't combine GEPs with complicated (nested) indexing.
-      for (unsigned j = 0; j < VL.size(); ++j) {
-        if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
+      for (Value *Val : VL) {
+        if (cast<Instruction>(Val)->getNumOperands() != 2) {
           DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          newTreeEntry(VL, false, false);
           return;
         }
       }
@@ -1338,29 +1369,29 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
       // We can't combine several GEPs into one vector if they operate on
       // different types.
       Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
-      for (unsigned j = 0; j < VL.size(); ++j) {
-        Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
+      for (Value *Val : VL) {
+        Type *CurTy = cast<Instruction>(Val)->getOperand(0)->getType();
         if (Ty0 != CurTy) {
           DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          newTreeEntry(VL, false, false);
           return;
         }
       }
 
       // We don't combine GEPs with non-constant indexes.
-      for (unsigned j = 0; j < VL.size(); ++j) {
-        auto Op = cast<Instruction>(VL[j])->getOperand(1);
+      for (Value *Val : VL) {
+        auto Op = cast<Instruction>(Val)->getOperand(1);
         if (!isa<ConstantInt>(Op)) {
           DEBUG(
               dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          newTreeEntry(VL, false, false);
           return;
         }
       }
 
-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, false);
       DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
       for (unsigned i = 0, e = 2; i < e; ++i) {
         ValueList Operands;
@@ -1377,12 +1408,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
         if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          newTreeEntry(VL, false, false);
           DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
           return;
         }
 
-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, false);
       DEBUG(dbgs() << "SLP: added a vector of stores.\n");
 
       ValueList Operands;
@@ -1400,7 +1431,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
       if (!isTriviallyVectorizable(ID)) {
         BS.cancelScheduling(VL);
-        newTreeEntry(VL, false);
+        newTreeEntry(VL, false, false);
         DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
         return;
       }
@@ -1414,7 +1445,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
             getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
             !CI->hasIdenticalOperandBundleSchema(*CI2)) {
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          newTreeEntry(VL, false, false);
           DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
                        << "\n");
           return;
@@ -1425,7 +1456,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
           Value *A1J = CI2->getArgOperand(1);
           if (A1I != A1J) {
             BS.cancelScheduling(VL);
-            newTreeEntry(VL, false);
+            newTreeEntry(VL, false, false);
             DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
                          << " argument "<< A1I<<"!=" << A1J
                          << "\n");
@@ -1438,14 +1469,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
                         CI->op_begin() + CI->getBundleOperandsEndIndex(),
                         CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false);
+          newTreeEntry(VL, false, false);
           DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
                        << *VL[i] << '\n');
           return;
         }
       }
 
-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, false);
       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
@@ -1462,11 +1493,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
       // then do not vectorize this instruction.
       if (!isAltShuffle) {
         BS.cancelScheduling(VL);
-        newTreeEntry(VL, false);
+        newTreeEntry(VL, false, false);
         DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
         return;
       }
-      newTreeEntry(VL, true);
+      newTreeEntry(VL, true, false);
       DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
 
       // Reorder operands if reordering would enable vectorization.
@@ -1490,7 +1521,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
     }
     default:
       BS.cancelScheduling(VL);
-      newTreeEntry(VL, false);
+      newTreeEntry(VL, false, false);
       DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
       return;
   }
@@ -1723,6 +1754,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
             TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0);
       int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
                                            VecTy, alignment, 0);
+      if (E->NeedToShuffle) {
+        VecLdCost += TTI->getShuffleCost(
+            TargetTransformInfo::SK_PermuteSingleSrc, VecTy, 0);
+      }
       return VecLdCost - ScalarLdCost;
     }
     case Instruction::Store: {
@@ -2285,8 +2320,8 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<V
   if (ScalarToTreeEntry.count(VL[0])) {
     int Idx = ScalarToTreeEntry[VL[0]];
     TreeEntry *E = &VectorizableTree[Idx];
-    if (E->isSame(VL))
-      return vectorizeTree(E);
+    if (E->isSame(VL) || (E->NeedToShuffle && E->isFoundJumbled(VL, *DL, *SE)))
+      return vectorizeTree(VL, E);
   }
 
   Type *ScalarTy = VL[0]->getType();
@@ -2297,10 +2332,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<V
   return Gather(VL, VecTy);
 }
 
-Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
+Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, TreeEntry *E) {
   IRBuilder<>::InsertPointGuard Guard(Builder);
 
-  if (E->VectorizedValue) {
+  if (E->VectorizedValue && !E->NeedToShuffle) {
     DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
     return E->VectorizedValue;
   }
@@ -2534,7 +2569,35 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
       LI->setAlignment(Alignment);
       E->VectorizedValue = LI;
       ++NumVectorInstructions;
-      return propagateMetadata(LI, E->Scalars);
+      propagateMetadata(LI, E->Scalars);
+
+      // As program order of scalar loads are jumbled, the vectorized 'load'
+      // must be followed by a 'shuffle' with the required jumbled mask.
+      if (!VL.empty() && (E->NeedToShuffle)) {
+        assert(VL.size() == E->Scalars.size() &&
+               "Equal number of scalars expected");
+        SmallVector<Constant *, 8> Mask;
+        for (Value *Val : VL) {
+          if (ScalarToTreeEntry.count(Val)) {
+            int Idx = ScalarToTreeEntry[Val];
+            TreeEntry *E = &VectorizableTree[Idx];
+            for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
+              if (E->Scalars[Lane] == Val) {
+                Mask.push_back(Builder.getInt32(Lane));
+                break;
+              }
+            }
+          }
+        }
+
+        // Generate shuffle for jumbled memory access
+        Value *Undef = UndefValue::get(VecTy);
+        Value *Shuf = Builder.CreateShuffleVector((Value *)LI, Undef,
+                                                  ConstantVector::get(Mask));
+        return Shuf;
+      }
+
+      return LI;
     }
     case Instruction::Store: {
       StoreInst *SI = cast<StoreInst>(VL0);
@@ -2709,7 +2772,7 @@ Value *BoUpSLP::vectorizeTree() {
   }
 
   Builder.SetInsertPoint(&F->getEntryBlock().front());
-  auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
+  auto *VectorRoot = vectorizeTree(ArrayRef<Value *>(), &VectorizableTree[0]);
 
   // If the vectorized tree can be rewritten in a smaller type, we truncate the
   // vectorized root. InstCombine will then rewrite the entire expression. We

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load.ll?rev=293386&r1=293385&r2=293386&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load.ll Sat Jan 28 11:59:44 2017
@@ -1,38 +1,31 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s
+; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-threshold=-10 -slp-vectorizer | FileCheck %s
 
 
 
 define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {
 ; CHECK-LABEL: @jumbled-load(
 ; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* %in, i64 0
-; CHECK-NEXT:    [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
-; CHECK-NEXT:    [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
-; CHECK-NEXT:    [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
-; CHECK-NEXT:    [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
 ; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* %inn, i64 0
-; CHECK-NEXT:    [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
-; CHECK-NEXT:    [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
-; CHECK-NEXT:    [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
-; CHECK-NEXT:    [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4
-; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[LOAD_3]], [[LOAD_5]]
-; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[LOAD_2]], [[LOAD_8]]
-; CHECK-NEXT:    [[MUL_3:%.*]] = mul i32 [[LOAD_4]], [[LOAD_7]]
-; CHECK-NEXT:    [[MUL_4:%.*]] = mul i32 [[LOAD_1]], [[LOAD_6]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i32> [[TMP3]], [[TMP6]]
 ; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* %out, i64 0
-; CHECK-NEXT:    store i32 [[MUL_1]], i32* [[GEP_7]], align 4
 ; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* %out, i64 1
-; CHECK-NEXT:    store i32 [[MUL_2]], i32* [[GEP_8]], align 4
 ; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* %out, i64 2
-; CHECK-NEXT:    store i32 [[MUL_3]], i32* [[GEP_9]], align 4
 ; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* %out, i64 3
-; CHECK-NEXT:    store i32 [[MUL_4]], i32* [[GEP_10]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
 ; CHECK-NEXT:    ret i32 undef
 ;
   %in.addr = getelementptr inbounds i32, i32* %in, i64 0

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/reduction_loads.ll?rev=293386&r1=293385&r2=293386&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/reduction_loads.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/reduction_loads.ll Sat Jan 28 11:59:44 2017
@@ -31,8 +31,8 @@ define i32 @test(i32* nocapture readonly
 ; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
 ; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[ADD_7:%.*]] = add i32 [[TMP3]], [[SUM]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[ADD_7:%.*]] = add i32 [[TMP4]], [[SUM]]
 ; CHECK-NEXT:    br i1 true, label %for.end, label %for.body
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i32 [[ADD_7]]

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/store-jumbled.ll?rev=293386&r1=293385&r2=293386&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/store-jumbled.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/store-jumbled.ll Sat Jan 28 11:59:44 2017
@@ -1,38 +1,31 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s
+; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-threshold=-10 -slp-vectorizer | FileCheck %s
 
 
 
 define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {
 ; CHECK-LABEL: @jumbled-load(
 ; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
-; CHECK-NEXT:    [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
-; CHECK-NEXT:    [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
-; CHECK-NEXT:    [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
-; CHECK-NEXT:    [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
 ; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
-; CHECK-NEXT:    [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
-; CHECK-NEXT:    [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
-; CHECK-NEXT:    [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
-; CHECK-NEXT:    [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4
-; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[LOAD_1]], [[LOAD_5]]
-; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[LOAD_2]], [[LOAD_6]]
-; CHECK-NEXT:    [[MUL_3:%.*]] = mul i32 [[LOAD_3]], [[LOAD_7]]
-; CHECK-NEXT:    [[MUL_4:%.*]] = mul i32 [[LOAD_4]], [[LOAD_8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i32> [[TMP3]], [[TMP6]]
 ; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
 ; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
 ; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
 ; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
-; CHECK-NEXT:    store i32 [[MUL_1]], i32* [[GEP_9]], align 4
-; CHECK-NEXT:    store i32 [[MUL_2]], i32* [[GEP_7]], align 4
-; CHECK-NEXT:    store i32 [[MUL_3]], i32* [[GEP_10]], align 4
-; CHECK-NEXT:    store i32 [[MUL_4]], i32* [[GEP_8]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
 ; CHECK-NEXT:    ret i32 undef
 ;
   %in.addr = getelementptr inbounds i32, i32* %in, i64 0