[llvm] r257918 - Reapply r257800 with fix

Fri Jan 15 10:51:51 PST 2016

Author: mssimpso
Date: Fri Jan 15 12:51:51 2016
New Revision: 257918

URL: http://llvm.org/viewvc/llvm-project?rev=257918&view=rev
Log:
Reapply r257800 with fix

The fix uniques the bundle of getelementptr indices we are about to vectorize
since it's possible for the same index to be used by multiple instructions.
The original commit message is below.

[SLP] Vectorize the index computations of getelementptr instructions.

This patch seeds the SLP vectorizer with getelementptr indices. The primary
motivation in doing so is to vectorize gather-like idioms beginning with
consecutive loads (e.g., g[a[0] - b[0]] + g[a[1] - b[1]] + ...). While these
cases could be vectorized with a top-down phase, seeding the existing bottom-up
phase with the index computations avoids the complexity, compile-time, and
phase ordering issues associated with a full top-down pass. Only bundles of
single-index getelementptrs with non-constant differences are considered for
vectorization.

Added:
    llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
    llvm/trunk/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
Modified:
    llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp

Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=257918&r1=257917&r2=257918&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Fri Jan 15 12:51:51 2016
@@ -412,6 +412,13 @@ public:
     return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
   }
 
+  /// \return The vector element size in bits to use when vectorizing the
+  /// expression tree ending at \p V. If V is a store, the size is the width of
+  /// the stored value. Otherwise, the size is the width of the largest loaded
+  /// value reaching V. This method is used by the vectorizer to calculate
+  /// vectorization factors.
+  unsigned getVectorElementSize(Value *V);
+
 private:
   struct TreeEntry;
 
@@ -3139,10 +3146,73 @@ void BoUpSLP::scheduleBlock(BlockSchedul
   BS->ScheduleStart = nullptr;
 }
 
+unsigned BoUpSLP::getVectorElementSize(Value *V) {
+  auto &DL = F->getParent()->getDataLayout();
+
+  // If V is a store, just return the width of the stored value without
+  // traversing the expression tree. This is the common case.
+  if (auto *Store = dyn_cast<StoreInst>(V))
+    return DL.getTypeSizeInBits(Store->getValueOperand()->getType());
+
+  // If V is not a store, we can traverse the expression tree to find loads
+  // that feed it. The type of the loaded value may indicate a more suitable
+  // width than V's type. We want to base the vector element size on the width
+  // of memory operations where possible.
+  SmallVector<Instruction *, 16> Worklist;
+  SmallPtrSet<Instruction *, 16> Visited;
+  if (auto *I = dyn_cast<Instruction>(V))
+    Worklist.push_back(I);
+
+  // Traverse the expression tree in bottom-up order looking for loads. If we
+  // encounter an instruciton we don't yet handle, we give up.
+  auto MaxWidth = 0u;
+  auto FoundUnknownInst = false;
+  while (!Worklist.empty() && !FoundUnknownInst) {
+    auto *I = Worklist.pop_back_val();
+    Visited.insert(I);
+
+    // We should only be looking at scalar instructions here. If the current
+    // instruction has a vector type, give up.
+    auto *Ty = I->getType();
+    if (isa<VectorType>(Ty))
+      FoundUnknownInst = true;
+
+    // If the current instruction is a load, update MaxWidth to reflect the
+    // width of the loaded value.
+    else if (isa<LoadInst>(I))
+      MaxWidth = std::max(MaxWidth, (unsigned)DL.getTypeSizeInBits(Ty));
+
+    // Otherwise, we need to visit the operands of the instruction. We only
+    // handle the interesting cases from buildTree here. If an operand is an
+    // instruction we haven't yet visited, we add it to the worklist.
+    else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
+             isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
+      for (Use &U : I->operands())
+        if (auto *J = dyn_cast<Instruction>(U.get()))
+          if (!Visited.count(J))
+            Worklist.push_back(J);
+    }
+
+    // If we don't yet handle the instruction, give up.
+    else
+      FoundUnknownInst = true;
+  }
+
+  // If we didn't encounter a memory access in the expression tree, or if we
+  // gave up for some reason, just return the width of V.
+  if (!MaxWidth || FoundUnknownInst)
+    return DL.getTypeSizeInBits(V->getType());
+
+  // Otherwise, return the maximum width we found.
+  return MaxWidth;
+}
+
 /// The SLPVectorizer Pass.
 struct SLPVectorizer : public FunctionPass {
   typedef SmallVector<StoreInst *, 8> StoreList;
   typedef MapVector<Value *, StoreList> StoreListMap;
+  typedef SmallVector<WeakVH, 8> WeakVHList;
+  typedef MapVector<Value *, WeakVHList> WeakVHListMap;
 
   /// Pass identification, replacement for typeid
   static char ID;
@@ -3172,7 +3242,8 @@ struct SLPVectorizer : public FunctionPa
     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
 
-    StoreRefs.clear();
+    Stores.clear();
+    GEPs.clear();
     bool Changed = false;
 
     // If the target claims to have no vector registers don't attempt
@@ -3206,15 +3277,24 @@ struct SLPVectorizer : public FunctionPa
 
     // Scan the blocks in the function in post order.
     for (auto BB : post_order(&F.getEntryBlock())) {
+      collectSeedInstructions(BB);
+
       // Vectorize trees that end at stores.
-      if (unsigned count = collectStores(BB, R)) {
-        (void)count;
-        DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n");
+      if (NumStores > 0) {
+        DEBUG(dbgs() << "SLP: Found " << NumStores << " stores.\n");
         Changed |= vectorizeStoreChains(R);
       }
 
       // Vectorize trees that end at reductions.
       Changed |= vectorizeChainsInBlock(BB, R);
+
+      // Vectorize the index computations of getelementptr instructions. This
+      // is primarily intended to catch gather-like idioms ending at
+      // non-consecutive loads.
+      if (NumGEPs > 0) {
+        DEBUG(dbgs() << "SLP: Found " << NumGEPs << " GEPs.\n");
+        Changed |= vectorizeGEPIndices(BB, R);
+      }
     }
 
     if (Changed) {
@@ -3241,12 +3321,14 @@ struct SLPVectorizer : public FunctionPa
   }
 
 private:
-
-  /// \brief Collect memory references and sort them according to their base
-  /// object. We sort the stores to their base objects to reduce the cost of the
-  /// quadratic search on the stores. TODO: We can further reduce this cost
-  /// if we flush the chain creation every time we run into a memory barrier.
-  unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
+  /// \brief Collect store and getelementptr instructions and organize them
+  /// according to the underlying object of their pointer operands. We sort the
+  /// instructions by their underlying objects to reduce the cost of
+  /// consecutive access queries.
+  ///
+  /// TODO: We can further reduce this cost if we flush the chain creation
+  ///       every time we run into a memory barrier.
+  void collectSeedInstructions(BasicBlock *BB);
 
   /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
   bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
@@ -3262,9 +3344,13 @@ private:
   /// \brief Try to vectorize a chain that may start at the operands of \V;
   bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
 
-  /// \brief Vectorize the stores that were collected in StoreRefs.
+  /// \brief Vectorize the store instructions collected in Stores.
   bool vectorizeStoreChains(BoUpSLP &R);
 
+  /// \brief Vectorize the index computations of the getelementptr instructions
+  /// collected in GEPs.
+  bool vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R);
+
   /// \brief Scan the basic block and look for patterns that are likely to start
   /// a vectorization chain.
   bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
@@ -3274,8 +3360,19 @@ private:
 
   bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
                        BoUpSLP &R);
-private:
-  StoreListMap StoreRefs;
+
+  /// The store instructions in a basic block organized by base pointer.
+  StoreListMap Stores;
+
+  /// The getelementptr instructions in a basic block organized by base pointer.
+  WeakVHListMap GEPs;
+
+  /// The number of store instructions in a basic block.
+  unsigned NumStores;
+
+  /// The number of getelementptr instructions in a basic block.
+  unsigned NumGEPs;
+
   unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
 };
 
@@ -3296,9 +3393,7 @@ bool SLPVectorizer::vectorizeStoreChain(
   unsigned ChainLen = Chain.size();
   DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
         << "\n");
-  Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
-  auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout();
-  unsigned Sz = DL.getTypeSizeInBits(StoreTy);
+  unsigned Sz = R.getVectorElementSize(Chain[0]);
   unsigned VF = VecRegSize / Sz;
 
   if (!isPowerOf2_32(Sz) || VF < 2)
@@ -3409,33 +3504,43 @@ bool SLPVectorizer::vectorizeStores(Arra
   return Changed;
 }
 
+void SLPVectorizer::collectSeedInstructions(BasicBlock *BB) {
 
-unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
-  unsigned count = 0;
-  StoreRefs.clear();
+  // Initialize the collections. We will make a single pass over the block.
+  Stores.clear();
+  GEPs.clear();
+  NumStores = NumGEPs = 0;
   const DataLayout &DL = BB->getModule()->getDataLayout();
-  for (Instruction &I : *BB) {
-    StoreInst *SI = dyn_cast<StoreInst>(&I);
-    if (!SI)
-      continue;
 
-    // Don't touch volatile stores.
-    if (!SI->isSimple())
-      continue;
-
-    // Check that the pointer points to scalars.
-    Type *Ty = SI->getValueOperand()->getType();
-    if (!isValidElementType(Ty))
-      continue;
+  // Visit the store and getelementptr instructions in BB and organize them in
+  // Stores and GEPs according to the underlying objects of their pointer
+  // operands.
+  for (Instruction &I : *BB) {
 
-    // Find the base pointer.
-    Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL);
+    // Ignore store instructions that are volatile or have a pointer operand
+    // that doesn't point to a scalar type.
+    if (auto *SI = dyn_cast<StoreInst>(&I)) {
+      if (!SI->isSimple())
+        continue;
+      if (!isValidElementType(SI->getValueOperand()->getType()))
+        continue;
+      Stores[GetUnderlyingObject(SI->getPointerOperand(), DL)].push_back(SI);
+      ++NumStores;
+    }
 
-    // Save the store locations.
-    StoreRefs[Ptr].push_back(SI);
-    count++;
+    // Ignore getelementptr instructions that have more than one index, a
+    // constant index, or a pointer operand that doesn't point to a scalar
+    // type.
+    else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+      auto Idx = GEP->idx_begin()->get();
+      if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
+        continue;
+      if (!isValidElementType(Idx->getType()))
+        continue;
+      GEPs[GetUnderlyingObject(GEP->getPointerOperand(), DL)].push_back(GEP);
+      ++NumGEPs;
+    }
   }
-  return count;
 }
 
 bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
@@ -3459,12 +3564,10 @@ bool SLPVectorizer::tryToVectorizeList(A
     return false;
 
   unsigned Opcode0 = I0->getOpcode();
-  const DataLayout &DL = I0->getModule()->getDataLayout();
 
-  Type *Ty0 = I0->getType();
-  unsigned Sz = DL.getTypeSizeInBits(Ty0);
   // FIXME: Register size should be a parameter to this function, so we can
   // try different vectorization factors.
+  unsigned Sz = R.getVectorElementSize(I0);
   unsigned VF = MinVecRegSize / Sz;
 
   for (Value *V : VL) {
@@ -4183,11 +4286,94 @@ bool SLPVectorizer::vectorizeChainsInBlo
   return Changed;
 }
 
+bool SLPVectorizer::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
+  auto Changed = false;
+  for (auto &Entry : GEPs) {
+
+    // If the getelementptr list has fewer than two elements, there's nothing
+    // to do.
+    if (Entry.second.size() < 2)
+      continue;
+
+    DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
+                 << Entry.second.size() << ".\n");
+
+    // We process the getelementptr list in chunks of 16 (like we do for
+    // stores) to minimize compile-time.
+    for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) {
+      auto Len = std::min<unsigned>(BE - BI, 16);
+      auto GEPList = makeArrayRef(&Entry.second[BI], Len);
+
+      // Initialize a set a candidate getelementptrs. Note that we use a
+      // SetVector here to preserve program order. If the index computations
+      // are vectorizable and begin with loads, we want to minimize the chance
+      // of having to reorder them later.
+      SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
+
+      // Some of the candidates may have already been vectorized after we
+      // initially collected them. If so, the WeakVHs will have nullified the
+      // values, so remove them from the set of candidates.
+      Candidates.remove(nullptr);
+
+      // Remove from the set of candidates all pairs of getelementptrs with
+      // constant differences. Such getelementptrs are likely not good
+      // candidates for vectorization in a bottom-up phase since one can be
+      // computed from the other. We also ensure all candidate getelementptr
+      // indices are unique.
+      for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
+        auto *GEPI = cast<GetElementPtrInst>(GEPList[I]);
+        if (!Candidates.count(GEPI))
+          continue;
+        auto *SCEVI = SE->getSCEV(GEPList[I]);
+        for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
+          auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]);
+          auto *SCEVJ = SE->getSCEV(GEPList[J]);
+          if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
+            Candidates.remove(GEPList[I]);
+            Candidates.remove(GEPList[J]);
+          } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
+            Candidates.remove(GEPList[J]);
+          }
+        }
+      }
+
+      // We break out of the above computation as soon as we know there are
+      // fewer than two candidates remaining.
+      if (Candidates.size() < 2)
+        continue;
+
+      // Add the single, non-constant index of each candidate to the bundle. We
+      // ensured the indices met these constraints when we originally collected
+      // the getelementptrs.
+      SmallVector<Value *, 16> Bundle(Candidates.size());
+      auto BundleIndex = 0u;
+      for (auto *V : Candidates) {
+        auto *GEP = cast<GetElementPtrInst>(V);
+        auto *GEPIdx = GEP->idx_begin()->get();
+        assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx));
+        Bundle[BundleIndex++] = GEPIdx;
+      }
+
+      // Try and vectorize the indices. We are currently only interested in
+      // gather-like cases of the form:
+      //
+      // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
+      //
+      // where the loads of "a", the loads of "b", and the subtractions can be
+      // performed in parallel. It's likely that detecting this pattern in a
+      // bottom-up phase will be simpler and less costly than building a
+      // full-blown top-down phase beginning at the consecutive loads.
+      Changed |= tryToVectorizeList(Bundle, R);
+    }
+  }
+  return Changed;
+}
+
 bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
   bool Changed = false;
   // Attempt to sort and vectorize each of the store-groups.
-  for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
-       it != e; ++it) {
+  for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
+       ++it) {
     if (it->second.size() < 2)
       continue;
 

Added: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll?rev=257918&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll Fri Jan 15 12:51:51 2016
@@ -0,0 +1,258 @@
+; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; These tests check that we vectorize the index calculations in the
+; gather-reduce pattern shown below. We check cases having i32 and i64
+; subtraction.
+;
+; int gather_reduce_8x16(short *a, short *b, short *g, int n) {
+;   int sum = 0;
+;   for (int i = 0; i < n ; ++i) {
+;     sum += g[*a++ - b[0]]; sum += g[*a++ - b[1]];
+;     sum += g[*a++ - b[2]]; sum += g[*a++ - b[3]];
+;     sum += g[*a++ - b[4]]; sum += g[*a++ - b[5]];
+;     sum += g[*a++ - b[6]]; sum += g[*a++ - b[7]];
+;   }
+;   return sum;
+; }
+
+; CHECK-LABEL: @gather_reduce_8x16_i32
+;
+; CHECK: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16>
+; CHECK: zext <8 x i16> [[L]] to <8 x i32>
+; CHECK: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32>
+; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]]
+; CHECK: sext i32 [[X]] to i64
+;
+define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
+entry:
+  %cmp.99 = icmp sgt i32 %n, 0
+  br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ]
+  ret i32 %sum.0.lcssa
+
+for.body:
+  %i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ]
+  %a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ]
+  %incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1
+  %0 = load i16, i16* %a.addr.0101, align 2
+  %conv = zext i16 %0 to i32
+  %incdec.ptr1 = getelementptr inbounds i16, i16* %b, i64 1
+  %1 = load i16, i16* %b, align 2
+  %conv2 = zext i16 %1 to i32
+  %sub = sub nsw i32 %conv, %conv2
+  %arrayidx = getelementptr inbounds i16, i16* %g, i32 %sub
+  %2 = load i16, i16* %arrayidx, align 2
+  %conv3 = zext i16 %2 to i32
+  %add = add nsw i32 %conv3, %sum.0102
+  %incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2
+  %3 = load i16, i16* %incdec.ptr, align 2
+  %conv5 = zext i16 %3 to i32
+  %incdec.ptr6 = getelementptr inbounds i16, i16* %b, i64 2
+  %4 = load i16, i16* %incdec.ptr1, align 2
+  %conv7 = zext i16 %4 to i32
+  %sub8 = sub nsw i32 %conv5, %conv7
+  %arrayidx10 = getelementptr inbounds i16, i16* %g, i32 %sub8
+  %5 = load i16, i16* %arrayidx10, align 2
+  %conv11 = zext i16 %5 to i32
+  %add12 = add nsw i32 %add, %conv11
+  %incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3
+  %6 = load i16, i16* %incdec.ptr4, align 2
+  %conv14 = zext i16 %6 to i32
+  %incdec.ptr15 = getelementptr inbounds i16, i16* %b, i64 3
+  %7 = load i16, i16* %incdec.ptr6, align 2
+  %conv16 = zext i16 %7 to i32
+  %sub17 = sub nsw i32 %conv14, %conv16
+  %arrayidx19 = getelementptr inbounds i16, i16* %g, i32 %sub17
+  %8 = load i16, i16* %arrayidx19, align 2
+  %conv20 = zext i16 %8 to i32
+  %add21 = add nsw i32 %add12, %conv20
+  %incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4
+  %9 = load i16, i16* %incdec.ptr13, align 2
+  %conv23 = zext i16 %9 to i32
+  %incdec.ptr24 = getelementptr inbounds i16, i16* %b, i64 4
+  %10 = load i16, i16* %incdec.ptr15, align 2
+  %conv25 = zext i16 %10 to i32
+  %sub26 = sub nsw i32 %conv23, %conv25
+  %arrayidx28 = getelementptr inbounds i16, i16* %g, i32 %sub26
+  %11 = load i16, i16* %arrayidx28, align 2
+  %conv29 = zext i16 %11 to i32
+  %add30 = add nsw i32 %add21, %conv29
+  %incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5
+  %12 = load i16, i16* %incdec.ptr22, align 2
+  %conv32 = zext i16 %12 to i32
+  %incdec.ptr33 = getelementptr inbounds i16, i16* %b, i64 5
+  %13 = load i16, i16* %incdec.ptr24, align 2
+  %conv34 = zext i16 %13 to i32
+  %sub35 = sub nsw i32 %conv32, %conv34
+  %arrayidx37 = getelementptr inbounds i16, i16* %g, i32 %sub35
+  %14 = load i16, i16* %arrayidx37, align 2
+  %conv38 = zext i16 %14 to i32
+  %add39 = add nsw i32 %add30, %conv38
+  %incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6
+  %15 = load i16, i16* %incdec.ptr31, align 2
+  %conv41 = zext i16 %15 to i32
+  %incdec.ptr42 = getelementptr inbounds i16, i16* %b, i64 6
+  %16 = load i16, i16* %incdec.ptr33, align 2
+  %conv43 = zext i16 %16 to i32
+  %sub44 = sub nsw i32 %conv41, %conv43
+  %arrayidx46 = getelementptr inbounds i16, i16* %g, i32 %sub44
+  %17 = load i16, i16* %arrayidx46, align 2
+  %conv47 = zext i16 %17 to i32
+  %add48 = add nsw i32 %add39, %conv47
+  %incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7
+  %18 = load i16, i16* %incdec.ptr40, align 2
+  %conv50 = zext i16 %18 to i32
+  %incdec.ptr51 = getelementptr inbounds i16, i16* %b, i64 7
+  %19 = load i16, i16* %incdec.ptr42, align 2
+  %conv52 = zext i16 %19 to i32
+  %sub53 = sub nsw i32 %conv50, %conv52
+  %arrayidx55 = getelementptr inbounds i16, i16* %g, i32 %sub53
+  %20 = load i16, i16* %arrayidx55, align 2
+  %conv56 = zext i16 %20 to i32
+  %add57 = add nsw i32 %add48, %conv56
+  %incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8
+  %21 = load i16, i16* %incdec.ptr49, align 2
+  %conv59 = zext i16 %21 to i32
+  %22 = load i16, i16* %incdec.ptr51, align 2
+  %conv61 = zext i16 %22 to i32
+  %sub62 = sub nsw i32 %conv59, %conv61
+  %arrayidx64 = getelementptr inbounds i16, i16* %g, i32 %sub62
+  %23 = load i16, i16* %arrayidx64, align 2
+  %conv65 = zext i16 %23 to i32
+  %add66 = add nsw i32 %add57, %conv65
+  %inc = add nuw nsw i32 %i.0103, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; CHECK-LABEL: @gather_reduce_8x16_i64
+;
+; CHECK-NOT: load <8 x i16>
+;
+; FIXME: We are currently unable to vectorize the case with i64 subtraction
+;        because the zero extensions are too expensive. The solution here is to
+;        convert the i64 subtractions to i32 subtractions during vectorization.
+;        This would then match the case above.
+;
+define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
+entry:
+  %cmp.99 = icmp sgt i32 %n, 0
+  br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ]
+  ret i32 %sum.0.lcssa
+
+for.body:
+  %i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ]
+  %a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ]
+  %incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1
+  %0 = load i16, i16* %a.addr.0101, align 2
+  %conv = zext i16 %0 to i64
+  %incdec.ptr1 = getelementptr inbounds i16, i16* %b, i64 1
+  %1 = load i16, i16* %b, align 2
+  %conv2 = zext i16 %1 to i64
+  %sub = sub nsw i64 %conv, %conv2
+  %arrayidx = getelementptr inbounds i16, i16* %g, i64 %sub
+  %2 = load i16, i16* %arrayidx, align 2
+  %conv3 = zext i16 %2 to i32
+  %add = add nsw i32 %conv3, %sum.0102
+  %incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2
+  %3 = load i16, i16* %incdec.ptr, align 2
+  %conv5 = zext i16 %3 to i64
+  %incdec.ptr6 = getelementptr inbounds i16, i16* %b, i64 2
+  %4 = load i16, i16* %incdec.ptr1, align 2
+  %conv7 = zext i16 %4 to i64
+  %sub8 = sub nsw i64 %conv5, %conv7
+  %arrayidx10 = getelementptr inbounds i16, i16* %g, i64 %sub8
+  %5 = load i16, i16* %arrayidx10, align 2
+  %conv11 = zext i16 %5 to i32
+  %add12 = add nsw i32 %add, %conv11
+  %incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3
+  %6 = load i16, i16* %incdec.ptr4, align 2
+  %conv14 = zext i16 %6 to i64
+  %incdec.ptr15 = getelementptr inbounds i16, i16* %b, i64 3
+  %7 = load i16, i16* %incdec.ptr6, align 2
+  %conv16 = zext i16 %7 to i64
+  %sub17 = sub nsw i64 %conv14, %conv16
+  %arrayidx19 = getelementptr inbounds i16, i16* %g, i64 %sub17
+  %8 = load i16, i16* %arrayidx19, align 2
+  %conv20 = zext i16 %8 to i32
+  %add21 = add nsw i32 %add12, %conv20
+  %incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4
+  %9 = load i16, i16* %incdec.ptr13, align 2
+  %conv23 = zext i16 %9 to i64
+  %incdec.ptr24 = getelementptr inbounds i16, i16* %b, i64 4
+  %10 = load i16, i16* %incdec.ptr15, align 2
+  %conv25 = zext i16 %10 to i64
+  %sub26 = sub nsw i64 %conv23, %conv25
+  %arrayidx28 = getelementptr inbounds i16, i16* %g, i64 %sub26
+  %11 = load i16, i16* %arrayidx28, align 2
+  %conv29 = zext i16 %11 to i32
+  %add30 = add nsw i32 %add21, %conv29
+  %incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5
+  %12 = load i16, i16* %incdec.ptr22, align 2
+  %conv32 = zext i16 %12 to i64
+  %incdec.ptr33 = getelementptr inbounds i16, i16* %b, i64 5
+  %13 = load i16, i16* %incdec.ptr24, align 2
+  %conv34 = zext i16 %13 to i64
+  %sub35 = sub nsw i64 %conv32, %conv34
+  %arrayidx37 = getelementptr inbounds i16, i16* %g, i64 %sub35
+  %14 = load i16, i16* %arrayidx37, align 2
+  %conv38 = zext i16 %14 to i32
+  %add39 = add nsw i32 %add30, %conv38
+  %incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6
+  %15 = load i16, i16* %incdec.ptr31, align 2
+  %conv41 = zext i16 %15 to i64
+  %incdec.ptr42 = getelementptr inbounds i16, i16* %b, i64 6
+  %16 = load i16, i16* %incdec.ptr33, align 2
+  %conv43 = zext i16 %16 to i64
+  %sub44 = sub nsw i64 %conv41, %conv43
+  %arrayidx46 = getelementptr inbounds i16, i16* %g, i64 %sub44
+  %17 = load i16, i16* %arrayidx46, align 2
+  %conv47 = zext i16 %17 to i32
+  %add48 = add nsw i32 %add39, %conv47
+  %incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7
+  %18 = load i16, i16* %incdec.ptr40, align 2
+  %conv50 = zext i16 %18 to i64
+  %incdec.ptr51 = getelementptr inbounds i16, i16* %b, i64 7
+  %19 = load i16, i16* %incdec.ptr42, align 2
+  %conv52 = zext i16 %19 to i64
+  %sub53 = sub nsw i64 %conv50, %conv52
+  %arrayidx55 = getelementptr inbounds i16, i16* %g, i64 %sub53
+  %20 = load i16, i16* %arrayidx55, align 2
+  %conv56 = zext i16 %20 to i32
+  %add57 = add nsw i32 %add48, %conv56
+  %incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8
+  %21 = load i16, i16* %incdec.ptr49, align 2
+  %conv59 = zext i16 %21 to i64
+  %22 = load i16, i16* %incdec.ptr51, align 2
+  %conv61 = zext i16 %22 to i64
+  %sub62 = sub nsw i64 %conv59, %conv61
+  %arrayidx64 = getelementptr inbounds i16, i16* %g, i64 %sub62
+  %23 = load i16, i16* %arrayidx64, align 2
+  %conv65 = zext i16 %23 to i32
+  %add66 = add nsw i32 %add57, %conv65
+  %inc = add nuw nsw i32 %i.0103, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}

Added: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll?rev=257918&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll Fri Jan 15 12:51:51 2016
@@ -0,0 +1,111 @@
+; RUN: opt -S -slp-vectorizer -slp-threshold=-18 -dce -instcombine < %s | FileCheck %s
+
+target datalayout = "e-m:e-i32:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; These tests check that we remove from consideration pairs of seed
+; getelementptrs when they are known to have a constant difference. Such pairs
+; are likely not good candidates for vectorization since one can be computed
+; from the other. We use an unprofitable threshold to force vectorization.
+;
+; int getelementptr(int *g, int n, int w, int x, int y, int z) {
+;   int sum = 0;
+;   for (int i = 0; i < n ; ++i) {
+;     sum += g[2*i + w]; sum += g[2*i + x];
+;     sum += g[2*i + y]; sum += g[2*i + z];
+;   }
+;   return sum;
+; }
+;
+
+; CHECK-LABEL: @getelementptr_4x32
+;
+; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <4 x i32>
+; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <4 x i32> [[A]]
+; CHECK: sext i32 [[X]] to i64
+;
+define i32 @getelementptr_4x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
+entry:
+  %cmp31 = icmp sgt i32 %n, 0
+  br i1 %cmp31, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add16, %for.cond.cleanup.loopexit ]
+  ret i32 %sum.0.lcssa
+
+for.body:
+  %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add16, %for.body ]
+  %t4 = shl nsw i32 %indvars.iv, 1
+  %t5 = add nsw i32 %t4, 0
+  %arrayidx = getelementptr inbounds i32, i32* %g, i32 %t5
+  %t6 = load i32, i32* %arrayidx, align 4
+  %add1 = add nsw i32 %t6, %sum.032
+  %t7 = add nsw i32 %t4, %x
+  %arrayidx5 = getelementptr inbounds i32, i32* %g, i32 %t7
+  %t8 = load i32, i32* %arrayidx5, align 4
+  %add6 = add nsw i32 %add1, %t8
+  %t9 = add nsw i32 %t4, %y
+  %arrayidx10 = getelementptr inbounds i32, i32* %g, i32 %t9
+  %t10 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %add6, %t10
+  %t11 = add nsw i32 %t4, %z
+  %arrayidx15 = getelementptr inbounds i32, i32* %g, i32 %t11
+  %t12 = load i32, i32* %arrayidx15, align 4
+  %add16 = add nsw i32 %add11, %t12
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next , %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; CHECK-LABEL: @getelementptr_2x32
+;
+; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <2 x i32>
+; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[A]]
+; CHECK: sext i32 [[X]] to i64
+;
+define i32 @getelementptr_2x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
+entry:
+  %cmp31 = icmp sgt i32 %n, 0
+  br i1 %cmp31, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add16, %for.cond.cleanup.loopexit ]
+  ret i32 %sum.0.lcssa
+
+for.body:
+  %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add16, %for.body ]
+  %t4 = shl nsw i32 %indvars.iv, 1
+  %t5 = add nsw i32 %t4, 0
+  %arrayidx = getelementptr inbounds i32, i32* %g, i32 %t5
+  %t6 = load i32, i32* %arrayidx, align 4
+  %add1 = add nsw i32 %t6, %sum.032
+  %t7 = add nsw i32 %t4, 1
+  %arrayidx5 = getelementptr inbounds i32, i32* %g, i32 %t7
+  %t8 = load i32, i32* %arrayidx5, align 4
+  %add6 = add nsw i32 %add1, %t8
+  %t9 = add nsw i32 %t4, %y
+  %arrayidx10 = getelementptr inbounds i32, i32* %g, i32 %t9
+  %t10 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %add6, %t10
+  %t11 = add nsw i32 %t4, %z
+  %arrayidx15 = getelementptr inbounds i32, i32* %g, i32 %t11
+  %t12 = load i32, i32* %arrayidx15, align 4
+  %add16 = add nsw i32 %add11, %t12
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next , %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}