[llvm] r257888 - Revert "[SLP] Vectorize the index computations of getelementptr instructions."

Fri Jan 15 05:10:48 PST 2016

Author: mssimpso
Date: Fri Jan 15 07:10:46 2016
New Revision: 257888

URL: http://llvm.org/viewvc/llvm-project?rev=257888&view=rev
Log:
Revert "[SLP] Vectorize the index computations of getelementptr instructions."

This reverts commit r257800.

Removed:
    llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
    llvm/trunk/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
Modified:
    llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp

Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=257888&r1=257887&r2=257888&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Fri Jan 15 07:10:46 2016
@@ -412,13 +412,6 @@ public:
     return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
   }
 
-  /// \return The vector element size in bits to use when vectorizing the
-  /// expression tree ending at \p V. If V is a store, the size is the width of
-  /// the stored value. Otherwise, the size is the width of the largest loaded
-  /// value reaching V. This method is used by the vectorizer to calculate
-  /// vectorization factors.
-  unsigned getVectorElementSize(Value *V);
-
 private:
   struct TreeEntry;
 
@@ -3146,73 +3139,10 @@ void BoUpSLP::scheduleBlock(BlockSchedul
   BS->ScheduleStart = nullptr;
 }
 
-unsigned BoUpSLP::getVectorElementSize(Value *V) {
-  auto &DL = F->getParent()->getDataLayout();
-
-  // If V is a store, just return the width of the stored value without
-  // traversing the expression tree. This is the common case.
-  if (auto *Store = dyn_cast<StoreInst>(V))
-    return DL.getTypeSizeInBits(Store->getValueOperand()->getType());
-
-  // If V is not a store, we can traverse the expression tree to find loads
-  // that feed it. The type of the loaded value may indicate a more suitable
-  // width than V's type. We want to base the vector element size on the width
-  // of memory operations where possible.
-  SmallVector<Instruction *, 16> Worklist;
-  SmallPtrSet<Instruction *, 16> Visited;
-  if (auto *I = dyn_cast<Instruction>(V))
-    Worklist.push_back(I);
-
-  // Traverse the expression tree in bottom-up order looking for loads. If we
-  // encounter an instruciton we don't yet handle, we give up.
-  auto MaxWidth = 0u;
-  auto FoundUnknownInst = false;
-  while (!Worklist.empty() && !FoundUnknownInst) {
-    auto *I = Worklist.pop_back_val();
-    Visited.insert(I);
-
-    // We should only be looking at scalar instructions here. If the current
-    // instruction has a vector type, give up.
-    auto *Ty = I->getType();
-    if (isa<VectorType>(Ty))
-      FoundUnknownInst = true;
-
-    // If the current instruction is a load, update MaxWidth to reflect the
-    // width of the loaded value.
-    else if (isa<LoadInst>(I))
-      MaxWidth = std::max(MaxWidth, (unsigned)DL.getTypeSizeInBits(Ty));
-
-    // Otherwise, we need to visit the operands of the instruction. We only
-    // handle the interesting cases from buildTree here. If an operand is an
-    // instruction we haven't yet visited, we add it to the worklist.
-    else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
-             isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
-      for (Use &U : I->operands())
-        if (auto *J = dyn_cast<Instruction>(U.get()))
-          if (!Visited.count(J))
-            Worklist.push_back(J);
-    }
-
-    // If we don't yet handle the instruction, give up.
-    else
-      FoundUnknownInst = true;
-  }
-
-  // If we didn't encounter a memory access in the expression tree, or if we
-  // gave up for some reason, just return the width of V.
-  if (!MaxWidth || FoundUnknownInst)
-    return DL.getTypeSizeInBits(V->getType());
-
-  // Otherwise, return the maximum width we found.
-  return MaxWidth;
-}
-
 /// The SLPVectorizer Pass.
 struct SLPVectorizer : public FunctionPass {
   typedef SmallVector<StoreInst *, 8> StoreList;
   typedef MapVector<Value *, StoreList> StoreListMap;
-  typedef SmallVector<WeakVH, 8> WeakVHList;
-  typedef MapVector<Value *, WeakVHList> WeakVHListMap;
 
   /// Pass identification, replacement for typeid
   static char ID;
@@ -3242,8 +3172,7 @@ struct SLPVectorizer : public FunctionPa
     DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
 
-    Stores.clear();
-    GEPs.clear();
+    StoreRefs.clear();
     bool Changed = false;
 
     // If the target claims to have no vector registers don't attempt
@@ -3277,24 +3206,15 @@ struct SLPVectorizer : public FunctionPa
 
     // Scan the blocks in the function in post order.
     for (auto BB : post_order(&F.getEntryBlock())) {
-      collectSeedInstructions(BB);
-
       // Vectorize trees that end at stores.
-      if (NumStores > 0) {
-        DEBUG(dbgs() << "SLP: Found " << NumStores << " stores.\n");
+      if (unsigned count = collectStores(BB, R)) {
+        (void)count;
+        DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n");
         Changed |= vectorizeStoreChains(R);
       }
 
       // Vectorize trees that end at reductions.
       Changed |= vectorizeChainsInBlock(BB, R);
-
-      // Vectorize the index computations of getelementptr instructions. This
-      // is primarily intended to catch gather-like idioms ending at
-      // non-consecutive loads.
-      if (NumGEPs > 0) {
-        DEBUG(dbgs() << "SLP: Found " << NumGEPs << " GEPs.\n");
-        Changed |= vectorizeGEPIndices(BB, R);
-      }
     }
 
     if (Changed) {
@@ -3321,14 +3241,12 @@ struct SLPVectorizer : public FunctionPa
   }
 
 private:
-  /// \brief Collect store and getelementptr instructions and organize them
-  /// according to the underlying object of their pointer operands. We sort the
-  /// instructions by their underlying objects to reduce the cost of
-  /// consecutive access queries.
-  ///
-  /// TODO: We can further reduce this cost if we flush the chain creation
-  ///       every time we run into a memory barrier.
-  void collectSeedInstructions(BasicBlock *BB);
+
+  /// \brief Collect memory references and sort them according to their base
+  /// object. We sort the stores to their base objects to reduce the cost of the
+  /// quadratic search on the stores. TODO: We can further reduce this cost
+  /// if we flush the chain creation every time we run into a memory barrier.
+  unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
 
   /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
   bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
@@ -3344,13 +3262,9 @@ private:
   /// \brief Try to vectorize a chain that may start at the operands of \V;
   bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
 
-  /// \brief Vectorize the store instructions collected in Stores.
+  /// \brief Vectorize the stores that were collected in StoreRefs.
   bool vectorizeStoreChains(BoUpSLP &R);
 
-  /// \brief Vectorize the index computations of the getelementptr instructions
-  /// collected in GEPs.
-  bool vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R);
-
   /// \brief Scan the basic block and look for patterns that are likely to start
   /// a vectorization chain.
   bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
@@ -3360,19 +3274,8 @@ private:
 
   bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
                        BoUpSLP &R);
-
-  /// The store instructions in a basic block organized by base pointer.
-  StoreListMap Stores;
-
-  /// The getelementptr instructions in a basic block organized by base pointer.
-  WeakVHListMap GEPs;
-
-  /// The number of store instructions in a basic block.
-  unsigned NumStores;
-
-  /// The number of getelementptr instructions in a basic block.
-  unsigned NumGEPs;
-
+private:
+  StoreListMap StoreRefs;
   unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
 };
 
@@ -3393,7 +3296,9 @@ bool SLPVectorizer::vectorizeStoreChain(
   unsigned ChainLen = Chain.size();
   DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
         << "\n");
-  unsigned Sz = R.getVectorElementSize(Chain[0]);
+  Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
+  auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout();
+  unsigned Sz = DL.getTypeSizeInBits(StoreTy);
   unsigned VF = VecRegSize / Sz;
 
   if (!isPowerOf2_32(Sz) || VF < 2)
@@ -3504,43 +3409,33 @@ bool SLPVectorizer::vectorizeStores(Arra
   return Changed;
 }
 
-void SLPVectorizer::collectSeedInstructions(BasicBlock *BB) {
 
-  // Initialize the collections. We will make a single pass over the block.
-  Stores.clear();
-  GEPs.clear();
-  NumStores = NumGEPs = 0;
+unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
+  unsigned count = 0;
+  StoreRefs.clear();
   const DataLayout &DL = BB->getModule()->getDataLayout();
-
-  // Visit the store and getelementptr instructions in BB and organize them in
-  // Stores and GEPs according to the underlying objects of their pointer
-  // operands.
   for (Instruction &I : *BB) {
+    StoreInst *SI = dyn_cast<StoreInst>(&I);
+    if (!SI)
+      continue;
 
-    // Ignore store instructions that are volatile or have a pointer operand
-    // that doesn't point to a scalar type.
-    if (auto *SI = dyn_cast<StoreInst>(&I)) {
-      if (!SI->isSimple())
-        continue;
-      if (!isValidElementType(SI->getValueOperand()->getType()))
-        continue;
-      Stores[GetUnderlyingObject(SI->getPointerOperand(), DL)].push_back(SI);
-      ++NumStores;
-    }
+    // Don't touch volatile stores.
+    if (!SI->isSimple())
+      continue;
 
-    // Ignore getelementptr instructions that have more than one index, a
-    // constant index, or a pointer operand that doesn't point to a scalar
-    // type.
-    else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
-      auto Idx = GEP->idx_begin()->get();
-      if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
-        continue;
-      if (!isValidElementType(Idx->getType()))
-        continue;
-      GEPs[GetUnderlyingObject(GEP->getPointerOperand(), DL)].push_back(GEP);
-      ++NumGEPs;
-    }
+    // Check that the pointer points to scalars.
+    Type *Ty = SI->getValueOperand()->getType();
+    if (!isValidElementType(Ty))
+      continue;
+
+    // Find the base pointer.
+    Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL);
+
+    // Save the store locations.
+    StoreRefs[Ptr].push_back(SI);
+    count++;
   }
+  return count;
 }
 
 bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
@@ -3564,10 +3459,12 @@ bool SLPVectorizer::tryToVectorizeList(A
     return false;
 
   unsigned Opcode0 = I0->getOpcode();
+  const DataLayout &DL = I0->getModule()->getDataLayout();
 
+  Type *Ty0 = I0->getType();
+  unsigned Sz = DL.getTypeSizeInBits(Ty0);
   // FIXME: Register size should be a parameter to this function, so we can
   // try different vectorization factors.
-  unsigned Sz = R.getVectorElementSize(I0);
   unsigned VF = MinVecRegSize / Sz;
 
   for (Value *V : VL) {
@@ -4286,83 +4183,10 @@ bool SLPVectorizer::vectorizeChainsInBlo
   return Changed;
 }
 
-bool SLPVectorizer::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
-  auto Changed = false;
-  for (auto &Entry : GEPs) {
-    auto &GEPList = Entry.second;
-
-    // If the getelementptr list has fewer than two elements, there's nothing
-    // to do.
-    if (GEPList.size() < 2)
-      continue;
-
-    DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
-                 << GEPList.size() << ".\n");
-
-    // Initialize a set a candidate getelementptrs. Note that we use a
-    // SetVector here to preserve program order. If the index computations are
-    // vectorizable and begin with loads, we want to minimize the chance of
-    // having to reorder them later.
-    SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
-
-    // Some of the candidates may have already been vectorized after we
-    // initially collected them. If so, the WeakVHs will have nullified the
-    // values, so remove them from the set of candidates.
-    Candidates.remove(nullptr);
-
-    // Remove from the set of candidates all pairs of getelementptrs with
-    // constant differences. Such getelementptrs are likely not good candidates
-    // for vectorization in a bottom-up phase since one can be computed from
-    // the other.
-    for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
-      auto *GEP = SE->getSCEV(GEPList[I]);
-      for (int J = I + 1; J < E && Candidates.size() > 1; ++J)
-        if (isa<SCEVConstant>(SE->getMinusSCEV(GEP, SE->getSCEV(GEPList[J])))) {
-          Candidates.remove(GEPList[I]);
-          Candidates.remove(GEPList[J]);
-        }
-    }
-
-    // We break out of the above computation as soon as we know there are fewer
-    // than two candidates remaining.
-    if (Candidates.size() < 2)
-      continue;
-
-    // Add the single, non-constant index of each candidate to the bundle. We
-    // ensured the indices met these constraints when we originally collected
-    // the getelementptrs.
-    SmallVector<Value *, 16> Bundle(Candidates.size());
-    auto BundleIndex = 0u;
-    for (auto *V : Candidates) {
-      auto *GEP = cast<GetElementPtrInst>(V);
-      auto *GEPIdx = GEP->idx_begin()->get();
-      assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx));
-      Bundle[BundleIndex++] = GEPIdx;
-    }
-
-    // Try and vectorize the indices. We are currently only interested in
-    // gather-like cases of the form:
-    //
-    // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
-    //
-    // where the loads of "a", the loads of "b", and the subtractions can be
-    // performed in parallel. It's likely that detecting this pattern in a
-    // bottom-up phase will be simpler and less costly than building a
-    // full-blown top-down phase beginning at the consecutive loads. We process
-    // the bundle in chunks of 16 (like we do for stores) to minimize
-    // compile-time.
-    for (unsigned BI = 0, BE = Bundle.size(); BI < BE; BI += 16) {
-      auto Len = std::min<unsigned>(BE - BI, 16);
-      Changed |= tryToVectorizeList(makeArrayRef(&Bundle[BI], Len), R);
-    }
-  }
-  return Changed;
-}
-
 bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
   bool Changed = false;
   // Attempt to sort and vectorize each of the store-groups.
-  for (StoreListMap::iterator it = Stores.begin(), e = Stores.end();
+  for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
        it != e; ++it) {
     if (it->second.size() < 2)
       continue;

Removed: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll?rev=257887&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll (removed)
@@ -1,258 +0,0 @@
-; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnu"
-
-; These tests check that we vectorize the index calculations in the
-; gather-reduce pattern shown below. We check cases having i32 and i64
-; subtraction.
-;
-; int gather_reduce_8x16(short *a, short *b, short *g, int n) {
-;   int sum = 0;
-;   for (int i = 0; i < n ; ++i) {
-;     sum += g[*a++ - b[0]]; sum += g[*a++ - b[1]];
-;     sum += g[*a++ - b[2]]; sum += g[*a++ - b[3]];
-;     sum += g[*a++ - b[4]]; sum += g[*a++ - b[5]];
-;     sum += g[*a++ - b[6]]; sum += g[*a++ - b[7]];
-;   }
-;   return sum;
-; }
-
-; CHECK-LABEL: @gather_reduce_8x16_i32
-;
-; CHECK: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16>
-; CHECK: zext <8 x i16> [[L]] to <8 x i32>
-; CHECK: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32>
-; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]]
-; CHECK: sext i32 [[X]] to i64
-;
-define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
-entry:
-  %cmp.99 = icmp sgt i32 %n, 0
-  br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  br label %for.body
-
-for.cond.cleanup.loopexit:
-  br label %for.cond.cleanup
-
-for.cond.cleanup:
-  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ]
-  ret i32 %sum.0.lcssa
-
-for.body:
-  %i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ]
-  %a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ]
-  %incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1
-  %0 = load i16, i16* %a.addr.0101, align 2
-  %conv = zext i16 %0 to i32
-  %incdec.ptr1 = getelementptr inbounds i16, i16* %b, i64 1
-  %1 = load i16, i16* %b, align 2
-  %conv2 = zext i16 %1 to i32
-  %sub = sub nsw i32 %conv, %conv2
-  %arrayidx = getelementptr inbounds i16, i16* %g, i32 %sub
-  %2 = load i16, i16* %arrayidx, align 2
-  %conv3 = zext i16 %2 to i32
-  %add = add nsw i32 %conv3, %sum.0102
-  %incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2
-  %3 = load i16, i16* %incdec.ptr, align 2
-  %conv5 = zext i16 %3 to i32
-  %incdec.ptr6 = getelementptr inbounds i16, i16* %b, i64 2
-  %4 = load i16, i16* %incdec.ptr1, align 2
-  %conv7 = zext i16 %4 to i32
-  %sub8 = sub nsw i32 %conv5, %conv7
-  %arrayidx10 = getelementptr inbounds i16, i16* %g, i32 %sub8
-  %5 = load i16, i16* %arrayidx10, align 2
-  %conv11 = zext i16 %5 to i32
-  %add12 = add nsw i32 %add, %conv11
-  %incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3
-  %6 = load i16, i16* %incdec.ptr4, align 2
-  %conv14 = zext i16 %6 to i32
-  %incdec.ptr15 = getelementptr inbounds i16, i16* %b, i64 3
-  %7 = load i16, i16* %incdec.ptr6, align 2
-  %conv16 = zext i16 %7 to i32
-  %sub17 = sub nsw i32 %conv14, %conv16
-  %arrayidx19 = getelementptr inbounds i16, i16* %g, i32 %sub17
-  %8 = load i16, i16* %arrayidx19, align 2
-  %conv20 = zext i16 %8 to i32
-  %add21 = add nsw i32 %add12, %conv20
-  %incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4
-  %9 = load i16, i16* %incdec.ptr13, align 2
-  %conv23 = zext i16 %9 to i32
-  %incdec.ptr24 = getelementptr inbounds i16, i16* %b, i64 4
-  %10 = load i16, i16* %incdec.ptr15, align 2
-  %conv25 = zext i16 %10 to i32
-  %sub26 = sub nsw i32 %conv23, %conv25
-  %arrayidx28 = getelementptr inbounds i16, i16* %g, i32 %sub26
-  %11 = load i16, i16* %arrayidx28, align 2
-  %conv29 = zext i16 %11 to i32
-  %add30 = add nsw i32 %add21, %conv29
-  %incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5
-  %12 = load i16, i16* %incdec.ptr22, align 2
-  %conv32 = zext i16 %12 to i32
-  %incdec.ptr33 = getelementptr inbounds i16, i16* %b, i64 5
-  %13 = load i16, i16* %incdec.ptr24, align 2
-  %conv34 = zext i16 %13 to i32
-  %sub35 = sub nsw i32 %conv32, %conv34
-  %arrayidx37 = getelementptr inbounds i16, i16* %g, i32 %sub35
-  %14 = load i16, i16* %arrayidx37, align 2
-  %conv38 = zext i16 %14 to i32
-  %add39 = add nsw i32 %add30, %conv38
-  %incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6
-  %15 = load i16, i16* %incdec.ptr31, align 2
-  %conv41 = zext i16 %15 to i32
-  %incdec.ptr42 = getelementptr inbounds i16, i16* %b, i64 6
-  %16 = load i16, i16* %incdec.ptr33, align 2
-  %conv43 = zext i16 %16 to i32
-  %sub44 = sub nsw i32 %conv41, %conv43
-  %arrayidx46 = getelementptr inbounds i16, i16* %g, i32 %sub44
-  %17 = load i16, i16* %arrayidx46, align 2
-  %conv47 = zext i16 %17 to i32
-  %add48 = add nsw i32 %add39, %conv47
-  %incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7
-  %18 = load i16, i16* %incdec.ptr40, align 2
-  %conv50 = zext i16 %18 to i32
-  %incdec.ptr51 = getelementptr inbounds i16, i16* %b, i64 7
-  %19 = load i16, i16* %incdec.ptr42, align 2
-  %conv52 = zext i16 %19 to i32
-  %sub53 = sub nsw i32 %conv50, %conv52
-  %arrayidx55 = getelementptr inbounds i16, i16* %g, i32 %sub53
-  %20 = load i16, i16* %arrayidx55, align 2
-  %conv56 = zext i16 %20 to i32
-  %add57 = add nsw i32 %add48, %conv56
-  %incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8
-  %21 = load i16, i16* %incdec.ptr49, align 2
-  %conv59 = zext i16 %21 to i32
-  %22 = load i16, i16* %incdec.ptr51, align 2
-  %conv61 = zext i16 %22 to i32
-  %sub62 = sub nsw i32 %conv59, %conv61
-  %arrayidx64 = getelementptr inbounds i16, i16* %g, i32 %sub62
-  %23 = load i16, i16* %arrayidx64, align 2
-  %conv65 = zext i16 %23 to i32
-  %add66 = add nsw i32 %add57, %conv65
-  %inc = add nuw nsw i32 %i.0103, 1
-  %exitcond = icmp eq i32 %inc, %n
-  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; CHECK-LABEL: @gather_reduce_8x16_i64
-;
-; CHECK-NOT: load <8 x i16>
-;
-; FIXME: We are currently unable to vectorize the case with i64 subtraction
-;        because the zero extensions are too expensive. The solution here is to
-;        convert the i64 subtractions to i32 subtractions during vectorization.
-;        This would then match the case above.
-;
-define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
-entry:
-  %cmp.99 = icmp sgt i32 %n, 0
-  br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  br label %for.body
-
-for.cond.cleanup.loopexit:
-  br label %for.cond.cleanup
-
-for.cond.cleanup:
-  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ]
-  ret i32 %sum.0.lcssa
-
-for.body:
-  %i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ]
-  %a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ]
-  %incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1
-  %0 = load i16, i16* %a.addr.0101, align 2
-  %conv = zext i16 %0 to i64
-  %incdec.ptr1 = getelementptr inbounds i16, i16* %b, i64 1
-  %1 = load i16, i16* %b, align 2
-  %conv2 = zext i16 %1 to i64
-  %sub = sub nsw i64 %conv, %conv2
-  %arrayidx = getelementptr inbounds i16, i16* %g, i64 %sub
-  %2 = load i16, i16* %arrayidx, align 2
-  %conv3 = zext i16 %2 to i32
-  %add = add nsw i32 %conv3, %sum.0102
-  %incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2
-  %3 = load i16, i16* %incdec.ptr, align 2
-  %conv5 = zext i16 %3 to i64
-  %incdec.ptr6 = getelementptr inbounds i16, i16* %b, i64 2
-  %4 = load i16, i16* %incdec.ptr1, align 2
-  %conv7 = zext i16 %4 to i64
-  %sub8 = sub nsw i64 %conv5, %conv7
-  %arrayidx10 = getelementptr inbounds i16, i16* %g, i64 %sub8
-  %5 = load i16, i16* %arrayidx10, align 2
-  %conv11 = zext i16 %5 to i32
-  %add12 = add nsw i32 %add, %conv11
-  %incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3
-  %6 = load i16, i16* %incdec.ptr4, align 2
-  %conv14 = zext i16 %6 to i64
-  %incdec.ptr15 = getelementptr inbounds i16, i16* %b, i64 3
-  %7 = load i16, i16* %incdec.ptr6, align 2
-  %conv16 = zext i16 %7 to i64
-  %sub17 = sub nsw i64 %conv14, %conv16
-  %arrayidx19 = getelementptr inbounds i16, i16* %g, i64 %sub17
-  %8 = load i16, i16* %arrayidx19, align 2
-  %conv20 = zext i16 %8 to i32
-  %add21 = add nsw i32 %add12, %conv20
-  %incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4
-  %9 = load i16, i16* %incdec.ptr13, align 2
-  %conv23 = zext i16 %9 to i64
-  %incdec.ptr24 = getelementptr inbounds i16, i16* %b, i64 4
-  %10 = load i16, i16* %incdec.ptr15, align 2
-  %conv25 = zext i16 %10 to i64
-  %sub26 = sub nsw i64 %conv23, %conv25
-  %arrayidx28 = getelementptr inbounds i16, i16* %g, i64 %sub26
-  %11 = load i16, i16* %arrayidx28, align 2
-  %conv29 = zext i16 %11 to i32
-  %add30 = add nsw i32 %add21, %conv29
-  %incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5
-  %12 = load i16, i16* %incdec.ptr22, align 2
-  %conv32 = zext i16 %12 to i64
-  %incdec.ptr33 = getelementptr inbounds i16, i16* %b, i64 5
-  %13 = load i16, i16* %incdec.ptr24, align 2
-  %conv34 = zext i16 %13 to i64
-  %sub35 = sub nsw i64 %conv32, %conv34
-  %arrayidx37 = getelementptr inbounds i16, i16* %g, i64 %sub35
-  %14 = load i16, i16* %arrayidx37, align 2
-  %conv38 = zext i16 %14 to i32
-  %add39 = add nsw i32 %add30, %conv38
-  %incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6
-  %15 = load i16, i16* %incdec.ptr31, align 2
-  %conv41 = zext i16 %15 to i64
-  %incdec.ptr42 = getelementptr inbounds i16, i16* %b, i64 6
-  %16 = load i16, i16* %incdec.ptr33, align 2
-  %conv43 = zext i16 %16 to i64
-  %sub44 = sub nsw i64 %conv41, %conv43
-  %arrayidx46 = getelementptr inbounds i16, i16* %g, i64 %sub44
-  %17 = load i16, i16* %arrayidx46, align 2
-  %conv47 = zext i16 %17 to i32
-  %add48 = add nsw i32 %add39, %conv47
-  %incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7
-  %18 = load i16, i16* %incdec.ptr40, align 2
-  %conv50 = zext i16 %18 to i64
-  %incdec.ptr51 = getelementptr inbounds i16, i16* %b, i64 7
-  %19 = load i16, i16* %incdec.ptr42, align 2
-  %conv52 = zext i16 %19 to i64
-  %sub53 = sub nsw i64 %conv50, %conv52
-  %arrayidx55 = getelementptr inbounds i16, i16* %g, i64 %sub53
-  %20 = load i16, i16* %arrayidx55, align 2
-  %conv56 = zext i16 %20 to i32
-  %add57 = add nsw i32 %add48, %conv56
-  %incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8
-  %21 = load i16, i16* %incdec.ptr49, align 2
-  %conv59 = zext i16 %21 to i64
-  %22 = load i16, i16* %incdec.ptr51, align 2
-  %conv61 = zext i16 %22 to i64
-  %sub62 = sub nsw i64 %conv59, %conv61
-  %arrayidx64 = getelementptr inbounds i16, i16* %g, i64 %sub62
-  %23 = load i16, i16* %arrayidx64, align 2
-  %conv65 = zext i16 %23 to i32
-  %add66 = add nsw i32 %add57, %conv65
-  %inc = add nuw nsw i32 %i.0103, 1
-  %exitcond = icmp eq i32 %inc, %n
-  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
-}

Removed: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll?rev=257887&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll (removed)
@@ -1,111 +0,0 @@
-; RUN: opt -S -slp-vectorizer -slp-threshold=-18 -dce -instcombine < %s | FileCheck %s
-
-target datalayout = "e-m:e-i32:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnu"
-
-; These tests check that we remove from consideration pairs of seed
-; getelementptrs when they are known to have a constant difference. Such pairs
-; are likely not good candidates for vectorization since one can be computed
-; from the other. We use an unprofitable threshold to force vectorization.
-;
-; int getelementptr(int *g, int n, int w, int x, int y, int z) {
-;   int sum = 0;
-;   for (int i = 0; i < n ; ++i) {
-;     sum += g[2*i + w]; sum += g[2*i + x];
-;     sum += g[2*i + y]; sum += g[2*i + z];
-;   }
-;   return sum;
-; }
-;
-
-; CHECK-LABEL: @getelementptr_4x32
-;
-; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <4 x i32>
-; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <4 x i32> [[A]]
-; CHECK: sext i32 [[X]] to i64
-;
-define i32 @getelementptr_4x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
-entry:
-  %cmp31 = icmp sgt i32 %n, 0
-  br i1 %cmp31, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  br label %for.body
-
-for.cond.cleanup.loopexit:
-  br label %for.cond.cleanup
-
-for.cond.cleanup:
-  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add16, %for.cond.cleanup.loopexit ]
-  ret i32 %sum.0.lcssa
-
-for.body:
-  %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-  %sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add16, %for.body ]
-  %t4 = shl nsw i32 %indvars.iv, 1
-  %t5 = add nsw i32 %t4, 0
-  %arrayidx = getelementptr inbounds i32, i32* %g, i32 %t5
-  %t6 = load i32, i32* %arrayidx, align 4
-  %add1 = add nsw i32 %t6, %sum.032
-  %t7 = add nsw i32 %t4, %x
-  %arrayidx5 = getelementptr inbounds i32, i32* %g, i32 %t7
-  %t8 = load i32, i32* %arrayidx5, align 4
-  %add6 = add nsw i32 %add1, %t8
-  %t9 = add nsw i32 %t4, %y
-  %arrayidx10 = getelementptr inbounds i32, i32* %g, i32 %t9
-  %t10 = load i32, i32* %arrayidx10, align 4
-  %add11 = add nsw i32 %add6, %t10
-  %t11 = add nsw i32 %t4, %z
-  %arrayidx15 = getelementptr inbounds i32, i32* %g, i32 %t11
-  %t12 = load i32, i32* %arrayidx15, align 4
-  %add16 = add nsw i32 %add11, %t12
-  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
-  %exitcond = icmp eq i32 %indvars.iv.next , %n
-  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-; CHECK-LABEL: @getelementptr_2x32
-;
-; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <2 x i32>
-; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[A]]
-; CHECK: sext i32 [[X]] to i64
-;
-define i32 @getelementptr_2x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
-entry:
-  %cmp31 = icmp sgt i32 %n, 0
-  br i1 %cmp31, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  br label %for.body
-
-for.cond.cleanup.loopexit:
-  br label %for.cond.cleanup
-
-for.cond.cleanup:
-  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add16, %for.cond.cleanup.loopexit ]
-  ret i32 %sum.0.lcssa
-
-for.body:
-  %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
-  %sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add16, %for.body ]
-  %t4 = shl nsw i32 %indvars.iv, 1
-  %t5 = add nsw i32 %t4, 0
-  %arrayidx = getelementptr inbounds i32, i32* %g, i32 %t5
-  %t6 = load i32, i32* %arrayidx, align 4
-  %add1 = add nsw i32 %t6, %sum.032
-  %t7 = add nsw i32 %t4, 1
-  %arrayidx5 = getelementptr inbounds i32, i32* %g, i32 %t7
-  %t8 = load i32, i32* %arrayidx5, align 4
-  %add6 = add nsw i32 %add1, %t8
-  %t9 = add nsw i32 %t4, %y
-  %arrayidx10 = getelementptr inbounds i32, i32* %g, i32 %t9
-  %t10 = load i32, i32* %arrayidx10, align 4
-  %add11 = add nsw i32 %add6, %t10
-  %t11 = add nsw i32 %t4, %z
-  %arrayidx15 = getelementptr inbounds i32, i32* %g, i32 %t11
-  %t12 = load i32, i32* %arrayidx15, align 4
-  %add16 = add nsw i32 %add11, %t12
-  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
-  %exitcond = icmp eq i32 %indvars.iv.next , %n
-  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
-}