<div dir="ltr">I don't believe you have added a test case to make sure we do not regress and re-trigger PR26364. Please add one.<br><div class="gmail_extra"><br><div class="gmail_quote">On Mon, Feb 1, 2016 at 5:38 AM, Matthew Simpson via llvm-commits <span dir="ltr"><<a href="mailto:llvm-commits@lists.llvm.org" target="_blank">llvm-commits@lists.llvm.org</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex">Author: mssimpso<br>
Date: Mon Feb 1 07:38:29 2016<br>
New Revision: 259357<br>
<br>
URL: <a href="http://llvm.org/viewvc/llvm-project?rev=259357&view=rev" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project?rev=259357&view=rev</a><br>
Log:<br>
Reapply commit r258404 with fix.<br>
<br>
The previous patch caused PR26364. The fix is to ensure that we don't enter a<br>
cycle when iterating over use-def chains.<br>
<br>
Modified:<br>
llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp<br>
llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll<br>
<br>
Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=259357&r1=259356&r2=259357&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=259357&r1=259356&r2=259357&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)<br>
+++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Mon Feb 1 07:38:29 2016<br>
@@ -15,22 +15,24 @@<br>
// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.<br>
//<br>
//===----------------------------------------------------------------------===//<br>
-#include "llvm/Transforms/Vectorize.h"<br>
#include "llvm/ADT/MapVector.h"<br>
#include "llvm/ADT/Optional.h"<br>
#include "llvm/ADT/PostOrderIterator.h"<br>
#include "llvm/ADT/SetVector.h"<br>
#include "llvm/ADT/Statistic.h"<br>
#include "llvm/Analysis/AliasAnalysis.h"<br>
-#include "llvm/Analysis/GlobalsModRef.h"<br>
#include "llvm/Analysis/AssumptionCache.h"<br>
#include "llvm/Analysis/CodeMetrics.h"<br>
+#include "llvm/Analysis/DemandedBits.h"<br>
+#include "llvm/Analysis/GlobalsModRef.h"<br>
+#include "llvm/Analysis/LoopAccessAnalysis.h"<br>
#include "llvm/Analysis/LoopInfo.h"<br>
#include "llvm/Analysis/LoopAccessAnalysis.h"<br>
#include "llvm/Analysis/ScalarEvolution.h"<br>
#include "llvm/Analysis/ScalarEvolutionExpressions.h"<br>
#include "llvm/Analysis/TargetTransformInfo.h"<br>
#include "llvm/Analysis/ValueTracking.h"<br>
+#include "llvm/Analysis/VectorUtils.h"<br>
#include "llvm/IR/DataLayout.h"<br>
#include "llvm/IR/Dominators.h"<br>
#include "llvm/IR/IRBuilder.h"<br>
@@ -45,7 +47,7 @@<br>
#include "llvm/Support/CommandLine.h"<br>
#include "llvm/Support/Debug.h"<br>
#include "llvm/Support/raw_ostream.h"<br>
-#include "llvm/Analysis/VectorUtils.h"<br>
+#include "llvm/Transforms/Vectorize.h"<br>
#include <algorithm><br>
#include <map><br>
#include <memory><br>
@@ -364,9 +366,9 @@ public:<br>
<br>
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,<br>
TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,<br>
- DominatorTree *Dt, AssumptionCache *AC)<br>
+ DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB)<br>
: NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),<br>
- SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),<br>
+ SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),<br>
Builder(Se->getContext()) {<br>
CodeMetrics::collectEphemeralValues(F, AC, EphValues);<br>
}<br>
@@ -400,6 +402,7 @@ public:<br>
BlockScheduling *BS = Iter.second.get();<br>
BS->clear();<br>
}<br>
+ MinBWs.clear();<br>
}<br>
<br>
/// \brief Perform LICM and CSE on the newly generated gather sequences.<br>
@@ -417,6 +420,10 @@ public:<br>
/// vectorization factors.<br>
unsigned getVectorElementSize(Value *V);<br>
<br>
+ /// Compute the minimum type sizes required to represent the entries in a<br>
+ /// vectorizable tree.<br>
+ void computeMinimumValueSizes();<br>
+<br>
private:<br>
struct TreeEntry;<br>
<br>
@@ -914,8 +921,14 @@ private:<br>
AliasAnalysis *AA;<br>
LoopInfo *LI;<br>
DominatorTree *DT;<br>
+ AssumptionCache *AC;<br>
+ DemandedBits *DB;<br>
/// Instruction builder to construct the vectorized tree.<br>
IRBuilder<> Builder;<br>
+<br>
+ /// A map of scalar integer values to the smallest bit width with which they<br>
+ /// can legally be represented.<br>
+ MapVector<Value *, uint64_t> MinBWs;<br>
};<br>
<br>
#ifndef NDEBUG<br>
@@ -1471,6 +1484,12 @@ int BoUpSLP::getEntryCost(TreeEntry *E)<br>
ScalarTy = SI->getValueOperand()->getType();<br>
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());<br>
<br>
+ // If we have computed a smaller type for the expression, update VecTy so<br>
+ // that the costs will be accurate.<br>
+ if (MinBWs.count(VL[0]))<br>
+ VecTy = VectorType::get(IntegerType::get(F->getContext(), MinBWs[VL[0]]),<br>
+ VL.size());<br>
+<br>
if (E->NeedToGather) {<br>
if (allConstant(VL))<br>
return 0;<br>
@@ -1799,9 +1818,19 @@ int BoUpSLP::getTreeCost() {<br>
if (EphValues.count(EU.User))<br>
continue;<br>
<br>
- VectorType *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);<br>
- ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,<br>
- EU.Lane);<br>
+ // If we plan to rewrite the tree in a smaller type, we will need to sign<br>
+ // extend the extracted value back to the original type. Here, we account<br>
+ // for the extract and the added cost of the sign extend if needed.<br>
+ auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);<br>
+ auto *ScalarRoot = VectorizableTree[0].Scalars[0];<br>
+ if (MinBWs.count(ScalarRoot)) {<br>
+ auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]);<br>
+ VecTy = VectorType::get(MinTy, BundleWidth);<br>
+ ExtractCost +=<br>
+ TTI->getCastInstrCost(Instruction::SExt, EU.Scalar->getType(), MinTy);<br>
+ }<br>
+ ExtractCost +=<br>
+ TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);<br>
}<br>
<br>
Cost += getSpillCost();<br>
@@ -2499,7 +2528,21 @@ Value *BoUpSLP::vectorizeTree() {<br>
}<br>
<br>
Builder.SetInsertPoint(&F->getEntryBlock().front());<br>
- vectorizeTree(&VectorizableTree[0]);<br>
+ auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);<br>
+<br>
+ // If the vectorized tree can be rewritten in a smaller type, we truncate the<br>
+ // vectorized root. InstCombine will then rewrite the entire expression. We<br>
+ // sign extend the extracted values below.<br>
+ auto *ScalarRoot = VectorizableTree[0].Scalars[0];<br>
+ if (MinBWs.count(ScalarRoot)) {<br>
+ if (auto *I = dyn_cast<Instruction>(VectorRoot))<br>
+ Builder.SetInsertPoint(&*++BasicBlock::iterator(I));<br>
+ auto BundleWidth = VectorizableTree[0].Scalars.size();<br>
+ auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]);<br>
+ auto *VecTy = VectorType::get(MinTy, BundleWidth);<br>
+ auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);<br>
+ VectorizableTree[0].VectorizedValue = Trunc;<br>
+ }<br>
<br>
DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");<br>
<br>
@@ -2532,6 +2575,8 @@ Value *BoUpSLP::vectorizeTree() {<br>
if (PH->getIncomingValue(i) == Scalar) {<br>
Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());<br>
Value *Ex = Builder.CreateExtractElement(Vec, Lane);<br>
+ if (MinBWs.count(ScalarRoot))<br>
+ Ex = Builder.CreateSExt(Ex, Scalar->getType());<br>
CSEBlocks.insert(PH->getIncomingBlock(i));<br>
PH->setOperand(i, Ex);<br>
}<br>
@@ -2539,12 +2584,16 @@ Value *BoUpSLP::vectorizeTree() {<br>
} else {<br>
Builder.SetInsertPoint(cast<Instruction>(User));<br>
Value *Ex = Builder.CreateExtractElement(Vec, Lane);<br>
+ if (MinBWs.count(ScalarRoot))<br>
+ Ex = Builder.CreateSExt(Ex, Scalar->getType());<br>
CSEBlocks.insert(cast<Instruction>(User)->getParent());<br>
User->replaceUsesOfWith(Scalar, Ex);<br>
}<br>
} else {<br>
Builder.SetInsertPoint(&F->getEntryBlock().front());<br>
Value *Ex = Builder.CreateExtractElement(Vec, Lane);<br>
+ if (MinBWs.count(ScalarRoot))<br>
+ Ex = Builder.CreateSExt(Ex, Scalar->getType());<br>
CSEBlocks.insert(&F->getEntryBlock());<br>
User->replaceUsesOfWith(Scalar, Ex);<br>
}<br>
@@ -3113,7 +3162,7 @@ unsigned BoUpSLP::getVectorElementSize(V<br>
// If the current instruction is a load, update MaxWidth to reflect the<br>
// width of the loaded value.<br>
else if (isa<LoadInst>(I))<br>
- MaxWidth = std::max(MaxWidth, (unsigned)DL.getTypeSizeInBits(Ty));<br>
+ MaxWidth = std::max<unsigned>(MaxWidth, DL.getTypeSizeInBits(Ty));<br>
<br>
// Otherwise, we need to visit the operands of the instruction. We only<br>
// handle the interesting cases from buildTree here. If an operand is an<br>
@@ -3140,6 +3189,171 @@ unsigned BoUpSLP::getVectorElementSize(V<br>
return MaxWidth;<br>
}<br>
<br>
+// Determine if a value V in a vectorizable expression Expr can be demoted to a<br>
+// smaller type with a truncation. We collect the values that will be demoted<br>
+// in ToDemote and additional roots that require investigating in Roots.<br>
+static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,<br>
+ SmallVectorImpl<Value *> &ToDemote,<br>
+ SmallVectorImpl<Value *> &Roots) {<br>
+<br>
+ // We can always demote constants.<br>
+ if (isa<Constant>(V)) {<br>
+ ToDemote.push_back(V);<br>
+ return true;<br>
+ }<br>
+<br>
+ // If the value is not an instruction in the expression with only one use, it<br>
+ // cannot be demoted.<br>
+ auto *I = dyn_cast<Instruction>(V);<br>
+ if (!I || !I->hasOneUse() || !Expr.count(I))<br>
+ return false;<br>
+<br>
+ switch (I->getOpcode()) {<br>
+<br>
+ // We can always demote truncations and extensions. Since truncations can<br>
+ // seed additional demotion, we save the truncated value.<br>
+ case Instruction::Trunc:<br>
+ Roots.push_back(I->getOperand(0));<br>
+ case Instruction::ZExt:<br>
+ case Instruction::SExt:<br>
+ break;<br>
+<br>
+ // We can demote certain binary operations if we can demote both of their<br>
+ // operands.<br>
+ case Instruction::Add:<br>
+ case Instruction::Sub:<br>
+ case Instruction::Mul:<br>
+ case Instruction::And:<br>
+ case Instruction::Or:<br>
+ case Instruction::Xor:<br>
+ if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||<br>
+ !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))<br>
+ return false;<br>
+ break;<br>
+<br>
+ // We can demote selects if we can demote their true and false values.<br>
+ case Instruction::Select: {<br>
+ SelectInst *SI = cast<SelectInst>(I);<br>
+ if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||<br>
+ !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))<br>
+ return false;<br>
+ break;<br>
+ }<br>
+<br>
+ // We can demote phis if we can demote all their incoming operands. Note that<br>
+ // we don't need to worry about cycles since we ensure single use above.<br>
+ case Instruction::PHI: {<br>
+ PHINode *PN = cast<PHINode>(I);<br>
+ for (Value *IncValue : PN->incoming_values())<br>
+ if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))<br>
+ return false;<br>
+ break;<br>
+ }<br>
+<br>
+ // Otherwise, conservatively give up.<br>
+ default:<br>
+ return false;<br>
+ }<br>
+<br>
+ // Record the value that we can demote.<br>
+ ToDemote.push_back(V);<br>
+ return true;<br>
+}<br>
+<br>
+void BoUpSLP::computeMinimumValueSizes() {<br>
+ auto &DL = F->getParent()->getDataLayout();<br>
+<br>
+ // If there are no external uses, the expression tree must be rooted by a<br>
+ // store. We can't demote in-memory values, so there is nothing to do here.<br>
+ if (ExternalUses.empty())<br>
+ return;<br>
+<br>
+ // We only attempt to truncate integer expressions.<br>
+ auto &TreeRoot = VectorizableTree[0].Scalars;<br>
+ auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());<br>
+ if (!TreeRootIT)<br>
+ return;<br>
+<br>
+ // If the expression is not rooted by a store, these roots should have<br>
+ // external uses. We will rely on InstCombine to rewrite the expression in<br>
+ // the narrower type. However, InstCombine only rewrites single-use values.<br>
+ // This means that if a tree entry other than a root is used externally, it<br>
+ // must have multiple uses and InstCombine will not rewrite it. The code<br>
+ // below ensures that only the roots are used externally.<br>
+ SmallPtrSet<Value *, 16> Expr(TreeRoot.begin(), TreeRoot.end());<br>
+ for (auto &EU : ExternalUses)<br>
+ if (!Expr.erase(EU.Scalar))<br>
+ return;<br>
+ if (!Expr.empty())<br>
+ return;<br>
+<br>
+ // Collect the scalar values in one lane of the vectorizable expression. We<br>
+ // will use this context to determine which values can be demoted. If we see<br>
+ // a truncation, we mark it as seeding another demotion.<br>
+ for (auto &Entry : VectorizableTree)<br>
+ Expr.insert(Entry.Scalars[0]);<br>
+<br>
+ // Ensure the root of the vectorizable tree doesn't form a cycle. It must<br>
+ // have a single external user that is not in the vectorizable tree.<br>
+ if (!TreeRoot[0]->hasOneUse() || Expr.count(*TreeRoot[0]->user_begin()))<br>
+ return;<br>
+<br>
+ // Conservatively determine if we can actually truncate the root of the<br>
+ // expression. Collect the values that can be demoted in ToDemote and<br>
+ // additional roots that require investigating in Roots.<br>
+ SmallVector<Value *, 32> ToDemote;<br>
+ SmallVector<Value *, 2> Roots;<br>
+ if (!collectValuesToDemote(TreeRoot[0], Expr, ToDemote, Roots))<br>
+ return;<br>
+<br>
+ // The maximum bit width required to represent all the values that can be<br>
+ // demoted without loss of precision. It would be safe to truncate the root<br>
+ // of the expression to this width.<br>
+ auto MaxBitWidth = 8u;<br>
+<br>
+ // We first check if all the bits of the root are demanded. If they're not,<br>
+ // we can truncate the root to this narrower type.<br>
+ auto Mask = DB->getDemandedBits(cast<Instruction>(TreeRoot[0]));<br>
+ if (Mask.countLeadingZeros() > 0)<br>
+ MaxBitWidth = std::max<unsigned>(<br>
+ Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);<br>
+<br>
+ // If all the bits of the root are demanded, we can try a little harder to<br>
+ // compute a narrower type. This can happen, for example, if the roots are<br>
+ // getelementptr indices. InstCombine promotes these indices to the pointer<br>
+ // width. Thus, all their bits are technically demanded even though the<br>
+ // address computation might be vectorized in a smaller type.<br>
+ //<br>
+ // We start by looking at each entry that can be demoted. We compute the<br>
+ // maximum bit width required to store the scalar by using ValueTracking to<br>
+ // compute the number of high-order bits we can truncate.<br>
+ else<br>
+ for (auto *Scalar : ToDemote) {<br>
+ auto NumSignBits = ComputeNumSignBits(Scalar, DL, 0, AC, 0, DT);<br>
+ auto NumTypeBits = DL.getTypeSizeInBits(Scalar->getType());<br>
+ MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);<br>
+ }<br>
+<br>
+ // Round MaxBitWidth up to the next power-of-two.<br>
+ if (!isPowerOf2_64(MaxBitWidth))<br>
+ MaxBitWidth = NextPowerOf2(MaxBitWidth);<br>
+<br>
+ // If the maximum bit width we compute is less than the with of the roots'<br>
+ // type, we can proceed with the narrowing. Otherwise, do nothing.<br>
+ if (MaxBitWidth >= TreeRootIT->getBitWidth())<br>
+ return;<br>
+<br>
+ // If we can truncate the root, we must collect additional values that might<br>
+ // be demoted as a result. That is, those seeded by truncations we will<br>
+ // modify.<br>
+ while (!Roots.empty())<br>
+ collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);<br>
+<br>
+ // Finally, map the values we can demote to the maximum bit with we computed.<br>
+ for (auto *Scalar : ToDemote)<br>
+ MinBWs[Scalar] = MaxBitWidth;<br>
+}<br>
+<br>
/// The SLPVectorizer Pass.<br>
struct SLPVectorizer : public FunctionPass {<br>
typedef SmallVector<StoreInst *, 8> StoreList;<br>
@@ -3161,6 +3375,7 @@ struct SLPVectorizer : public FunctionPa<br>
LoopInfo *LI;<br>
DominatorTree *DT;<br>
AssumptionCache *AC;<br>
+ DemandedBits *DB;<br>
<br>
bool runOnFunction(Function &F) override {<br>
if (skipOptnoneFunction(F))<br>
@@ -3174,6 +3389,7 @@ struct SLPVectorizer : public FunctionPa<br>
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();<br>
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();<br>
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);<br>
+ DB = &getAnalysis<DemandedBits>();<br>
<br>
Stores.clear();<br>
GEPs.clear();<br>
@@ -3203,7 +3419,7 @@ struct SLPVectorizer : public FunctionPa<br>
<br>
// Use the bottom up slp vectorizer to construct chains that start with<br>
// store instructions.<br>
- BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC);<br>
+ BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB);<br>
<br>
// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to<br>
// delete instructions.<br>
@@ -3246,6 +3462,7 @@ struct SLPVectorizer : public FunctionPa<br>
AU.addRequired<TargetTransformInfoWrapperPass>();<br>
AU.addRequired<LoopInfoWrapperPass>();<br>
AU.addRequired<DominatorTreeWrapperPass>();<br>
+ AU.addRequired<DemandedBits>();<br>
AU.addPreserved<LoopInfoWrapperPass>();<br>
AU.addPreserved<DominatorTreeWrapperPass>();<br>
AU.addPreserved<AAResultsWrapperPass>();<br>
@@ -3350,6 +3567,7 @@ bool SLPVectorizer::vectorizeStoreChain(<br>
ArrayRef<Value *> Operands = Chain.slice(i, VF);<br>
<br>
R.buildTree(Operands);<br>
+ R.computeMinimumValueSizes();<br>
<br>
int Cost = R.getTreeCost();<br>
<br>
@@ -3549,6 +3767,7 @@ bool SLPVectorizer::tryToVectorizeList(A<br>
Value *ReorderedOps[] = { Ops[1], Ops[0] };<br>
R.buildTree(ReorderedOps, None);<br>
}<br>
+ R.computeMinimumValueSizes();<br>
int Cost = R.getTreeCost();<br>
<br>
if (Cost < -SLPCostThreshold) {<br>
@@ -3815,6 +4034,7 @@ public:<br>
<br>
for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {<br>
V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);<br>
+ V.computeMinimumValueSizes();<br>
<br>
// Estimate cost.<br>
int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);<br>
<br>
Modified: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll?rev=259357&r1=259356&r2=259357&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll?rev=259357&r1=259356&r2=259357&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll (original)<br>
+++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll Mon Feb 1 07:38:29 2016<br>
@@ -1,4 +1,5 @@<br>
-; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s<br>
+; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s --check-prefix=PROFITABLE<br>
+; RUN: opt -S -slp-vectorizer -slp-threshold=-12 -dce -instcombine < %s | FileCheck %s --check-prefix=UNPROFITABLE<br>
<br>
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"<br>
target triple = "aarch64--linux-gnu"<br>
@@ -18,13 +19,13 @@ target triple = "aarch64--linux-gnu"<br>
; return sum;<br>
; }<br>
<br>
-; CHECK-LABEL: @gather_reduce_8x16_i32<br>
+; PROFITABLE-LABEL: @gather_reduce_8x16_i32<br>
;<br>
-; CHECK: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16><br>
-; CHECK: zext <8 x i16> [[L]] to <8 x i32><br>
-; CHECK: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32><br>
-; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]]<br>
-; CHECK: sext i32 [[X]] to i64<br>
+; PROFITABLE: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16><br>
+; PROFITABLE: zext <8 x i16> [[L]] to <8 x i32><br>
+; PROFITABLE: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32><br>
+; PROFITABLE: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]]<br>
+; PROFITABLE: sext i32 [[X]] to i64<br>
;<br>
define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {<br>
entry:<br>
@@ -137,14 +138,18 @@ for.body:<br>
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body<br>
}<br>
<br>
-; CHECK-LABEL: @gather_reduce_8x16_i64<br>
+; UNPROFITABLE-LABEL: @gather_reduce_8x16_i64<br>
;<br>
-; CHECK-NOT: load <8 x i16><br>
-;<br>
-; FIXME: We are currently unable to vectorize the case with i64 subtraction<br>
-; because the zero extensions are too expensive. The solution here is to<br>
-; convert the i64 subtractions to i32 subtractions during vectorization.<br>
-; This would then match the case above.<br>
+; UNPROFITABLE: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16><br>
+; UNPROFITABLE: zext <8 x i16> [[L]] to <8 x i32><br>
+; UNPROFITABLE: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32><br>
+; UNPROFITABLE: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]]<br>
+; UNPROFITABLE: sext i32 [[X]] to i64<br>
+;<br>
+; TODO: Although we can now vectorize this case while converting the i64<br>
+; subtractions to i32, the cost model currently finds vectorization to be<br>
+; unprofitable. The cost model is penalizing the sign and zero<br>
+; extensions in the vectorized version, but they are actually free.<br>
;<br>
define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {<br>
entry:<br>
<br>
<br>
_______________________________________________<br>
llvm-commits mailing list<br>
<a href="mailto:llvm-commits@lists.llvm.org">llvm-commits@lists.llvm.org</a><br>
<a href="http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits" rel="noreferrer" target="_blank">http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits</a><br>
</blockquote></div><br></div></div>