[llvm] r259357 - Reapply commit r258404 with fix.

Wed Feb 17 11:08:35 PST 2016

I've reverted this in r261137, it seems to be the root cause of PR26629.

On Mon, Feb 1, 2016 at 11:15 AM, Matthew Simpson <mssimpso at codeaurora.org>
wrote:

> Test case added in r259385. Thanks again for reducing the test case!
>
>
>
> -- Matt
>
>
>
> *From:* Matthew Simpson [mailto:mssimpso at codeaurora.org]
> *Sent:* Monday, February 01, 2016 1:42 PM
> *To:* 'David Majnemer'
> *Cc:* 'llvm-commits'
> *Subject:* RE: [llvm] r259357 - Reapply commit r258404 with fix.
>
>
>
> Will do. Thanks, David.
>
>
>
> -- Matt
>
>
>
> *From:* David Majnemer [mailto:david.majnemer at gmail.com
> <david.majnemer at gmail.com>]
> *Sent:* Monday, February 01, 2016 1:17 PM
> *To:* Matthew Simpson
> *Cc:* llvm-commits
> *Subject:* Re: [llvm] r259357 - Reapply commit r258404 with fix.
>
>
>
> I don't believe you have added a test case to make sure we do not regress
> and re-trigger PR26364.  Please add one.
>
>
>
> On Mon, Feb 1, 2016 at 5:38 AM, Matthew Simpson via llvm-commits <
> llvm-commits at lists.llvm.org> wrote:
>
> Author: mssimpso
> Date: Mon Feb  1 07:38:29 2016
> New Revision: 259357
>
> URL: http://llvm.org/viewvc/llvm-project?rev=259357&view=rev
> Log:
> Reapply commit r258404 with fix.
>
> The previous patch caused PR26364. The fix is to ensure that we don't
> enter a
> cycle when iterating over use-def chains.
>
> Modified:
>     llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
>     llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
>
> Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=259357&r1=259356&r2=259357&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
> +++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Mon Feb  1
> 07:38:29 2016
> @@ -15,22 +15,24 @@
>  //  "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
>  //
>
>  //===----------------------------------------------------------------------===//
> -#include "llvm/Transforms/Vectorize.h"
>  #include "llvm/ADT/MapVector.h"
>  #include "llvm/ADT/Optional.h"
>  #include "llvm/ADT/PostOrderIterator.h"
>  #include "llvm/ADT/SetVector.h"
>  #include "llvm/ADT/Statistic.h"
>  #include "llvm/Analysis/AliasAnalysis.h"
> -#include "llvm/Analysis/GlobalsModRef.h"
>  #include "llvm/Analysis/AssumptionCache.h"
>  #include "llvm/Analysis/CodeMetrics.h"
> +#include "llvm/Analysis/DemandedBits.h"
> +#include "llvm/Analysis/GlobalsModRef.h"
> +#include "llvm/Analysis/LoopAccessAnalysis.h"
>  #include "llvm/Analysis/LoopInfo.h"
>  #include "llvm/Analysis/LoopAccessAnalysis.h"
>  #include "llvm/Analysis/ScalarEvolution.h"
>  #include "llvm/Analysis/ScalarEvolutionExpressions.h"
>  #include "llvm/Analysis/TargetTransformInfo.h"
>  #include "llvm/Analysis/ValueTracking.h"
> +#include "llvm/Analysis/VectorUtils.h"
>  #include "llvm/IR/DataLayout.h"
>  #include "llvm/IR/Dominators.h"
>  #include "llvm/IR/IRBuilder.h"
> @@ -45,7 +47,7 @@
>  #include "llvm/Support/CommandLine.h"
>  #include "llvm/Support/Debug.h"
>  #include "llvm/Support/raw_ostream.h"
> -#include "llvm/Analysis/VectorUtils.h"
> +#include "llvm/Transforms/Vectorize.h"
>  #include <algorithm>
>  #include <map>
>  #include <memory>
> @@ -364,9 +366,9 @@ public:
>
>    BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
>            TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
> -          DominatorTree *Dt, AssumptionCache *AC)
> +          DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB)
>        : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
> -        SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
> +        SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
> DB(DB),
>          Builder(Se->getContext()) {
>      CodeMetrics::collectEphemeralValues(F, AC, EphValues);
>    }
> @@ -400,6 +402,7 @@ public:
>        BlockScheduling *BS = Iter.second.get();
>        BS->clear();
>      }
> +    MinBWs.clear();
>    }
>
>    /// \brief Perform LICM and CSE on the newly generated gather sequences.
> @@ -417,6 +420,10 @@ public:
>    /// vectorization factors.
>    unsigned getVectorElementSize(Value *V);
>
> +  /// Compute the minimum type sizes required to represent the entries in
> a
> +  /// vectorizable tree.
> +  void computeMinimumValueSizes();
> +
>  private:
>    struct TreeEntry;
>
> @@ -914,8 +921,14 @@ private:
>    AliasAnalysis *AA;
>    LoopInfo *LI;
>    DominatorTree *DT;
> +  AssumptionCache *AC;
> +  DemandedBits *DB;
>    /// Instruction builder to construct the vectorized tree.
>    IRBuilder<> Builder;
> +
> +  /// A map of scalar integer values to the smallest bit width with which
> they
> +  /// can legally be represented.
> +  MapVector<Value *, uint64_t> MinBWs;
>  };
>
>  #ifndef NDEBUG
> @@ -1471,6 +1484,12 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
>      ScalarTy = SI->getValueOperand()->getType();
>    VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
>
> +  // If we have computed a smaller type for the expression, update VecTy
> so
> +  // that the costs will be accurate.
> +  if (MinBWs.count(VL[0]))
> +    VecTy = VectorType::get(IntegerType::get(F->getContext(),
> MinBWs[VL[0]]),
> +                            VL.size());
> +
>    if (E->NeedToGather) {
>      if (allConstant(VL))
>        return 0;
> @@ -1799,9 +1818,19 @@ int BoUpSLP::getTreeCost() {
>      if (EphValues.count(EU.User))
>        continue;
>
> -    VectorType *VecTy = VectorType::get(EU.Scalar->getType(),
> BundleWidth);
> -    ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement,
> VecTy,
> -                                           EU.Lane);
> +    // If we plan to rewrite the tree in a smaller type, we will need to
> sign
> +    // extend the extracted value back to the original type. Here, we
> account
> +    // for the extract and the added cost of the sign extend if needed.
> +    auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
> +    auto *ScalarRoot = VectorizableTree[0].Scalars[0];
> +    if (MinBWs.count(ScalarRoot)) {
> +      auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]);
> +      VecTy = VectorType::get(MinTy, BundleWidth);
> +      ExtractCost +=
> +          TTI->getCastInstrCost(Instruction::SExt, EU.Scalar->getType(),
> MinTy);
> +    }
> +    ExtractCost +=
> +        TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
> EU.Lane);
>    }
>
>    Cost += getSpillCost();
> @@ -2499,7 +2528,21 @@ Value *BoUpSLP::vectorizeTree() {
>    }
>
>    Builder.SetInsertPoint(&F->getEntryBlock().front());
> -  vectorizeTree(&VectorizableTree[0]);
> +  auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
> +
> +  // If the vectorized tree can be rewritten in a smaller type, we
> truncate the
> +  // vectorized root. InstCombine will then rewrite the entire
> expression. We
> +  // sign extend the extracted values below.
> +  auto *ScalarRoot = VectorizableTree[0].Scalars[0];
> +  if (MinBWs.count(ScalarRoot)) {
> +    if (auto *I = dyn_cast<Instruction>(VectorRoot))
> +      Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
> +    auto BundleWidth = VectorizableTree[0].Scalars.size();
> +    auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]);
> +    auto *VecTy = VectorType::get(MinTy, BundleWidth);
> +    auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
> +    VectorizableTree[0].VectorizedValue = Trunc;
> +  }
>
>    DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values
> .\n");
>
> @@ -2532,6 +2575,8 @@ Value *BoUpSLP::vectorizeTree() {
>            if (PH->getIncomingValue(i) == Scalar) {
>
>  Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
>              Value *Ex = Builder.CreateExtractElement(Vec, Lane);
> +            if (MinBWs.count(ScalarRoot))
> +              Ex = Builder.CreateSExt(Ex, Scalar->getType());
>              CSEBlocks.insert(PH->getIncomingBlock(i));
>              PH->setOperand(i, Ex);
>            }
> @@ -2539,12 +2584,16 @@ Value *BoUpSLP::vectorizeTree() {
>        } else {
>          Builder.SetInsertPoint(cast<Instruction>(User));
>          Value *Ex = Builder.CreateExtractElement(Vec, Lane);
> +        if (MinBWs.count(ScalarRoot))
> +          Ex = Builder.CreateSExt(Ex, Scalar->getType());
>          CSEBlocks.insert(cast<Instruction>(User)->getParent());
>          User->replaceUsesOfWith(Scalar, Ex);
>       }
>      } else {
>        Builder.SetInsertPoint(&F->getEntryBlock().front());
>        Value *Ex = Builder.CreateExtractElement(Vec, Lane);
> +      if (MinBWs.count(ScalarRoot))
> +        Ex = Builder.CreateSExt(Ex, Scalar->getType());
>        CSEBlocks.insert(&F->getEntryBlock());
>        User->replaceUsesOfWith(Scalar, Ex);
>      }
> @@ -3113,7 +3162,7 @@ unsigned BoUpSLP::getVectorElementSize(V
>      // If the current instruction is a load, update MaxWidth to reflect
> the
>      // width of the loaded value.
>      else if (isa<LoadInst>(I))
> -      MaxWidth = std::max(MaxWidth, (unsigned)DL.getTypeSizeInBits(Ty));
> +      MaxWidth = std::max<unsigned>(MaxWidth, DL.getTypeSizeInBits(Ty));
>
>      // Otherwise, we need to visit the operands of the instruction. We
> only
>      // handle the interesting cases from buildTree here. If an operand is
> an
> @@ -3140,6 +3189,171 @@ unsigned BoUpSLP::getVectorElementSize(V
>    return MaxWidth;
>  }
>
> +// Determine if a value V in a vectorizable expression Expr can be
> demoted to a
> +// smaller type with a truncation. We collect the values that will be
> demoted
> +// in ToDemote and additional roots that require investigating in Roots.
> +static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *>
> &Expr,
> +                                  SmallVectorImpl<Value *> &ToDemote,
> +                                  SmallVectorImpl<Value *> &Roots) {
> +
> +  // We can always demote constants.
> +  if (isa<Constant>(V)) {
> +    ToDemote.push_back(V);
> +    return true;
> +  }
> +
> +  // If the value is not an instruction in the expression with only one
> use, it
> +  // cannot be demoted.
> +  auto *I = dyn_cast<Instruction>(V);
> +  if (!I || !I->hasOneUse() || !Expr.count(I))
> +    return false;
> +
> +  switch (I->getOpcode()) {
> +
> +  // We can always demote truncations and extensions. Since truncations
> can
> +  // seed additional demotion, we save the truncated value.
> +  case Instruction::Trunc:
> +    Roots.push_back(I->getOperand(0));
> +  case Instruction::ZExt:
> +  case Instruction::SExt:
> +    break;
> +
> +  // We can demote certain binary operations if we can demote both of
> their
> +  // operands.
> +  case Instruction::Add:
> +  case Instruction::Sub:
> +  case Instruction::Mul:
> +  case Instruction::And:
> +  case Instruction::Or:
> +  case Instruction::Xor:
> +    if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||
> +        !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
> +      return false;
> +    break;
> +
> +  // We can demote selects if we can demote their true and false values.
> +  case Instruction::Select: {
> +    SelectInst *SI = cast<SelectInst>(I);
> +    if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots)
> ||
> +        !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote,
> Roots))
> +      return false;
> +    break;
> +  }
> +
> +  // We can demote phis if we can demote all their incoming operands.
> Note that
> +  // we don't need to worry about cycles since we ensure single use above.
> +  case Instruction::PHI: {
> +    PHINode *PN = cast<PHINode>(I);
> +    for (Value *IncValue : PN->incoming_values())
> +      if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
> +        return false;
> +    break;
> +  }
> +
> +  // Otherwise, conservatively give up.
> +  default:
> +    return false;
> +  }
> +
> +  // Record the value that we can demote.
> +  ToDemote.push_back(V);
> +  return true;
> +}
> +
> +void BoUpSLP::computeMinimumValueSizes() {
> +  auto &DL = F->getParent()->getDataLayout();
> +
> +  // If there are no external uses, the expression tree must be rooted by
> a
> +  // store. We can't demote in-memory values, so there is nothing to do
> here.
> +  if (ExternalUses.empty())
> +    return;
> +
> +  // We only attempt to truncate integer expressions.
> +  auto &TreeRoot = VectorizableTree[0].Scalars;
> +  auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
> +  if (!TreeRootIT)
> +    return;
> +
> +  // If the expression is not rooted by a store, these roots should have
> +  // external uses. We will rely on InstCombine to rewrite the expression
> in
> +  // the narrower type. However, InstCombine only rewrites single-use
> values.
> +  // This means that if a tree entry other than a root is used
> externally, it
> +  // must have multiple uses and InstCombine will not rewrite it. The code
> +  // below ensures that only the roots are used externally.
> +  SmallPtrSet<Value *, 16> Expr(TreeRoot.begin(), TreeRoot.end());
> +  for (auto &EU : ExternalUses)
> +    if (!Expr.erase(EU.Scalar))
> +      return;
> +  if (!Expr.empty())
> +    return;
> +
> +  // Collect the scalar values in one lane of the vectorizable
> expression. We
> +  // will use this context to determine which values can be demoted. If
> we see
> +  // a truncation, we mark it as seeding another demotion.
> +  for (auto &Entry : VectorizableTree)
> +    Expr.insert(Entry.Scalars[0]);
> +
> +  // Ensure the root of the vectorizable tree doesn't form a cycle. It
> must
> +  // have a single external user that is not in the vectorizable tree.
> +  if (!TreeRoot[0]->hasOneUse() || Expr.count(*TreeRoot[0]->user_begin()))
> +    return;
> +
> +  // Conservatively determine if we can actually truncate the root of the
> +  // expression. Collect the values that can be demoted in ToDemote and
> +  // additional roots that require investigating in Roots.
> +  SmallVector<Value *, 32> ToDemote;
> +  SmallVector<Value *, 2> Roots;
> +  if (!collectValuesToDemote(TreeRoot[0], Expr, ToDemote, Roots))
> +    return;
> +
> +  // The maximum bit width required to represent all the values that can
> be
> +  // demoted without loss of precision. It would be safe to truncate the
> root
> +  // of the expression to this width.
> +  auto MaxBitWidth = 8u;
> +
> +  // We first check if all the bits of the root are demanded. If they're
> not,
> +  // we can truncate the root to this narrower type.
> +  auto Mask = DB->getDemandedBits(cast<Instruction>(TreeRoot[0]));
> +  if (Mask.countLeadingZeros() > 0)
> +    MaxBitWidth = std::max<unsigned>(
> +        Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
> +
> +  // If all the bits of the root are demanded, we can try a little harder
> to
> +  // compute a narrower type. This can happen, for example, if the roots
> are
> +  // getelementptr indices. InstCombine promotes these indices to the
> pointer
> +  // width. Thus, all their bits are technically demanded even though the
> +  // address computation might be vectorized in a smaller type.
> +  //
> +  // We start by looking at each entry that can be demoted. We compute the
> +  // maximum bit width required to store the scalar by using
> ValueTracking to
> +  // compute the number of high-order bits we can truncate.
> +  else
> +    for (auto *Scalar : ToDemote) {
> +      auto NumSignBits = ComputeNumSignBits(Scalar, DL, 0, AC, 0, DT);
> +      auto NumTypeBits = DL.getTypeSizeInBits(Scalar->getType());
> +      MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits,
> MaxBitWidth);
> +    }
> +
> +  // Round MaxBitWidth up to the next power-of-two.
> +  if (!isPowerOf2_64(MaxBitWidth))
> +    MaxBitWidth = NextPowerOf2(MaxBitWidth);
> +
> +  // If the maximum bit width we compute is less than the with of the
> roots'
> +  // type, we can proceed with the narrowing. Otherwise, do nothing.
> +  if (MaxBitWidth >= TreeRootIT->getBitWidth())
> +    return;
> +
> +  // If we can truncate the root, we must collect additional values that
> might
> +  // be demoted as a result. That is, those seeded by truncations we will
> +  // modify.
> +  while (!Roots.empty())
> +    collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);
> +
> +  // Finally, map the values we can demote to the maximum bit with we
> computed.
> +  for (auto *Scalar : ToDemote)
> +    MinBWs[Scalar] = MaxBitWidth;
> +}
> +
>  /// The SLPVectorizer Pass.
>  struct SLPVectorizer : public FunctionPass {
>    typedef SmallVector<StoreInst *, 8> StoreList;
> @@ -3161,6 +3375,7 @@ struct SLPVectorizer : public FunctionPa
>    LoopInfo *LI;
>    DominatorTree *DT;
>    AssumptionCache *AC;
> +  DemandedBits *DB;
>
>    bool runOnFunction(Function &F) override {
>      if (skipOptnoneFunction(F))
> @@ -3174,6 +3389,7 @@ struct SLPVectorizer : public FunctionPa
>      LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
>      DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
>      AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
> +    DB = &getAnalysis<DemandedBits>();
>
>      Stores.clear();
>      GEPs.clear();
> @@ -3203,7 +3419,7 @@ struct SLPVectorizer : public FunctionPa
>
>      // Use the bottom up slp vectorizer to construct chains that start
> with
>      // store instructions.
> -    BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC);
> +    BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB);
>
>      // A general note: the vectorizer must use
> BoUpSLP::eraseInstruction() to
>      // delete instructions.
> @@ -3246,6 +3462,7 @@ struct SLPVectorizer : public FunctionPa
>      AU.addRequired<TargetTransformInfoWrapperPass>();
>      AU.addRequired<LoopInfoWrapperPass>();
>      AU.addRequired<DominatorTreeWrapperPass>();
> +    AU.addRequired<DemandedBits>();
>      AU.addPreserved<LoopInfoWrapperPass>();
>      AU.addPreserved<DominatorTreeWrapperPass>();
>      AU.addPreserved<AAResultsWrapperPass>();
> @@ -3350,6 +3567,7 @@ bool SLPVectorizer::vectorizeStoreChain(
>      ArrayRef<Value *> Operands = Chain.slice(i, VF);
>
>      R.buildTree(Operands);
> +    R.computeMinimumValueSizes();
>
>      int Cost = R.getTreeCost();
>
> @@ -3549,6 +3767,7 @@ bool SLPVectorizer::tryToVectorizeList(A
>        Value *ReorderedOps[] = { Ops[1], Ops[0] };
>        R.buildTree(ReorderedOps, None);
>      }
> +    R.computeMinimumValueSizes();
>      int Cost = R.getTreeCost();
>
>      if (Cost < -SLPCostThreshold) {
> @@ -3815,6 +4034,7 @@ public:
>
>      for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
>        V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth),
> ReductionOps);
> +      V.computeMinimumValueSizes();
>
>        // Estimate cost.
>        int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
>
> Modified: llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll?rev=259357&r1=259356&r2=259357&view=diff
>
> ==============================================================================
> --- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
> (original)
> +++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll Mon
> Feb  1 07:38:29 2016
> @@ -1,4 +1,5 @@
> -; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s
> +; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s
> --check-prefix=PROFITABLE
> +; RUN: opt -S -slp-vectorizer -slp-threshold=-12 -dce -instcombine < %s |
> FileCheck %s --check-prefix=UNPROFITABLE
>
>  target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
>  target triple = "aarch64--linux-gnu"
> @@ -18,13 +19,13 @@ target triple = "aarch64--linux-gnu"
>  ;   return sum;
>  ; }
>
> -; CHECK-LABEL: @gather_reduce_8x16_i32
> +; PROFITABLE-LABEL: @gather_reduce_8x16_i32
>  ;
> -; CHECK: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16>
> -; CHECK: zext <8 x i16> [[L]] to <8 x i32>
> -; CHECK: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32>
> -; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]]
> -; CHECK: sext i32 [[X]] to i64
> +; PROFITABLE: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16>
> +; PROFITABLE: zext <8 x i16> [[L]] to <8 x i32>
> +; PROFITABLE: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32>
> +; PROFITABLE: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]]
> +; PROFITABLE: sext i32 [[X]] to i64
>  ;
>  define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16*
> nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
>  entry:
> @@ -137,14 +138,18 @@ for.body:
>    br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
>  }
>
> -; CHECK-LABEL: @gather_reduce_8x16_i64
> +; UNPROFITABLE-LABEL: @gather_reduce_8x16_i64
>  ;
> -; CHECK-NOT: load <8 x i16>
> -;
> -; FIXME: We are currently unable to vectorize the case with i64
> subtraction
> -;        because the zero extensions are too expensive. The solution here
> is to
> -;        convert the i64 subtractions to i32 subtractions during
> vectorization.
> -;        This would then match the case above.
> +; UNPROFITABLE: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16>
> +; UNPROFITABLE: zext <8 x i16> [[L]] to <8 x i32>
> +; UNPROFITABLE: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32>
> +; UNPROFITABLE: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]]
> +; UNPROFITABLE: sext i32 [[X]] to i64
> +;
> +; TODO: Although we can now vectorize this case while converting the i64
> +;       subtractions to i32, the cost model currently finds vectorization
> to be
> +;       unprofitable. The cost model is penalizing the sign and zero
> +;       extensions in the vectorized version, but they are actually free.
>  ;
>  define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16*
> nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
>  entry:
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20160217/1972d726/attachment.html>