[llvm] r258408 - Revert "[SLP] Truncate expressions to minimum required bit width"

Thu Jan 21 13:46:21 PST 2016

Thanks for the information!

Q.
> On Jan 21, 2016, at 1:42 PM, Matthew Simpson <mssimpso at codeaurora.org> wrote:
> 
> Hi Quentin,
> 
> Yes, I reverted the patch because it caused a buildbot failure due to that
> assertion. The assertion was introduced in the patch.
> 
> -- Matt
> 
> 
> -----Original Message-----
> From: Quentin Colombet [mailto:qcolombet at apple.com] 
> Sent: Thursday, January 21, 2016 4:26 PM
> To: Matthew Simpson
> Cc: llvm-commits at lists.llvm.org
> Subject: Re: [llvm] r258408 - Revert "[SLP] Truncate expressions to minimum
> required bit width"
> 
> Hi Matthew,
> 
> Would be good to have said why you reverted it.
> 
> In particular, I am seeing an assertion failure which seems to be because of
> this commit and I was wondering if you reverted it because of this or if I
> need to look into producing a test case.
> 
> Specifically, I am seeing:
> Assertion failed: (IT && "Computed smaller type for non-integer value?"),
> function getEntryCost, file lib/Transforms/Vectorize/SLPVectorizer.cpp, line
> 1500
> 
> Thanks,
> -Quentin
> 
>> On Jan 21, 2016, at 9:17 AM, Matthew Simpson via llvm-commits
> <llvm-commits at lists.llvm.org> wrote:
>> 
>> Author: mssimpso
>> Date: Thu Jan 21 11:17:20 2016
>> New Revision: 258408
>> 
>> URL: http://llvm.org/viewvc/llvm-project?rev=258408&view=rev
>> Log:
>> Revert "[SLP] Truncate expressions to minimum required bit width"
>> 
>> This reverts commit r258404.
>> 
>> Modified:
>>   llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
>>   llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
>> 
>> Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
>> URL: 
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectoriz
>> e/SLPVectorizer.cpp?rev=258408&r1=258407&r2=258408&view=diff
>> ======================================================================
>> ========
>> --- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
>> +++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Thu Jan 21 
>> +++ 11:17:20 2016
>> @@ -15,22 +15,21 @@
>> //  "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
>> //
>> //===-----------------------------------------------------------------
>> -----===//
>> +#include "llvm/Transforms/Vectorize.h"
>> #include "llvm/ADT/MapVector.h"
>> #include "llvm/ADT/Optional.h"
>> #include "llvm/ADT/PostOrderIterator.h"
>> #include "llvm/ADT/SetVector.h"
>> #include "llvm/ADT/Statistic.h"
>> #include "llvm/Analysis/AliasAnalysis.h"
>> +#include "llvm/Analysis/GlobalsModRef.h"
>> #include "llvm/Analysis/AssumptionCache.h"
>> #include "llvm/Analysis/CodeMetrics.h"
>> -#include "llvm/Analysis/DemandedBits.h"
>> -#include "llvm/Analysis/GlobalsModRef.h"
>> #include "llvm/Analysis/LoopInfo.h"
>> #include "llvm/Analysis/ScalarEvolution.h"
>> #include "llvm/Analysis/ScalarEvolutionExpressions.h"
>> #include "llvm/Analysis/TargetTransformInfo.h"
>> #include "llvm/Analysis/ValueTracking.h"
>> -#include "llvm/Analysis/VectorUtils.h"
>> #include "llvm/IR/DataLayout.h"
>> #include "llvm/IR/Dominators.h"
>> #include "llvm/IR/IRBuilder.h"
>> @@ -45,7 +44,7 @@
>> #include "llvm/Support/CommandLine.h"
>> #include "llvm/Support/Debug.h"
>> #include "llvm/Support/raw_ostream.h"
>> -#include "llvm/Transforms/Vectorize.h"
>> +#include "llvm/Analysis/VectorUtils.h"
>> #include <algorithm>
>> #include <map>
>> #include <memory>
>> @@ -364,12 +363,11 @@ public:
>> 
>>  BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
>>          TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
>> -          DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB)
>> +          DominatorTree *Dt, AssumptionCache *AC)
>>      : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
>> -        SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
> DB(DB),
>> +        SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
>>        Builder(Se->getContext()) {
>>    CodeMetrics::collectEphemeralValues(F, AC, EphValues);
>> -    MaxRequiredIntegerTy = nullptr;
>>  }
>> 
>>  /// \brief Vectorize the tree that starts with the elements in \p VL.
>> @@ -401,7 +399,6 @@ public:
>>      BlockScheduling *BS = Iter.second.get();
>>      BS->clear();
>>    }
>> -    MaxRequiredIntegerTy = nullptr;
>>  }
>> 
>>  /// \returns true if the memory operations A and B are consecutive.
>> @@ -422,10 +419,6 @@ public:
>>  /// vectorization factors.
>>  unsigned getVectorElementSize(Value *V);
>> 
>> -  /// Compute the maximum width integer type required to represent 
>> the result
>> -  /// of a scalar expression, if such a type exists.
>> -  void computeMaxRequiredIntegerTy();
>> -
>> private:
>>  struct TreeEntry;
>> 
>> @@ -931,13 +924,8 @@ private:
>>  AliasAnalysis *AA;
>>  LoopInfo *LI;
>>  DominatorTree *DT;
>> -  AssumptionCache *AC;
>> -  DemandedBits *DB;
>>  /// Instruction builder to construct the vectorized tree.
>>  IRBuilder<> Builder;
>> -
>> -  // The maximum width integer type required to represent a scalar
> expression.
>> -  IntegerType *MaxRequiredIntegerTy;
>> };
>> 
>> #ifndef NDEBUG
>> @@ -1493,15 +1481,6 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
>>    ScalarTy = SI->getValueOperand()->getType();
>>  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
>> 
>> -  // If we have computed a smaller type for the expression, update 
>> VecTy so
>> -  // that the costs will be accurate.
>> -  if (MaxRequiredIntegerTy) {
>> -    auto *IT = dyn_cast<IntegerType>(ScalarTy);
>> -    assert(IT && "Computed smaller type for non-integer value?");
>> -    if (MaxRequiredIntegerTy->getBitWidth() < IT->getBitWidth())
>> -      VecTy = VectorType::get(MaxRequiredIntegerTy, VL.size());
>> -  }
>> -
>>  if (E->NeedToGather) {
>>    if (allConstant(VL))
>>      return 0;
>> @@ -1830,17 +1809,9 @@ int BoUpSLP::getTreeCost() {
>>    if (EphValues.count(EU.User))
>>      continue;
>> 
>> -    // If we plan to rewrite the tree in a smaller type, we will need to
> sign
>> -    // extend the extracted value back to the original type. Here, we
> account
>> -    // for the extract and the added cost of the sign extend if needed.
>> -    auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
>> -    if (MaxRequiredIntegerTy) {
>> -      VecTy = VectorType::get(MaxRequiredIntegerTy, BundleWidth);
>> -      ExtractCost += TTI->getCastInstrCost(
>> -          Instruction::SExt, EU.Scalar->getType(), MaxRequiredIntegerTy);
>> -    }
>> -    ExtractCost +=
>> -        TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
> EU.Lane);
>> +    VectorType *VecTy = VectorType::get(EU.Scalar->getType(),
> BundleWidth);
>> +    ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement,
> VecTy,
>> +                                           EU.Lane);
>>  }
>> 
>>  Cost += getSpillCost();
>> @@ -2595,19 +2566,7 @@ Value *BoUpSLP::vectorizeTree() {
>>  }
>> 
>>  Builder.SetInsertPoint(&F->getEntryBlock().front());
>> -  auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
>> -
>> -  // If the vectorized tree can be rewritten in a smaller type, we 
>> truncate the
>> -  // vectorized root. InstCombine will then rewrite the entire 
>> expression. We
>> -  // sign extend the extracted values below.
>> -  if (MaxRequiredIntegerTy) {
>> -    BasicBlock::iterator I(cast<Instruction>(VectorRoot));
>> -    Builder.SetInsertPoint(&*++I);
>> -    auto BundleWidth = VectorizableTree[0].Scalars.size();
>> -    auto *SmallerTy = VectorType::get(MaxRequiredIntegerTy, BundleWidth);
>> -    auto *Trunc = Builder.CreateTrunc(VectorRoot, SmallerTy);
>> -    VectorizableTree[0].VectorizedValue = Trunc;
>> -  }
>> +  vectorizeTree(&VectorizableTree[0]);
>> 
>>  DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " 
>> values .\n");
>> 
>> @@ -2640,8 +2599,6 @@ Value *BoUpSLP::vectorizeTree() {
>>          if (PH->getIncomingValue(i) == Scalar) {
>> 
> Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
>>            Value *Ex = Builder.CreateExtractElement(Vec, Lane);
>> -            if (MaxRequiredIntegerTy)
>> -              Ex = Builder.CreateSExt(Ex, Scalar->getType());
>>            CSEBlocks.insert(PH->getIncomingBlock(i));
>>            PH->setOperand(i, Ex);
>>          }
>> @@ -2649,16 +2606,12 @@ Value *BoUpSLP::vectorizeTree() {
>>      } else {
>>        Builder.SetInsertPoint(cast<Instruction>(User));
>>        Value *Ex = Builder.CreateExtractElement(Vec, Lane);
>> -        if (MaxRequiredIntegerTy)
>> -          Ex = Builder.CreateSExt(Ex, Scalar->getType());
>>        CSEBlocks.insert(cast<Instruction>(User)->getParent());
>>        User->replaceUsesOfWith(Scalar, Ex);
>>     }
>>    } else {
>>      Builder.SetInsertPoint(&F->getEntryBlock().front());
>>      Value *Ex = Builder.CreateExtractElement(Vec, Lane);
>> -      if (MaxRequiredIntegerTy)
>> -        Ex = Builder.CreateSExt(Ex, Scalar->getType());
>>      CSEBlocks.insert(&F->getEntryBlock());
>>      User->replaceUsesOfWith(Scalar, Ex);
>>    }
>> @@ -3227,7 +3180,7 @@ unsigned BoUpSLP::getVectorElementSize(V
>>    // If the current instruction is a load, update MaxWidth to reflect
> the
>>    // width of the loaded value.
>>    else if (isa<LoadInst>(I))
>> -      MaxWidth = std::max<unsigned>(MaxWidth, DL.getTypeSizeInBits(Ty));
>> +      MaxWidth = std::max(MaxWidth, 
>> + (unsigned)DL.getTypeSizeInBits(Ty));
>> 
>>    // Otherwise, we need to visit the operands of the instruction. We
> only
>>    // handle the interesting cases from buildTree here. If an operand 
>> is an @@ -3254,85 +3207,6 @@ unsigned BoUpSLP::getVectorElementSize(V
>>  return MaxWidth;
>> }
>> 
>> -void BoUpSLP::computeMaxRequiredIntegerTy() {
>> -
>> -  // If there are no external uses, the expression tree must be 
>> rooted by a
>> -  // store. We can't demote in-memory values, so there is nothing to do
> here.
>> -  if (ExternalUses.empty())
>> -    return;
>> -
>> -  // If the expression is not rooted by a store, these roots should 
>> have
>> -  // external uses. We will rely on InstCombine to rewrite the 
>> expression in
>> -  // the narrower type. However, InstCombine only rewrites single-use
> values.
>> -  // This means that if a tree entry other than a root is used 
>> externally, it
>> -  // must have multiple uses and InstCombine will not rewrite it. The 
>> code
>> -  // below ensures that only the roots are used externally.
>> -  auto &TreeRoot = VectorizableTree[0].Scalars;
>> -  SmallPtrSet<Value *, 16> ScalarRoots(TreeRoot.begin(), 
>> TreeRoot.end());
>> -  for (auto &EU : ExternalUses)
>> -    if (!ScalarRoots.erase(EU.Scalar))
>> -      return;
>> -  if (!ScalarRoots.empty())
>> -    return;
>> -
>> -  // The maximum bit width required to represent all the instructions 
>> in the
>> -  // tree without loss of precision. It would be safe to truncate the
>> -  // expression to this width.
>> -  auto MaxBitWidth = 8u;
>> -
>> -  // We first check if all the bits of the root are demanded. If 
>> they're not,
>> -  // we can truncate the root to this narrower type.
>> -  auto *Root = dyn_cast<Instruction>(TreeRoot[0]);
>> -  if (!Root || !isa<IntegerType>(Root->getType()) || !Root->hasOneUse())
>> -    return;
>> -  auto Mask = DB->getDemandedBits(Root);
>> -  if (Mask.countLeadingZeros() > 0)
>> -    MaxBitWidth = Mask.getBitWidth() - Mask.countLeadingZeros();
>> -
>> -  // If all the bits of the root are demanded, we can try a little 
>> harder to
>> -  // compute a narrower type. This can happen, for example, if the 
>> roots are
>> -  // getelementptr indices. InstCombine promotes these indices to the 
>> pointer
>> -  // width. Thus, all their bits are technically demanded even though 
>> the
>> -  // address computation might be vectorized in a smaller type. We 
>> start by
>> -  // looking at each entry in the tree.
>> -  else
>> -    for (auto &Entry : VectorizableTree) {
>> -
>> -      // Get a representative value for the vectorizable bundle. All
> values in
>> -      // Entry.Scalars should be isomorphic.
>> -      auto *Scalar = Entry.Scalars[0];
>> -
>> -      // If the scalar is used more than once, InstCombine will not
> rewrite it,
>> -      // so we should give up.
>> -      if (!Scalar->hasOneUse())
>> -        return;
>> -
>> -      // We only compute smaller integer types. If the scalar has a
> different
>> -      // type, give up.
>> -      auto *IT = dyn_cast<IntegerType>(Scalar->getType());
>> -      if (!IT)
>> -        return;
>> -
>> -      // Compute the maximum bit width required to store the scalar. We
> use
>> -      // ValueTracking to compute the number of high-order bits we can
>> -      // truncate. We then round up to the next power-of-two.
>> -      auto &DL = F->getParent()->getDataLayout();
>> -      auto NumSignBits = ComputeNumSignBits(Scalar, DL, 0, AC, 0, DT);
>> -      auto NumTypeBits = IT->getBitWidth();
>> -      MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits,
> MaxBitWidth);
>> -    }
>> -
>> -  // Round up to the next power-of-two.
>> -  if (!isPowerOf2_64(MaxBitWidth))
>> -    MaxBitWidth = NextPowerOf2(MaxBitWidth);
>> -
>> -  // If the maximum bit width we compute is less than the with of the
> roots'
>> -  // type, we can proceed with the narrowing. Otherwise, do nothing.
>> -  auto *RootIT = cast<IntegerType>(TreeRoot[0]->getType());
>> -  if (MaxBitWidth > 0 && MaxBitWidth < RootIT->getBitWidth())
>> -    MaxRequiredIntegerTy = IntegerType::get(F->getContext(),
> MaxBitWidth);
>> -}
>> -
>> /// The SLPVectorizer Pass.
>> struct SLPVectorizer : public FunctionPass {
>>  typedef SmallVector<StoreInst *, 8> StoreList; @@ -3354,7 +3228,6 @@ 
>> struct SLPVectorizer : public FunctionPa
>>  LoopInfo *LI;
>>  DominatorTree *DT;
>>  AssumptionCache *AC;
>> -  DemandedBits *DB;
>> 
>>  bool runOnFunction(Function &F) override {
>>    if (skipOptnoneFunction(F))
>> @@ -3368,7 +3241,6 @@ struct SLPVectorizer : public FunctionPa
>>    LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
>>    DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
>>    AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
>> -    DB = &getAnalysis<DemandedBits>();
>> 
>>    Stores.clear();
>>    GEPs.clear();
>> @@ -3398,7 +3270,7 @@ struct SLPVectorizer : public FunctionPa
>> 
>>    // Use the bottom up slp vectorizer to construct chains that start
> with
>>    // store instructions.
>> -    BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB);
>> +    BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC);
>> 
>>    // A general note: the vectorizer must use BoUpSLP::eraseInstruction()
> to
>>    // delete instructions.
>> @@ -3441,7 +3313,6 @@ struct SLPVectorizer : public FunctionPa
>>    AU.addRequired<TargetTransformInfoWrapperPass>();
>>    AU.addRequired<LoopInfoWrapperPass>();
>>    AU.addRequired<DominatorTreeWrapperPass>();
>> -    AU.addRequired<DemandedBits>();
>>    AU.addPreserved<LoopInfoWrapperPass>();
>>    AU.addPreserved<DominatorTreeWrapperPass>();
>>    AU.addPreserved<AAResultsWrapperPass>();
>> @@ -3546,7 +3417,6 @@ bool SLPVectorizer::vectorizeStoreChain(
>>    ArrayRef<Value *> Operands = Chain.slice(i, VF);
>> 
>>    R.buildTree(Operands);
>> -    R.computeMaxRequiredIntegerTy();
>> 
>>    int Cost = R.getTreeCost();
>> 
>> @@ -3746,7 +3616,6 @@ bool SLPVectorizer::tryToVectorizeList(A
>>      Value *ReorderedOps[] = { Ops[1], Ops[0] };
>>      R.buildTree(ReorderedOps, None);
>>    }
>> -    R.computeMaxRequiredIntegerTy();
>>    int Cost = R.getTreeCost();
>> 
>>    if (Cost < -SLPCostThreshold) {
>> @@ -4013,7 +3882,6 @@ public:
>> 
>>    for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
>>      V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth),
> ReductionOps);
>> -      V.computeMaxRequiredIntegerTy();
>> 
>>      // Estimate cost.
>>      int Cost = V.getTreeCost() + getReductionCost(TTI, 
>> ReducedVals[i]);
>> 
>> Modified: 
>> llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
>> URL: 
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVect
>> orizer/AArch64/gather-reduce.ll?rev=258408&r1=258407&r2=258408&view=di
>> ff 
>> ======================================================================
>> ========
>> --- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll 
>> (original)
>> +++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll 
>> +++ Thu Jan 21 11:17:20 2016
>> @@ -1,5 +1,4 @@
>> -; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s 
>> --check-prefix=PROFITABLE -; RUN: opt -S -slp-vectorizer 
>> -slp-threshold=-12 -dce -instcombine < %s | FileCheck %s 
>> --check-prefix=UNPROFITABLE
>> +; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s
>> 
>> target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
>> target triple = "aarch64--linux-gnu"
>> @@ -19,13 +18,13 @@ target triple = "aarch64--linux-gnu"
>> ;   return sum;
>> ; }
>> 
>> -; PROFITABLE-LABEL: @gather_reduce_8x16_i32
>> +; CHECK-LABEL: @gather_reduce_8x16_i32
>> ;
>> -; PROFITABLE: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16> -; PROFITABLE: 
>> zext <8 x i16> [[L]] to <8 x i32> -; PROFITABLE: [[S:%[a-zA-Z0-9.]+]] 
>> = sub nsw <8 x i32> -; PROFITABLE: [[X:%[a-zA-Z0-9.]+]] = 
>> extractelement <8 x i32> [[S]] -; PROFITABLE: sext i32 [[X]] to i64
>> +; CHECK: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16> ; CHECK: zext <8 x 
>> +i16> [[L]] to <8 x i32> ; CHECK: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x 
>> +i32> ; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]] ; 
>> +CHECK: sext i32 [[X]] to i64
>> ;
>> define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* 
>> nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
>> entry:
>> @@ -138,18 +137,14 @@ for.body:
>>  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body }
>> 
>> -; UNPROFITABLE-LABEL: @gather_reduce_8x16_i64
>> +; CHECK-LABEL: @gather_reduce_8x16_i64
>> ;
>> -; UNPROFITABLE: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16> -; 
>> UNPROFITABLE: zext <8 x i16> [[L]] to <8 x i32> -; UNPROFITABLE: 
>> [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32> -; UNPROFITABLE: 
>> [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]] -; UNPROFITABLE: 
>> sext i32 [[X]] to i64 -; -; TODO: Although we can now vectorize this 
>> case while converting the i64
>> -;       subtractions to i32, the cost model currently finds vectorization
> to be
>> -;       unprofitable. The cost model is penalizing the sign and zero
>> -;       extensions in the vectorized version, but they are actually free.
>> +; CHECK-NOT: load <8 x i16>
>> +;
>> +; FIXME: We are currently unable to vectorize the case with i64
> subtraction
>> +;        because the zero extensions are too expensive. The solution here
> is to
>> +;        convert the i64 subtractions to i32 subtractions during
> vectorization.
>> +;        This would then match the case above.
>> ;
>> define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* 
>> nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
>> entry:
>> 
>> 
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at lists.llvm.org
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
> 
>