[llvm] r258408 - Revert "[SLP] Truncate expressions to minimum required bit width"
Matthew Simpson via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 21 13:53:02 PST 2016
No problem, and sorry for the confusion! I should also have mentioned that a
bug was opened to track the issue until a fix is ready.
https://llvm.org/bugs/show_bug.cgi?id=26239
-- Matt
-----Original Message-----
From: Quentin Colombet [mailto:qcolombet at apple.com]
Sent: Thursday, January 21, 2016 4:46 PM
To: Matthew Simpson
Cc: llvm-commits at lists.llvm.org
Subject: Re: [llvm] r258408 - Revert "[SLP] Truncate expressions to minimum
required bit width"
Thanks for the information!
Q.
> On Jan 21, 2016, at 1:42 PM, Matthew Simpson <mssimpso at codeaurora.org>
wrote:
>
> Hi Quentin,
>
> Yes, I reverted the patch because it caused a buildbot failure due to
> that assertion. The assertion was introduced in the patch.
>
> -- Matt
>
>
> -----Original Message-----
> From: Quentin Colombet [mailto:qcolombet at apple.com]
> Sent: Thursday, January 21, 2016 4:26 PM
> To: Matthew Simpson
> Cc: llvm-commits at lists.llvm.org
> Subject: Re: [llvm] r258408 - Revert "[SLP] Truncate expressions to
> minimum required bit width"
>
> Hi Matthew,
>
> Would be good to have said why you reverted it.
>
> In particular, I am seeing an assertion failure which seems to be
> because of this commit and I was wondering if you reverted it because
> of this or if I need to look into producing a test case.
>
> Specifically, I am seeing:
> Assertion failed: (IT && "Computed smaller type for non-integer
> value?"), function getEntryCost, file
> lib/Transforms/Vectorize/SLPVectorizer.cpp, line
> 1500
>
> Thanks,
> -Quentin
>
>> On Jan 21, 2016, at 9:17 AM, Matthew Simpson via llvm-commits
> <llvm-commits at lists.llvm.org> wrote:
>>
>> Author: mssimpso
>> Date: Thu Jan 21 11:17:20 2016
>> New Revision: 258408
>>
>> URL: http://llvm.org/viewvc/llvm-project?rev=258408&view=rev
>> Log:
>> Revert "[SLP] Truncate expressions to minimum required bit width"
>>
>> This reverts commit r258404.
>>
>> Modified:
>> llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
>> llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
>>
>> Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectori
>> z e/SLPVectorizer.cpp?rev=258408&r1=258407&r2=258408&view=diff
>> =====================================================================
>> =
>> ========
>> --- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
>> +++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Thu Jan 21
>> +++ 11:17:20 2016
>> @@ -15,22 +15,21 @@
>> // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
>> //
>> //===----------------------------------------------------------------
>> -
>> -----===//
>> +#include "llvm/Transforms/Vectorize.h"
>> #include "llvm/ADT/MapVector.h"
>> #include "llvm/ADT/Optional.h"
>> #include "llvm/ADT/PostOrderIterator.h"
>> #include "llvm/ADT/SetVector.h"
>> #include "llvm/ADT/Statistic.h"
>> #include "llvm/Analysis/AliasAnalysis.h"
>> +#include "llvm/Analysis/GlobalsModRef.h"
>> #include "llvm/Analysis/AssumptionCache.h"
>> #include "llvm/Analysis/CodeMetrics.h"
>> -#include "llvm/Analysis/DemandedBits.h"
>> -#include "llvm/Analysis/GlobalsModRef.h"
>> #include "llvm/Analysis/LoopInfo.h"
>> #include "llvm/Analysis/ScalarEvolution.h"
>> #include "llvm/Analysis/ScalarEvolutionExpressions.h"
>> #include "llvm/Analysis/TargetTransformInfo.h"
>> #include "llvm/Analysis/ValueTracking.h"
>> -#include "llvm/Analysis/VectorUtils.h"
>> #include "llvm/IR/DataLayout.h"
>> #include "llvm/IR/Dominators.h"
>> #include "llvm/IR/IRBuilder.h"
>> @@ -45,7 +44,7 @@
>> #include "llvm/Support/CommandLine.h"
>> #include "llvm/Support/Debug.h"
>> #include "llvm/Support/raw_ostream.h"
>> -#include "llvm/Transforms/Vectorize.h"
>> +#include "llvm/Analysis/VectorUtils.h"
>> #include <algorithm>
>> #include <map>
>> #include <memory>
>> @@ -364,12 +363,11 @@ public:
>>
>> BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
>> TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
>> - DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB)
>> + DominatorTree *Dt, AssumptionCache *AC)
>> : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
>> - SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
> DB(DB),
>> + SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
>> Builder(Se->getContext()) {
>> CodeMetrics::collectEphemeralValues(F, AC, EphValues);
>> - MaxRequiredIntegerTy = nullptr;
>> }
>>
>> /// \brief Vectorize the tree that starts with the elements in \p VL.
>> @@ -401,7 +399,6 @@ public:
>> BlockScheduling *BS = Iter.second.get();
>> BS->clear();
>> }
>> - MaxRequiredIntegerTy = nullptr;
>> }
>>
>> /// \returns true if the memory operations A and B are consecutive.
>> @@ -422,10 +419,6 @@ public:
>> /// vectorization factors.
>> unsigned getVectorElementSize(Value *V);
>>
>> - /// Compute the maximum width integer type required to represent
>> the result
>> - /// of a scalar expression, if such a type exists.
>> - void computeMaxRequiredIntegerTy();
>> -
>> private:
>> struct TreeEntry;
>>
>> @@ -931,13 +924,8 @@ private:
>> AliasAnalysis *AA;
>> LoopInfo *LI;
>> DominatorTree *DT;
>> - AssumptionCache *AC;
>> - DemandedBits *DB;
>> /// Instruction builder to construct the vectorized tree.
>> IRBuilder<> Builder;
>> -
>> - // The maximum width integer type required to represent a scalar
> expression.
>> - IntegerType *MaxRequiredIntegerTy; };
>>
>> #ifndef NDEBUG
>> @@ -1493,15 +1481,6 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
>> ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy =
>> VectorType::get(ScalarTy, VL.size());
>>
>> - // If we have computed a smaller type for the expression, update
>> VecTy so
>> - // that the costs will be accurate.
>> - if (MaxRequiredIntegerTy) {
>> - auto *IT = dyn_cast<IntegerType>(ScalarTy);
>> - assert(IT && "Computed smaller type for non-integer value?");
>> - if (MaxRequiredIntegerTy->getBitWidth() < IT->getBitWidth())
>> - VecTy = VectorType::get(MaxRequiredIntegerTy, VL.size());
>> - }
>> -
>> if (E->NeedToGather) {
>> if (allConstant(VL))
>> return 0;
>> @@ -1830,17 +1809,9 @@ int BoUpSLP::getTreeCost() {
>> if (EphValues.count(EU.User))
>> continue;
>>
>> - // If we plan to rewrite the tree in a smaller type, we will need to
> sign
>> - // extend the extracted value back to the original type. Here, we
> account
>> - // for the extract and the added cost of the sign extend if needed.
>> - auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
>> - if (MaxRequiredIntegerTy) {
>> - VecTy = VectorType::get(MaxRequiredIntegerTy, BundleWidth);
>> - ExtractCost += TTI->getCastInstrCost(
>> - Instruction::SExt, EU.Scalar->getType(),
MaxRequiredIntegerTy);
>> - }
>> - ExtractCost +=
>> - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
> EU.Lane);
>> + VectorType *VecTy = VectorType::get(EU.Scalar->getType(),
> BundleWidth);
>> + ExtractCost +=
>> + TTI->getVectorInstrCost(Instruction::ExtractElement,
> VecTy,
>> + EU.Lane);
>> }
>>
>> Cost += getSpillCost();
>> @@ -2595,19 +2566,7 @@ Value *BoUpSLP::vectorizeTree() { }
>>
>> Builder.SetInsertPoint(&F->getEntryBlock().front());
>> - auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
>> -
>> - // If the vectorized tree can be rewritten in a smaller type, we
>> truncate the
>> - // vectorized root. InstCombine will then rewrite the entire
>> expression. We
>> - // sign extend the extracted values below.
>> - if (MaxRequiredIntegerTy) {
>> - BasicBlock::iterator I(cast<Instruction>(VectorRoot));
>> - Builder.SetInsertPoint(&*++I);
>> - auto BundleWidth = VectorizableTree[0].Scalars.size();
>> - auto *SmallerTy = VectorType::get(MaxRequiredIntegerTy,
BundleWidth);
>> - auto *Trunc = Builder.CreateTrunc(VectorRoot, SmallerTy);
>> - VectorizableTree[0].VectorizedValue = Trunc;
>> - }
>> + vectorizeTree(&VectorizableTree[0]);
>>
>> DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << "
>> values .\n");
>>
>> @@ -2640,8 +2599,6 @@ Value *BoUpSLP::vectorizeTree() {
>> if (PH->getIncomingValue(i) == Scalar) {
>>
> Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
>> Value *Ex = Builder.CreateExtractElement(Vec, Lane);
>> - if (MaxRequiredIntegerTy)
>> - Ex = Builder.CreateSExt(Ex, Scalar->getType());
>> CSEBlocks.insert(PH->getIncomingBlock(i));
>> PH->setOperand(i, Ex);
>> }
>> @@ -2649,16 +2606,12 @@ Value *BoUpSLP::vectorizeTree() {
>> } else {
>> Builder.SetInsertPoint(cast<Instruction>(User));
>> Value *Ex = Builder.CreateExtractElement(Vec, Lane);
>> - if (MaxRequiredIntegerTy)
>> - Ex = Builder.CreateSExt(Ex, Scalar->getType());
>> CSEBlocks.insert(cast<Instruction>(User)->getParent());
>> User->replaceUsesOfWith(Scalar, Ex);
>> }
>> } else {
>> Builder.SetInsertPoint(&F->getEntryBlock().front());
>> Value *Ex = Builder.CreateExtractElement(Vec, Lane);
>> - if (MaxRequiredIntegerTy)
>> - Ex = Builder.CreateSExt(Ex, Scalar->getType());
>> CSEBlocks.insert(&F->getEntryBlock());
>> User->replaceUsesOfWith(Scalar, Ex);
>> }
>> @@ -3227,7 +3180,7 @@ unsigned BoUpSLP::getVectorElementSize(V
>> // If the current instruction is a load, update MaxWidth to
>> reflect
> the
>> // width of the loaded value.
>> else if (isa<LoadInst>(I))
>> - MaxWidth = std::max<unsigned>(MaxWidth, DL.getTypeSizeInBits(Ty));
>> + MaxWidth = std::max(MaxWidth,
>> + (unsigned)DL.getTypeSizeInBits(Ty));
>>
>> // Otherwise, we need to visit the operands of the instruction. We
> only
>> // handle the interesting cases from buildTree here. If an operand
>> is an @@ -3254,85 +3207,6 @@ unsigned BoUpSLP::getVectorElementSize(V
>> return MaxWidth; }
>>
>> -void BoUpSLP::computeMaxRequiredIntegerTy() {
>> -
>> - // If there are no external uses, the expression tree must be
>> rooted by a
>> - // store. We can't demote in-memory values, so there is nothing to
>> do
> here.
>> - if (ExternalUses.empty())
>> - return;
>> -
>> - // If the expression is not rooted by a store, these roots should
>> have
>> - // external uses. We will rely on InstCombine to rewrite the
>> expression in
>> - // the narrower type. However, InstCombine only rewrites
>> single-use
> values.
>> - // This means that if a tree entry other than a root is used
>> externally, it
>> - // must have multiple uses and InstCombine will not rewrite it.
>> The code
>> - // below ensures that only the roots are used externally.
>> - auto &TreeRoot = VectorizableTree[0].Scalars;
>> - SmallPtrSet<Value *, 16> ScalarRoots(TreeRoot.begin(),
>> TreeRoot.end());
>> - for (auto &EU : ExternalUses)
>> - if (!ScalarRoots.erase(EU.Scalar))
>> - return;
>> - if (!ScalarRoots.empty())
>> - return;
>> -
>> - // The maximum bit width required to represent all the
>> instructions in the
>> - // tree without loss of precision. It would be safe to truncate
>> the
>> - // expression to this width.
>> - auto MaxBitWidth = 8u;
>> -
>> - // We first check if all the bits of the root are demanded. If
>> they're not,
>> - // we can truncate the root to this narrower type.
>> - auto *Root = dyn_cast<Instruction>(TreeRoot[0]);
>> - if (!Root || !isa<IntegerType>(Root->getType()) || !Root->hasOneUse())
>> - return;
>> - auto Mask = DB->getDemandedBits(Root);
>> - if (Mask.countLeadingZeros() > 0)
>> - MaxBitWidth = Mask.getBitWidth() - Mask.countLeadingZeros();
>> -
>> - // If all the bits of the root are demanded, we can try a little
>> harder to
>> - // compute a narrower type. This can happen, for example, if the
>> roots are
>> - // getelementptr indices. InstCombine promotes these indices to
>> the pointer
>> - // width. Thus, all their bits are technically demanded even
>> though the
>> - // address computation might be vectorized in a smaller type. We
>> start by
>> - // looking at each entry in the tree.
>> - else
>> - for (auto &Entry : VectorizableTree) {
>> -
>> - // Get a representative value for the vectorizable bundle. All
> values in
>> - // Entry.Scalars should be isomorphic.
>> - auto *Scalar = Entry.Scalars[0];
>> -
>> - // If the scalar is used more than once, InstCombine will not
> rewrite it,
>> - // so we should give up.
>> - if (!Scalar->hasOneUse())
>> - return;
>> -
>> - // We only compute smaller integer types. If the scalar has a
> different
>> - // type, give up.
>> - auto *IT = dyn_cast<IntegerType>(Scalar->getType());
>> - if (!IT)
>> - return;
>> -
>> - // Compute the maximum bit width required to store the scalar. We
> use
>> - // ValueTracking to compute the number of high-order bits we can
>> - // truncate. We then round up to the next power-of-two.
>> - auto &DL = F->getParent()->getDataLayout();
>> - auto NumSignBits = ComputeNumSignBits(Scalar, DL, 0, AC, 0, DT);
>> - auto NumTypeBits = IT->getBitWidth();
>> - MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits,
> MaxBitWidth);
>> - }
>> -
>> - // Round up to the next power-of-two.
>> - if (!isPowerOf2_64(MaxBitWidth))
>> - MaxBitWidth = NextPowerOf2(MaxBitWidth);
>> -
>> - // If the maximum bit width we compute is less than the with of
>> the
> roots'
>> - // type, we can proceed with the narrowing. Otherwise, do nothing.
>> - auto *RootIT = cast<IntegerType>(TreeRoot[0]->getType());
>> - if (MaxBitWidth > 0 && MaxBitWidth < RootIT->getBitWidth())
>> - MaxRequiredIntegerTy = IntegerType::get(F->getContext(),
> MaxBitWidth);
>> -}
>> -
>> /// The SLPVectorizer Pass.
>> struct SLPVectorizer : public FunctionPass { typedef
>> SmallVector<StoreInst *, 8> StoreList; @@ -3354,7 +3228,6 @@ struct
>> SLPVectorizer : public FunctionPa LoopInfo *LI; DominatorTree *DT;
>> AssumptionCache *AC;
>> - DemandedBits *DB;
>>
>> bool runOnFunction(Function &F) override {
>> if (skipOptnoneFunction(F))
>> @@ -3368,7 +3241,6 @@ struct SLPVectorizer : public FunctionPa
>> LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
>> DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
>> AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
>> - DB = &getAnalysis<DemandedBits>();
>>
>> Stores.clear();
>> GEPs.clear();
>> @@ -3398,7 +3270,7 @@ struct SLPVectorizer : public FunctionPa
>>
>> // Use the bottom up slp vectorizer to construct chains that start
> with
>> // store instructions.
>> - BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB);
>> + BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC);
>>
>> // A general note: the vectorizer must use
>> BoUpSLP::eraseInstruction()
> to
>> // delete instructions.
>> @@ -3441,7 +3313,6 @@ struct SLPVectorizer : public FunctionPa
>> AU.addRequired<TargetTransformInfoWrapperPass>();
>> AU.addRequired<LoopInfoWrapperPass>();
>> AU.addRequired<DominatorTreeWrapperPass>();
>> - AU.addRequired<DemandedBits>();
>> AU.addPreserved<LoopInfoWrapperPass>();
>> AU.addPreserved<DominatorTreeWrapperPass>();
>> AU.addPreserved<AAResultsWrapperPass>();
>> @@ -3546,7 +3417,6 @@ bool SLPVectorizer::vectorizeStoreChain(
>> ArrayRef<Value *> Operands = Chain.slice(i, VF);
>>
>> R.buildTree(Operands);
>> - R.computeMaxRequiredIntegerTy();
>>
>> int Cost = R.getTreeCost();
>>
>> @@ -3746,7 +3616,6 @@ bool SLPVectorizer::tryToVectorizeList(A
>> Value *ReorderedOps[] = { Ops[1], Ops[0] };
>> R.buildTree(ReorderedOps, None);
>> }
>> - R.computeMaxRequiredIntegerTy();
>> int Cost = R.getTreeCost();
>>
>> if (Cost < -SLPCostThreshold) {
>> @@ -4013,7 +3882,6 @@ public:
>>
>> for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
>> V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth),
> ReductionOps);
>> - V.computeMaxRequiredIntegerTy();
>>
>> // Estimate cost.
>> int Cost = V.getTreeCost() + getReductionCost(TTI,
>> ReducedVals[i]);
>>
>> Modified:
>> llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVec
>> t
>> orizer/AArch64/gather-reduce.ll?rev=258408&r1=258407&r2=258408&view=d
>> i
>> ff
>> =====================================================================
>> =
>> ========
>> --- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
>> (original)
>> +++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
>> +++ Thu Jan 21 11:17:20 2016
>> @@ -1,5 +1,4 @@
>> -; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s
>> --check-prefix=PROFITABLE -; RUN: opt -S -slp-vectorizer
>> -slp-threshold=-12 -dce -instcombine < %s | FileCheck %s
>> --check-prefix=UNPROFITABLE
>> +; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s
>>
>> target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
>> target triple = "aarch64--linux-gnu"
>> @@ -19,13 +18,13 @@ target triple = "aarch64--linux-gnu"
>> ; return sum;
>> ; }
>>
>> -; PROFITABLE-LABEL: @gather_reduce_8x16_i32
>> +; CHECK-LABEL: @gather_reduce_8x16_i32
>> ;
>> -; PROFITABLE: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16> -; PROFITABLE:
>> zext <8 x i16> [[L]] to <8 x i32> -; PROFITABLE: [[S:%[a-zA-Z0-9.]+]]
>> = sub nsw <8 x i32> -; PROFITABLE: [[X:%[a-zA-Z0-9.]+]] =
>> extractelement <8 x i32> [[S]] -; PROFITABLE: sext i32 [[X]] to i64
>> +; CHECK: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16> ; CHECK: zext <8 x
>> +i16> [[L]] to <8 x i32> ; CHECK: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x
>> +i32> ; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]]
>> +i32> ;
>> +CHECK: sext i32 [[X]] to i64
>> ;
>> define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16*
>> nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
>> entry:
>> @@ -138,18 +137,14 @@ for.body:
>> br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body }
>>
>> -; UNPROFITABLE-LABEL: @gather_reduce_8x16_i64
>> +; CHECK-LABEL: @gather_reduce_8x16_i64
>> ;
>> -; UNPROFITABLE: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16> -;
>> UNPROFITABLE: zext <8 x i16> [[L]] to <8 x i32> -; UNPROFITABLE:
>> [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32> -; UNPROFITABLE:
>> [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]] -; UNPROFITABLE:
>> sext i32 [[X]] to i64 -; -; TODO: Although we can now vectorize this
>> case while converting the i64
>> -; subtractions to i32, the cost model currently finds
vectorization
> to be
>> -; unprofitable. The cost model is penalizing the sign and zero
>> -; extensions in the vectorized version, but they are actually
free.
>> +; CHECK-NOT: load <8 x i16>
>> +;
>> +; FIXME: We are currently unable to vectorize the case with i64
> subtraction
>> +; because the zero extensions are too expensive. The solution
here
> is to
>> +; convert the i64 subtractions to i32 subtractions during
> vectorization.
>> +; This would then match the case above.
>> ;
>> define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16*
>> nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
>> entry:
>>
>>
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at lists.llvm.org
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>
>
More information about the llvm-commits
mailing list