[llvm] r258408 - Revert "[SLP] Truncate expressions to minimum required bit width"
Matthew Simpson via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 21 13:42:35 PST 2016
Hi Quentin,
Yes, I reverted the patch because it caused a buildbot failure due to that
assertion. The assertion was introduced in the patch.
-- Matt
-----Original Message-----
From: Quentin Colombet [mailto:qcolombet at apple.com]
Sent: Thursday, January 21, 2016 4:26 PM
To: Matthew Simpson
Cc: llvm-commits at lists.llvm.org
Subject: Re: [llvm] r258408 - Revert "[SLP] Truncate expressions to minimum
required bit width"
Hi Matthew,
Would be good to have said why you reverted it.
In particular, I am seeing an assertion failure which seems to be because of
this commit and I was wondering if you reverted it because of this or if I
need to look into producing a test case.
Specifically, I am seeing:
Assertion failed: (IT && "Computed smaller type for non-integer value?"),
function getEntryCost, file lib/Transforms/Vectorize/SLPVectorizer.cpp, line
1500
Thanks,
-Quentin
> On Jan 21, 2016, at 9:17 AM, Matthew Simpson via llvm-commits
<llvm-commits at lists.llvm.org> wrote:
>
> Author: mssimpso
> Date: Thu Jan 21 11:17:20 2016
> New Revision: 258408
>
> URL: http://llvm.org/viewvc/llvm-project?rev=258408&view=rev
> Log:
> Revert "[SLP] Truncate expressions to minimum required bit width"
>
> This reverts commit r258404.
>
> Modified:
> llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
> llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
>
> Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectoriz
> e/SLPVectorizer.cpp?rev=258408&r1=258407&r2=258408&view=diff
> ======================================================================
> ========
> --- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
> +++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Thu Jan 21
> +++ 11:17:20 2016
> @@ -15,22 +15,21 @@
> // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
> //
> //===-----------------------------------------------------------------
> -----===//
> +#include "llvm/Transforms/Vectorize.h"
> #include "llvm/ADT/MapVector.h"
> #include "llvm/ADT/Optional.h"
> #include "llvm/ADT/PostOrderIterator.h"
> #include "llvm/ADT/SetVector.h"
> #include "llvm/ADT/Statistic.h"
> #include "llvm/Analysis/AliasAnalysis.h"
> +#include "llvm/Analysis/GlobalsModRef.h"
> #include "llvm/Analysis/AssumptionCache.h"
> #include "llvm/Analysis/CodeMetrics.h"
> -#include "llvm/Analysis/DemandedBits.h"
> -#include "llvm/Analysis/GlobalsModRef.h"
> #include "llvm/Analysis/LoopInfo.h"
> #include "llvm/Analysis/ScalarEvolution.h"
> #include "llvm/Analysis/ScalarEvolutionExpressions.h"
> #include "llvm/Analysis/TargetTransformInfo.h"
> #include "llvm/Analysis/ValueTracking.h"
> -#include "llvm/Analysis/VectorUtils.h"
> #include "llvm/IR/DataLayout.h"
> #include "llvm/IR/Dominators.h"
> #include "llvm/IR/IRBuilder.h"
> @@ -45,7 +44,7 @@
> #include "llvm/Support/CommandLine.h"
> #include "llvm/Support/Debug.h"
> #include "llvm/Support/raw_ostream.h"
> -#include "llvm/Transforms/Vectorize.h"
> +#include "llvm/Analysis/VectorUtils.h"
> #include <algorithm>
> #include <map>
> #include <memory>
> @@ -364,12 +363,11 @@ public:
>
> BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
> TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
> - DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB)
> + DominatorTree *Dt, AssumptionCache *AC)
> : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
> - SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
DB(DB),
> + SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt),
> Builder(Se->getContext()) {
> CodeMetrics::collectEphemeralValues(F, AC, EphValues);
> - MaxRequiredIntegerTy = nullptr;
> }
>
> /// \brief Vectorize the tree that starts with the elements in \p VL.
> @@ -401,7 +399,6 @@ public:
> BlockScheduling *BS = Iter.second.get();
> BS->clear();
> }
> - MaxRequiredIntegerTy = nullptr;
> }
>
> /// \returns true if the memory operations A and B are consecutive.
> @@ -422,10 +419,6 @@ public:
> /// vectorization factors.
> unsigned getVectorElementSize(Value *V);
>
> - /// Compute the maximum width integer type required to represent
> the result
> - /// of a scalar expression, if such a type exists.
> - void computeMaxRequiredIntegerTy();
> -
> private:
> struct TreeEntry;
>
> @@ -931,13 +924,8 @@ private:
> AliasAnalysis *AA;
> LoopInfo *LI;
> DominatorTree *DT;
> - AssumptionCache *AC;
> - DemandedBits *DB;
> /// Instruction builder to construct the vectorized tree.
> IRBuilder<> Builder;
> -
> - // The maximum width integer type required to represent a scalar
expression.
> - IntegerType *MaxRequiredIntegerTy;
> };
>
> #ifndef NDEBUG
> @@ -1493,15 +1481,6 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
> ScalarTy = SI->getValueOperand()->getType();
> VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
>
> - // If we have computed a smaller type for the expression, update
> VecTy so
> - // that the costs will be accurate.
> - if (MaxRequiredIntegerTy) {
> - auto *IT = dyn_cast<IntegerType>(ScalarTy);
> - assert(IT && "Computed smaller type for non-integer value?");
> - if (MaxRequiredIntegerTy->getBitWidth() < IT->getBitWidth())
> - VecTy = VectorType::get(MaxRequiredIntegerTy, VL.size());
> - }
> -
> if (E->NeedToGather) {
> if (allConstant(VL))
> return 0;
> @@ -1830,17 +1809,9 @@ int BoUpSLP::getTreeCost() {
> if (EphValues.count(EU.User))
> continue;
>
> - // If we plan to rewrite the tree in a smaller type, we will need to
sign
> - // extend the extracted value back to the original type. Here, we
account
> - // for the extract and the added cost of the sign extend if needed.
> - auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
> - if (MaxRequiredIntegerTy) {
> - VecTy = VectorType::get(MaxRequiredIntegerTy, BundleWidth);
> - ExtractCost += TTI->getCastInstrCost(
> - Instruction::SExt, EU.Scalar->getType(), MaxRequiredIntegerTy);
> - }
> - ExtractCost +=
> - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
EU.Lane);
> + VectorType *VecTy = VectorType::get(EU.Scalar->getType(),
BundleWidth);
> + ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement,
VecTy,
> + EU.Lane);
> }
>
> Cost += getSpillCost();
> @@ -2595,19 +2566,7 @@ Value *BoUpSLP::vectorizeTree() {
> }
>
> Builder.SetInsertPoint(&F->getEntryBlock().front());
> - auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
> -
> - // If the vectorized tree can be rewritten in a smaller type, we
> truncate the
> - // vectorized root. InstCombine will then rewrite the entire
> expression. We
> - // sign extend the extracted values below.
> - if (MaxRequiredIntegerTy) {
> - BasicBlock::iterator I(cast<Instruction>(VectorRoot));
> - Builder.SetInsertPoint(&*++I);
> - auto BundleWidth = VectorizableTree[0].Scalars.size();
> - auto *SmallerTy = VectorType::get(MaxRequiredIntegerTy, BundleWidth);
> - auto *Trunc = Builder.CreateTrunc(VectorRoot, SmallerTy);
> - VectorizableTree[0].VectorizedValue = Trunc;
> - }
> + vectorizeTree(&VectorizableTree[0]);
>
> DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << "
> values .\n");
>
> @@ -2640,8 +2599,6 @@ Value *BoUpSLP::vectorizeTree() {
> if (PH->getIncomingValue(i) == Scalar) {
>
Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
> Value *Ex = Builder.CreateExtractElement(Vec, Lane);
> - if (MaxRequiredIntegerTy)
> - Ex = Builder.CreateSExt(Ex, Scalar->getType());
> CSEBlocks.insert(PH->getIncomingBlock(i));
> PH->setOperand(i, Ex);
> }
> @@ -2649,16 +2606,12 @@ Value *BoUpSLP::vectorizeTree() {
> } else {
> Builder.SetInsertPoint(cast<Instruction>(User));
> Value *Ex = Builder.CreateExtractElement(Vec, Lane);
> - if (MaxRequiredIntegerTy)
> - Ex = Builder.CreateSExt(Ex, Scalar->getType());
> CSEBlocks.insert(cast<Instruction>(User)->getParent());
> User->replaceUsesOfWith(Scalar, Ex);
> }
> } else {
> Builder.SetInsertPoint(&F->getEntryBlock().front());
> Value *Ex = Builder.CreateExtractElement(Vec, Lane);
> - if (MaxRequiredIntegerTy)
> - Ex = Builder.CreateSExt(Ex, Scalar->getType());
> CSEBlocks.insert(&F->getEntryBlock());
> User->replaceUsesOfWith(Scalar, Ex);
> }
> @@ -3227,7 +3180,7 @@ unsigned BoUpSLP::getVectorElementSize(V
> // If the current instruction is a load, update MaxWidth to reflect
the
> // width of the loaded value.
> else if (isa<LoadInst>(I))
> - MaxWidth = std::max<unsigned>(MaxWidth, DL.getTypeSizeInBits(Ty));
> + MaxWidth = std::max(MaxWidth,
> + (unsigned)DL.getTypeSizeInBits(Ty));
>
> // Otherwise, we need to visit the operands of the instruction. We
only
> // handle the interesting cases from buildTree here. If an operand
> is an @@ -3254,85 +3207,6 @@ unsigned BoUpSLP::getVectorElementSize(V
> return MaxWidth;
> }
>
> -void BoUpSLP::computeMaxRequiredIntegerTy() {
> -
> - // If there are no external uses, the expression tree must be
> rooted by a
> - // store. We can't demote in-memory values, so there is nothing to do
here.
> - if (ExternalUses.empty())
> - return;
> -
> - // If the expression is not rooted by a store, these roots should
> have
> - // external uses. We will rely on InstCombine to rewrite the
> expression in
> - // the narrower type. However, InstCombine only rewrites single-use
values.
> - // This means that if a tree entry other than a root is used
> externally, it
> - // must have multiple uses and InstCombine will not rewrite it. The
> code
> - // below ensures that only the roots are used externally.
> - auto &TreeRoot = VectorizableTree[0].Scalars;
> - SmallPtrSet<Value *, 16> ScalarRoots(TreeRoot.begin(),
> TreeRoot.end());
> - for (auto &EU : ExternalUses)
> - if (!ScalarRoots.erase(EU.Scalar))
> - return;
> - if (!ScalarRoots.empty())
> - return;
> -
> - // The maximum bit width required to represent all the instructions
> in the
> - // tree without loss of precision. It would be safe to truncate the
> - // expression to this width.
> - auto MaxBitWidth = 8u;
> -
> - // We first check if all the bits of the root are demanded. If
> they're not,
> - // we can truncate the root to this narrower type.
> - auto *Root = dyn_cast<Instruction>(TreeRoot[0]);
> - if (!Root || !isa<IntegerType>(Root->getType()) || !Root->hasOneUse())
> - return;
> - auto Mask = DB->getDemandedBits(Root);
> - if (Mask.countLeadingZeros() > 0)
> - MaxBitWidth = Mask.getBitWidth() - Mask.countLeadingZeros();
> -
> - // If all the bits of the root are demanded, we can try a little
> harder to
> - // compute a narrower type. This can happen, for example, if the
> roots are
> - // getelementptr indices. InstCombine promotes these indices to the
> pointer
> - // width. Thus, all their bits are technically demanded even though
> the
> - // address computation might be vectorized in a smaller type. We
> start by
> - // looking at each entry in the tree.
> - else
> - for (auto &Entry : VectorizableTree) {
> -
> - // Get a representative value for the vectorizable bundle. All
values in
> - // Entry.Scalars should be isomorphic.
> - auto *Scalar = Entry.Scalars[0];
> -
> - // If the scalar is used more than once, InstCombine will not
rewrite it,
> - // so we should give up.
> - if (!Scalar->hasOneUse())
> - return;
> -
> - // We only compute smaller integer types. If the scalar has a
different
> - // type, give up.
> - auto *IT = dyn_cast<IntegerType>(Scalar->getType());
> - if (!IT)
> - return;
> -
> - // Compute the maximum bit width required to store the scalar. We
use
> - // ValueTracking to compute the number of high-order bits we can
> - // truncate. We then round up to the next power-of-two.
> - auto &DL = F->getParent()->getDataLayout();
> - auto NumSignBits = ComputeNumSignBits(Scalar, DL, 0, AC, 0, DT);
> - auto NumTypeBits = IT->getBitWidth();
> - MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits,
MaxBitWidth);
> - }
> -
> - // Round up to the next power-of-two.
> - if (!isPowerOf2_64(MaxBitWidth))
> - MaxBitWidth = NextPowerOf2(MaxBitWidth);
> -
> - // If the maximum bit width we compute is less than the with of the
roots'
> - // type, we can proceed with the narrowing. Otherwise, do nothing.
> - auto *RootIT = cast<IntegerType>(TreeRoot[0]->getType());
> - if (MaxBitWidth > 0 && MaxBitWidth < RootIT->getBitWidth())
> - MaxRequiredIntegerTy = IntegerType::get(F->getContext(),
MaxBitWidth);
> -}
> -
> /// The SLPVectorizer Pass.
> struct SLPVectorizer : public FunctionPass {
> typedef SmallVector<StoreInst *, 8> StoreList; @@ -3354,7 +3228,6 @@
> struct SLPVectorizer : public FunctionPa
> LoopInfo *LI;
> DominatorTree *DT;
> AssumptionCache *AC;
> - DemandedBits *DB;
>
> bool runOnFunction(Function &F) override {
> if (skipOptnoneFunction(F))
> @@ -3368,7 +3241,6 @@ struct SLPVectorizer : public FunctionPa
> LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
> DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
> AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
> - DB = &getAnalysis<DemandedBits>();
>
> Stores.clear();
> GEPs.clear();
> @@ -3398,7 +3270,7 @@ struct SLPVectorizer : public FunctionPa
>
> // Use the bottom up slp vectorizer to construct chains that start
with
> // store instructions.
> - BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB);
> + BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC);
>
> // A general note: the vectorizer must use BoUpSLP::eraseInstruction()
to
> // delete instructions.
> @@ -3441,7 +3313,6 @@ struct SLPVectorizer : public FunctionPa
> AU.addRequired<TargetTransformInfoWrapperPass>();
> AU.addRequired<LoopInfoWrapperPass>();
> AU.addRequired<DominatorTreeWrapperPass>();
> - AU.addRequired<DemandedBits>();
> AU.addPreserved<LoopInfoWrapperPass>();
> AU.addPreserved<DominatorTreeWrapperPass>();
> AU.addPreserved<AAResultsWrapperPass>();
> @@ -3546,7 +3417,6 @@ bool SLPVectorizer::vectorizeStoreChain(
> ArrayRef<Value *> Operands = Chain.slice(i, VF);
>
> R.buildTree(Operands);
> - R.computeMaxRequiredIntegerTy();
>
> int Cost = R.getTreeCost();
>
> @@ -3746,7 +3616,6 @@ bool SLPVectorizer::tryToVectorizeList(A
> Value *ReorderedOps[] = { Ops[1], Ops[0] };
> R.buildTree(ReorderedOps, None);
> }
> - R.computeMaxRequiredIntegerTy();
> int Cost = R.getTreeCost();
>
> if (Cost < -SLPCostThreshold) {
> @@ -4013,7 +3882,6 @@ public:
>
> for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
> V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth),
ReductionOps);
> - V.computeMaxRequiredIntegerTy();
>
> // Estimate cost.
> int Cost = V.getTreeCost() + getReductionCost(TTI,
> ReducedVals[i]);
>
> Modified:
> llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVect
> orizer/AArch64/gather-reduce.ll?rev=258408&r1=258407&r2=258408&view=di
> ff
> ======================================================================
> ========
> --- llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
> (original)
> +++ llvm/trunk/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
> +++ Thu Jan 21 11:17:20 2016
> @@ -1,5 +1,4 @@
> -; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s
> --check-prefix=PROFITABLE -; RUN: opt -S -slp-vectorizer
> -slp-threshold=-12 -dce -instcombine < %s | FileCheck %s
> --check-prefix=UNPROFITABLE
> +; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s
>
> target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
> target triple = "aarch64--linux-gnu"
> @@ -19,13 +18,13 @@ target triple = "aarch64--linux-gnu"
> ; return sum;
> ; }
>
> -; PROFITABLE-LABEL: @gather_reduce_8x16_i32
> +; CHECK-LABEL: @gather_reduce_8x16_i32
> ;
> -; PROFITABLE: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16> -; PROFITABLE:
> zext <8 x i16> [[L]] to <8 x i32> -; PROFITABLE: [[S:%[a-zA-Z0-9.]+]]
> = sub nsw <8 x i32> -; PROFITABLE: [[X:%[a-zA-Z0-9.]+]] =
> extractelement <8 x i32> [[S]] -; PROFITABLE: sext i32 [[X]] to i64
> +; CHECK: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16> ; CHECK: zext <8 x
> +i16> [[L]] to <8 x i32> ; CHECK: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x
> +i32> ; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]] ;
> +CHECK: sext i32 [[X]] to i64
> ;
> define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16*
> nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
> entry:
> @@ -138,18 +137,14 @@ for.body:
> br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body }
>
> -; UNPROFITABLE-LABEL: @gather_reduce_8x16_i64
> +; CHECK-LABEL: @gather_reduce_8x16_i64
> ;
> -; UNPROFITABLE: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16> -;
> UNPROFITABLE: zext <8 x i16> [[L]] to <8 x i32> -; UNPROFITABLE:
> [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32> -; UNPROFITABLE:
> [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]] -; UNPROFITABLE:
> sext i32 [[X]] to i64 -; -; TODO: Although we can now vectorize this
> case while converting the i64
> -; subtractions to i32, the cost model currently finds vectorization
to be
> -; unprofitable. The cost model is penalizing the sign and zero
> -; extensions in the vectorized version, but they are actually free.
> +; CHECK-NOT: load <8 x i16>
> +;
> +; FIXME: We are currently unable to vectorize the case with i64
subtraction
> +; because the zero extensions are too expensive. The solution here
is to
> +; convert the i64 subtractions to i32 subtractions during
vectorization.
> +; This would then match the case above.
> ;
> define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16*
> nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
> entry:
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list