[llvm] [LoopVectorize][NFC] Centralize the setting of CostKind (PR #121937)
John Brawn via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 7 06:18:18 PST 2025
https://github.com/john-brawn-arm created https://github.com/llvm/llvm-project/pull/121937
In each class which calculates instruction costs (VPCostContext, LoopVectorizationCostModel, GeneratedRTChecks) set the CostKind once in the constructor instead of in each function that calculates a cost. This is in preparation for potentially changing the CostKind when compiling for optsize.
>From 7e2bb2aac31bace7310ff0cc070f9bb520327540 Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn at arm.com>
Date: Thu, 17 Oct 2024 17:00:32 +0100
Subject: [PATCH] [LoopVectorize][NFC] Centralize the setting of CostKind
In each class which calculates instruction costs (VPCostContext,
LoopVectorizationCostModel, GeneratedRTChecks) set the CostKind once
in the constructor instead of in each function that calculates a
cost. This is in preparation for potentially changing the CostKind
when compiling for optsize.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 109 +++++++++---------
llvm/lib/Transforms/Vectorize/VPlan.cpp | 2 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 6 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 85 ++++++--------
4 files changed, 96 insertions(+), 106 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e0f629e14f6571..847150e539a8a1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -987,7 +987,7 @@ class LoopVectorizationCostModel {
InterleavedAccessInfo &IAI)
: ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
- Hints(Hints), InterleaveInfo(IAI) {}
+ Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {}
/// \return An upper bound for the vectorization factors (both fixed and
/// scalable). If the factors are 0, vectorization and interleaving should be
@@ -1553,9 +1553,9 @@ class LoopVectorizationCostModel {
/// Return the cost of instructions in an inloop reduction pattern, if I is
/// part of that pattern.
- std::optional<InstructionCost>
- getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
- TTI::TargetCostKind CostKind) const;
+ std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
+ ElementCount VF,
+ Type *VectorTy) const;
/// Returns true if \p Op should be considered invariant and if it is
/// trivially hoistable.
@@ -1614,8 +1614,8 @@ class LoopVectorizationCostModel {
/// Estimate the overhead of scalarizing an instruction. This is a
/// convenience wrapper for the type-based getScalarizationOverhead API.
- InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
- TTI::TargetCostKind CostKind) const;
+ InstructionCost getScalarizationOverhead(Instruction *I,
+ ElementCount VF) const;
/// Returns true if an artificially high cost for emulated masked memrefs
/// should be used.
@@ -1796,6 +1796,9 @@ class LoopVectorizationCostModel {
/// All element types found in the loop.
SmallPtrSet<Type *, 16> ElementTypesInLoop;
+
+ /// The kind of cost that we are calculating
+ TTI::TargetCostKind CostKind;
};
} // end namespace llvm
@@ -1836,13 +1839,17 @@ class GeneratedRTChecks {
PredicatedScalarEvolution &PSE;
+ /// The kind of cost that we are calculating
+ TTI::TargetCostKind CostKind;
+
public:
GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
LoopInfo *LI, TargetTransformInfo *TTI,
- const DataLayout &DL, bool AddBranchWeights)
+ const DataLayout &DL, bool AddBranchWeights,
+ TTI::TargetCostKind CostKind)
: DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
MemCheckExp(*PSE.getSE(), DL, "scev.check"),
- AddBranchWeights(AddBranchWeights), PSE(PSE) {}
+ AddBranchWeights(AddBranchWeights), PSE(PSE), CostKind(CostKind) {}
/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
/// accurately estimate the cost of the runtime checks. The blocks are
@@ -1954,8 +1961,7 @@ class GeneratedRTChecks {
for (Instruction &I : *SCEVCheckBlock) {
if (SCEVCheckBlock->getTerminator() == &I)
continue;
- InstructionCost C =
- TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
+ InstructionCost C = TTI->getInstructionCost(&I, CostKind);
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
RTCheckCost += C;
}
@@ -1964,8 +1970,7 @@ class GeneratedRTChecks {
for (Instruction &I : *MemCheckBlock) {
if (MemCheckBlock->getTerminator() == &I)
continue;
- InstructionCost C =
- TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
+ InstructionCost C = TTI->getInstructionCost(&I, CostKind);
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
MemCheckCost += C;
}
@@ -2926,10 +2931,9 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
if (!VF.isScalar())
return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
Type *RetTy = CI->getType();
if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
- if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
+ if (auto RedCost = getReductionPatternCost(CI, VF, RetTy))
return *RedCost;
SmallVector<Type *, 4> Tys;
@@ -2937,7 +2941,7 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
Tys.push_back(ArgOp->getType());
InstructionCost ScalarCallCost =
- TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
+ TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys);
// If this is an intrinsic we may have a lower cost for it.
if (getVectorIntrinsicIDForCall(CI, TLI)) {
@@ -2972,8 +2976,7 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
dyn_cast<IntrinsicInst>(CI));
- return TTI.getIntrinsicInstrCost(CostAttrs,
- TargetTransformInfo::TCK_RecipThroughput);
+ return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
}
void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
@@ -3430,8 +3433,6 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
I->getOpcode() == Instruction::URem);
assert(!isSafeToSpeculativelyExecute(I));
- const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-
// Scalarization isn't legal for scalable vector types
InstructionCost ScalarizationCost = InstructionCost::getInvalid();
if (!VF.isScalable()) {
@@ -3453,7 +3454,7 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
// The cost of insertelement and extractelement instructions needed for
// scalarization.
- ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
+ ScalarizationCost += getScalarizationOverhead(I, VF);
// Scale the cost by the probability of executing the predicated blocks.
// This assumes the predicated block for each vector lane is equally
@@ -4426,7 +4427,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
for (const auto &Plan : VPlans) {
for (ElementCount VF : Plan->vectorFactors()) {
VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
- CM);
+ CM, CM.CostKind);
precomputeCosts(*Plan, VF, CostCtx);
auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -5576,7 +5577,6 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
// Compute the scalarization overhead of needed insertelement instructions
// and phi nodes.
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
ScalarCost += TTI.getScalarizationOverhead(
cast<VectorType>(toVectorTy(I->getType(), VF)),
@@ -5723,7 +5723,6 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// Don't pass *I here, since it is scalar but will actually be part of a
// vectorized loop where the user of it is a vectorized instruction.
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
const Align Alignment = getLoadStoreAlignment(I);
Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
ValTy->getScalarType(),
@@ -5731,7 +5730,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// Get the overhead of the extractelement and insertelement instructions
// we might create due to scalarization.
- Cost += getScalarizationOverhead(I, VF, CostKind);
+ Cost += getScalarizationOverhead(I, VF);
// If we have a predicated load/store, it will need extra i1 extracts and
// conditional branches, but may not be executed for each vector lane. Scale
@@ -5764,7 +5763,6 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
Value *Ptr = getLoadStorePointerOperand(I);
unsigned AS = getLoadStoreAddressSpace(I);
int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
- enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Stride should be 1 or -1 for consecutive memory access");
@@ -5795,12 +5793,12 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
const Align Alignment = getLoadStoreAlignment(I);
unsigned AS = getLoadStoreAddressSpace(I);
- enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
if (isa<LoadInst>(I)) {
return TTI.getAddressComputationCost(ValTy) +
TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
CostKind) +
- TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
+ TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy, {},
+ CostKind);
}
StoreInst *SI = cast<StoreInst>(I);
@@ -5823,9 +5821,9 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
const Value *Ptr = getLoadStorePointerOperand(I);
return TTI.getAddressComputationCost(VectorTy) +
- TTI.getGatherScatterOpCost(
- I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
- TargetTransformInfo::TCK_RecipThroughput, I);
+ TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
+ Legal->isMaskRequired(I), Alignment,
+ CostKind, I);
}
InstructionCost
@@ -5838,7 +5836,6 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
Type *ValTy = getLoadStoreType(InsertPos);
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
unsigned AS = getLoadStoreAddressSpace(InsertPos);
- enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
unsigned InterleaveFactor = Group->getFactor();
auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
@@ -5870,9 +5867,9 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
}
std::optional<InstructionCost>
-LoopVectorizationCostModel::getReductionPatternCost(
- Instruction *I, ElementCount VF, Type *Ty,
- TTI::TargetCostKind CostKind) const {
+LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
+ ElementCount VF,
+ Type *Ty) const {
using namespace llvm::PatternMatch;
// Early exit for no inloop reductions
if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
@@ -6063,14 +6060,15 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
return TTI.getAddressComputationCost(ValTy) +
- TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
- TTI::TCK_RecipThroughput, OpInfo, I);
+ TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, CostKind,
+ OpInfo, I);
}
return getWideningCost(I, VF);
}
-InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
- Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
+InstructionCost
+LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
+ ElementCount VF) const {
// There is no mechanism yet to create a scalable scalarization loop,
// so this is currently Invalid.
@@ -6313,7 +6311,6 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
InstructionCost ScalarCost = InstructionCost::getInvalid();
InstructionCost VectorCost = InstructionCost::getInvalid();
InstructionCost IntrinsicCost = InstructionCost::getInvalid();
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
Function *ScalarFunc = CI->getCalledFunction();
Type *ScalarRetTy = CI->getType();
SmallVector<Type *, 4> Tys, ScalarTys;
@@ -6329,8 +6326,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
// Compute costs of unpacking argument values for the scalar calls and
// packing the return values to a vector.
- InstructionCost ScalarizationCost =
- getScalarizationOverhead(CI, VF, CostKind);
+ InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
// Honor ForcedScalars and UniformAfterVectorization decisions.
@@ -6354,7 +6350,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
// An in-loop reduction using an fmuladd intrinsic is a special case;
// we don't want the normal cost for that intrinsic.
if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
- if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
+ if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) {
setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
getVectorIntrinsicIDForCall(CI, TLI),
std::nullopt, *RedCost);
@@ -6439,7 +6435,8 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
TargetTransformInfo::SK_Broadcast,
VectorType::get(IntegerType::getInt1Ty(
VecFunc->getFunctionType()->getContext()),
- VF));
+ VF),
+ {}, CostKind);
if (TLI && VecFunc && !CI->isNoBuiltin())
VectorCost =
@@ -6507,7 +6504,6 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
if (canTruncateToMinimalBitwidth(I, VF))
RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
auto *SE = PSE.getSE();
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
auto HasSingleCopyAfterVectorization = [this](Instruction *I,
ElementCount VF) -> bool {
@@ -6694,9 +6690,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
{PtrTy, ScalarTy, MaskTy});
// Add the costs together with the add/sub operation.
- return TTI.getIntrinsicInstrCost(
- ICA, TargetTransformInfo::TCK_RecipThroughput) +
- MulCost + TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy);
+ return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
+ TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy);
}
[[fallthrough]];
}
@@ -6721,7 +6716,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
return 0;
// Detect reduction patterns
- if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
+ if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
return *RedCost;
// Certain instructions can be cheaper to vectorize if they have a constant
@@ -6886,7 +6881,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
}
// Detect reduction patterns
- if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
+ if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
return *RedCost;
Type *SrcScalarTy = I->getOperand(0)->getType();
@@ -6911,7 +6906,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
case Instruction::Call:
return getVectorCallCost(cast<CallInst>(I), VF);
case Instruction::ExtractValue:
- return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
+ return TTI.getInstructionCost(I, CostKind);
case Instruction::Alloca:
// We cannot easily widen alloca to a scalable alloca, as
// the result would need to be a vector of pointers.
@@ -7423,8 +7418,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
// Pre-compute the cost for I, if it has a reduction pattern cost.
for (Instruction *I : ChainOpsAndOperands) {
- auto ReductionCost = CM.getReductionPatternCost(
- I, VF, toVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
+ auto ReductionCost =
+ CM.getReductionPatternCost(I, VF, toVectorTy(I->getType(), VF));
if (!ReductionCost)
continue;
@@ -7482,7 +7477,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
ElementCount VF) const {
- VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
+ CM.CostKind);
InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
// Now compute and add the VPlan-based cost.
@@ -7611,7 +7607,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
// simplifications not accounted for in the legacy cost model. If that's the
// case, don't trigger the assertion, as the extra simplifications may cause a
// different VF to be picked by the VPlan-based cost model.
- VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
+ CM.CostKind);
precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
assert((BestFactor.Width == LegacyVF.Width ||
planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
@@ -9971,7 +9968,7 @@ static bool processLoopInVPlanNativePath(
bool AddBranchWeights =
hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
- AddBranchWeights);
+ AddBranchWeights, CM.CostKind);
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
@@ -10488,7 +10485,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
bool AddBranchWeights =
hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
- AddBranchWeights);
+ AddBranchWeights, CM.CostKind);
if (LVP.hasPlanWithVF(VF.Width)) {
// Select the interleave count.
IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index e804f81c36dba0..e0c9f6d27cb881 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -770,7 +770,7 @@ InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
InstructionCost BackedgeCost =
ForceTargetInstructionCost.getNumOccurrences()
? InstructionCost(ForceTargetInstructionCost.getNumOccurrences())
- : Ctx.TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
+ : Ctx.TTI.getCFInstrCost(Instruction::Br, Ctx.CostKind);
LLVM_DEBUG(dbgs() << "Cost of " << BackedgeCost << " for VF " << VF
<< ": vector loop backedge\n");
Cost += BackedgeCost;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 9d7bf97d305ed1..25f889028cb396 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -686,11 +686,13 @@ struct VPCostContext {
LLVMContext &LLVMCtx;
LoopVectorizationCostModel &CM;
SmallPtrSet<Instruction *, 8> SkipCostComputation;
+ TargetTransformInfo::TargetCostKind CostKind;
VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
- Type *CanIVTy, LoopVectorizationCostModel &CM)
+ Type *CanIVTy, LoopVectorizationCostModel &CM,
+ TargetTransformInfo::TargetCostKind CostKind)
: TTI(TTI), TLI(TLI), Types(CanIVTy), LLVMCtx(CanIVTy->getContext()),
- CM(CM) {}
+ CM(CM), CostKind(CostKind) {}
/// Return the cost for \p UI with \p VF using the legacy cost model as
/// fallback until computing the cost of all recipes migrates to VPlan.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 76336ae447edb7..4c02c78c59d16d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -920,10 +920,9 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
Variant->getFunctionType()->params(),
- CostKind);
+ Ctx.CostKind);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1001,8 +1000,6 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-
// Some backends analyze intrinsic arguments to determine cost. Use the
// underlying value for the operand if it has one. Otherwise try to use the
// operand of the underlying call instruction, if there is one. Otherwise
@@ -1042,7 +1039,7 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
IntrinsicCostAttributes CostAttrs(
VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()));
- return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
+ return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
}
StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
@@ -1141,9 +1138,8 @@ InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
{PtrTy, IncTy, MaskTy});
// Add the costs together with the add/sub operation.
- return Ctx.TTI.getIntrinsicInstrCost(
- ICA, TargetTransformInfo::TCK_RecipThroughput) +
- MulCost + Ctx.TTI.getArithmeticInstrCost(Opcode, VTy);
+ return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
+ Ctx.TTI.getArithmeticInstrCost(Opcode, VTy);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1204,7 +1200,6 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
Type *ScalarTy = Ctx.Types.inferScalarType(this);
Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
VPValue *Op0, *Op1;
using namespace llvm::VPlanPatternMatch;
@@ -1222,8 +1217,8 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
Operands.append(SI->op_begin(), SI->op_end());
bool IsLogicalOr = match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
return Ctx.TTI.getArithmeticInstrCost(
- IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy, CostKind,
- {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
+ IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy,
+ Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
}
Type *CondTy = Ctx.Types.inferScalarType(getOperand(0));
@@ -1233,9 +1228,9 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
Pred = Cmp->getPredicate();
- return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VectorTy, CondTy, Pred,
- CostKind, {TTI::OK_AnyValue, TTI::OP_None},
- {TTI::OK_AnyValue, TTI::OP_None}, SI);
+ return Ctx.TTI.getCmpSelInstrCost(
+ Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind,
+ {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI);
}
VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
@@ -1380,12 +1375,11 @@ void VPWidenRecipe::execute(VPTransformState &State) {
InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
switch (Opcode) {
case Instruction::FNeg: {
Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
return Ctx.TTI.getArithmeticInstrCost(
- Opcode, VectorTy, CostKind,
+ Opcode, VectorTy, Ctx.CostKind,
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
}
@@ -1428,21 +1422,22 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
if (CtxI)
Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
return Ctx.TTI.getArithmeticInstrCost(
- Opcode, VectorTy, CostKind,
+ Opcode, VectorTy, Ctx.CostKind,
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
RHSInfo, Operands, CtxI, &Ctx.TLI);
}
case Instruction::Freeze: {
// This opcode is unknown. Assume that it is the same as 'mul'.
Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
- return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
+ return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
+ Ctx.CostKind);
}
case Instruction::ICmp:
case Instruction::FCmp: {
Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
- CostKind,
+ Ctx.CostKind,
{TTI::OK_AnyValue, TTI::OP_None},
{TTI::OK_AnyValue, TTI::OP_None}, CtxI);
}
@@ -1572,7 +1567,7 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
auto *DestTy = cast<VectorType>(toVectorTy(getResultType(), VF));
// Arm TTI will use the underlying instruction to determine the cost.
return Ctx.TTI.getCastInstrCost(
- Opcode, DestTy, SrcTy, CCH, TTI::TCK_RecipThroughput,
+ Opcode, DestTy, SrcTy, CCH, Ctx.CostKind,
dyn_cast_if_present<Instruction>(getUnderlyingValue()));
}
@@ -1590,7 +1585,7 @@ void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
- return Ctx.TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
+ return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
}
/// This function adds
@@ -2069,18 +2064,16 @@ void VPBlendRecipe::execute(VPTransformState &State) {
InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-
// Handle cases where only the first lane is used the same way as the legacy
// cost model.
if (vputils::onlyFirstLaneUsed(this))
- return Ctx.TTI.getCFInstrCost(Instruction::PHI, CostKind);
+ return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
return (getNumIncomingValues() - 1) *
Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2200,7 +2193,6 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
RecurKind RdxKind = RdxDesc.getRecurrenceKind();
Type *ElementTy = Ctx.Types.inferScalarType(this);
auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
unsigned Opcode = RdxDesc.getOpcode();
// TODO: Support any-of and in-loop reductions.
@@ -2218,15 +2210,15 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
// Cost = Reduction cost + BinOp cost
InstructionCost Cost =
- Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, CostKind);
+ Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, Ctx.CostKind);
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) {
Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
return Cost + Ctx.TTI.getMinMaxReductionCost(
- Id, VectorTy, RdxDesc.getFastMathFlags(), CostKind);
+ Id, VectorTy, RdxDesc.getFastMathFlags(), Ctx.CostKind);
}
return Cost + Ctx.TTI.getArithmeticReductionCost(
- Opcode, VectorTy, RdxDesc.getFastMathFlags(), CostKind);
+ Opcode, VectorTy, RdxDesc.getFastMathFlags(), Ctx.CostKind);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2457,7 +2449,6 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
unsigned AS =
getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
if (!Consecutive) {
// TODO: Using the original IR may not be accurate.
@@ -2468,25 +2459,26 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
"Inconsecutive memory access should not have the order.");
return Ctx.TTI.getAddressComputationCost(Ty) +
Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
- IsMasked, Alignment, CostKind,
+ IsMasked, Alignment, Ctx.CostKind,
&Ingredient);
}
InstructionCost Cost = 0;
if (IsMasked) {
Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
- AS, CostKind);
+ AS, Ctx.CostKind);
} else {
TTI::OperandValueInfo OpInfo =
Ctx.TTI.getOperandInfo(Ingredient.getOperand(0));
Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS,
- CostKind, OpInfo, &Ingredient);
+ Ctx.CostKind, OpInfo, &Ingredient);
}
if (!Reverse)
return Cost;
- return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
- cast<VectorType>(Ty), {}, CostKind, 0);
+ return Cost +=
+ Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
+ cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
}
void VPWidenLoadRecipe::execute(VPTransformState &State) {
@@ -2604,14 +2596,14 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
unsigned AS =
getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
- Ingredient.getOpcode(), Ty, Alignment, AS, CostKind);
+ Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind);
if (!Reverse)
return Cost;
return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
- cast<VectorType>(Ty), {}, CostKind, 0);
+ cast<VectorType>(Ty), {}, Ctx.CostKind,
+ 0);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2725,14 +2717,14 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
unsigned AS =
getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
- Ingredient.getOpcode(), Ty, Alignment, AS, CostKind);
+ Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind);
if (!Reverse)
return Cost;
return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
- cast<VectorType>(Ty), {}, CostKind, 0);
+ cast<VectorType>(Ty), {}, Ctx.CostKind,
+ 0);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -3090,7 +3082,6 @@ InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
: getStoredValues()[InsertPosIdx]);
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
unsigned AS = getLoadStoreAddressSpace(InsertPos);
- enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
unsigned InterleaveFactor = IG->getFactor();
auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
@@ -3104,14 +3095,15 @@ InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
// Calculate the cost of the whole interleaved group.
InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
- IG->getAlign(), AS, CostKind, getMask(), NeedsMaskForGaps);
+ IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);
if (!IG->isReverse())
return Cost;
return Cost + IG->getNumMembers() *
Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
- VectorTy, std::nullopt, CostKind, 0);
+ VectorTy, std::nullopt, Ctx.CostKind,
+ 0);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -3321,9 +3313,8 @@ void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
InstructionCost
VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
if (VF.isScalar())
- return Ctx.TTI.getCFInstrCost(Instruction::PHI, CostKind);
+ return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
if (VF.isScalable() && VF.getKnownMinValue() == 1)
return InstructionCost::getInvalid();
@@ -3334,7 +3325,7 @@ VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
toVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
- cast<VectorType>(VectorTy), Mask, CostKind,
+ cast<VectorType>(VectorTy), Mask, Ctx.CostKind,
VF.getKnownMinValue() - 1);
}
More information about the llvm-commits
mailing list