[llvm] [LoopVectorize][NFC] Centralize the setting of CostKind (PR #121937)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 7 06:18:52 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: John Brawn (john-brawn-arm)
<details>
<summary>Changes</summary>
In each class which calculates instruction costs (VPCostContext, LoopVectorizationCostModel, GeneratedRTChecks) set the CostKind once in the constructor instead of in each function that calculates a cost. This is in preparation for potentially changing the CostKind when compiling for optsize.
---
Patch is 35.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/121937.diff
4 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+53-56)
- (modified) llvm/lib/Transforms/Vectorize/VPlan.cpp (+1-1)
- (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+4-2)
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+38-47)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e0f629e14f6571..847150e539a8a1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -987,7 +987,7 @@ class LoopVectorizationCostModel {
InterleavedAccessInfo &IAI)
: ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
- Hints(Hints), InterleaveInfo(IAI) {}
+ Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {}
/// \return An upper bound for the vectorization factors (both fixed and
/// scalable). If the factors are 0, vectorization and interleaving should be
@@ -1553,9 +1553,9 @@ class LoopVectorizationCostModel {
/// Return the cost of instructions in an inloop reduction pattern, if I is
/// part of that pattern.
- std::optional<InstructionCost>
- getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
- TTI::TargetCostKind CostKind) const;
+ std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
+ ElementCount VF,
+ Type *VectorTy) const;
/// Returns true if \p Op should be considered invariant and if it is
/// trivially hoistable.
@@ -1614,8 +1614,8 @@ class LoopVectorizationCostModel {
/// Estimate the overhead of scalarizing an instruction. This is a
/// convenience wrapper for the type-based getScalarizationOverhead API.
- InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
- TTI::TargetCostKind CostKind) const;
+ InstructionCost getScalarizationOverhead(Instruction *I,
+ ElementCount VF) const;
/// Returns true if an artificially high cost for emulated masked memrefs
/// should be used.
@@ -1796,6 +1796,9 @@ class LoopVectorizationCostModel {
/// All element types found in the loop.
SmallPtrSet<Type *, 16> ElementTypesInLoop;
+
+ /// The kind of cost that we are calculating
+ TTI::TargetCostKind CostKind;
};
} // end namespace llvm
@@ -1836,13 +1839,17 @@ class GeneratedRTChecks {
PredicatedScalarEvolution &PSE;
+ /// The kind of cost that we are calculating
+ TTI::TargetCostKind CostKind;
+
public:
GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
LoopInfo *LI, TargetTransformInfo *TTI,
- const DataLayout &DL, bool AddBranchWeights)
+ const DataLayout &DL, bool AddBranchWeights,
+ TTI::TargetCostKind CostKind)
: DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
MemCheckExp(*PSE.getSE(), DL, "scev.check"),
- AddBranchWeights(AddBranchWeights), PSE(PSE) {}
+ AddBranchWeights(AddBranchWeights), PSE(PSE), CostKind(CostKind) {}
/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
/// accurately estimate the cost of the runtime checks. The blocks are
@@ -1954,8 +1961,7 @@ class GeneratedRTChecks {
for (Instruction &I : *SCEVCheckBlock) {
if (SCEVCheckBlock->getTerminator() == &I)
continue;
- InstructionCost C =
- TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
+ InstructionCost C = TTI->getInstructionCost(&I, CostKind);
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
RTCheckCost += C;
}
@@ -1964,8 +1970,7 @@ class GeneratedRTChecks {
for (Instruction &I : *MemCheckBlock) {
if (MemCheckBlock->getTerminator() == &I)
continue;
- InstructionCost C =
- TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
+ InstructionCost C = TTI->getInstructionCost(&I, CostKind);
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
MemCheckCost += C;
}
@@ -2926,10 +2931,9 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
if (!VF.isScalar())
return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
Type *RetTy = CI->getType();
if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
- if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
+ if (auto RedCost = getReductionPatternCost(CI, VF, RetTy))
return *RedCost;
SmallVector<Type *, 4> Tys;
@@ -2937,7 +2941,7 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
Tys.push_back(ArgOp->getType());
InstructionCost ScalarCallCost =
- TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
+ TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys);
// If this is an intrinsic we may have a lower cost for it.
if (getVectorIntrinsicIDForCall(CI, TLI)) {
@@ -2972,8 +2976,7 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
dyn_cast<IntrinsicInst>(CI));
- return TTI.getIntrinsicInstrCost(CostAttrs,
- TargetTransformInfo::TCK_RecipThroughput);
+ return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
}
void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
@@ -3430,8 +3433,6 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
I->getOpcode() == Instruction::URem);
assert(!isSafeToSpeculativelyExecute(I));
- const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-
// Scalarization isn't legal for scalable vector types
InstructionCost ScalarizationCost = InstructionCost::getInvalid();
if (!VF.isScalable()) {
@@ -3453,7 +3454,7 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
// The cost of insertelement and extractelement instructions needed for
// scalarization.
- ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
+ ScalarizationCost += getScalarizationOverhead(I, VF);
// Scale the cost by the probability of executing the predicated blocks.
// This assumes the predicated block for each vector lane is equally
@@ -4426,7 +4427,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
for (const auto &Plan : VPlans) {
for (ElementCount VF : Plan->vectorFactors()) {
VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
- CM);
+ CM, CM.CostKind);
precomputeCosts(*Plan, VF, CostCtx);
auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -5576,7 +5577,6 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
// Compute the scalarization overhead of needed insertelement instructions
// and phi nodes.
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
ScalarCost += TTI.getScalarizationOverhead(
cast<VectorType>(toVectorTy(I->getType(), VF)),
@@ -5723,7 +5723,6 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// Don't pass *I here, since it is scalar but will actually be part of a
// vectorized loop where the user of it is a vectorized instruction.
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
const Align Alignment = getLoadStoreAlignment(I);
Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
ValTy->getScalarType(),
@@ -5731,7 +5730,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// Get the overhead of the extractelement and insertelement instructions
// we might create due to scalarization.
- Cost += getScalarizationOverhead(I, VF, CostKind);
+ Cost += getScalarizationOverhead(I, VF);
// If we have a predicated load/store, it will need extra i1 extracts and
// conditional branches, but may not be executed for each vector lane. Scale
@@ -5764,7 +5763,6 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
Value *Ptr = getLoadStorePointerOperand(I);
unsigned AS = getLoadStoreAddressSpace(I);
int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
- enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Stride should be 1 or -1 for consecutive memory access");
@@ -5795,12 +5793,12 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
const Align Alignment = getLoadStoreAlignment(I);
unsigned AS = getLoadStoreAddressSpace(I);
- enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
if (isa<LoadInst>(I)) {
return TTI.getAddressComputationCost(ValTy) +
TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
CostKind) +
- TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
+ TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy, {},
+ CostKind);
}
StoreInst *SI = cast<StoreInst>(I);
@@ -5823,9 +5821,9 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
const Value *Ptr = getLoadStorePointerOperand(I);
return TTI.getAddressComputationCost(VectorTy) +
- TTI.getGatherScatterOpCost(
- I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
- TargetTransformInfo::TCK_RecipThroughput, I);
+ TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
+ Legal->isMaskRequired(I), Alignment,
+ CostKind, I);
}
InstructionCost
@@ -5838,7 +5836,6 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
Type *ValTy = getLoadStoreType(InsertPos);
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
unsigned AS = getLoadStoreAddressSpace(InsertPos);
- enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
unsigned InterleaveFactor = Group->getFactor();
auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
@@ -5870,9 +5867,9 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
}
std::optional<InstructionCost>
-LoopVectorizationCostModel::getReductionPatternCost(
- Instruction *I, ElementCount VF, Type *Ty,
- TTI::TargetCostKind CostKind) const {
+LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
+ ElementCount VF,
+ Type *Ty) const {
using namespace llvm::PatternMatch;
// Early exit for no inloop reductions
if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
@@ -6063,14 +6060,15 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
return TTI.getAddressComputationCost(ValTy) +
- TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
- TTI::TCK_RecipThroughput, OpInfo, I);
+ TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, CostKind,
+ OpInfo, I);
}
return getWideningCost(I, VF);
}
-InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
- Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
+InstructionCost
+LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
+ ElementCount VF) const {
// There is no mechanism yet to create a scalable scalarization loop,
// so this is currently Invalid.
@@ -6313,7 +6311,6 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
InstructionCost ScalarCost = InstructionCost::getInvalid();
InstructionCost VectorCost = InstructionCost::getInvalid();
InstructionCost IntrinsicCost = InstructionCost::getInvalid();
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
Function *ScalarFunc = CI->getCalledFunction();
Type *ScalarRetTy = CI->getType();
SmallVector<Type *, 4> Tys, ScalarTys;
@@ -6329,8 +6326,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
// Compute costs of unpacking argument values for the scalar calls and
// packing the return values to a vector.
- InstructionCost ScalarizationCost =
- getScalarizationOverhead(CI, VF, CostKind);
+ InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
// Honor ForcedScalars and UniformAfterVectorization decisions.
@@ -6354,7 +6350,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
// An in-loop reduction using an fmuladd intrinsic is a special case;
// we don't want the normal cost for that intrinsic.
if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
- if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
+ if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) {
setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
getVectorIntrinsicIDForCall(CI, TLI),
std::nullopt, *RedCost);
@@ -6439,7 +6435,8 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
TargetTransformInfo::SK_Broadcast,
VectorType::get(IntegerType::getInt1Ty(
VecFunc->getFunctionType()->getContext()),
- VF));
+ VF),
+ {}, CostKind);
if (TLI && VecFunc && !CI->isNoBuiltin())
VectorCost =
@@ -6507,7 +6504,6 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
if (canTruncateToMinimalBitwidth(I, VF))
RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
auto *SE = PSE.getSE();
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
auto HasSingleCopyAfterVectorization = [this](Instruction *I,
ElementCount VF) -> bool {
@@ -6694,9 +6690,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
{PtrTy, ScalarTy, MaskTy});
// Add the costs together with the add/sub operation.
- return TTI.getIntrinsicInstrCost(
- ICA, TargetTransformInfo::TCK_RecipThroughput) +
- MulCost + TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy);
+ return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
+ TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy);
}
[[fallthrough]];
}
@@ -6721,7 +6716,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
return 0;
// Detect reduction patterns
- if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
+ if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
return *RedCost;
// Certain instructions can be cheaper to vectorize if they have a constant
@@ -6886,7 +6881,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
}
// Detect reduction patterns
- if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
+ if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
return *RedCost;
Type *SrcScalarTy = I->getOperand(0)->getType();
@@ -6911,7 +6906,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
case Instruction::Call:
return getVectorCallCost(cast<CallInst>(I), VF);
case Instruction::ExtractValue:
- return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
+ return TTI.getInstructionCost(I, CostKind);
case Instruction::Alloca:
// We cannot easily widen alloca to a scalable alloca, as
// the result would need to be a vector of pointers.
@@ -7423,8 +7418,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
// Pre-compute the cost for I, if it has a reduction pattern cost.
for (Instruction *I : ChainOpsAndOperands) {
- auto ReductionCost = CM.getReductionPatternCost(
- I, VF, toVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
+ auto ReductionCost =
+ CM.getReductionPatternCost(I, VF, toVectorTy(I->getType(), VF));
if (!ReductionCost)
continue;
@@ -7482,7 +7477,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
ElementCount VF) const {
- VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
+ CM.CostKind);
InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
// Now compute and add the VPlan-based cost.
@@ -7611,7 +7607,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
// simplifications not accounted for in the legacy cost model. If that's the
// case, don't trigger the assertion, as the extra simplifications may cause a
// different VF to be picked by the VPlan-based cost model.
- VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
+ CM.CostKind);
precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
assert((BestFactor.Width == LegacyVF.Width ||
planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
@@ -9971,7 +9968,7 @@ static bool processLoopInVPlanNativePath(
bool AddBranchWeights =
hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
- AddBranchWeights);
+ AddBranchWeights, CM.CostKind);
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
@@ -10488,7 +10485,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
bool AddBranchWeights =
hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
- AddBranchWeights);
+ AddBranchWeights, CM.CostKind);
if (LVP.hasPlanWithVF(VF.Width)) {
// Select the interleave count.
IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index e804f81c36dba0..e0c9f6d27cb881 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -770,7 +770,7 @@ InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
InstructionCost BackedgeCost =
ForceTargetInstructionCost.getNumOccurrences()
? InstructionCost(ForceTargetInstructionCost.getNumOccurrences())
- : Ctx.TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
+ : Ctx.TTI.getCFInstrCost(Instruction::Br, Ctx.CostKind);
LLVM_DEBUG(dbgs() << "Cost of " << BackedgeCost << " for VF " << VF
<< ": vector loop backedge\n");
Cost += BackedgeCost;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 9d7bf97d305ed1..25f889028cb396 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -686,11 +686,13 @@ struct VPCostContext {
LLVMContext &LLVMCtx;
LoopVectorizationCostModel &CM;
SmallPtrSet<Instruction *, 8> SkipCostComput...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/121937
More information about the llvm-commits
mailing list