[llvm] edf3a55 - [LoopVectorize][NFC] Centralize the setting of CostKind (#121937)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 17 07:06:22 PST 2025
Author: John Brawn
Date: 2025-01-17T15:06:18Z
New Revision: edf3a55bcecc8b0441a7a5fe6bda2023f86667a3
URL: https://github.com/llvm/llvm-project/commit/edf3a55bcecc8b0441a7a5fe6bda2023f86667a3
DIFF: https://github.com/llvm/llvm-project/commit/edf3a55bcecc8b0441a7a5fe6bda2023f86667a3.diff
LOG: [LoopVectorize][NFC] Centralize the setting of CostKind (#121937)
In each class which calculates instruction costs (VPCostContext,
LoopVectorizationCostModel, GeneratedRTChecks) set the CostKind once in
the constructor instead of in each function that calculates a cost. This
is in preparation for potentially changing the CostKind when compiling
for optsize.
Added:
Modified:
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/lib/Transforms/Vectorize/VPlan.cpp
llvm/lib/Transforms/Vectorize/VPlan.h
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6df11abda9e988..d79d9e8445b3df 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -987,7 +987,7 @@ class LoopVectorizationCostModel {
InterleavedAccessInfo &IAI)
: ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
- Hints(Hints), InterleaveInfo(IAI) {}
+ Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {}
/// \return An upper bound for the vectorization factors (both fixed and
/// scalable). If the factors are 0, vectorization and interleaving should be
@@ -1555,9 +1555,9 @@ class LoopVectorizationCostModel {
/// Return the cost of instructions in an inloop reduction pattern, if I is
/// part of that pattern.
- std::optional<InstructionCost>
- getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
- TTI::TargetCostKind CostKind) const;
+ std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
+ ElementCount VF,
+ Type *VectorTy) const;
/// Returns true if \p Op should be considered invariant and if it is
/// trivially hoistable.
@@ -1616,8 +1616,8 @@ class LoopVectorizationCostModel {
/// Estimate the overhead of scalarizing an instruction. This is a
/// convenience wrapper for the type-based getScalarizationOverhead API.
- InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
- TTI::TargetCostKind CostKind) const;
+ InstructionCost getScalarizationOverhead(Instruction *I,
+ ElementCount VF) const;
/// Returns true if an artificially high cost for emulated masked memrefs
/// should be used.
@@ -1798,6 +1798,9 @@ class LoopVectorizationCostModel {
/// All element types found in the loop.
SmallPtrSet<Type *, 16> ElementTypesInLoop;
+
+ /// The kind of cost that we are calculating
+ TTI::TargetCostKind CostKind;
};
} // end namespace llvm
@@ -1838,13 +1841,17 @@ class GeneratedRTChecks {
PredicatedScalarEvolution &PSE;
+ /// The kind of cost that we are calculating
+ TTI::TargetCostKind CostKind;
+
public:
GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
LoopInfo *LI, TargetTransformInfo *TTI,
- const DataLayout &DL, bool AddBranchWeights)
+ const DataLayout &DL, bool AddBranchWeights,
+ TTI::TargetCostKind CostKind)
: DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
MemCheckExp(*PSE.getSE(), DL, "scev.check"),
- AddBranchWeights(AddBranchWeights), PSE(PSE) {}
+ AddBranchWeights(AddBranchWeights), PSE(PSE), CostKind(CostKind) {}
/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
/// accurately estimate the cost of the runtime checks. The blocks are
@@ -1956,8 +1963,7 @@ class GeneratedRTChecks {
for (Instruction &I : *SCEVCheckBlock) {
if (SCEVCheckBlock->getTerminator() == &I)
continue;
- InstructionCost C =
- TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
+ InstructionCost C = TTI->getInstructionCost(&I, CostKind);
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
RTCheckCost += C;
}
@@ -1966,8 +1972,7 @@ class GeneratedRTChecks {
for (Instruction &I : *MemCheckBlock) {
if (MemCheckBlock->getTerminator() == &I)
continue;
- InstructionCost C =
- TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
+ InstructionCost C = TTI->getInstructionCost(&I, CostKind);
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
MemCheckCost += C;
}
@@ -2928,10 +2933,9 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
if (!VF.isScalar())
return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
Type *RetTy = CI->getType();
if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
- if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
+ if (auto RedCost = getReductionPatternCost(CI, VF, RetTy))
return *RedCost;
SmallVector<Type *, 4> Tys;
@@ -2974,8 +2978,7 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
dyn_cast<IntrinsicInst>(CI));
- return TTI.getIntrinsicInstrCost(CostAttrs,
- TargetTransformInfo::TCK_RecipThroughput);
+ return TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
}
void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
@@ -3432,8 +3435,6 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
I->getOpcode() == Instruction::URem);
assert(!isSafeToSpeculativelyExecute(I));
- const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-
// Scalarization isn't legal for scalable vector types
InstructionCost ScalarizationCost = InstructionCost::getInvalid();
if (!VF.isScalable()) {
@@ -3455,7 +3456,7 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
// The cost of insertelement and extractelement instructions needed for
// scalarization.
- ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
+ ScalarizationCost += getScalarizationOverhead(I, VF);
// Scale the cost by the probability of executing the predicated blocks.
// This assumes the predicated block for each vector lane is equally
@@ -4445,7 +4446,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
for (const auto &Plan : VPlans) {
for (ElementCount VF : Plan->vectorFactors()) {
VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
- CM);
+ CM, CM.CostKind);
precomputeCosts(*Plan, VF, CostCtx);
auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
@@ -5595,7 +5596,6 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
// Compute the scalarization overhead of needed insertelement instructions
// and phi nodes.
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
ScalarCost += TTI.getScalarizationOverhead(
cast<VectorType>(toVectorTy(I->getType(), VF)),
@@ -5742,7 +5742,6 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// Don't pass *I here, since it is scalar but will actually be part of a
// vectorized loop where the user of it is a vectorized instruction.
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
const Align Alignment = getLoadStoreAlignment(I);
Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
ValTy->getScalarType(),
@@ -5750,7 +5749,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// Get the overhead of the extractelement and insertelement instructions
// we might create due to scalarization.
- Cost += getScalarizationOverhead(I, VF, CostKind);
+ Cost += getScalarizationOverhead(I, VF);
// If we have a predicated load/store, it will need extra i1 extracts and
// conditional branches, but may not be executed for each vector lane. Scale
@@ -5783,7 +5782,6 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
Value *Ptr = getLoadStorePointerOperand(I);
unsigned AS = getLoadStoreAddressSpace(I);
int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
- enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Stride should be 1 or -1 for consecutive memory access");
@@ -5814,12 +5812,12 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
const Align Alignment = getLoadStoreAlignment(I);
unsigned AS = getLoadStoreAddressSpace(I);
- enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
if (isa<LoadInst>(I)) {
return TTI.getAddressComputationCost(ValTy) +
TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
CostKind) +
- TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
+ TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy, {},
+ CostKind);
}
StoreInst *SI = cast<StoreInst>(I);
@@ -5842,9 +5840,9 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
const Value *Ptr = getLoadStorePointerOperand(I);
return TTI.getAddressComputationCost(VectorTy) +
- TTI.getGatherScatterOpCost(
- I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
- TargetTransformInfo::TCK_RecipThroughput, I);
+ TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
+ Legal->isMaskRequired(I), Alignment,
+ CostKind, I);
}
InstructionCost
@@ -5857,7 +5855,6 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
Type *ValTy = getLoadStoreType(InsertPos);
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
unsigned AS = getLoadStoreAddressSpace(InsertPos);
- enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
unsigned InterleaveFactor = Group->getFactor();
auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
@@ -5889,9 +5886,9 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
}
std::optional<InstructionCost>
-LoopVectorizationCostModel::getReductionPatternCost(
- Instruction *I, ElementCount VF, Type *Ty,
- TTI::TargetCostKind CostKind) const {
+LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
+ ElementCount VF,
+ Type *Ty) const {
using namespace llvm::PatternMatch;
// Early exit for no inloop reductions
if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
@@ -6082,14 +6079,15 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
return TTI.getAddressComputationCost(ValTy) +
- TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
- TTI::TCK_RecipThroughput, OpInfo, I);
+ TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, CostKind,
+ OpInfo, I);
}
return getWideningCost(I, VF);
}
-InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
- Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
+InstructionCost
+LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
+ ElementCount VF) const {
// There is no mechanism yet to create a scalable scalarization loop,
// so this is currently Invalid.
@@ -6332,7 +6330,6 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
InstructionCost ScalarCost = InstructionCost::getInvalid();
InstructionCost VectorCost = InstructionCost::getInvalid();
InstructionCost IntrinsicCost = InstructionCost::getInvalid();
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
Function *ScalarFunc = CI->getCalledFunction();
Type *ScalarRetTy = CI->getType();
SmallVector<Type *, 4> Tys, ScalarTys;
@@ -6348,8 +6345,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
// Compute costs of unpacking argument values for the scalar calls and
// packing the return values to a vector.
- InstructionCost ScalarizationCost =
- getScalarizationOverhead(CI, VF, CostKind);
+ InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
// Honor ForcedScalars and UniformAfterVectorization decisions.
@@ -6373,7 +6369,7 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
// An in-loop reduction using an fmuladd intrinsic is a special case;
// we don't want the normal cost for that intrinsic.
if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
- if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
+ if (auto RedCost = getReductionPatternCost(CI, VF, RetTy)) {
setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
getVectorIntrinsicIDForCall(CI, TLI),
std::nullopt, *RedCost);
@@ -6458,7 +6454,8 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
TargetTransformInfo::SK_Broadcast,
VectorType::get(IntegerType::getInt1Ty(
VecFunc->getFunctionType()->getContext()),
- VF));
+ VF),
+ {}, CostKind);
if (TLI && VecFunc && !CI->isNoBuiltin())
VectorCost =
@@ -6526,7 +6523,6 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
if (canTruncateToMinimalBitwidth(I, VF))
RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
auto *SE = PSE.getSE();
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
auto HasSingleCopyAfterVectorization = [this](Instruction *I,
ElementCount VF) -> bool {
@@ -6702,7 +6698,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
InstructionCost MulCost = TTI::TCC_Free;
ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));
if (!RHS || RHS->getZExtValue() != 1)
- MulCost = TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
+ MulCost =
+ TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
// Find the cost of the histogram operation itself.
Type *PtrTy = VectorType::get(HGram->Load->getPointerOperandType(), VF);
@@ -6713,9 +6710,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
{PtrTy, ScalarTy, MaskTy});
// Add the costs together with the add/sub operation.
- return TTI.getIntrinsicInstrCost(
- ICA, TargetTransformInfo::TCK_RecipThroughput) +
- MulCost + TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy);
+ return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
+ TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, CostKind);
}
[[fallthrough]];
}
@@ -6740,7 +6736,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
return 0;
// Detect reduction patterns
- if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
+ if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
return *RedCost;
// Certain instructions can be cheaper to vectorize if they have a constant
@@ -6905,7 +6901,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
}
// Detect reduction patterns
- if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
+ if (auto RedCost = getReductionPatternCost(I, VF, VectorTy))
return *RedCost;
Type *SrcScalarTy = I->getOperand(0)->getType();
@@ -6930,7 +6926,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
case Instruction::Call:
return getVectorCallCost(cast<CallInst>(I), VF);
case Instruction::ExtractValue:
- return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
+ return TTI.getInstructionCost(I, CostKind);
case Instruction::Alloca:
// We cannot easily widen alloca to a scalable alloca, as
// the result would need to be a vector of pointers.
@@ -7442,8 +7438,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
// Pre-compute the cost for I, if it has a reduction pattern cost.
for (Instruction *I : ChainOpsAndOperands) {
- auto ReductionCost = CM.getReductionPatternCost(
- I, VF, toVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
+ auto ReductionCost =
+ CM.getReductionPatternCost(I, VF, toVectorTy(I->getType(), VF));
if (!ReductionCost)
continue;
@@ -7501,7 +7497,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
ElementCount VF) const {
- VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
+ CM.CostKind);
InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
// Now compute and add the VPlan-based cost.
@@ -7581,6 +7578,16 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
return {*FirstPlan.vectorFactors().begin(), 0, 0};
+ LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
+ << (CM.CostKind == TTI::TCK_RecipThroughput
+ ? "Reciprocal Throughput\n"
+ : CM.CostKind == TTI::TCK_Latency
+ ? "Instruction Latency\n"
+ : CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
+ : CM.CostKind == TTI::TCK_SizeAndLatency
+ ? "Code Size and Latency\n"
+ : "Unknown\n"));
+
ElementCount ScalarVF = ElementCount::getFixed(1);
assert(hasPlanWithVF(ScalarVF) &&
"More than a single plan/VF w/o any plan having scalar VF");
@@ -7634,7 +7641,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
// simplifications not accounted for in the legacy cost model. If that's the
// case, don't trigger the assertion, as the extra simplifications may cause a
//
diff erent VF to be picked by the VPlan-based cost model.
- VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM);
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
+ CM.CostKind);
precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
assert((BestFactor.Width == LegacyVF.Width ||
planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
@@ -10155,7 +10163,7 @@ static bool processLoopInVPlanNativePath(
bool AddBranchWeights =
hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
- AddBranchWeights);
+ AddBranchWeights, CM.CostKind);
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
@@ -10692,7 +10700,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
bool AddBranchWeights =
hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
- AddBranchWeights);
+ AddBranchWeights, CM.CostKind);
if (LVP.hasPlanWithVF(VF.Width)) {
// Select the interleave count.
IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index aa41c41e90c4c4..f1228368804beb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -770,7 +770,7 @@ InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
InstructionCost BackedgeCost =
ForceTargetInstructionCost.getNumOccurrences()
? InstructionCost(ForceTargetInstructionCost.getNumOccurrences())
- : Ctx.TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
+ : Ctx.TTI.getCFInstrCost(Instruction::Br, Ctx.CostKind);
LLVM_DEBUG(dbgs() << "Cost of " << BackedgeCost << " for VF " << VF
<< ": vector loop backedge\n");
Cost += BackedgeCost;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index eceddff6be6ff5..784cee6ed4b06c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -686,11 +686,13 @@ struct VPCostContext {
LLVMContext &LLVMCtx;
LoopVectorizationCostModel &CM;
SmallPtrSet<Instruction *, 8> SkipCostComputation;
+ TargetTransformInfo::TargetCostKind CostKind;
VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
- Type *CanIVTy, LoopVectorizationCostModel &CM)
+ Type *CanIVTy, LoopVectorizationCostModel &CM,
+ TargetTransformInfo::TargetCostKind CostKind)
: TTI(TTI), TLI(TLI), Types(CanIVTy), LLVMCtx(CanIVTy->getContext()),
- CM(CM) {}
+ CM(CM), CostKind(CostKind) {}
/// Return the cost for \p UI with \p VF using the legacy cost model as
/// fallback until computing the cost of all recipes migrates to VPlan.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 5ae2f43e4950c5..aa5f92b235555e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -991,10 +991,9 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
InstructionCost VPWidenCallRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
return Ctx.TTI.getCallInstrCost(nullptr, Variant->getReturnType(),
Variant->getFunctionType()->params(),
- CostKind);
+ Ctx.CostKind);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1072,8 +1071,6 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-
// Some backends analyze intrinsic arguments to determine cost. Use the
// underlying value for the operand if it has one. Otherwise try to use the
// operand of the underlying call instruction, if there is one. Otherwise
@@ -1113,7 +1110,7 @@ InstructionCost VPWidenIntrinsicRecipe::computeCost(ElementCount VF,
IntrinsicCostAttributes CostAttrs(
VectorIntrinsicID, RetTy, Arguments, ParamTys, FMF,
dyn_cast_or_null<IntrinsicInst>(getUnderlyingValue()));
- return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, CostKind);
+ return Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
}
StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
@@ -1196,7 +1193,7 @@ InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
// Assume that a non-constant update value (or a constant != 1) requires
// a multiply, and add that into the cost.
InstructionCost MulCost =
- Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy);
+ Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VTy, Ctx.CostKind);
if (IncAmt->isLiveIn()) {
ConstantInt *CI = dyn_cast<ConstantInt>(IncAmt->getLiveInIRValue());
@@ -1212,9 +1209,8 @@ InstructionCost VPHistogramRecipe::computeCost(ElementCount VF,
{PtrTy, IncTy, MaskTy});
// Add the costs together with the add/sub operation.
- return Ctx.TTI.getIntrinsicInstrCost(
- ICA, TargetTransformInfo::TCK_RecipThroughput) +
- MulCost + Ctx.TTI.getArithmeticInstrCost(Opcode, VTy);
+ return Ctx.TTI.getIntrinsicInstrCost(ICA, Ctx.CostKind) + MulCost +
+ Ctx.TTI.getArithmeticInstrCost(Opcode, VTy, Ctx.CostKind);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1278,7 +1274,6 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
bool ScalarCond = getOperand(0)->isDefinedOutsideLoopRegions();
Type *ScalarTy = Ctx.Types.inferScalarType(this);
Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
VPValue *Op0, *Op1;
using namespace llvm::VPlanPatternMatch;
@@ -1296,8 +1291,8 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
Operands.append(SI->op_begin(), SI->op_end());
bool IsLogicalOr = match(this, m_LogicalOr(m_VPValue(Op0), m_VPValue(Op1)));
return Ctx.TTI.getArithmeticInstrCost(
- IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy, CostKind,
- {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
+ IsLogicalOr ? Instruction::Or : Instruction::And, VectorTy,
+ Ctx.CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP}, Operands, SI);
}
Type *CondTy = Ctx.Types.inferScalarType(getOperand(0));
@@ -1307,9 +1302,9 @@ InstructionCost VPWidenSelectRecipe::computeCost(ElementCount VF,
CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
Pred = Cmp->getPredicate();
- return Ctx.TTI.getCmpSelInstrCost(Instruction::Select, VectorTy, CondTy, Pred,
- CostKind, {TTI::OK_AnyValue, TTI::OP_None},
- {TTI::OK_AnyValue, TTI::OP_None}, SI);
+ return Ctx.TTI.getCmpSelInstrCost(
+ Instruction::Select, VectorTy, CondTy, Pred, Ctx.CostKind,
+ {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, SI);
}
VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
@@ -1454,12 +1449,11 @@ void VPWidenRecipe::execute(VPTransformState &State) {
InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
switch (Opcode) {
case Instruction::FNeg: {
Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
return Ctx.TTI.getArithmeticInstrCost(
- Opcode, VectorTy, CostKind,
+ Opcode, VectorTy, Ctx.CostKind,
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
}
@@ -1502,21 +1496,22 @@ InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
if (CtxI)
Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
return Ctx.TTI.getArithmeticInstrCost(
- Opcode, VectorTy, CostKind,
+ Opcode, VectorTy, Ctx.CostKind,
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
RHSInfo, Operands, CtxI, &Ctx.TLI);
}
case Instruction::Freeze: {
// This opcode is unknown. Assume that it is the same as 'mul'.
Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
- return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
+ return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
+ Ctx.CostKind);
}
case Instruction::ICmp:
case Instruction::FCmp: {
Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
Type *VectorTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
- CostKind,
+ Ctx.CostKind,
{TTI::OK_AnyValue, TTI::OP_None},
{TTI::OK_AnyValue, TTI::OP_None}, CtxI);
}
@@ -1646,7 +1641,7 @@ InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
auto *DestTy = cast<VectorType>(toVectorTy(getResultType(), VF));
// Arm TTI will use the underlying instruction to determine the cost.
return Ctx.TTI.getCastInstrCost(
- Opcode, DestTy, SrcTy, CCH, TTI::TCK_RecipThroughput,
+ Opcode, DestTy, SrcTy, CCH, Ctx.CostKind,
dyn_cast_if_present<Instruction>(getUnderlyingValue()));
}
@@ -1664,7 +1659,7 @@ void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
InstructionCost VPHeaderPHIRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
- return Ctx.TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
+ return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
}
/// This function adds
@@ -2143,18 +2138,16 @@ void VPBlendRecipe::execute(VPTransformState &State) {
InstructionCost VPBlendRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-
// Handle cases where only the first lane is used the same way as the legacy
// cost model.
if (vputils::onlyFirstLaneUsed(this))
- return Ctx.TTI.getCFInstrCost(Instruction::PHI, CostKind);
+ return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
Type *ResultTy = toVectorTy(Ctx.Types.inferScalarType(this), VF);
Type *CmpTy = toVectorTy(Type::getInt1Ty(Ctx.Types.getContext()), VF);
return (getNumIncomingValues() - 1) *
Ctx.TTI.getCmpSelInstrCost(Instruction::Select, ResultTy, CmpTy,
- CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ CmpInst::BAD_ICMP_PREDICATE, Ctx.CostKind);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2274,7 +2267,6 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
RecurKind RdxKind = RdxDesc.getRecurrenceKind();
Type *ElementTy = Ctx.Types.inferScalarType(this);
auto *VectorTy = cast<VectorType>(toVectorTy(ElementTy, VF));
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
unsigned Opcode = RdxDesc.getOpcode();
// TODO: Support any-of and in-loop reductions.
@@ -2292,15 +2284,15 @@ InstructionCost VPReductionRecipe::computeCost(ElementCount VF,
// Cost = Reduction cost + BinOp cost
InstructionCost Cost =
- Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, CostKind);
+ Ctx.TTI.getArithmeticInstrCost(Opcode, ElementTy, Ctx.CostKind);
if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind)) {
Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RdxKind);
return Cost + Ctx.TTI.getMinMaxReductionCost(
- Id, VectorTy, RdxDesc.getFastMathFlags(), CostKind);
+ Id, VectorTy, RdxDesc.getFastMathFlags(), Ctx.CostKind);
}
return Cost + Ctx.TTI.getArithmeticReductionCost(
- Opcode, VectorTy, RdxDesc.getFastMathFlags(), CostKind);
+ Opcode, VectorTy, RdxDesc.getFastMathFlags(), Ctx.CostKind);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2531,7 +2523,6 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
unsigned AS =
getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
if (!Consecutive) {
// TODO: Using the original IR may not be accurate.
@@ -2542,25 +2533,26 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
"Inconsecutive memory access should not have the order.");
return Ctx.TTI.getAddressComputationCost(Ty) +
Ctx.TTI.getGatherScatterOpCost(Ingredient.getOpcode(), Ty, Ptr,
- IsMasked, Alignment, CostKind,
+ IsMasked, Alignment, Ctx.CostKind,
&Ingredient);
}
InstructionCost Cost = 0;
if (IsMasked) {
Cost += Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
- AS, CostKind);
+ AS, Ctx.CostKind);
} else {
TTI::OperandValueInfo OpInfo =
Ctx.TTI.getOperandInfo(Ingredient.getOperand(0));
Cost += Ctx.TTI.getMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, AS,
- CostKind, OpInfo, &Ingredient);
+ Ctx.CostKind, OpInfo, &Ingredient);
}
if (!Reverse)
return Cost;
- return Cost += Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
- cast<VectorType>(Ty), {}, CostKind, 0);
+ return Cost +=
+ Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
+ cast<VectorType>(Ty), {}, Ctx.CostKind, 0);
}
void VPWidenLoadRecipe::execute(VPTransformState &State) {
@@ -2678,14 +2670,14 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
unsigned AS =
getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
- Ingredient.getOpcode(), Ty, Alignment, AS, CostKind);
+ Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind);
if (!Reverse)
return Cost;
return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
- cast<VectorType>(Ty), {}, CostKind, 0);
+ cast<VectorType>(Ty), {}, Ctx.CostKind,
+ 0);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2799,14 +2791,14 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
unsigned AS =
getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
- Ingredient.getOpcode(), Ty, Alignment, AS, CostKind);
+ Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind);
if (!Reverse)
return Cost;
return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
- cast<VectorType>(Ty), {}, CostKind, 0);
+ cast<VectorType>(Ty), {}, Ctx.CostKind,
+ 0);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -3197,7 +3189,6 @@ InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
: getStoredValues()[InsertPosIdx]);
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
unsigned AS = getLoadStoreAddressSpace(InsertPos);
- enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
unsigned InterleaveFactor = IG->getFactor();
auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
@@ -3211,14 +3202,15 @@ InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
// Calculate the cost of the whole interleaved group.
InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
InsertPos->getOpcode(), WideVecTy, IG->getFactor(), Indices,
- IG->getAlign(), AS, CostKind, getMask(), NeedsMaskForGaps);
+ IG->getAlign(), AS, Ctx.CostKind, getMask(), NeedsMaskForGaps);
if (!IG->isReverse())
return Cost;
return Cost + IG->getNumMembers() *
Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
- VectorTy, std::nullopt, CostKind, 0);
+ VectorTy, std::nullopt, Ctx.CostKind,
+ 0);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -3428,9 +3420,8 @@ void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
InstructionCost
VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
if (VF.isScalar())
- return Ctx.TTI.getCFInstrCost(Instruction::PHI, CostKind);
+ return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
if (VF.isScalable() && VF.getKnownMinValue() == 1)
return InstructionCost::getInvalid();
@@ -3441,7 +3432,7 @@ VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
toVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
return Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Splice,
- cast<VectorType>(VectorTy), Mask, CostKind,
+ cast<VectorType>(VectorTy), Mask, Ctx.CostKind,
VF.getKnownMinValue() - 1);
}
More information about the llvm-commits
mailing list