[llvm] [LV] Cache call vectorization decisions (PR #66521)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 15 08:32:00 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-vectorizers
<details>
<summary>Changes</summary>
LoopVectorize currently queries VFDatabase repeatedly for each CI,
and each query to VFDatabase rescans all vector variants.
This patch instead makes a decision for each call once per VF based
on the cost of scalarization vs. function call to a vector variant
of the function vs. a vector intrinsic, then caches the decision
along with relevant info for use in planning and plan execution.
---
Patch is 21.14 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/66521.diff
1 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+211-119)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a203c4794eac943..7e81707b8b15750 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1245,6 +1245,13 @@ class LoopVectorizationCostModel {
/// avoid redundant calculations.
void setCostBasedWideningDecision(ElementCount VF);
+ /// A call may be vectorized in different ways depending on whether we have
+ /// vectorized variants available and whether the target supports masking.
+ /// This function analyzes all calls in the function at the supplied VF,
+ /// makes a decision based on the costs of available options, and stores that
+ /// decision in a map for use in planning and plan execution.
+ void setVectorizedCallDecision(ElementCount VF);
+
/// A struct that represents some properties of the register usage
/// of a loop.
struct RegisterUsage {
@@ -1356,7 +1363,9 @@ class LoopVectorizationCostModel {
CM_Widen_Reverse, // For consecutive accesses with stride -1.
CM_Interleave,
CM_GatherScatter,
- CM_Scalarize
+ CM_Scalarize,
+ CM_VectorCall,
+ CM_IntrinsicCall
};
/// Save vectorization decision \p W and \p Cost taken by the cost model for
@@ -1412,6 +1421,29 @@ class LoopVectorizationCostModel {
return WideningDecisions[InstOnVF].second;
}
+ struct CallWideningDecision {
+ InstWidening Kind;
+ Function *Variant;
+ Intrinsic::ID IID;
+ std::optional<unsigned> MaskPos;
+ InstructionCost Cost;
+ };
+
+ void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
+ Function *Variant, Intrinsic::ID IID,
+ std::optional<unsigned> MaskPos,
+ InstructionCost Cost) {
+ assert(!VF.isScalar() && "Expected vector VF");
+ CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
+ MaskPos, Cost};
+ }
+
+ CallWideningDecision getCallWideningDecision(CallInst *CI,
+ ElementCount VF) const {
+ assert(!VF.isScalar() && "Expected vector VF");
+ return CallWideningDecisions.at(std::make_pair(CI, VF));
+ }
+
/// Return True if instruction \p I is an optimizable truncate whose operand
/// is an induction variable. Such a truncate will be removed by adding a new
/// induction variable with the destination type.
@@ -1450,6 +1482,7 @@ class LoopVectorizationCostModel {
if (VF.isScalar() || Uniforms.contains(VF))
return;
setCostBasedWideningDecision(VF);
+ setVectorizedCallDecision(VF);
collectLoopUniforms(VF);
collectLoopScalars(VF);
}
@@ -1616,16 +1649,13 @@ class LoopVectorizationCostModel {
/// Estimate cost of a call instruction CI if it were vectorized with factor
/// VF. Return the cost of the instruction, including scalarization overhead
- /// if it's needed. The flag NeedToScalarize shows if the call needs to be
- /// scalarized -
- /// i.e. either vector version isn't available, or is too expensive.
- InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
- Function **Variant,
- bool *NeedsMask = nullptr) const;
+ /// if it's needed.
+ InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
/// Invalidates decisions already taken by the cost model.
void invalidateCostModelingDecisions() {
WideningDecisions.clear();
+ CallWideningDecisions.clear();
Uniforms.clear();
Scalars.clear();
}
@@ -1692,7 +1722,7 @@ class LoopVectorizationCostModel {
/// part of that pattern.
std::optional<InstructionCost>
getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
- TTI::TargetCostKind CostKind);
+ TTI::TargetCostKind CostKind) const;
/// Calculate vectorization cost of memory instruction \p I.
InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
@@ -1814,6 +1844,11 @@ class LoopVectorizationCostModel {
DecisionList WideningDecisions;
+ using CallDecisionList =
+ DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
+
+ CallDecisionList CallWideningDecisions;
+
/// Returns true if \p V is expected to be vectorized and it needs to be
/// extracted.
bool needsExtract(Value *V, ElementCount VF) const {
@@ -3278,76 +3313,33 @@ static void cse(BasicBlock *BB) {
}
}
-InstructionCost LoopVectorizationCostModel::getVectorCallCost(
- CallInst *CI, ElementCount VF, Function **Variant, bool *NeedsMask) const {
- Function *F = CI->getCalledFunction();
- Type *ScalarRetTy = CI->getType();
- SmallVector<Type *, 4> Tys, ScalarTys;
- bool MaskRequired = Legal->isMaskRequired(CI);
- for (auto &ArgOp : CI->args())
- ScalarTys.push_back(ArgOp->getType());
+InstructionCost
+LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
+ ElementCount VF) const {
+ // We only need to calculate a cost if the VF is scalar; for actual vectors
+ // we should already have a pre-calculated cost at each VF.
+ if (!VF.isScalar())
+ return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
- // Estimate cost of scalarized vector call. The source operands are assumed
- // to be vectors, so we need to extract individual elements from there,
- // execute VF scalar calls, and then gather the result into the vector return
- // value.
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
- InstructionCost ScalarCallCost =
- TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, CostKind);
- if (VF.isScalar())
- return ScalarCallCost;
-
- // Compute corresponding vector type for return value and arguments.
- Type *RetTy = ToVectorTy(ScalarRetTy, VF);
- for (Type *ScalarTy : ScalarTys)
- Tys.push_back(ToVectorTy(ScalarTy, VF));
-
- // Compute costs of unpacking argument values for the scalar calls and
- // packing the return values to a vector.
- InstructionCost ScalarizationCost =
- getScalarizationOverhead(CI, VF, CostKind);
+ Type *RetTy = CI->getType();
+ if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
+ if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind))
+ return *RedCost;
- InstructionCost Cost =
- ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
-
- // If we can't emit a vector call for this function, then the currently found
- // cost is the cost we need to return.
- InstructionCost MaskCost = 0;
- VFShape Shape = VFShape::get(*CI, VF, MaskRequired);
- if (NeedsMask)
- *NeedsMask = MaskRequired;
- Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
- // If we want an unmasked vector function but can't find one matching the VF,
- // maybe we can find vector function that does use a mask and synthesize
- // an all-true mask.
- if (!VecFunc && !MaskRequired) {
- Shape = VFShape::get(*CI, VF, /*HasGlobalPred=*/true);
- VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
- // If we found one, add in the cost of creating a mask
- if (VecFunc) {
- if (NeedsMask)
- *NeedsMask = true;
- MaskCost = TTI.getShuffleCost(
- TargetTransformInfo::SK_Broadcast,
- VectorType::get(
- IntegerType::getInt1Ty(VecFunc->getFunctionType()->getContext()),
- VF));
- }
- }
+ SmallVector<Type *, 4> Tys;
+ for (auto &ArgOp : CI->args())
+ Tys.push_back(ArgOp->getType());
- // We don't support masked function calls yet, but we can scalarize a
- // masked call with branches (unless VF is scalable).
- if (!TLI || CI->isNoBuiltin() || !VecFunc)
- return VF.isScalable() ? InstructionCost::getInvalid() : Cost;
+ InstructionCost ScalarCallCost =
+ TTI.getCallInstrCost(CI->getCalledFunction(), RetTy, Tys, CostKind);
- // If the corresponding vector cost is cheaper, return its cost.
- InstructionCost VectorCallCost =
- TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
- if (VectorCallCost < Cost) {
- *Variant = VecFunc;
- Cost = VectorCallCost;
+ // If this is an intrinsic we may have a lower cost for it.
+ if (getVectorIntrinsicIDForCall(CI, TLI)) {
+ InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
+ return std::min(ScalarCallCost, IntrinsicCost);
}
- return Cost;
+ return ScalarCallCost;
}
static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
@@ -4279,7 +4271,10 @@ bool LoopVectorizationCostModel::isScalarWithPredication(
default:
return true;
case Instruction::Call:
- return !VFDatabase::hasMaskedVariant(*(cast<CallInst>(I)), VF);
+ if (VF.isScalar())
+ return true;
+ return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))
+ .Kind == CM_Scalarize;
case Instruction::Load:
case Instruction::Store: {
auto *Ptr = getLoadStorePointerOperand(I);
@@ -5992,6 +5987,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
if (ValuesToIgnore.count(I))
continue;
+ collectInLoopReductions();
+
// For each VF find the maximum usage of registers.
for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
// Count the number of registers used, per register class, given all open
@@ -6511,7 +6508,8 @@ LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
std::optional<InstructionCost>
LoopVectorizationCostModel::getReductionPatternCost(
- Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
+ Instruction *I, ElementCount VF, Type *Ty,
+ TTI::TargetCostKind CostKind) const {
using namespace llvm::PatternMatch;
// Early exit for no inloop reductions
if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))
@@ -6549,10 +6547,10 @@ LoopVectorizationCostModel::getReductionPatternCost(
// Find the reduction this chain is a part of and calculate the basic cost of
// the reduction on its own.
- Instruction *LastChain = InLoopReductionImmediateChains[RetI];
+ Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);
Instruction *ReductionPhi = LastChain;
while (!isa<PHINode>(ReductionPhi))
- ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
+ ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);
const RecurrenceDescriptor &RdxDesc =
Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
@@ -6970,6 +6968,122 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
}
}
+void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
+ if (VF.isScalar())
+ return;
+
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ // For each instruction in the old loop.
+ for (Instruction &I : *BB) {
+ CallInst *CI = dyn_cast<CallInst>(&I);
+
+ if (!CI || CallWideningDecisions.contains(std::make_pair(CI, VF)))
+ continue;
+
+ InstructionCost ScalarCost = InstructionCost::getInvalid();
+ InstructionCost VectorCost = InstructionCost::getInvalid();
+ InstructionCost IntrinsicCost = InstructionCost::getInvalid();
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+ Function *ScalarFunc = CI->getCalledFunction();
+ Type *ScalarRetTy = CI->getType();
+ SmallVector<Type *, 4> Tys, ScalarTys;
+ bool MaskRequired = Legal->isMaskRequired(CI);
+ for (auto &ArgOp : CI->args())
+ ScalarTys.push_back(ArgOp->getType());
+
+ // Compute corresponding vector type for return value and arguments.
+ Type *RetTy = ToVectorTy(ScalarRetTy, VF);
+ for (Type *ScalarTy : ScalarTys)
+ Tys.push_back(ToVectorTy(ScalarTy, VF));
+
+ // An in-loop reduction using an fmuladd intrinsic is a special case;
+ // we don't want the normal cost for that intrinsic.
+ if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))
+ if (auto RedCost = getReductionPatternCost(CI, VF, RetTy, CostKind)) {
+ setCallWideningDecision(CI, VF, CM_IntrinsicCall, nullptr,
+ getVectorIntrinsicIDForCall(CI, TLI),
+ std::nullopt, *RedCost);
+ return;
+ }
+
+ // Estimate cost of scalarized vector call. The source operands are
+ // assumed to be vectors, so we need to extract individual elements from
+ // there, execute VF scalar calls, and then gather the result into the
+ // vector return value.
+ InstructionCost ScalarCallCost =
+ TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys, CostKind);
+
+ // Compute costs of unpacking argument values for the scalar calls and
+ // packing the return values to a vector.
+ InstructionCost ScalarizationCost =
+ getScalarizationOverhead(CI, VF, CostKind);
+
+ ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
+
+ // Find the cost of vectorizing the call, if we can find a suitable
+ // vector variant of the function.
+ InstructionCost MaskCost = 0;
+ VFShape Shape = VFShape::get(*CI, VF, MaskRequired);
+ bool UsesMask = MaskRequired;
+ Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
+ // If we want an unmasked vector function but can't find one matching the
+ // VF, maybe we can find vector function that does use a mask and
+ // synthesize an all-true mask.
+ if (!VecFunc && !MaskRequired) {
+ Shape = VFShape::get(*CI, VF, /*HasGlobalPred=*/true);
+ VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
+ // If we found one, add in the cost of creating a mask
+ if (VecFunc) {
+ UsesMask = true;
+ MaskCost = TTI.getShuffleCost(
+ TargetTransformInfo::SK_Broadcast,
+ VectorType::get(IntegerType::getInt1Ty(
+ VecFunc->getFunctionType()->getContext()),
+ VF));
+ }
+ }
+
+ std::optional<unsigned> MaskPos = std::nullopt;
+ if (VecFunc && UsesMask) {
+ for (const VFInfo &Info : VFDatabase::getMappings(*CI))
+ if (Info.Shape == Shape) {
+ assert(Info.isMasked() && "Vector function info shape mismatch");
+ MaskPos = Info.getParamIndexForOptionalMask().value();
+ break;
+ }
+
+ assert(MaskPos.has_value() && "Unable to find mask parameter index");
+ }
+
+ if (TLI && VecFunc && !CI->isNoBuiltin())
+ VectorCost =
+ TTI.getCallInstrCost(nullptr, RetTy, Tys, CostKind) + MaskCost;
+
+ // Find the cost of an intrinsic; some targets may have instructions that
+ // perform the operation without needing an actual call.
+ Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
+ if (IID != Intrinsic::not_intrinsic)
+ IntrinsicCost = getVectorIntrinsicCost(CI, VF);
+
+ InstructionCost Cost = ScalarCost;
+ InstWidening Decision = CM_Scalarize;
+
+ if (VectorCost <= Cost) {
+ Cost = VectorCost;
+ Decision = CM_VectorCall;
+ }
+
+ if (IntrinsicCost <= Cost) {
+ Cost = IntrinsicCost;
+ Decision = CM_IntrinsicCall;
+ }
+
+ setCallWideningDecision(CI, VF, Decision, VecFunc, IID, MaskPos, Cost);
+ }
+ }
+}
+
InstructionCost
LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
Type *&VectorTy) {
@@ -7227,6 +7341,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
return TTI::CastContextHint::Reversed;
case LoopVectorizationCostModel::CM_Unknown:
llvm_unreachable("Instr did not go through cost modelling?");
+ case LoopVectorizationCostModel::CM_VectorCall:
+ case LoopVectorizationCostModel::CM_IntrinsicCall:
+ llvm_unreachable_internal("Instr has invalid widening decision");
}
llvm_unreachable("Unhandled case!");
@@ -7284,19 +7401,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
}
- case Instruction::Call: {
- if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
- if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
- return *RedCost;
- Function *Variant;
- CallInst *CI = cast<CallInst>(I);
- InstructionCost CallCost = getVectorCallCost(CI, VF, &Variant);
- if (getVectorIntrinsicIDForCall(CI, TLI)) {
- InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
- return std::min(CallCost, IntrinsicCost);
- }
- return CallCost;
- }
+ case Instruction::Call:
+ return getVectorCallCost(cast<CallInst>(I), VF);
case Instruction::ExtractValue:
return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
case Instruction::Alloca:
@@ -7476,9 +7582,9 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
"VF needs to be a power of two");
// Collect the instructions (and their associated costs) that will be more
// profitable to scalarize.
+ CM.collectInLoopReductions();
if (CM.selectUserVectorizationFactor(UserVF)) {
LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
- CM.collectInLoopReductions();
buildVPlansWithVPRecipes(UserVF, UserVF);
if (!hasPlanWithVF(UserVF)) {
LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
@@ -7502,6 +7608,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
VFCandidates.insert(VF);
+ CM.collectInLoopReductions();
for (const auto &VF : VFCandidates) {
// Collect Uniform and Scalar instructions after vectorization with VF.
CM.collectUniformsAndScalars(VF);
@@ -7512,7 +7619,6 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
CM.collectInstsToScalarize(VF);
}
- CM.collectInLoopReductions();
buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
@@ -8318,22 +8424,15 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
bool ShouldUseVectorIntrinsic =
ID && LoopVectorizationPlanner::getDecisionAndClampRange(
[&](ElementCount VF) -> bool {
- Function *Variant;
- // Is it beneficial to perform intrinsic call compared to lib
- // call?
- InstructionCost CallCost =
- CM.getVectorCallCost(CI, VF, &Variant);
- InstructionCost IntrinsicCost =
- CM.getVectorIntrinsicCost(CI, VF);
- return IntrinsicCost <= CallCost;
+ return CM.getCallWideningDecision(CI, VF).Kind ==
+ LoopVectorizationCostModel::CM_IntrinsicCall;
},
Range);
if (ShouldUseVectorIntrinsic)
return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID);
Function *Variant = nullptr;
- ElementCount VariantVF;
- bool NeedsMask = false;
+ std::optional<unsigned> MaskPos;
// Is better to call a vectorized version of the function than to to scalarize
// the call?
auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
@@ -8352,16 +8451,19 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
// finds a valid variant.
if (Variant)
return false;
- CM.getVectorCallCost(CI, VF, &Variant, &NeedsMask);
- // If we found a valid vector variant at this VF, then store the VF
- // in case we need to generate a mask.
- if (Variant)
- VariantVF = VF;
- return Variant != nullptr;
+ LoopVectorizationCostModel::CallWideningDecision Decision =
+ CM.getCallWideningDecision(CI, VF);
+ if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
+ Variant = Decision.Variant;
+ MaskPos = Decision.MaskPos;
+ ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/66521
More information about the llvm-commits
mailing list