[llvm] [LV] Move VPlan-based calculateRegisterUsage to VPlanAnalysis (NFC). (PR #135673)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 14 13:35:28 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-risc-v
Author: Florian Hahn (fhahn)
<details>
<summary>Changes</summary>
Move VPlan-based calculateRegisterUsage from LoopVectorize
to VPlanAnalysis.cpp. It is a VPlan-based analysis and this helps
to reduce the size of LoopVectorize.
Probably best to land this after https://github.com/llvm/llvm-project/pull/132190, when LoopVectorizationCostModel::RegisterUsage can be removed.
---
Patch is 34.71 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135673.diff
14 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+2-246)
- (modified) llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp (+245)
- (modified) llvm/lib/Transforms/Vectorize/VPlanAnalysis.h (+23)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll (+1-1)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll (+1-1)
- (modified) llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll (+2-2)
- (modified) llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll (+2-2)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll (+1-1)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll (+2-2)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll (+5-5)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll (+1-1)
- (modified) llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll (+1-1)
- (modified) llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll (+1-1)
- (modified) llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll (+2-2)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a28cda9fe62b3..00c3b2fb68450 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4863,250 +4863,6 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
}
}
-/// Get the VF scaling factor applied to the recipe's output, if the recipe has
-/// one.
-static unsigned getVFScaleFactor(VPRecipeBase *R) {
- if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
- return RR->getVFScaleFactor();
- if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
- return RR->getVFScaleFactor();
- return 1;
-}
-
-/// Estimate the register usage for \p Plan and vectorization factors in \p VFs
-/// by calculating the highest number of values that are live at a single
-/// location as a rough estimate. Returns the register usage for each VF in \p
-/// VFs.
-static SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
-calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs,
- const TargetTransformInfo &TTI,
- const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
- // Each 'key' in the map opens a new interval. The values
- // of the map are the index of the 'last seen' usage of the
- // recipe that is the key.
- using IntervalMap = SmallDenseMap<VPRecipeBase *, unsigned, 16>;
-
- // Maps indices to recipes.
- SmallVector<VPRecipeBase *, 64> Idx2Recipe;
- // Marks the end of each interval.
- IntervalMap EndPoint;
- // Saves the list of recipe indices that are used in the loop.
- SmallPtrSet<VPRecipeBase *, 8> Ends;
- // Saves the list of values that are used in the loop but are defined outside
- // the loop (not including non-recipe values such as arguments and
- // constants).
- SmallSetVector<VPValue *, 8> LoopInvariants;
- LoopInvariants.insert(&Plan.getVectorTripCount());
-
- // We scan the loop in a topological order in order and assign a number to
- // each recipe. We use RPO to ensure that defs are met before their users. We
- // assume that each recipe that has in-loop users starts an interval. We
- // record every time that an in-loop value is used, so we have a list of the
- // first and last occurrences of each recipe.
- ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
- Plan.getVectorLoopRegion());
- for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
- if (!VPBB->getParent())
- break;
- for (VPRecipeBase &R : *VPBB) {
- Idx2Recipe.push_back(&R);
-
- // Save the end location of each USE.
- for (VPValue *U : R.operands()) {
- auto *DefR = U->getDefiningRecipe();
-
- // Ignore non-recipe values such as arguments, constants, etc.
- // FIXME: Might need some motivation why these values are ignored. If
- // for example an argument is used inside the loop it will increase the
- // register pressure (so shouldn't we add it to LoopInvariants).
- if (!DefR && (!U->getLiveInIRValue() ||
- !isa<Instruction>(U->getLiveInIRValue())))
- continue;
-
- // If this recipe is outside the loop then record it and continue.
- if (!DefR) {
- LoopInvariants.insert(U);
- continue;
- }
-
- // Overwrite previous end points.
- EndPoint[DefR] = Idx2Recipe.size();
- Ends.insert(DefR);
- }
- }
- if (VPBB == Plan.getVectorLoopRegion()->getExiting()) {
- // VPWidenIntOrFpInductionRecipes are used implicitly at the end of the
- // exiting block, where their increment will get materialized eventually.
- for (auto &R : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
- if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
- EndPoint[&R] = Idx2Recipe.size();
- Ends.insert(&R);
- }
- }
- }
- }
-
- // Saves the list of intervals that end with the index in 'key'.
- using RecipeList = SmallVector<VPRecipeBase *, 2>;
- SmallDenseMap<unsigned, RecipeList, 16> TransposeEnds;
-
- // Next, we transpose the EndPoints into a multi map that holds the list of
- // intervals that *end* at a specific location.
- for (auto &Interval : EndPoint)
- TransposeEnds[Interval.second].push_back(Interval.first);
-
- SmallPtrSet<VPRecipeBase *, 8> OpenIntervals;
- SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> RUs(VFs.size());
- SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
-
- LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
-
- VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
-
- const auto &TTICapture = TTI;
- auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
- if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) ||
- (VF.isScalable() &&
- !TTICapture.isElementTypeLegalForScalableVector(Ty)))
- return 0;
- return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
- };
-
- // We scan the instructions linearly and record each time that a new interval
- // starts, by placing it in a set. If we find this value in TransposEnds then
- // we remove it from the set. The max register usage is the maximum register
- // usage of the recipes of the set.
- for (unsigned int Idx = 0, Sz = Idx2Recipe.size(); Idx < Sz; ++Idx) {
- VPRecipeBase *R = Idx2Recipe[Idx];
-
- // Remove all of the recipes that end at this location.
- RecipeList &List = TransposeEnds[Idx];
- for (VPRecipeBase *ToRemove : List)
- OpenIntervals.erase(ToRemove);
-
- // Ignore recipes that are never used within the loop and do not have side
- // effects.
- if (!Ends.count(R) && !R->mayHaveSideEffects())
- continue;
-
- // Skip recipes for ignored values.
- // TODO: Should mark recipes for ephemeral values that cannot be removed
- // explictly in VPlan.
- if (isa<VPSingleDefRecipe>(R) &&
- ValuesToIgnore.contains(
- cast<VPSingleDefRecipe>(R)->getUnderlyingValue()))
- continue;
-
- // For each VF find the maximum usage of registers.
- for (unsigned J = 0, E = VFs.size(); J < E; ++J) {
- // Count the number of registers used, per register class, given all open
- // intervals.
- // Note that elements in this SmallMapVector will be default constructed
- // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
- // there is no previous entry for ClassID.
- SmallMapVector<unsigned, unsigned, 4> RegUsage;
-
- for (auto *R : OpenIntervals) {
- // Skip recipes that weren't present in the original loop.
- // TODO: Remove after removing the legacy
- // LoopVectorizationCostModel::calculateRegisterUsage
- if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe,
- VPBranchOnMaskRecipe>(R))
- continue;
-
- if (VFs[J].isScalar() ||
- isa<VPCanonicalIVPHIRecipe, VPReplicateRecipe, VPDerivedIVRecipe,
- VPScalarIVStepsRecipe>(R) ||
- (isa<VPInstruction>(R) &&
- all_of(cast<VPSingleDefRecipe>(R)->users(), [&](VPUser *U) {
- return cast<VPRecipeBase>(U)->usesScalars(R->getVPSingleValue());
- }))) {
- unsigned ClassID = TTI.getRegisterClassForType(
- false, TypeInfo.inferScalarType(R->getVPSingleValue()));
- // FIXME: The target might use more than one register for the type
- // even in the scalar case.
- RegUsage[ClassID] += 1;
- } else {
- // The output from scaled phis and scaled reductions actually has
- // fewer lanes than the VF.
- unsigned ScaleFactor = getVFScaleFactor(R);
- ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor);
- LLVM_DEBUG(if (VF != VFs[J]) {
- dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF
- << " for " << *R << "\n";
- });
-
- for (VPValue *DefV : R->definedValues()) {
- Type *ScalarTy = TypeInfo.inferScalarType(DefV);
- unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy);
- RegUsage[ClassID] += GetRegUsage(ScalarTy, VF);
- }
- }
- }
-
- for (const auto &Pair : RegUsage) {
- auto &Entry = MaxUsages[J][Pair.first];
- Entry = std::max(Entry, Pair.second);
- }
- }
-
- LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
- << OpenIntervals.size() << '\n');
-
- // Add the current recipe to the list of open intervals.
- OpenIntervals.insert(R);
- }
-
- // We also search for instructions that are defined outside the loop, but are
- // used inside the loop. We need this number separately from the max-interval
- // usage number because when we unroll, loop-invariant values do not take
- // more register.
- LoopVectorizationCostModel::RegisterUsage RU;
- for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) {
- // Note that elements in this SmallMapVector will be default constructed
- // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
- // there is no previous entry for ClassID.
- SmallMapVector<unsigned, unsigned, 4> Invariant;
-
- for (auto *In : LoopInvariants) {
- // FIXME: The target might use more than one register for the type
- // even in the scalar case.
- bool IsScalar = all_of(In->users(), [&](VPUser *U) {
- return cast<VPRecipeBase>(U)->usesScalars(In);
- });
-
- ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
- unsigned ClassID = TTI.getRegisterClassForType(
- VF.isVector(), TypeInfo.inferScalarType(In));
- Invariant[ClassID] += GetRegUsage(TypeInfo.inferScalarType(In), VF);
- }
-
- LLVM_DEBUG({
- dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n';
- dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size()
- << " item\n";
- for (const auto &pair : MaxUsages[Idx]) {
- dbgs() << "LV(REG): RegisterClass: "
- << TTI.getRegisterClassName(pair.first) << ", " << pair.second
- << " registers\n";
- }
- dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
- << " item\n";
- for (const auto &pair : Invariant) {
- dbgs() << "LV(REG): RegisterClass: "
- << TTI.getRegisterClassName(pair.first) << ", " << pair.second
- << " registers\n";
- }
- });
-
- RU.LoopInvariantRegs = Invariant;
- RU.MaxLocalUsers = MaxUsages[Idx];
- RUs[Idx] = RU;
- }
-
- return RUs;
-}
-
unsigned
LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
InstructionCost LoopCost) {
@@ -5158,8 +4914,8 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
return 1;
}
- RegisterUsage R =
- ::calculateRegisterUsage(Plan, {VF}, TTI, ValuesToIgnore)[0];
+ VPRegisterUsage R =
+ calculateRegisterUsageForVPlan(Plan, {VF}, TTI, ValuesToIgnore)[0];
// We divide by these constants so assume that we have at least one
// instruction that uses at least one register.
for (auto &Pair : R.MaxLocalUsers) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 375d4c9787994..d6ed432b53ea5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -10,8 +10,10 @@
#include "VPlan.h"
#include "VPlanCFG.h"
#include "VPlanDominatorTree.h"
+#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/GenericDomTreeConstruction.h"
@@ -362,3 +364,246 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A,
#endif
return Base::properlyDominates(ParentA, ParentB);
}
+
+/// Get the VF scaling factor applied to the recipe's output, if the recipe has
+/// one.
+static unsigned getVFScaleFactor(VPRecipeBase *R) {
+ if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
+ return RR->getVFScaleFactor();
+ if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
+ return RR->getVFScaleFactor();
+ return 1;
+}
+
+/// Estimate the register usage for \p Plan and vectorization factors in \p VFs
+/// by calculating the highest number of values that are live at a single
+/// location as a rough estimate. Returns the register usage for each VF in \p
+/// VFs.
+SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForVPlan(
+ VPlan &Plan, ArrayRef<ElementCount> VFs, const TargetTransformInfo &TTI,
+ const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
+ // Each 'key' in the map opens a new interval. The values
+ // of the map are the index of the 'last seen' usage of the
+ // recipe that is the key.
+ using IntervalMap = SmallDenseMap<VPRecipeBase *, unsigned, 16>;
+
+ // Maps indices to recipes.
+ SmallVector<VPRecipeBase *, 64> Idx2Recipe;
+ // Marks the end of each interval.
+ IntervalMap EndPoint;
+ // Saves the list of recipe indices that are used in the loop.
+ SmallPtrSet<VPRecipeBase *, 8> Ends;
+ // Saves the list of values that are used in the loop but are defined outside
+ // the loop (not including non-recipe values such as arguments and
+ // constants).
+ SmallSetVector<VPValue *, 8> LoopInvariants;
+ LoopInvariants.insert(&Plan.getVectorTripCount());
+
+ // We scan the loop in a topological order in order and assign a number to
+ // each recipe. We use RPO to ensure that defs are met before their users. We
+ // assume that each recipe that has in-loop users starts an interval. We
+ // record every time that an in-loop value is used, so we have a list of the
+ // first and last occurrences of each recipe.
+ ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
+ Plan.getVectorLoopRegion());
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+ if (!VPBB->getParent())
+ break;
+ for (VPRecipeBase &R : *VPBB) {
+ Idx2Recipe.push_back(&R);
+
+ // Save the end location of each USE.
+ for (VPValue *U : R.operands()) {
+ auto *DefR = U->getDefiningRecipe();
+
+ // Ignore non-recipe values such as arguments, constants, etc.
+ // FIXME: Might need some motivation why these values are ignored. If
+ // for example an argument is used inside the loop it will increase the
+ // register pressure (so shouldn't we add it to LoopInvariants).
+ if (!DefR && (!U->getLiveInIRValue() ||
+ !isa<Instruction>(U->getLiveInIRValue())))
+ continue;
+
+ // If this recipe is outside the loop then record it and continue.
+ if (!DefR) {
+ LoopInvariants.insert(U);
+ continue;
+ }
+
+ // Overwrite previous end points.
+ EndPoint[DefR] = Idx2Recipe.size();
+ Ends.insert(DefR);
+ }
+ }
+ if (VPBB == Plan.getVectorLoopRegion()->getExiting()) {
+ // VPWidenIntOrFpInductionRecipes are used implicitly at the end of the
+ // exiting block, where their increment will get materialized eventually.
+ for (auto &R : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
+ if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
+ EndPoint[&R] = Idx2Recipe.size();
+ Ends.insert(&R);
+ }
+ }
+ }
+ }
+
+ // Saves the list of intervals that end with the index in 'key'.
+ using RecipeList = SmallVector<VPRecipeBase *, 2>;
+ SmallDenseMap<unsigned, RecipeList, 16> TransposeEnds;
+
+ // Next, we transpose the EndPoints into a multi map that holds the list of
+ // intervals that *end* at a specific location.
+ for (auto &Interval : EndPoint)
+ TransposeEnds[Interval.second].push_back(Interval.first);
+
+ SmallPtrSet<VPRecipeBase *, 8> OpenIntervals;
+ SmallVector<VPRegisterUsage, 8> RUs(VFs.size());
+ SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
+
+ LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
+
+ VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
+
+ const auto &TTICapture = TTI;
+ auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
+ if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) ||
+ (VF.isScalable() &&
+ !TTICapture.isElementTypeLegalForScalableVector(Ty)))
+ return 0;
+ return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
+ };
+
+ // We scan the instructions linearly and record each time that a new interval
+ // starts, by placing it in a set. If we find this value in TransposEnds then
+ // we remove it from the set. The max register usage is the maximum register
+ // usage of the recipes of the set.
+ for (unsigned int Idx = 0, Sz = Idx2Recipe.size(); Idx < Sz; ++Idx) {
+ VPRecipeBase *R = Idx2Recipe[Idx];
+
+ // Remove all of the recipes that end at this location.
+ RecipeList &List = TransposeEnds[Idx];
+ for (VPRecipeBase *ToRemove : List)
+ OpenIntervals.erase(ToRemove);
+
+ // Ignore recipes that are never used within the loop and do not have side
+ // effects.
+ if (!Ends.count(R) && !R->mayHaveSideEffects())
+ continue;
+
+ // Skip recipes for ignored values.
+ // TODO: Should mark recipes for ephemeral values that cannot be removed
+ // explictly in VPlan.
+ if (isa<VPSingleDefRecipe>(R) &&
+ ValuesToIgnore.contains(
+ cast<VPSingleDefRecipe>(R)->getUnderlyingValue()))
+ continue;
+
+ // For each VF find the maximum usage of registers.
+ for (unsigned J = 0, E = VFs.size(); J < E; ++J) {
+ // Count the number of registers used, per register class, given all open
+ // intervals.
+ // Note that elements in this SmallMapVector will be default constructed
+ // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
+ // there is no previous entry for ClassID.
+ SmallMapVector<unsigned, unsigned, 4> RegUsage;
+
+ for (auto *R : OpenIntervals) {
+ // Skip recipes that weren't present in the original loop.
+ // TODO: Remove after removing the legacy
+ // LoopVectorizationCostModel::calculateRegisterUsage
+ if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe,
+ VPBranchOnMaskRecipe>(R))
+ continue;
+
+ if (VFs[J].isScalar() ||
+ isa<VPCanonicalIVPHIRecipe, VPReplicateRecipe, VPDerivedIVRecipe,
+ VPScalarIVStepsRecipe>(R) ||
+ (isa<VPInstruction>(R) &&
+ all_of(cast<VPSingleDefRecipe>(R)->users(), [&](VPUser *U) {
+ return cast<VPRecipeBase>(U)->usesScalars(R->getVPSingleValue());
+ }))) {
+ unsigned ClassID = TTI.getRegisterClassForType(
+ false, TypeInfo.inferScalarType(R->getVPSingleValue()));
+ // FIXME: The target might use more than one register for the type
+ // even in the scalar case.
+ RegUsage[ClassID] += 1;
+ } else {
+ // The output from scaled phis and scaled reductions actually has
+ // fewer lanes than the VF.
+ unsigned ScaleFactor = getVFScaleFactor(R);
+ ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor);
+ LLVM_DEBUG(if (VF != VFs[J]) {
+ dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF
+ << " for " << *R << "\n";
+ });
+
+ for (VPValue *DefV : R->definedValues()) {
+ Type *ScalarTy = TypeInfo.inferScalarType(DefV);
+ unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy);
+ RegUsage[ClassID] += GetRegUsage(ScalarTy, VF);
+ }
+ }
+ }
+
+ for (const auto &Pair : RegUsage) {
+ auto &Entry = MaxUsages[J][Pair.first];
+ Entry = std::max(Entry, Pair.second);
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
+ << OpenIntervals.size() << '\n');
+
+ // Add the cur...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/135673
More information about the llvm-commits
mailing list