[llvm] [LV] Change getSmallBestKnownTC to return an ElementCount (NFC) (PR #141793)
Paul Walker via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 17 08:00:56 PDT 2025
https://github.com/paulwalker-arm updated https://github.com/llvm/llvm-project/pull/141793
>From 8af01cac2e46e3ef9e059a49f98c08e3f6d46b04 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Fri, 16 May 2025 15:09:07 +0100
Subject: [PATCH 1/2] [NFC][LLVM][LoopVectorize] Change getSmallBestKnownTC to
return an ElementCount.
This is prep work for enabling better UF calculations when using
vscale based VFs to vectorise loops with vscale based tripcounts.
NOTE: NFC because All uses remain fixed-length until a following PR
changes LoopVectorize's version of getSmallConstantTripCount().
---
.../Transforms/Vectorize/LoopVectorize.cpp | 49 +++++++++++--------
1 file changed, 29 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f1470fd1f7314..75152a2e8f8ca 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -419,6 +419,12 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
}
+/// A version of ScalarEvolution::getSmallConstantTripCount that returns an
+/// ElementCount to include loops whose trip count is a function of vscale.
+ElementCount getSmallConstantTripCount(ScalarEvolution *SE, const Loop *L) {
+ return ElementCount::getFixed(SE->getSmallConstantTripCount(L));
+}
+
/// Returns "best known" trip count, which is either a valid positive trip count
/// or std::nullopt when an estimate cannot be made (including when the trip
/// count would overflow), for the specified loop \p L as defined by the
@@ -427,24 +433,24 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
/// 2) Returns expected trip count according to profile data if any.
/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
/// 4) Returns std::nullopt if all of the above failed.
-static std::optional<unsigned>
+static std::optional<ElementCount>
getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L,
bool CanUseConstantMax = true) {
// Check if exact trip count is known.
- if (unsigned ExpectedTC = PSE.getSE()->getSmallConstantTripCount(L))
+ if (auto ExpectedTC = getSmallConstantTripCount(PSE.getSE(), L))
return ExpectedTC;
// Check if there is an expected trip count available from profile data.
if (LoopVectorizeWithBlockFrequency)
if (auto EstimatedTC = getLoopEstimatedTripCount(L))
- return *EstimatedTC;
+ return ElementCount::getFixed(*EstimatedTC);
if (!CanUseConstantMax)
return std::nullopt;
// Check if upper bound estimate is known.
if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
- return ExpectedTC;
+ return ElementCount::getFixed(ExpectedTC);
return std::nullopt;
}
@@ -1960,7 +1966,8 @@ class GeneratedRTChecks {
// Get the best known TC estimate.
if (auto EstimatedTC = getSmallBestKnownTC(
PSE, OuterLoop, /* CanUseConstantMax = */ false))
- BestTripCount = *EstimatedTC;
+ if (EstimatedTC->isFixed())
+ BestTripCount = EstimatedTC->getFixedValue();
InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
@@ -3751,12 +3758,12 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
}
ScalarEvolution *SE = PSE.getSE();
- unsigned TC = SE->getSmallConstantTripCount(TheLoop);
+ ElementCount TC = getSmallConstantTripCount(SE, TheLoop);
unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
- if (TC != MaxTC)
+ if (TC != ElementCount::getFixed(MaxTC))
LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
- if (TC == 1) {
+ if (TC.isScalar()) {
reportVectorizationFailure("Single iteration (non) loop",
"loop trip count is one, irrelevant for vectorization",
"SingleIterationLoop", ORE, TheLoop);
@@ -3870,7 +3877,9 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
}
auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop);
- if (ExpectedTC && ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) {
+ if (ExpectedTC && ExpectedTC->isFixed() &&
+ ExpectedTC->getFixedValue() <=
+ TTI.getMinTripCountTailFoldingThreshold()) {
if (MaxPowerOf2RuntimeVF > 0u) {
// If we have a low-trip-count, and the fixed-width VF is known to divide
// the trip count but the scalable factor does not, use the fixed-width
@@ -3928,7 +3937,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
return FixedScalableVFPair::getNone();
}
- if (TC == 0) {
+ if (TC.isZero()) {
reportVectorizationFailure(
"unable to calculate the loop count due to complex control flow",
"UnknownLoopCountComplexCFG", ORE, TheLoop);
@@ -4817,13 +4826,13 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
// At least one iteration must be scalar when this constraint holds. So the
// maximum available iterations for interleaving is one less.
unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
- ? (*BestKnownTC) - 1
- : *BestKnownTC;
+ ? BestKnownTC->getFixedValue() - 1
+ : BestKnownTC->getFixedValue();
unsigned InterleaveCountLB = bit_floor(std::max(
1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
- if (PSE.getSE()->getSmallConstantTripCount(TheLoop) > 0) {
+ if (getSmallConstantTripCount(PSE.getSE(), TheLoop).isNonZero()) {
// If the best known trip count is exact, we select between two
// prospective ICs, where
//
@@ -5183,8 +5192,8 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
// costs of comparison and induction instructions, as they'll get simplified
// away.
SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
- auto TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
- if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking())
+ auto TC = getSmallConstantTripCount(PSE.getSE(), TheLoop);
+ if (TC == VF && !foldTailByMasking())
addFullyUnrolledInstructionsToIgnore(TheLoop, Legal->getInductionVars(),
ValuesToIgnoreForVF);
@@ -6884,8 +6893,8 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
// simplified away.
// TODO: Remove this code after stepping away from the legacy cost model and
// adding code to simplify VPlans before calculating their costs.
- auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);
- if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking())
+ auto TC = getSmallConstantTripCount(PSE.getSE(), OrigLoop);
+ if (TC == VF && !CM.foldTailByMasking())
addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
CostCtx.SkipCostComputation);
@@ -9641,8 +9650,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
// Skip vectorization if the expected trip count is less than the minimum
// required trip count.
if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
- if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
- VF.MinProfitableTripCount)) {
+ if (ElementCount::isKnownLT(*ExpectedTC, VF.MinProfitableTripCount)) {
LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
"trip count < minimum profitable VF ("
<< *ExpectedTC << " < " << VF.MinProfitableTripCount
@@ -10012,7 +10020,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Check the loop for a trip count threshold: vectorize loops with a tiny trip
// count by optimizing for size, to minimize overheads.
auto ExpectedTC = getSmallBestKnownTC(PSE, L);
- if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
+ if (ExpectedTC && ExpectedTC->isFixed() &&
+ ExpectedTC->getFixedValue() < TinyTripCountVectorThreshold) {
LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is worth vectorizing only if no scalar "
<< "iteration overheads are incurred.");
>From 29a79f119884baedb32c70fe404b1003b4094fb6 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Tue, 17 Jun 2025 14:59:30 +0000
Subject: [PATCH 2/2] Make getSmallConstantTripCount a static function.
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 75152a2e8f8ca..ac8952d21b416 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -421,7 +421,8 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
/// A version of ScalarEvolution::getSmallConstantTripCount that returns an
/// ElementCount to include loops whose trip count is a function of vscale.
-ElementCount getSmallConstantTripCount(ScalarEvolution *SE, const Loop *L) {
+static ElementCount getSmallConstantTripCount(ScalarEvolution *SE,
+ const Loop *L) {
return ElementCount::getFixed(SE->getSmallConstantTripCount(L));
}
More information about the llvm-commits
mailing list