[llvm] 4f0be94 - [LV] Improve code in selectInterleaveCount (NFC) (#128002)
via llvm-commits
llvm-commits at lists.llvm.org
Mon May 12 09:20:13 PDT 2025
Author: Ramkumar Ramachandra
Date: 2025-05-12T17:20:10+01:00
New Revision: 4f0be9414c0c92134d4baeae3d9cdfa32d4f9848
URL: https://github.com/llvm/llvm-project/commit/4f0be9414c0c92134d4baeae3d9cdfa32d4f9848
DIFF: https://github.com/llvm/llvm-project/commit/4f0be9414c0c92134d4baeae3d9cdfa32d4f9848.diff
LOG: [LV] Improve code in selectInterleaveCount (NFC) (#128002)
Use the fact that getSmallBestKnownTC returns an exact trip count, if
possible, and falls back to returning an estimate, to factor some code
in selectInterleaveCount.
Added:
Modified:
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 79474b5bf7f61..a1cedbbf17b84 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4985,7 +4985,6 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
if (Legal->hasUncountableEarlyExit())
return 1;
- auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop);
const bool HasReductions = !Legal->getReductionVars().empty();
// If we did not calculate the cost for VF (because the user selected the VF)
@@ -5062,51 +5061,53 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
}
unsigned EstimatedVF = getEstimatedRuntimeVF(VF, VScaleForTuning);
- unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
- if (KnownTC > 0) {
- // At least one iteration must be scalar when this constraint holds. So the
- // maximum available iterations for interleaving is one less.
- unsigned AvailableTC =
- requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
-
- // If trip count is known we select between two prospective ICs, where
- // 1) the aggressive IC is capped by the trip count divided by VF
- // 2) the conservative IC is capped by the trip count divided by (VF * 2)
- // The final IC is selected in a way that the epilogue loop trip count is
- // minimized while maximizing the IC itself, so that we either run the
- // vector loop at least once if it generates a small epilogue loop, or else
- // we run the vector loop at least twice.
-
- unsigned InterleaveCountUB = bit_floor(
- std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
- unsigned InterleaveCountLB = bit_floor(std::max(
- 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
- MaxInterleaveCount = InterleaveCountLB;
-
- if (InterleaveCountUB != InterleaveCountLB) {
- unsigned TailTripCountUB =
- (AvailableTC % (EstimatedVF * InterleaveCountUB));
- unsigned TailTripCountLB =
- (AvailableTC % (EstimatedVF * InterleaveCountLB));
- // If both produce same scalar tail, maximize the IC to do the same work
- // in fewer vector loop iterations
- if (TailTripCountUB == TailTripCountLB)
- MaxInterleaveCount = InterleaveCountUB;
- }
- } else if (BestKnownTC) {
+
+ // Try to get the exact trip count, or an estimate based on profiling data or
+ // ConstantMax from PSE, failing that.
+ if (auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop)) {
// At least one iteration must be scalar when this constraint holds. So the
// maximum available iterations for interleaving is one less.
unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
? (*BestKnownTC) - 1
: *BestKnownTC;
- // If trip count is an estimated compile time constant, limit the
- // IC to be capped by the trip count divided by VF * 2, such that the vector
- // loop runs at least twice to make interleaving seem profitable when there
- // is an epilogue loop present. Since exact Trip count is not known we
- // choose to be conservative in our IC estimate.
- MaxInterleaveCount = bit_floor(std::max(
+ unsigned InterleaveCountLB = bit_floor(std::max(
1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
+
+ if (PSE.getSE()->getSmallConstantTripCount(TheLoop) > 0) {
+ // If the best known trip count is exact, we select between two
+ // prospective ICs, where
+ //
+ // 1) the aggressive IC is capped by the trip count divided by VF
+ // 2) the conservative IC is capped by the trip count divided by (VF * 2)
+ //
+ // The final IC is selected in a way that the epilogue loop trip count is
+ // minimized while maximizing the IC itself, so that we either run the
+ // vector loop at least once if it generates a small epilogue loop, or
+ // else we run the vector loop at least twice.
+
+ unsigned InterleaveCountUB = bit_floor(std::max(
+ 1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
+ MaxInterleaveCount = InterleaveCountLB;
+
+ if (InterleaveCountUB != InterleaveCountLB) {
+ unsigned TailTripCountUB =
+ (AvailableTC % (EstimatedVF * InterleaveCountUB));
+ unsigned TailTripCountLB =
+ (AvailableTC % (EstimatedVF * InterleaveCountLB));
+ // If both produce same scalar tail, maximize the IC to do the same work
+ // in fewer vector loop iterations
+ if (TailTripCountUB == TailTripCountLB)
+ MaxInterleaveCount = InterleaveCountUB;
+ }
+ } else {
+ // If trip count is an estimated compile time constant, limit the
+ // IC to be capped by the trip count divided by VF * 2, such that the
+ // vector loop runs at least twice to make interleaving seem profitable
+ // when there is an epilogue loop present. Since exact Trip count is not
+ // known we choose to be conservative in our IC estimate.
+ MaxInterleaveCount = InterleaveCountLB;
+ }
}
assert(MaxInterleaveCount > 0 &&
More information about the llvm-commits
mailing list