[llvm] [LV][NFC] Clean up tail-folding check for early-exit loops (PR #133931)
Shih-Po Hung via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 1 18:39:49 PDT 2025
https://github.com/arcbbb updated https://github.com/llvm/llvm-project/pull/133931
>From 18ea5709eceeff6062b5223abd831e78fcb78dc4 Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Mon, 31 Mar 2025 23:54:09 -0700
Subject: [PATCH 1/2] [LV][NFC] Clean up tail-folding check for early-exit
loops
This patch moves the check for a single latch exit from computeMaxVF()
to LoopVectorizationLegality::canFoldTailByMasking(), as it duplicates
the logic when foldTailByMasking() returns false.
It also introduces HasSingleLatchExit to prevent early-exit loops from
entering code paths that assume non-predicated loops.
---
.../Vectorize/LoopVectorizationLegality.cpp | 8 +++++++
.../Transforms/Vectorize/LoopVectorize.cpp | 21 +++----------------
2 files changed, 11 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 3ec6850d6f685..0dc065333f807 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1924,6 +1924,14 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const {
}
}
+ // The only loops we can vectorize without a scalar epilogue, are loops with
+ // a bottom-test and a single exiting block. We'd have to handle the fact
+ // that not every instruction executes on the last iteration. This will
+ // require a lane mask which varies through the vector loop body. (TODO)
+ if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+ LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking. Requires a singe latch exit\n");
+ return false;
+ }
LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
return true;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 55cc801e91452..477514d907201 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3987,22 +3987,6 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
break;
}
- // The only loops we can vectorize without a scalar epilogue, are loops with
- // a bottom-test and a single exiting block. We'd have to handle the fact
- // that not every instruction executes on the last iteration. This will
- // require a lane mask which varies through the vector loop body. (TODO)
- if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
- // If there was a tail-folding hint/switch, but we can't fold the tail by
- // masking, fallback to a vectorization with a scalar epilogue.
- if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
- LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
- "scalar epilogue instead.\n");
- ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
- return computeFeasibleMaxVF(MaxTC, UserVF, false);
- }
- return FixedScalableVFPair::getNone();
- }
-
// Now try the tail folding
// Invalidate interleave groups that require an epilogue if we can't mask
@@ -4049,7 +4033,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
return Rem->isZero();
};
- if (MaxPowerOf2RuntimeVF > 0u) {
+ bool HasSingleLatchExit = TheLoop->getExitingBlock() == TheLoop->getLoopLatch();
+ if (HasSingleLatchExit && MaxPowerOf2RuntimeVF > 0u) {
assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
"MaxFixedVF must be a power of 2");
if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
@@ -4060,7 +4045,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
}
auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop);
- if (ExpectedTC && ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) {
+ if (HasSingleLatchExit && ExpectedTC && ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) {
if (MaxPowerOf2RuntimeVF > 0u) {
// If we have a low-trip-count, and the fixed-width VF is known to divide
// the trip count but the scalable factor does not, use the fixed-width
>From 45541a4937ae935bdcd2a523c482e5eb8d60b82b Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Tue, 1 Apr 2025 18:38:16 -0700
Subject: [PATCH 2/2] Style update
---
llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp | 4 +++-
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 6 ++++--
2 files changed, 7 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 0dc065333f807..0763a255b3afa 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1929,7 +1929,9 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const {
// that not every instruction executes on the last iteration. This will
// require a lane mask which varies through the vector loop body. (TODO)
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
- LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking. Requires a singe latch exit\n");
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Cannot fold tail by masking. Requires a singe latch exit\n");
return false;
}
LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 477514d907201..a010f5c52e9a7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4033,7 +4033,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
return Rem->isZero();
};
- bool HasSingleLatchExit = TheLoop->getExitingBlock() == TheLoop->getLoopLatch();
+ bool HasSingleLatchExit =
+ TheLoop->getExitingBlock() == TheLoop->getLoopLatch();
if (HasSingleLatchExit && MaxPowerOf2RuntimeVF > 0u) {
assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
"MaxFixedVF must be a power of 2");
@@ -4045,7 +4046,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
}
auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop);
- if (HasSingleLatchExit && ExpectedTC && ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) {
+ if (HasSingleLatchExit && ExpectedTC &&
+ ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) {
if (MaxPowerOf2RuntimeVF > 0u) {
// If we have a low-trip-count, and the fixed-width VF is known to divide
// the trip count but the scalable factor does not, use the fixed-width
More information about the llvm-commits
mailing list