[llvm] [LV][NFC] Clean up tail-folding check for early-exit loops (PR #133931)
Shih-Po Hung via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 3 08:27:01 PDT 2025
https://github.com/arcbbb updated https://github.com/llvm/llvm-project/pull/133931
>From 18ea5709eceeff6062b5223abd831e78fcb78dc4 Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Mon, 31 Mar 2025 23:54:09 -0700
Subject: [PATCH 1/3] [LV][NFC] Clean up tail-folding check for early-exit
loops
This patch moves the check for a single latch exit from computeMaxVF()
to LoopVectorizationLegality::canFoldTailByMasking(), as it duplicates
the logic when foldTailByMasking() returns false.
It also introduces HasSingleLatchExit to prevent early-exit loops from
entering code paths that assume non-predicated loops.
---
.../Vectorize/LoopVectorizationLegality.cpp | 8 +++++++
.../Transforms/Vectorize/LoopVectorize.cpp | 21 +++----------------
2 files changed, 11 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 3ec6850d6f685..0dc065333f807 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1924,6 +1924,14 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const {
}
}
+ // The only loops we can vectorize without a scalar epilogue, are loops with
+ // a bottom-test and a single exiting block. We'd have to handle the fact
+ // that not every instruction executes on the last iteration. This will
+ // require a lane mask which varies through the vector loop body. (TODO)
+ if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+ LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking. Requires a singe latch exit\n");
+ return false;
+ }
LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
return true;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 55cc801e91452..477514d907201 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3987,22 +3987,6 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
break;
}
- // The only loops we can vectorize without a scalar epilogue, are loops with
- // a bottom-test and a single exiting block. We'd have to handle the fact
- // that not every instruction executes on the last iteration. This will
- // require a lane mask which varies through the vector loop body. (TODO)
- if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
- // If there was a tail-folding hint/switch, but we can't fold the tail by
- // masking, fallback to a vectorization with a scalar epilogue.
- if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
- LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
- "scalar epilogue instead.\n");
- ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
- return computeFeasibleMaxVF(MaxTC, UserVF, false);
- }
- return FixedScalableVFPair::getNone();
- }
-
// Now try the tail folding
// Invalidate interleave groups that require an epilogue if we can't mask
@@ -4049,7 +4033,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
return Rem->isZero();
};
- if (MaxPowerOf2RuntimeVF > 0u) {
+ bool HasSingleLatchExit = TheLoop->getExitingBlock() == TheLoop->getLoopLatch();
+ if (HasSingleLatchExit && MaxPowerOf2RuntimeVF > 0u) {
assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
"MaxFixedVF must be a power of 2");
if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
@@ -4060,7 +4045,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
}
auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop);
- if (ExpectedTC && ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) {
+ if (HasSingleLatchExit && ExpectedTC && ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) {
if (MaxPowerOf2RuntimeVF > 0u) {
// If we have a low-trip-count, and the fixed-width VF is known to divide
// the trip count but the scalable factor does not, use the fixed-width
>From 45541a4937ae935bdcd2a523c482e5eb8d60b82b Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Tue, 1 Apr 2025 18:38:16 -0700
Subject: [PATCH 2/3] Style update
---
llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp | 4 +++-
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 6 ++++--
2 files changed, 7 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 0dc065333f807..0763a255b3afa 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1929,7 +1929,9 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const {
// that not every instruction executes on the last iteration. This will
// require a lane mask which varies through the vector loop body. (TODO)
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
- LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking. Requires a singe latch exit\n");
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Cannot fold tail by masking. Requires a singe latch exit\n");
return false;
}
LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 477514d907201..a010f5c52e9a7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4033,7 +4033,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
return Rem->isZero();
};
- bool HasSingleLatchExit = TheLoop->getExitingBlock() == TheLoop->getLoopLatch();
+ bool HasSingleLatchExit =
+ TheLoop->getExitingBlock() == TheLoop->getLoopLatch();
if (HasSingleLatchExit && MaxPowerOf2RuntimeVF > 0u) {
assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
"MaxFixedVF must be a power of 2");
@@ -4045,7 +4046,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
}
auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop);
- if (HasSingleLatchExit && ExpectedTC && ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) {
+ if (HasSingleLatchExit && ExpectedTC &&
+ ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) {
if (MaxPowerOf2RuntimeVF > 0u) {
// If we have a low-trip-count, and the fixed-width VF is known to divide
// the trip count but the scalable factor does not, use the fixed-width
>From e9df81f6a588211931d55f600372910d79d08e55 Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Thu, 3 Apr 2025 07:15:20 -0700
Subject: [PATCH 3/3] Moves check to NoScalarEpilogueNeeded
---
.../Vectorize/LoopVectorizationLegality.cpp | 20 +++++++++----------
.../Transforms/Vectorize/LoopVectorize.cpp | 20 +++++++++----------
2 files changed, 20 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 0763a255b3afa..8e09e6f8d4935 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1872,6 +1872,16 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
}
bool LoopVectorizationLegality::canFoldTailByMasking() const {
+ // The only loops we can vectorize without a scalar epilogue, are loops with
+ // a bottom-test and a single exiting block. We'd have to handle the fact
+ // that not every instruction executes on the last iteration. This will
+ // require a lane mask which varies through the vector loop body. (TODO)
+ if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Cannot fold tail by masking. Requires a singe latch exit\n");
+ return false;
+ }
LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
@@ -1924,16 +1934,6 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const {
}
}
- // The only loops we can vectorize without a scalar epilogue, are loops with
- // a bottom-test and a single exiting block. We'd have to handle the fact
- // that not every instruction executes on the last iteration. This will
- // require a lane mask which varies through the vector loop body. (TODO)
- if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
- LLVM_DEBUG(
- dbgs()
- << "LV: Cannot fold tail by masking. Requires a singe latch exit\n");
- return false;
- }
LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
return true;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a010f5c52e9a7..01254b51a94e8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4016,14 +4016,17 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
}
auto NoScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) {
+ if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
+ !Legal->hasUncountableEarlyExit())
+ return false;
unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
ScalarEvolution *SE = PSE.getSE();
- // Currently only loops with countable exits are vectorized, but calling
- // getSymbolicMaxBackedgeTakenCount allows enablement work for loops with
- // uncountable exits whilst also ensuring the symbolic maximum and known
- // back-edge taken count remain identical for loops with countable exits.
+ // Calling getSymbolicMaxBackedgeTakenCount enables support for loops
+ // with uncountable exits. For countable loops, the symbolic maximum must
+ // remain identical to the known back-edge taken count.
const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
- assert(BackedgeTakenCount == PSE.getBackedgeTakenCount() &&
+ assert((Legal->hasUncountableEarlyExit() ||
+ BackedgeTakenCount == PSE.getBackedgeTakenCount()) &&
"Invalid loop count");
const SCEV *ExitCount = SE->getAddExpr(
BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
@@ -4033,9 +4036,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
return Rem->isZero();
};
- bool HasSingleLatchExit =
- TheLoop->getExitingBlock() == TheLoop->getLoopLatch();
- if (HasSingleLatchExit && MaxPowerOf2RuntimeVF > 0u) {
+ if (MaxPowerOf2RuntimeVF > 0u) {
assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
"MaxFixedVF must be a power of 2");
if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
@@ -4046,8 +4047,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
}
auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop);
- if (HasSingleLatchExit && ExpectedTC &&
- ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) {
+ if (ExpectedTC && ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) {
if (MaxPowerOf2RuntimeVF > 0u) {
// If we have a low-trip-count, and the fixed-width VF is known to divide
// the trip count but the scalable factor does not, use the fixed-width
More information about the llvm-commits
mailing list