[llvm] [LV][NFC] Clean up tail-folding check for early-exit loops (PR #133931)

Shih-Po Hung via llvm-commits llvm-commits at lists.llvm.org
Tue Apr 1 18:39:49 PDT 2025


https://github.com/arcbbb updated https://github.com/llvm/llvm-project/pull/133931

>From 18ea5709eceeff6062b5223abd831e78fcb78dc4 Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Mon, 31 Mar 2025 23:54:09 -0700
Subject: [PATCH 1/2] [LV][NFC] Clean up tail-folding check for early-exit
 loops

This patch moves the check for a single latch exit from computeMaxVF()
to LoopVectorizationLegality::canFoldTailByMasking(), as it duplicates
the logic when foldTailByMasking() returns false.

It also introduces HasSingleLatchExit to prevent early-exit loops from
entering code paths that assume non-predicated loops.
---
 .../Vectorize/LoopVectorizationLegality.cpp   |  8 +++++++
 .../Transforms/Vectorize/LoopVectorize.cpp    | 21 +++----------------
 2 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 3ec6850d6f685..0dc065333f807 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1924,6 +1924,14 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const {
     }
   }
 
+  // The only loops we can vectorize without a scalar epilogue, are loops with
+  // a bottom-test and a single exiting block. We'd have to handle the fact
+  // that not every instruction executes on the last iteration.  This will
+  // require a lane mask which varies through the vector loop body.  (TODO)
+  if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
+    LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking. Requires a singe latch exit\n");
+    return false;
+  }
   LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
 
   return true;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 55cc801e91452..477514d907201 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3987,22 +3987,6 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     break;
   }
 
-  // The only loops we can vectorize without a scalar epilogue, are loops with
-  // a bottom-test and a single exiting block. We'd have to handle the fact
-  // that not every instruction executes on the last iteration.  This will
-  // require a lane mask which varies through the vector loop body.  (TODO)
-  if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
-    // If there was a tail-folding hint/switch, but we can't fold the tail by
-    // masking, fallback to a vectorization with a scalar epilogue.
-    if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
-      LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
-                           "scalar epilogue instead.\n");
-      ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
-      return computeFeasibleMaxVF(MaxTC, UserVF, false);
-    }
-    return FixedScalableVFPair::getNone();
-  }
-
   // Now try the tail folding
 
   // Invalidate interleave groups that require an epilogue if we can't mask
@@ -4049,7 +4033,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     return Rem->isZero();
   };
 
-  if (MaxPowerOf2RuntimeVF > 0u) {
+  bool HasSingleLatchExit = TheLoop->getExitingBlock() == TheLoop->getLoopLatch();
+  if (HasSingleLatchExit && MaxPowerOf2RuntimeVF > 0u) {
     assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
            "MaxFixedVF must be a power of 2");
     if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
@@ -4060,7 +4045,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   }
 
   auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop);
-  if (ExpectedTC && ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) {
+  if (HasSingleLatchExit && ExpectedTC && ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) {
     if (MaxPowerOf2RuntimeVF > 0u) {
       // If we have a low-trip-count, and the fixed-width VF is known to divide
       // the trip count but the scalable factor does not, use the fixed-width

>From 45541a4937ae935bdcd2a523c482e5eb8d60b82b Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Tue, 1 Apr 2025 18:38:16 -0700
Subject: [PATCH 2/2] Style update

---
 llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp | 4 +++-
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp             | 6 ++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 0dc065333f807..0763a255b3afa 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1929,7 +1929,9 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const {
   // that not every instruction executes on the last iteration.  This will
   // require a lane mask which varies through the vector loop body.  (TODO)
   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
-    LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking. Requires a singe latch exit\n");
+    LLVM_DEBUG(
+        dbgs()
+        << "LV: Cannot fold tail by masking. Requires a singe latch exit\n");
     return false;
   }
   LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 477514d907201..a010f5c52e9a7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4033,7 +4033,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     return Rem->isZero();
   };
 
-  bool HasSingleLatchExit = TheLoop->getExitingBlock() == TheLoop->getLoopLatch();
+  bool HasSingleLatchExit =
+      TheLoop->getExitingBlock() == TheLoop->getLoopLatch();
   if (HasSingleLatchExit && MaxPowerOf2RuntimeVF > 0u) {
     assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
            "MaxFixedVF must be a power of 2");
@@ -4045,7 +4046,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   }
 
   auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop);
-  if (HasSingleLatchExit && ExpectedTC && ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) {
+  if (HasSingleLatchExit && ExpectedTC &&
+      ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) {
     if (MaxPowerOf2RuntimeVF > 0u) {
       // If we have a low-trip-count, and the fixed-width VF is known to divide
       // the trip count but the scalable factor does not, use the fixed-width



More information about the llvm-commits mailing list