[llvm-branch-commits] [llvm] [LV] Vectorize uncountable early exit store loops with combined conditions (PR #205109)

Graham Hunter via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Wed Jun 24 05:49:31 PDT 2026


https://github.com/huntergr-arm updated https://github.com/llvm/llvm-project/pull/205109

>From 9274730ce4f57f795a889380dde53aae74f41049 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Fri, 5 Jun 2026 10:19:21 +0000
Subject: [PATCH 1/2] [LV] Vectorize uncountable early exit store loops with
 combined conditions

Support the case where both the countable and uncountable exit conditions
have been combined by earlier passes.
---
 .../Vectorize/LoopVectorizationLegality.cpp   | 67 ++++++++++++------
 .../Transforms/Vectorize/LoopVectorize.cpp    | 21 ++++++
 .../Transforms/Vectorize/VPlanPatternMatch.h  | 12 ++++
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 55 +++++++++++++++
 .../VPlan/early_exit_with_stores_vplan.ll     | 70 +++++++++++++++++++
 .../X86/vectorization-remarks-missed.ll       | 18 +++++
 .../early_exit_combined_exits.ll              | 32 ++++++++-
 .../LoopVectorize/early_exit_legality.ll      |  5 +-
 .../early_exit_store_legality.ll              |  2 +-
 .../uncountable-single-exit-loops.ll          |  2 +
 10 files changed, 255 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 6813930c76a6d..a233a67f4db39 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1688,16 +1688,30 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
   }
 
   // The latch block must have a countable exit.
-  if (isa<SCEVCouldNotCompute>(
-          PSE.getSE()->getPredicatedExitCount(TheLoop, LatchBB, &Predicates))) {
+  if (isa<SCEVCouldNotCompute>(PSE.getSE()->getPredicatedExitCount(
+          TheLoop, LatchBB, &Predicates, ScalarEvolution::SymbolicMaximum))) {
     reportVectorizationFailure(
         "Cannot determine exact exit count for latch block",
         "Cannot vectorize early exit loop",
         "UnknownLatchExitCountEarlyExitLoop", ORE, TheLoop);
     return false;
   }
-  assert(llvm::is_contained(CountableExitingBlocks, LatchBB) &&
-         "Latch block not found in list of countable exits!");
+
+  if (!is_contained(CountableExitingBlocks, LatchBB)) {
+    // If not a separate counted exit in the latch, then check for a combined
+    // countable and uncountable exit.
+    BasicBlock *TrueBB, *FalseBB;
+    // Do we know the IV here?
+    if (!match(LatchBB->getTerminator(),
+               m_Br(m_c_LogicalOr(m_Value(), m_Cmp(m_Add(m_Value(), m_Value()),
+                                                   m_Value())),
+                    TrueBB, FalseBB))) {
+      reportVectorizationFailure(
+          "Latch block does not have a countable exit condition",
+          "NoCountableConditionInLatchBlock", ORE, TheLoop);
+      return false;
+    }
+  }
 
   // Check to see if there are instructions that could potentially generate
   // exceptions or have side-effects.
@@ -1775,6 +1789,13 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
     }
   }
 
+  // We're only handling combined exit conditions via masking at present, which
+  // is used for loops with side effects.
+  // TODO: Support readonly loops with combined exit conditions.
+  // TODO: Decouple style from the presence of side effects.
+  if (!llvm::is_contained(CountableExitingBlocks, LatchBB) && !HasSideEffects)
+    return false;
+
   [[maybe_unused]] const SCEV *SymbolicMaxBTC =
       PSE.getSymbolicMaxBackedgeTakenCount();
   // Since we have an exact exit count for the latch and the early exit
@@ -1804,9 +1825,16 @@ bool LoopVectorizationLegality::canUncountableExitConditionLoadBeMoved(
   Instruction *L = nullptr;
   Value *Ptr = nullptr;
   Value *R = nullptr;
-  if (!match(Br->getCondition(),
-             m_OneUse(m_ICmp(m_OneUse(m_Instruction(L, m_Load(m_Value(Ptr)))),
-                             m_Value(R))))) {
+  if (!match(
+          Br->getCondition(),
+          m_CombineOr(
+              m_OneUse(m_ICmp(m_OneUse(m_Instruction(L, m_Load(m_Value(Ptr)))),
+                              m_Value(R))),
+              m_OneUse(m_LogicalOr(
+                  m_OneUse(
+                      m_ICmp(m_OneUse(m_Instruction(L, m_Load(m_Value(Ptr)))),
+                             m_Value(R))),
+                  m_ICmp(m_Add(m_Value(), m_Value()), m_Value())))))) {
     reportVectorizationFailure(
         "Early exit loop with store but no supported condition load",
         "NoConditionLoadForEarlyExitLoop", ORE, TheLoop);
@@ -1933,24 +1961,17 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
       return false;
   }
 
-  if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
-    if (TheLoop->getExitingBlock()) {
+  if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount()) &&
+      !isVectorizableEarlyExitLoop()) {
+    assert(UncountableExitType == UncountableExitTrait::None &&
+           "Must be false without vectorizable early-exit loop");
+    if (TheLoop->getExitingBlock())
       reportVectorizationFailure("Cannot vectorize uncountable loop",
                                  "UnsupportedUncountableLoop", ORE, TheLoop);
-      if (DoExtraAnalysis)
-        Result = false;
-      else
-        return false;
-    } else {
-      if (!isVectorizableEarlyExitLoop()) {
-        assert(UncountableExitType == UncountableExitTrait::None &&
-               "Must be false without vectorizable early-exit loop");
-        if (DoExtraAnalysis)
-          Result = false;
-        else
-          return false;
-      }
-    }
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
   }
 
   // Go over each instruction and look at memory deps.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3dbee08e7d7d8..69279a18dd2fd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2706,6 +2706,27 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
     if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
       continue;
     auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
+    // TODO: This might occur for a multi-exit readonly loop too?
+    //       Excluded for now in LVL.
+    // TODO: Do we have the main IV available somewhere? this feels a little
+    //       fragile.
+    // If we have an exit condition that is actually two conditions combined
+    // via an or, only add the countable comparison as a uniform value.
+    if (Legal->hasUncountableExitWithSideEffects() &&
+        TheLoop->getLoopLatch() == E) {
+      Value *Uncounted, *Counted, *IV;
+      using namespace llvm::PatternMatch;
+      if (match(Cmp,
+                m_c_LogicalOr(
+                    m_Value(Uncounted, m_Cmp(m_Load(m_Value()), m_Value())),
+                    m_Value(Counted, m_Cmp(m_Add(m_Value(IV), m_Value()),
+                                           m_Value()))))) {
+        if (isa<PHINode>(IV)) {
+          AddToWorklistIfAllowed(cast<Instruction>(Counted));
+          continue;
+        }
+      }
+    }
     if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
       AddToWorklistIfAllowed(Cmp);
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 1f133cbaa95bb..e66442645b9c7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -203,6 +203,18 @@ inline bind_const_int m_ConstantInt(uint64_t &C) { return C; }
 /// Match a VPValue, capturing it if we match.
 inline match_bind<VPValue> m_VPValue(VPValue *&V) { return V; }
 
+/// Match against the nested pattern, and capture the value if we match.
+template <typename Pattern>
+inline auto m_VPValue(VPValue *&V, const Pattern &P) {
+  return m_CombineAnd(P, match_bind<VPValue>(V));
+}
+
+/// Match against the nested pattern, and capture the value if we match.
+template <typename Pattern>
+inline auto m_VPValue(const VPValue *&V, const Pattern &P) {
+  return m_CombineAnd(P, match_bind<const VPValue>(V));
+}
+
 /// Match a VPIRValue.
 inline match_bind<VPIRValue> m_VPIRValue(VPIRValue *&V) { return V; }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index adcfe30ff9561..61dfe79df23c0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -4638,6 +4638,61 @@ bool VPlanTransforms::handleUncountableEarlyExits(
     }
   }
 
+  // If we didn't find any, perhaps the exit was combined.
+  if (Exits.empty() && Plan.getExitBlocks().size() == 1) {
+    // TODO: Make this work with other styles.
+    if (Style != UncountableExitStyle::MaskedHandleExitInScalarLoop)
+      return false;
+
+    // TODO: Relax assumptions to cover more loops.
+    VPValue *Uncounted = nullptr;
+    VPValue *Counted = nullptr;
+    auto *IV = cast<VPSingleDefRecipe>(&HeaderVPBB->front());
+    VPRecipeBase *LatchBr = LatchVPBB->getTerminator();
+
+    if (!match(
+            LatchBr,
+            m_BranchOnCond(m_c_LogicalOr(
+                m_VPValue(Uncounted,
+                          m_Cmp(m_VPInstruction<Instruction::Load>(m_VPValue()),
+                                m_VPValue())),
+                m_VPValue(Counted, m_Cmp(m_Add(m_Specific(IV), m_VPValue()),
+                                         m_VPValue()))))))
+      return false;
+
+    // TODO: Exits currently assumes the ExitBlock must be an existing IR
+    //       basic block, and MiddleVPBB doesn't qualify. For now, hack around
+    //       this and duplicate the work from below.
+    // TODO: Find a nicer way to integrate this into the rest of the function.
+
+    auto *CondToEarlyExit =
+        LatchBuilder.createNaryOp(VPInstruction::MaskedCond, Uncounted);
+
+    VPValue *IsUncountableExitTaken =
+        LatchBuilder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
+
+    DebugLoc LatchDL = LatchBr->getDebugLoc();
+    VPSingleDefRecipe *LBC = cast<VPSingleDefRecipe>(LatchBr->getOperand(0));
+    LatchBr->eraseFromParent();
+    // Deleting the condition because of the single use restriction...
+    // TODO: Relax single use a bit?
+    LBC->eraseFromParent();
+    LatchBuilder.setInsertPoint(LatchVPBB);
+    LatchBuilder.createNaryOp(VPInstruction::BranchOnTwoConds,
+                              {IsUncountableExitTaken, Counted}, LatchDL);
+    // TODO: Are we guaranteed to have the successors in the expected order
+    //       at this point?
+    LatchVPBB->clearSuccessors();
+
+    // If handling the exiting lane in the scalar loop, combine the exit
+    // conditions into a single BranchOnCond.
+    LatchVPBB->setSuccessors({MiddleVPBB, MiddleVPBB, HeaderVPBB});
+    MiddleVPBB->clearPredecessors();
+    MiddleVPBB->setPredecessors({LatchVPBB, LatchVPBB});
+    return handleUncountableExitsWithSideEffects(
+        Plan, Exits, HeaderVPBB, LatchVPBB, MiddleVPBB, TheLoop, PSE, DT, AC);
+  }
+
   assert(!Exits.empty() && "must have at least one early exit");
   // Sort exits by RPO order to get correct program order. RPO gives a
   // topological ordering of the CFG, ensuring upstream exits are checked
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/early_exit_with_stores_vplan.ll b/llvm/test/Transforms/LoopVectorize/VPlan/early_exit_with_stores_vplan.ll
index 822bc1d4f3d93..7786d869a4394 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/early_exit_with_stores_vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/early_exit_with_stores_vplan.ll
@@ -270,6 +270,76 @@ exit:
 }
 
 define void @combined_exit_conditions(ptr align 4 dereferenceable(80) readonly %src, ptr align 4 dereferenceable(80) noalias %dst, ptr align 4 dereferenceable(80) readonly %pred) {
+; CHECK-LABEL: VPlan for loop in 'combined_exit_conditions'
+; CHECK:  VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK-NEXT:  Live-in vp<[[VP0:%[0-9]+]]> = VF
+; CHECK-NEXT:  Live-in vp<[[VP1:%[0-9]+]]> = VF * UF
+; CHECK-NEXT:  Live-in vp<[[VP2:%[0-9]+]]> = vector-trip-count
+; CHECK-NEXT:  Live-in ir<20> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<entry>:
+; CHECK-NEXT:  Successor(s): scalar.ph, vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:  Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:  vp<[[VP3:%[0-9]+]]> = CANONICAL-IV
+; CHECK-EMPTY:
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      ir<%iv> = WIDEN-INDUCTION nuw nsw ir<0>, ir<1>, vp<[[VP0]]>
+; CHECK-NEXT:      vp<[[VP4:%[0-9]+]]> = SCALAR-STEPS vp<[[VP3]]>, ir<1>, vp<[[VP0]]>
+; CHECK-NEXT:      CLONE ir<%ee.ptr> = getelementptr inbounds nuw ir<%pred>, vp<[[VP4]]>
+; CHECK-NEXT:      vp<[[VP5:%[0-9]+]]> = vector-pointer inbounds nuw ir<%ee.ptr>, ir<1>
+; CHECK-NEXT:      WIDEN ir<%ee.val> = load vp<[[VP5]]>
+; CHECK-NEXT:      WIDEN ir<%ee.cmp> = icmp ne ir<%ee.val>, ir<0>
+; CHECK-NEXT:      EMIT vp<[[VP6:%[0-9]+]]> = first-active-lane ir<%ee.cmp>
+; CHECK-NEXT:      EMIT vp<%uncountable.exit.mask> = active lane mask ir<0>, vp<[[VP6]]>, ir<1>
+; CHECK-NEXT:      CLONE ir<%src.ptr> = getelementptr ir<%src>, vp<[[VP4]]>
+; CHECK-NEXT:      vp<[[VP7:%[0-9]+]]> = vector-pointer ir<%src.ptr>, ir<1>
+; CHECK-NEXT:      WIDEN ir<%data> = load vp<[[VP7]]>, vp<%uncountable.exit.mask>
+; CHECK-NEXT:      WIDEN ir<%add> = add nsw ir<%data>, ir<1>
+; CHECK-NEXT:      CLONE ir<%dst.ptr> = getelementptr ir<%dst>, vp<[[VP4]]>
+; CHECK-NEXT:      vp<[[VP8:%[0-9]+]]> = vector-pointer ir<%dst.ptr>, ir<1>
+; CHECK-NEXT:      WIDEN store vp<[[VP8]]>, ir<%add>, vp<%uncountable.exit.mask>
+; CHECK-NEXT:      EMIT vp<[[VP9:%[0-9]+]]> = any-of ir<%ee.cmp>
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1]]>
+; CHECK-NEXT:      EMIT vp<[[VP10:%[0-9]+]]> = icmp eq vp<%index.next>, vp<[[VP2]]>
+; CHECK-NEXT:      EMIT branch-on-two-conds vp<[[VP9]]>, vp<[[VP10]]>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block, middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT:  middle.block:
+; CHECK-NEXT:    EMIT vp<[[VP12:%[0-9]+]]> = extract-lane ir<0>, ir<%iv>
+; CHECK-NEXT:    EMIT vp<[[VP13:%[0-9]+]]> = add vp<[[VP12]]>, vp<[[VP6]]>
+; CHECK-NEXT:    EMIT vp<[[VP14:%[0-9]+]]> = icmp eq vp<[[VP13]]>, ir<20>
+; CHECK-NEXT:    EMIT branch-on-cond vp<[[VP14]]>
+; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<exit>:
+; CHECK-NEXT:  No successors
+; CHECK-EMPTY:
+; CHECK-NEXT:  scalar.ph:
+; CHECK-NEXT:    EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<[[VP13]]>, middle.block ], [ ir<0>, ir-bb<entry> ]
+; CHECK-NEXT:  Successor(s): ir-bb<for.body>
+; CHECK-EMPTY:
+; CHECK-NEXT:  ir-bb<for.body>:
+; CHECK-NEXT:    IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] (extra operand: vp<%bc.resume.val> from scalar.ph)
+; CHECK-NEXT:    IR   %src.ptr = getelementptr inbounds nuw [4 x i8], ptr %src, i64 %iv
+; CHECK-NEXT:    IR   %data = load i32, ptr %src.ptr, align 4
+; CHECK-NEXT:    IR   %add = add nsw i32 %data, 1
+; CHECK-NEXT:    IR   %dst.ptr = getelementptr inbounds nuw [4 x i8], ptr %dst, i64 %iv
+; CHECK-NEXT:    IR   store i32 %add, ptr %dst.ptr, align 4
+; CHECK-NEXT:    IR   %ee.ptr = getelementptr inbounds nuw [4 x i8], ptr %pred, i64 %iv
+; CHECK-NEXT:    IR   %ee.val = load i32, ptr %ee.ptr, align 4
+; CHECK-NEXT:    IR   %ee.cmp = icmp ne i32 %ee.val, 0
+; CHECK-NEXT:    IR   %iv.next = add nuw nsw i64 %iv, 1
+; CHECK-NEXT:    IR   %counted.cmp = icmp eq i64 %iv.next, 20
+; CHECK-NEXT:    IR   %combined.cond = select i1 %ee.cmp, i1 true, i1 %counted.cmp
+; CHECK-NEXT:  No successors
+; CHECK-NEXT:  }
+;
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
index 5f2a93b230995..f233f71d93f9c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
@@ -48,6 +48,15 @@
 
 ; YAML:       --- !Analysis
 ; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            NoCountableConditionInLatchBlock
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 5, Column: 9 }
+; YAML-NEXT: Function:        _Z4testPii
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          Latch block does not have a countable exit condition
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
 ; YAML-NEXT: Name:            UnsupportedUncountableLoop
 ; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 5, Column: 9 }
 ; YAML-NEXT: Function:        _Z4testPii
@@ -137,6 +146,15 @@
 ; YAML-NEXT: ...
 ; YAML-NEXT: --- !Analysis
 ; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            RecurrencesInEarlyExitLoop
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 27, Column: 3 }
+; YAML-NEXT: Function:        test_multiple_failures
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          Cannot vectorize early exit loop with reductions or recurrences
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
 ; YAML-NEXT: Name:            UnsupportedUncountableLoop
 ; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 27, Column: 3 }
 ; YAML-NEXT: Function:        test_multiple_failures
diff --git a/llvm/test/Transforms/LoopVectorize/early_exit_combined_exits.ll b/llvm/test/Transforms/LoopVectorize/early_exit_combined_exits.ll
index 43a62e19eb0c3..ad844ce816e80 100644
--- a/llvm/test/Transforms/LoopVectorize/early_exit_combined_exits.ll
+++ b/llvm/test/Transforms/LoopVectorize/early_exit_combined_exits.ll
@@ -4,10 +4,36 @@
 define void @combined_exit_conditions(ptr align 4 dereferenceable(80) readonly %src, ptr align 4 dereferenceable(80) noalias %dst, ptr align 4 dereferenceable(80) readonly %pred) {
 ; CHECK-LABEL: define void @combined_exit_conditions(
 ; CHECK-SAME: ptr readonly align 4 dereferenceable(80) [[SRC:%.*]], ptr noalias align 4 dereferenceable(80) [[DST:%.*]], ptr readonly align 4 dereferenceable(80) [[PRED:%.*]]) {
-; CHECK-NEXT:  [[SCALAR_PH:.*]]:
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[FOR_BODY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [4 x i8], ptr [[PRED]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 false)
+; CHECK-NEXT:    [[UNCOUNTABLE_EXIT_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [4 x i8], ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 4 [[TMP0]], <4 x i1> [[UNCOUNTABLE_EXIT_MASK]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 1)
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr [4 x i8], ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr align 4 [[TMP5]], <4 x i1> [[UNCOUNTABLE_EXIT_MASK]])
+; CHECK-NEXT:    [[TMP6:%.*]] = freeze <4 x i1> [[TMP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20
+; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], [[TMP3]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[TMP10]], 20
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[EXIT:.*]], label %[[SCALAR_PH:.*]]
+; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    br label %[[FOR_BODY1:.*]]
 ; CHECK:       [[FOR_BODY1]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY1]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[TMP10]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY1]] ]
 ; CHECK-NEXT:    [[SRC_PTR:%.*]] = getelementptr inbounds nuw [4 x i8], ptr [[SRC]], i64 [[IV]]
 ; CHECK-NEXT:    [[DATA:%.*]] = load i32, ptr [[SRC_PTR]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[DATA]], 1
@@ -19,7 +45,7 @@ define void @combined_exit_conditions(ptr align 4 dereferenceable(80) readonly %
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[COUNTED_CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 20
 ; CHECK-NEXT:    [[COMBINED_COND:%.*]] = select i1 [[EE_CMP]], i1 true, i1 [[COUNTED_CMP]]
-; CHECK-NEXT:    br i1 [[COMBINED_COND]], label %[[EXIT:.*]], label %[[FOR_BODY1]]
+; CHECK-NEXT:    br i1 [[COMBINED_COND]], label %[[EXIT]], label %[[FOR_BODY1]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
index 88a2bccf9c904..69d28fec87cde 100644
--- a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
+++ b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
@@ -383,7 +383,7 @@ loop.end:
 
 define i64 @uncountable_exit_infinite_loop() {
 ; CHECK-LABEL: LV: Checking a loop in 'uncountable_exit_infinite_loop'
-; CHECK:       LV: Not vectorizing: Cannot vectorize uncountable loop.
+; CHECK:       LV: Not vectorizing: Cannot determine exact exit count for latch block.
 entry:
   %p1 = alloca [1024 x i8]
   %p2 = alloca [1024 x i8]
@@ -470,7 +470,8 @@ loop.end:
 
 define void @exit_conditions_combined_in_single_branch(ptr noalias dereferenceable(40) %array, ptr readonly align 2 dereferenceable(40) %pred) {
 ; CHECK-LABEL: LV: Checking a loop in 'exit_conditions_combined_in_single_branch'
-; CHECK:       LV: Not vectorizing: Cannot vectorize uncountable loop.
+; CHECK:       LV: We can vectorize this loop!
+; CHECK:       LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit and side effects is not enabled.
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll b/llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll
index 9b2d26ae84102..36082c61296e2 100644
--- a/llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll
+++ b/llvm/test/Transforms/LoopVectorize/early_exit_store_legality.ll
@@ -937,7 +937,7 @@ invalid.block:
 
 define void @combined_exit_conditions(ptr align 4 dereferenceable(80) readonly %src, ptr align 4 dereferenceable(80) noalias %dst, ptr align 4 dereferenceable(80) readonly %pred) {
 ; CHECK-LABEL: LV: Checking a loop in 'combined_exit_conditions'
-; CHECK:       LV:  Not vectorizing: Cannot vectorize uncountable loop.
+; CHECK:       LV: We can vectorize this loop!
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/uncountable-single-exit-loops.ll b/llvm/test/Transforms/LoopVectorize/uncountable-single-exit-loops.ll
index e7ecf459ca6c9..327c9f1668854 100644
--- a/llvm/test/Transforms/LoopVectorize/uncountable-single-exit-loops.ll
+++ b/llvm/test/Transforms/LoopVectorize/uncountable-single-exit-loops.ll
@@ -4,12 +4,14 @@
 
 ; CHECK-LABEL: LV: Checking a loop in 'latch_exit_cannot_compute_btc_due_to_step'
 ; CHECK: 	   LV: Did not find one integer induction var.
+; CHECK-NEXT:  LV: Not vectorizing: Cannot determine exact exit count for latch block.
 ; CHECK-NEXT:  LV: Not vectorizing: Cannot vectorize uncountable loop.
 ; CHECK-NEXT:  LV: Not vectorizing: Cannot prove legality.
 
 ; CHECK-LABEL: LV: Checking a loop in 'header_exit_cannot_compute_btc_due_to_step'
 ; CHECK:       LV: Found an induction variable.
 ; CHECK-NEXT:  LV: Did not find one integer induction var.
+; CHECK-NEXT:  LV: Not vectorizing: Cannot determine exact exit count for latch block.
 ; CHECK-NEXT:  LV: Not vectorizing: Cannot vectorize uncountable loop.
 ; CHECK-NEXT:  LV: Not vectorizing: Cannot prove legality.
 

>From a2f61a932aafba4fe9cb8a01b0c334fea56d08d6 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 24 Jun 2026 12:48:35 +0000
Subject: [PATCH 2/2] Rebase to show phase ordering test vectorizing

---
 ...ountable-and-uncountable-exits-combined.ll | 40 +++++++++++++++++--
 1 file changed, 36 insertions(+), 4 deletions(-)

diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/countable-and-uncountable-exits-combined.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/countable-and-uncountable-exits-combined.ll
index 2b2f0fbc87762..475e32da9c419 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/countable-and-uncountable-exits-combined.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/countable-and-uncountable-exits-combined.ll
@@ -1,5 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -passes="default<O3>" -S %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -passes="default<O3>" -enable-early-exit-vectorization-with-side-effects -S %s | FileCheck %s
 
 target triple = "aarch64"
 
@@ -26,9 +26,41 @@ define void @foo() #0 {
 ; CHECK-LABEL: define void @foo(
 ; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP0]], 2500
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 10000, [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i64 10000, [[N_MOD_VF]]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [4 x i8], ptr @c, i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD_FR:%.*]] = freeze <vscale x 4 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_LOAD_FR]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP3]], i1 false)
+; CHECK-NEXT:    [[UNCOUNTABLE_EXIT_MASK:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr [4 x i8], ptr @src, i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP5]], <vscale x 4 x i1> [[UNCOUNTABLE_EXIT_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 42)
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr [4 x i8], ptr @dst, i64 [[INDEX]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP6]], ptr align 4 [[TMP7]], <vscale x 4 x i1> [[UNCOUNTABLE_EXIT_MASK]])
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP3]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[TMP11]], 10000
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[EXIT:.*]], label %[[FOR_BODY_PREHEADER]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY1:.*]]
+; CHECK:       [[FOR_BODY1]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY1]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4 x i8], ptr @src, i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[SRC_PTR:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[SRC_PTR]], 42
@@ -40,7 +72,7 @@ define void @foo() #0 {
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
 ; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[EE_COND]], i1 true, i1 [[EXITCOND_NOT]]
-; CHECK-NEXT:    br i1 [[OR_COND]], label %[[EXIT:.*]], label %[[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label %[[EXIT]], label %[[FOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;



More information about the llvm-branch-commits mailing list