[llvm] [LSR] Make OptimizeLoopTermCond able to handle some non-cmp conditions (PR #165590)

Wed Oct 29 09:16:20 PDT 2025

https://github.com/john-brawn-arm created https://github.com/llvm/llvm-project/pull/165590

Currently OptimizeLoopTermCond can only convert a cmp instruction to using a postincrement induction variable, which means it can't handle predicated loops where the termination condition comes from get_active_lane_mask. Relax this restriction so that we can handle any kind of instruction, though only if it's the instruction immediately before the branch (except for possibly an extractelement).

>From 139a2869a53546159f7a60b49f91a9b1bf33ca81 Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn at arm.com>
Date: Fri, 24 Oct 2025 16:02:37 +0100
Subject: [PATCH] [LSR] Make OptimizeLoopTermCond able to handle some non-cmp
 conditions

Currently OptimizeLoopTermCond can only convert a cmp instruction to
using a postincrement induction variable, which means it can't handle
predicated loops where the termination condition comes from
get_active_lane_mask. Relax this restriction so that we can handle any
kind of instruction, though only if it's the instruction immediately
before the branch (except for possibly an extractelement).
---
 .../Transforms/Scalar/LoopStrengthReduce.cpp  |  44 +++-
 .../AArch64/non-cmp-cond.ll                   | 209 ++++++++++++++++++
 2 files changed, 241 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopStrengthReduce/AArch64/non-cmp-cond.ll

diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 1a279b6198182..6a4f4cbcc36bf 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -2181,8 +2181,8 @@ class LSRInstance {
   SmallSetVector<Instruction *, 4> InsertedNonLCSSAInsts;
 
   void OptimizeShadowIV();
-  bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
-  ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
+  bool FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse);
+  Instruction *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
   void OptimizeLoopTermCond();
 
   void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
@@ -2416,7 +2416,7 @@ void LSRInstance::OptimizeShadowIV() {
 
 /// If Cond has an operand that is an expression of an IV, set the IV user and
 /// stride information and return true, otherwise return false.
-bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
+bool LSRInstance::FindIVUserForCond(Instruction *Cond, IVStrideUse *&CondUse) {
   for (IVStrideUse &U : IU)
     if (U.getUser() == Cond) {
       // NOTE: we could handle setcc instructions with multiple uses here, but
@@ -2476,7 +2476,7 @@ bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
 /// This function solves this problem by detecting this type of loop and
 /// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
 /// the instructions for the maximum computation.
-ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
+Instruction *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
   // Check that the loop matches the pattern we're looking for.
   if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
       Cond->getPredicate() != CmpInst::ICMP_NE)
@@ -2620,15 +2620,34 @@ LSRInstance::OptimizeLoopTermCond() {
     // one register value.
 
     BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
-    if (!TermBr)
+    if (!TermBr || TermBr->isUnconditional())
       continue;
-    // FIXME: Overly conservative, termination condition could be an 'or' etc..
-    if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
+
+    Instruction *Cond = dyn_cast<Instruction>(TermBr->getCondition());
+    bool CondImmediatelyBeforeTerm = Cond && Cond->getNextNode() == TermBr;
+    // If the argument to TermBr is an extractelement, then the source of that
+    // instruction is what's generated the condition.
+    auto *Extract = dyn_cast_or_null<ExtractElementInst>(Cond);
+    if (Extract) {
+      Cond = dyn_cast<Instruction>(Extract->getVectorOperand());
+      if (Cond && CondImmediatelyBeforeTerm)
+        CondImmediatelyBeforeTerm = Cond->getNextNode() == Extract;
+    }
+    // FIXME: We could do more here, like handling logical operations where one
+    // side is a cmp that uses an induction variable.
+    if (!Cond)
+      continue;
+
+    // If the condition instruction isn't immediately before TermBr then it has
+    // to either be a CmpInst, or be immediately before an extract that's
+    // immediately before TermBr, as currently we can only move or clone a
+    // CmpInst.
+    // FIXME: We should be able to do this when it's safe to do so.
+    if ((!isa<CmpInst>(Cond) || Extract) && !CondImmediatelyBeforeTerm)
       continue;
 
     // Search IVUsesByStride to find Cond's IVUse if there is one.
     IVStrideUse *CondUse = nullptr;
-    ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
     if (!FindIVUserForCond(Cond, CondUse))
       continue;
 
@@ -2638,7 +2657,8 @@ LSRInstance::OptimizeLoopTermCond() {
     // One consequence of doing this now is that it disrupts the count-down
     // optimization. That's not always a bad thing though, because in such
     // cases it may still be worthwhile to avoid a max.
-    Cond = OptimizeMax(Cond, CondUse);
+    if (auto *Cmp = dyn_cast<ICmpInst>(Cond))
+      Cond = OptimizeMax(Cmp, CondUse);
 
     // If this exiting block dominates the latch block, it may also use
     // the post-inc value if it won't be shared with other uses.
@@ -2703,13 +2723,13 @@ LSRInstance::OptimizeLoopTermCond() {
     // It's possible for the setcc instruction to be anywhere in the loop, and
     // possible for it to have multiple users.  If it is not immediately before
     // the exiting block branch, move it.
-    if (Cond->getNextNode() != TermBr) {
+    if (!CondImmediatelyBeforeTerm) {
       if (Cond->hasOneUse()) {
         Cond->moveBefore(TermBr->getIterator());
       } else {
         // Clone the terminating condition and insert into the loopend.
-        ICmpInst *OldCond = Cond;
-        Cond = cast<ICmpInst>(Cond->clone());
+        Instruction *OldCond = Cond;
+        Cond = Cond->clone();
         Cond->setName(L->getHeader()->getName() + ".termcond");
         Cond->insertInto(ExitingBlock, TermBr->getIterator());
 
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/non-cmp-cond.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/non-cmp-cond.ll
new file mode 100644
index 0000000000000..7e184ec9cebbd
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/non-cmp-cond.ll
@@ -0,0 +1,209 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -loop-reduce %s -S -o - | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Tests where the loop termination condition is not generated by a compare.
+
+; The call to get.active.lane.mask in the loop should use the postincrement
+; value of %index.
+define void @lane_mask(ptr %dst, i64 %n) #0 {
+; CHECK-LABEL: define void @lane_mask(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[VSCALE:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[VSCALEX4:%.*]] = shl i64 [[VSCALE]], 2
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[TMP1:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[INDEX]], 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> splat (i32 1), ptr align 4 [[SCEVGEP]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP1]] = add i64 [[INDEX]], [[VSCALEX4]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP1]], i64 [[N]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %vscalex4 = shl i64 %vscale, 2
+  %active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %n)
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %active.lane.mask = phi <vscale x 4 x i1> [ %active.lane.mask.entry, %entry ], [ %active.lane.mask.next, %vector.body ]
+  %gep = getelementptr inbounds nuw i32, ptr %dst, i64 %index
+  tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> splat (i32 1), ptr %gep, i32 4, <vscale x 4 x i1> %active.lane.mask)
+  %index.next = add i64 %index, %vscalex4
+  %active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %n)
+  %cond = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0
+  br i1 %cond, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+; The store between the call and the branch should cause get.active.lane.mask to
+; use a preincrement value.
+; FIXME: We could use a postincrement value by moving the call and
+; extractelement to after the store.
+define void @lane_mask_not_last(ptr %dst, i64 %n) #0 {
+; CHECK-LABEL: define void @lane_mask_not_last(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[VSCALE:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[VSCALEX4:%.*]] = shl i64 [[VSCALE]], 2
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT1:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_NEXT1]] = add i64 [[INDEX]], [[VSCALEX4]]
+; CHECK-NEXT:    [[INDEX_NEXT:%.*]] = add i64 [[VSCALEX4]], [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[INDEX]], 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]])
+; CHECK-NEXT:    tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> splat (i32 1), ptr align 4 [[SCEVGEP]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %vscalex4 = shl i64 %vscale, 2
+  %active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %n)
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %active.lane.mask = phi <vscale x 4 x i1> [ %active.lane.mask.entry, %entry ], [ %active.lane.mask.next, %vector.body ]
+  %gep = getelementptr inbounds nuw i32, ptr %dst, i64 %index
+  %index.next = add i64 %index, %vscalex4
+  %active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %n)
+  tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> splat (i32 1), ptr %gep, i32 4, <vscale x 4 x i1> %active.lane.mask)
+  %cond = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0
+  br i1 %cond, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+; The call to cmp_fn in the loop should use the postincrement value of %index.
+define void @uses_cmp_fn(ptr %dst, i64 %n) {
+; CHECK-LABEL: define void @uses_cmp_fn(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[INDEX]], 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
+; CHECK-NEXT:    store i32 0, ptr [[SCEVGEP]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[COND:%.*]] = tail call i1 @cmp_fn(i64 [[INDEX_NEXT]])
+; CHECK-NEXT:    br i1 [[COND]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %gep = getelementptr inbounds nuw i32, ptr %dst, i64 %index
+  store i32 0, ptr %gep, align 4
+  %index.next = add i64 %index, 1
+  %cond = tail call i1 @cmp_fn(i64 %index.next)
+  br i1 %cond, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+; The store between the call and the branch should cause cmp_fn to use a
+; preincrement value. We can't move the call after the store as the call could
+; have side effects.
+define void @uses_cmp_fn_not_last(ptr %dst, i64 %n) {
+; CHECK-LABEL: define void @uses_cmp_fn_not_last(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[VECTOR_BODY]] ], [ [[DST]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], %[[VECTOR_BODY]] ], [ 1, %[[ENTRY]] ]
+; CHECK-NEXT:    [[COND:%.*]] = tail call i1 @cmp_fn(i64 [[LSR_IV]])
+; CHECK-NEXT:    store i32 0, ptr [[LSR_IV1]], align 4
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], 1
+; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
+; CHECK-NEXT:    br i1 [[COND]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %gep = getelementptr inbounds nuw i32, ptr %dst, i64 %index
+  %index.next = add i64 %index, 1
+  %cond = tail call i1 @cmp_fn(i64 %index.next)
+  store i32 0, ptr %gep, align 4
+  br i1 %cond, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+; cmp2 will use a preincrement induction variable as it isn't directly the loop
+; termination condition.
+; FIXME: We could potentially handle this by examining the operands of the 'and'
+; instruction.
+define void @cmp_and(ptr %dst, i64 %n) {
+; CHECK-LABEL: define void @cmp_and(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[VECTOR_BODY]] ], [ [[DST]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[LSR_IV_NEXT:%.*]] = phi i64 [ [[LSR_IV_NEXT1:%.*]], %[[VECTOR_BODY]] ], [ [[TMP0]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[VAL:%.*]] = load i64, ptr [[LSR_IV1]], align 8
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i64 [[VAL]], [[N]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i64 [[LSR_IV_NEXT]], 0
+; CHECK-NEXT:    [[COND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    [[LSR_IV_NEXT1]] = add i64 [[LSR_IV_NEXT]], -1
+; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
+; CHECK-NEXT:    br i1 [[COND]], label %[[VECTOR_BODY]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %gep = getelementptr inbounds nuw i32, ptr %dst, i64 %index
+  %val = load i64, ptr %gep, align 8
+  %index.next = add i64 %index, 1
+  %cmp1 = icmp ne i64 %val, %n
+  %cmp2 = icmp ne i64 %index.next, %n
+  %cond = and i1 %cmp1, %cmp2
+  br i1 %cond, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
+
+declare i64 @llvm.vscale.i64()
+declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64)
+declare void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32>, ptr captures(none), i32 immarg, <vscale x 4 x i1>)
+declare i1 @cmp_fn(i64)
+
+attributes #0 = { "target-features"="+sve2" }