[llvm] [IndVarSimplify] Relax Restrictions on Loop Counter Stride (PR #146992)

Thu Jul 3 20:35:42 PDT 2025

https://github.com/buggfg created https://github.com/llvm/llvm-project/pull/146992



This patch relaxes the restrictions on the stride of loop counters, providing optimization opportunities for common countdown loop structures.

This PR was separated out from [#146845](https://github.com/llvm/llvm-project/pull/146845).

**Key changes**:

Usually the **loop's iterations are counted** by an integer-valued variable that proceeds **upward (or downward) by a constant** amount with each iteration[1][2].  However, the current design requires the loop counter to have a unit stride (+1) and does not support -1, which limits optimization potential.

Without disrupting the original design, we have relaxed the stride restrictions in both `isLoopCounter()` and `genLoopLimit()` to allow loop counters with strides of **either +1 or -1**. This enhancement, combined with LLVM’s existing infrastructure, enables support for common countdown loop patterns—for example, transforming a loop such as `for (i = 31; i * i > 48; i--)` into `for (i = 31; i > 6; i--)`—thereby improving the pass’s versatility and optimization coverage.



[1] Steven S. Muchnick. 1998. Advanced compiler design and implementation. Morgan Kaufmann Publishers Inc., San Francisco, CA, USA.

[2] [Loops - Using and Porting GNU Fortran](https://gcc.gnu.org/onlinedocs/gcc-3.3.6/g77/Loops.html)

**Tests**

- A LIT test  added to confirm the effectiveness of the patch.

- Four existing LIT tests  modified because relaxing the stride restrictions on LoopCounters provides more optimization opportunities for LFTR, such as simplifying exit comparisons from `sgt/slt` to `eq/ne`.

- We further verified the correctness of the transformation and evaluated its performance using the SPEC CPU2006 and SPEC CPU2017 benchmark suites. 

- The results of the SPEC CPU2017 benchmark are as follows:

  | bench           | ratio<br>[llvm base] | ratio<br>[with the relaxed LoopCounter] |  SpeedUp  |
  | --------------- | :------------------: | :-------------------------------------: | :-------: |
  | 623.xalancbmk_s |        12.86         |                  13.01                  | **1.01x** |
  | 648.exchange2_s |        17.38         |                  18.87                  | **1.09x** |
  | 619.lbm_s       |         7.95         |                  8.08                   | **1.02x** |

  * \[with the relaxed LoopCounter]: Relax the stride restriction on the LoopCounter.
  * Platform: X86-intel (I9-11900K, L1-cache 384KB, L2-cache 4MB, L3-cache 16MB, cache line 64B).
  *  

**Authors**

  The [XSCC compiler team](https://github.com/orgs/OpenXiangShan/teams/xscc) developed this implementation.

>From e88d3c1e0e892c15ff210bbd4b99d7dd9da005e6 Mon Sep 17 00:00:00 2001
From: bernadate <3171290993 at qq.com>
Date: Fri, 4 Jul 2025 11:19:09 +0800
Subject: [PATCH] [IndVarSimplify] Relax Restrictions on Loop Counter Stride

Co-Authored-By: ict-ql <168183727+ict-ql at users.noreply.github.com>
Co-Authored-By: Chyaka <52224511+liliumshade at users.noreply.github.com>
Co-Authored-By: Lin Wang <wanglulin at ict.ac.cn>
---
 llvm/lib/Transforms/Scalar/IndVarSimplify.cpp | 13 +++++--
 .../Transforms/IndVarSimplify/X86/pr24356.ll  |  8 ++--
 .../check-loop-counter-stride.ll              | 39 +++++++++++++++++++
 .../Transforms/IndVarSimplify/drop-exact.ll   |  8 +---
 .../IndVarSimplify/eliminate-comparison.ll    |  7 ++--
 llvm/test/Transforms/IndVarSimplify/lftr.ll   |  2 +-
 6 files changed, 60 insertions(+), 17 deletions(-)
 create mode 100644 llvm/test/Transforms/IndVarSimplify/check-loop-counter-stride.ll

diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 334c911191cb8..404be19a0a5f1 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -798,7 +798,7 @@ static bool hasConcreteDef(Value *V) {
 
 /// Return true if the given phi is a "counter" in L.  A counter is an
 /// add recurance (of integer or pointer type) with an arbitrary start, and a
-/// step of 1.  Note that L must have exactly one latch.
+/// step of 1/-1.  Note that L must have exactly one latch.
 static bool isLoopCounter(PHINode* Phi, Loop *L,
                           ScalarEvolution *SE) {
   assert(Phi->getParent() == L->getHeader());
@@ -808,7 +808,13 @@ static bool isLoopCounter(PHINode* Phi, Loop *L,
     return false;
 
   const SCEV *S = SE->getSCEV(Phi);
-  if (!match(S, m_scev_AffineAddRec(m_SCEV(), m_scev_One(), m_SpecificLoop(L))))
+  const SCEVConstant *Step;
+  if (!match(S, m_scev_AffineAddRec(m_SCEV(), m_SCEVConstant(Step),
+                                    m_SpecificLoop(L))))
+    return false;
+  int64_t StepVal = Step->getValue()->getSExtValue();
+  // Require that the loop counter stride can only be 1 or -1
+  if (StepVal != 1 && StepVal != -1)
     return false;
 
   int LatchIdx = Phi->getBasicBlockIndex(L->getLoopLatch());
@@ -910,7 +916,8 @@ static Value *genLoopLimit(PHINode *IndVar, BasicBlock *ExitingBB,
   assert(isLoopCounter(IndVar, L, SE));
   assert(ExitCount->getType()->isIntegerTy() && "exit count must be integer");
   const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IndVar));
-  assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride");
+  const SCEV *StepAbs = SE->getAbsExpr(AR->getStepRecurrence(*SE), true);
+  assert(StepAbs->isOne() && "only handles unit stride");
 
   // For integer IVs, truncate the IV before computing the limit unless we
   // know apriori that the limit must be a constant when evaluated in the
diff --git a/llvm/test/Transforms/IndVarSimplify/X86/pr24356.ll b/llvm/test/Transforms/IndVarSimplify/X86/pr24356.ll
index f2d938f6452d3..05575c58bff6d 100644
--- a/llvm/test/Transforms/IndVarSimplify/X86/pr24356.ll
+++ b/llvm/test/Transforms/IndVarSimplify/X86/pr24356.ll
@@ -14,11 +14,11 @@ bb:
 bb4.preheader:                                    ; preds = %bb, %bb16
 ; CHECK-LABEL:  bb4.preheader:
   %b.03 = phi i8 [ 0, %bb ], [ %tmp17, %bb16 ]
-; CHECK: %tmp9 = icmp ugt i8 %b.03, 1
-; CHECK-NOT: %tmp9 = icmp ugt i8 0, 1
+; CHECK: %exitcond = icmp eq i8 %b.03, -1
+; CHECK-NOT: %exitcond = icmp ugt i8 0, 1
 
-  %tmp9 = icmp ugt i8 %b.03, 1
-  br i1 %tmp9, label %bb4.preheader.bb18.loopexit.split_crit_edge, label %bb4.preheader.bb4.preheader.split_crit_edge
+  %exitcond = icmp eq i8 %b.03, -1
+  br i1 %exitcond, label %bb4.preheader.bb18.loopexit.split_crit_edge, label %bb4.preheader.bb4.preheader.split_crit_edge
 
 bb4.preheader.bb4.preheader.split_crit_edge:      ; preds = %bb4.preheader
   br label %bb4.preheader.split
diff --git a/llvm/test/Transforms/IndVarSimplify/check-loop-counter-stride.ll b/llvm/test/Transforms/IndVarSimplify/check-loop-counter-stride.ll
new file mode 100644
index 0000000000000..1c81ae3fc08e8
--- /dev/null
+++ b/llvm/test/Transforms/IndVarSimplify/check-loop-counter-stride.ll
@@ -0,0 +1,39 @@
+; Test the case where the LoopCounter's stride equals -1.
+; RUN: opt -S -passes=indvars  < %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+
+define void @check_step_minus_one(ptr nocapture readonly %0)  {
+; CHECK-LABEL: define void @check_step_minus_one(ptr readonly captures(none) %0) {
+; CHECK:       entry:
+; CHECK-NEXT:  br label [[loop:.*]]
+; CHECK:       loop:
+; CHECK-NEXT:  [[IV:%.*]] = phi i64 [ 31, [[entry:%.*]] ], [ [[PostDec:%.*]], [[loop:%.*]] ]
+; CHECK-NEXT:  [[GEP:%.*]] = getelementptr inbounds i32, ptr %0, i64 [[IV]]
+; CHECK-NEXT:  [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4
+; CHECK-NEXT:  [[ADD:%.*]] = add nsw i32 [[LOAD]], 1
+; CHECK-NEXT:  store i32 [[ADD]], ptr [[GEP]], align 4
+; CHECK-NEXT:  [[PostDec:%.*]] = add nsw i64 [[IV]], -1
+; CHECK-NEXT:  [[CMP:%.*]] = icmp ne i64 [[PostDec]], 6
+; CHECK-NEXT:  br i1 [[CMP]], label [[loop:.*]], label [[end:.*]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:                  
+  br label %loop
+
+loop:                                           
+  %1 = phi i64 [ 31, %entry ], [ %6, %loop ]
+  %3 = getelementptr inbounds i32, ptr %0, i64 %1
+  %4 = load i32, ptr %3, align 4
+  %5 = add nsw i32 %4, 1
+  store i32 %5, ptr %3, align 4
+  %6 = add nsw i64 %1, -1
+  %7 = mul nsw i64 %6, %6
+  %8 = icmp samesign ugt i64 %7, 48
+  br i1 %8, label %loop, label %end
+
+end:                                      
+  ret void
+}
+
diff --git a/llvm/test/Transforms/IndVarSimplify/drop-exact.ll b/llvm/test/Transforms/IndVarSimplify/drop-exact.ll
index fb8027df74ee7..2a60c9d73a021 100644
--- a/llvm/test/Transforms/IndVarSimplify/drop-exact.ll
+++ b/llvm/test/Transforms/IndVarSimplify/drop-exact.ll
@@ -13,7 +13,6 @@ define void @drop_exact(ptr %p, ptr %p1) {
 ; CHECK-NEXT:    ret void
 ; CHECK:       bb12:
 ; CHECK-NEXT:    [[TMP13:%.*]] = phi i32 [ -47436, [[BB:%.*]] ], [ [[TMP15:%.*]], [[BB12]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ 0, [[BB]] ], [ [[TMP42:%.*]], [[BB12]] ]
 ; CHECK-NEXT:    [[TMP15]] = add nsw i32 [[TMP13]], -1
 ; CHECK-NEXT:    [[TMP16:%.*]] = shl i32 [[TMP15]], 1
 ; CHECK-NEXT:    [[TMP17:%.*]] = sub nsw i32 42831, [[TMP16]]
@@ -23,8 +22,7 @@ define void @drop_exact(ptr %p, ptr %p1) {
 ; CHECK-NEXT:    store i32 [[TMP22]], ptr [[P:%.*]], align 4
 ; CHECK-NEXT:    [[TMP26:%.*]] = zext i32 [[TMP20]] to i64
 ; CHECK-NEXT:    store i64 [[TMP26]], ptr [[P1:%.*]], align 4
-; CHECK-NEXT:    [[TMP42]] = add nuw nsw i32 [[TMP14]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP42]], 719
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP15]], -48155
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[BB7:%.*]], label [[BB12]]
 ;
 bb:
@@ -60,7 +58,6 @@ define void @dont_drop_exact(ptr %p, ptr %p1) {
 ; CHECK-NEXT:    ret void
 ; CHECK:       bb12:
 ; CHECK-NEXT:    [[TMP13:%.*]] = phi i32 [ -47436, [[BB:%.*]] ], [ [[TMP15:%.*]], [[BB12]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = phi i32 [ 0, [[BB]] ], [ [[TMP42:%.*]], [[BB12]] ]
 ; CHECK-NEXT:    [[TMP15]] = add nsw i32 [[TMP13]], -1
 ; CHECK-NEXT:    [[TMP16:%.*]] = shl i32 [[TMP15]], 1
 ; CHECK-NEXT:    [[TMP17:%.*]] = sub nsw i32 42831, [[TMP16]]
@@ -70,8 +67,7 @@ define void @dont_drop_exact(ptr %p, ptr %p1) {
 ; CHECK-NEXT:    store i32 [[TMP22]], ptr [[P:%.*]], align 4
 ; CHECK-NEXT:    [[TMP26:%.*]] = zext i32 [[TMP20]] to i64
 ; CHECK-NEXT:    store i64 [[TMP26]], ptr [[P1:%.*]], align 4
-; CHECK-NEXT:    [[TMP42]] = add nuw nsw i32 [[TMP14]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP42]], 719
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP15]], -48155
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[BB7:%.*]], label [[BB12]]
 ;
 bb:
diff --git a/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll b/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll
index 08f9856ac603d..7f889cc6f6d0b 100644
--- a/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll
+++ b/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll
@@ -68,6 +68,7 @@ return:
 define i32 @_ZNK4llvm5APInt3ultERKS0_(i32 %tmp2.i1, ptr %tmp65, ptr %tmp73, ptr %tmp82, ptr %tmp90) {
 ; CHECK-LABEL: @_ZNK4llvm5APInt3ultERKS0_(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SMIN:%.*]]= call i32 @llvm.smin.i32(i32 %tmp2.i1, i32 -1)
 ; CHECK-NEXT:    br label [[BB18:%.*]]
 ; CHECK:       bb13:
 ; CHECK-NEXT:    [[TMP66:%.*]] = load ptr, ptr [[TMP65:%.*]], align 4
@@ -88,11 +89,11 @@ define i32 @_ZNK4llvm5APInt3ultERKS0_(i32 %tmp2.i1, ptr %tmp65, ptr %tmp73, ptr
 ; CHECK-NEXT:    [[TMP95:%.*]] = icmp ult i64 [[TMP86]], [[TMP94]]
 ; CHECK-NEXT:    br i1 [[TMP95]], label [[BB20_LOOPEXIT]], label [[BB17:%.*]]
 ; CHECK:       bb17:
-; CHECK-NEXT:    [[TMP97:%.*]] = add nsw i32 [[I]], -1
+; CHECK-NEXT:    [[TMP97:%.*]] = add i32 [[I]], -1
 ; CHECK-NEXT:    br label [[BB18]]
 ; CHECK:       bb18:
 ; CHECK-NEXT:    [[I]] = phi i32 [ [[TMP2_I1:%.*]], [[ENTRY:%.*]] ], [ [[TMP97]], [[BB17]] ]
-; CHECK-NEXT:    [[TMP99:%.*]] = icmp sgt i32 [[I]], -1
+; CHECK-NEXT:    [[TMP99:%.*]] = icmp ne i32 [[I]], [[SMIN:%.*]]
 ; CHECK-NEXT:    br i1 [[TMP99]], label [[BB13:%.*]], label [[BB20_LOOPEXIT]]
 ; CHECK:       bb20.loopexit:
 ; CHECK-NEXT:    [[TMP_0_PH:%.*]] = phi i32 [ 0, [[BB18]] ], [ 1, [[BB15]] ], [ 0, [[BB13]] ]
@@ -917,7 +918,7 @@ define void @func_24(ptr %init.ptr) {
 ; CHECK-NEXT:    br i1 true, label [[BE]], label [[LEAVE_LOOPEXIT:%.*]]
 ; CHECK:       be:
 ; CHECK-NEXT:    call void @side_effect()
-; CHECK-NEXT:    [[BE_COND:%.*]] = icmp sgt i32 [[IV_DEC]], 4
+; CHECK-NEXT:    [[BE_COND:%.*]] = icmp ne i32 [[IV_DEC]], 4
 ; CHECK-NEXT:    br i1 [[BE_COND]], label [[LOOP]], label [[LEAVE_LOOPEXIT]]
 ; CHECK:       leave.loopexit:
 ; CHECK-NEXT:    br label [[LEAVE]]
diff --git a/llvm/test/Transforms/IndVarSimplify/lftr.ll b/llvm/test/Transforms/IndVarSimplify/lftr.ll
index 5ee62ba357ab6..f321daa00a953 100644
--- a/llvm/test/Transforms/IndVarSimplify/lftr.ll
+++ b/llvm/test/Transforms/IndVarSimplify/lftr.ll
@@ -43,7 +43,7 @@ define i32 @pre_to_post_sub() {
 ; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 1000, [[ENTRY:%.*]] ], [ [[I_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[I_NEXT]] = sub nsw i32 [[I]], 1
 ; CHECK-NEXT:    store i32 [[I]], ptr @A, align 4
-; CHECK-NEXT:    [[C:%.*]] = icmp samesign ugt i32 [[I]], 0
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i32 [[I_NEXT]], -1
 ; CHECK-NEXT:    br i1 [[C]], label [[LOOP]], label [[LOOPEXIT:%.*]]
 ; CHECK:       loopexit:
 ; CHECK-NEXT:    ret i32 0