[llvm] cb32b8b - [LoopUnrollPass] Don't pre-set `UP.Count` before legality checks in `computeUnrollCount()` (#185979)

Tue Mar 31 12:51:11 PDT 2026

Author: Justin Fargnoli
Date: 2026-03-31T19:50:52Z
New Revision: cb32b8bffbf32fd7936808ad260764c28e3197db

URL: https://github.com/llvm/llvm-project/commit/cb32b8bffbf32fd7936808ad260764c28e3197db
DIFF: https://github.com/llvm/llvm-project/commit/cb32b8bffbf32fd7936808ad260764c28e3197db.diff

LOG: [LoopUnrollPass] Don't pre-set `UP.Count` before legality checks in `computeUnrollCount()` (#185979)

We currently set `UP.Count` to `TripCount` and `MaxTripCount` prior to
full and upper bound unrolling, respectively. This was likely done to
ensure that calls to `UCE.getUnrolledLoopSize(UP)` use the appropriate
trip count. However, we can use `UCE.getUnrolledLoopSize(UP,
FullUnrollTripCount)` instead.

To prevent unintentional unrolling, we set `UP.Count = 0` when
early-exiting `computeUnrollCount()`. (Note: this does not occur
[here](https://github.com/llvm/llvm-project/blob/eb687fb10613573b5d60dfbbb146e15f07809d82/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp#L1190-L1198).
This seems like a bug.)

We only perform early exits when evaluating runtime unrolling. At that
point, [we know `TripCount` is
false](https://github.com/justinfargnoli/llvm-project/blob/3fb31e7b06f6f6d08c7310506a2f363d198a6790/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp#L1157-L1158),
and thus we could not have leaked `TripCount`. However, we [could've
leaked
`MaxTripCount`](https://github.com/llvm/llvm-project/blob/eb687fb10613573b5d60dfbbb146e15f07809d82/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp#L1102-L1110).

It seems like:
https://github.com/llvm/llvm-project/blob/eb687fb10613573b5d60dfbbb146e15f07809d82/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp#L1181-L1188

was supposed to handle this case. However:

- It uses `<` instead of `<=`. This breaks the existing convention
[[1]](https://github.com/llvm/llvm-project/blob/eb687fb10613573b5d60dfbbb146e15f07809d82/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp#L869)
[[2]](https://github.com/llvm/llvm-project/blob/eb687fb10613573b5d60dfbbb146e15f07809d82/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp#L1103)
for how `UP.MaxUpperBound` is treated.
- It's ignored when a target sets `UP.Force = true`.

Thus:
- When `UP.Force == false`, we leak `MaxTripCount` into runtime
unrolling when `MaxTripCount && (UP.UpperBound || MaxOrZero) &&
MaxTripCount == UP.MaxUpperBound`
- When `UP.Force == true`, we leak `MaxTripCount` into runtime unrolling
when `MaxTripCount && (UP.UpperBound || MaxOrZero) && MaxTripCount <=
UP.MaxUpperBound`.

This PR:
- Uses `UCE.getUnrolledLoopSize(UP, FullUnrollTripCount)`
- Stops setting `TripCount` and `MaxTripCount` prior to calling
`shouldFullUnroll()`
- Removes the `UP.Count = 0` safeguards
- Swaps `<` with `<=`, to address the `UP.Force == false` case
- Adds a test to document the behavior change (no longer leaking
`MaxTripCount`) in the `UP.Force == true` case.

Added: 
    llvm/test/Transforms/LoopUnroll/RISCV/runtime-unroll-max-trip-count.ll

Modified: 
    llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
    llvm/test/Transforms/LoopUnroll/debug.ll
    llvm/test/Transforms/LoopUnroll/runtime-small-upperbound.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 9fdf7ef1b0a86..ce15427c43e4a 100644

--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -879,7 +879,7 @@ static std::optional<unsigned> shouldFullUnroll(
 
   // When computing the unrolled size, note that BEInsns are not replicated
   // like the rest of the loop body.
-  uint64_t UnrolledSize = UCE.getUnrolledLoopSize(UP);
+  uint64_t UnrolledSize = UCE.getUnrolledLoopSize(UP, FullUnrollTripCount);
   if (UnrolledSize < UP.Threshold) {
     LLVM_DEBUG(dbgs().indent(2) << "Unrolling: size " << UnrolledSize
                                 << " < threshold " << UP.Threshold << ".\n");
@@ -1054,9 +1054,8 @@ void llvm::computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
   // 3rd priority is exact full unrolling.  This will eliminate all copies
   // of some exit test.
   LLVM_DEBUG(dbgs().indent(1) << "Trying full unroll...\n");
-  UP.Count = 0;
+  assert(UP.Count == 0);
   if (TripCount) {
-    UP.Count = TripCount;
     if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
                                              TripCount, UCE, UP)) {
       UP.Count = *UnrollFactor;
@@ -1079,7 +1078,6 @@ void llvm::computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
   LLVM_DEBUG(dbgs().indent(1) << "Trying upper-bound unroll...\n");
   if (!TripCount && MaxTripCount && (UP.UpperBound || MaxOrZero) &&
       MaxTripCount <= UP.MaxUpperBound) {
-    UP.Count = MaxTripCount;
     if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
                                              MaxTripCount, UCE, UP)) {
       UP.Count = *UnrollFactor;
@@ -1152,16 +1150,14 @@ void llvm::computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
   if (PInfo.PragmaRuntimeUnrollDisable) {
     LLVM_DEBUG(dbgs().indent(2)
                << "Not runtime unrolling: disabled by pragma.\n");
-    UP.Count = 0;
     return;
   }
 
   // Don't unroll a small upper bound loop unless user or TTI asked to do so.
-  if (MaxTripCount && !UP.Force && MaxTripCount < UP.MaxUpperBound) {
-    LLVM_DEBUG(dbgs().indent(2)
-               << "Not runtime unrolling: max trip count " << MaxTripCount
-               << " is small (< " << UP.MaxUpperBound << ") and not forced.\n");
-    UP.Count = 0;
+  if (MaxTripCount && !UP.Force && MaxTripCount <= UP.MaxUpperBound) {
+    LLVM_DEBUG(dbgs().indent(2) << "Not runtime unrolling: max trip count "
+                                << MaxTripCount << " is small (<= "
+                                << UP.MaxUpperBound << ") and not forced.\n");
     return;
   }
 
@@ -1174,17 +1170,18 @@ void llvm::computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
         UP.AllowExpensiveTripCount = true;
     }
   }
+
   UP.Runtime |= PInfo.PragmaEnableUnroll || PInfo.PragmaCount > 0 ||
                 PInfo.UserUnrollCount;
   if (!UP.Runtime) {
     LLVM_DEBUG(dbgs().indent(2)
                << "Will not try to unroll loop with runtime trip count "
                << "because -unroll-runtime not given\n");
-    UP.Count = 0;
     return;
   }
-  if (UP.Count == 0)
-    UP.Count = UP.DefaultUnrollRuntimeCount;
+
+  assert(UP.Count == 0);
+  UP.Count = UP.DefaultUnrollRuntimeCount;
 
   // Reduce unroll count to be the largest power-of-two factor of
   // the original count which satisfies the threshold limit.

diff  --git a/llvm/test/Transforms/LoopUnroll/RISCV/runtime-unroll-max-trip-count.ll b/llvm/test/Transforms/LoopUnroll/RISCV/runtime-unroll-max-trip-count.ll
new file mode 100644
index 0000000000000..8191a527dc222
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/RISCV/runtime-unroll-max-trip-count.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -passes=loop-unroll -mattr=+no-default-unroll | FileCheck %s
+
+; Regression test: when a loop has a small MaxTripCount and falls through to
+; runtime unrolling, the unroll count should be based on DefaultUnrollRuntimeCount
+; (not MaxTripCount). Previously, a quirk would set UP.Count = MaxTripCount (5)
+; before the power-of-two reduction, yielding count 2 (5 >> 1). Now the count
+; starts at DefaultUnrollRuntimeCount (8) and reduces to 4 (8 >> 1).
+;
+; This test uses RISC-V because we need a target that sets UP.Force = true
+; (via RISCVTTIImpl::getUnrollingPreferences) to trigger the bug.
+
+target triple = "riscv64-unknown-linux-gnu"
+
+define void @test(i32 %start, ptr %p) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[END:%.*]] = add i32 [[START:%.*]], 5
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[START]], [[ENTRY:%.*]] ], [ [[IV_NEXT_3:%.*]], [[LOOP_3:%.*]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i32 [[IV]]
+; CHECK-NEXT:    [[V:%.*]] = load volatile i32, ptr [[GEP]], align 4
+; CHECK-NEXT:    store volatile i32 [[V]], ptr [[GEP]], align 4
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add nuw i32 [[IV]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[IV_NEXT]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_1:%.*]], label [[EXIT:%.*]]
+; CHECK:       loop.1:
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 [[IV_NEXT]]
+; CHECK-NEXT:    [[V_1:%.*]] = load volatile i32, ptr [[GEP_1]], align 4
+; CHECK-NEXT:    store volatile i32 [[V_1]], ptr [[GEP_1]], align 4
+; CHECK-NEXT:    [[IV_NEXT_1:%.*]] = add nuw i32 [[IV]], 2
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp ult i32 [[IV_NEXT_1]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP_1]], label [[LOOP_2:%.*]], label [[EXIT]]
+; CHECK:       loop.2:
+; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 [[IV_NEXT_1]]
+; CHECK-NEXT:    [[V_2:%.*]] = load volatile i32, ptr [[GEP_2]], align 4
+; CHECK-NEXT:    store volatile i32 [[V_2]], ptr [[GEP_2]], align 4
+; CHECK-NEXT:    [[IV_NEXT_2:%.*]] = add nuw i32 [[IV]], 3
+; CHECK-NEXT:    [[CMP_2:%.*]] = icmp ult i32 [[IV_NEXT_2]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP_2]], label [[LOOP_3]], label [[EXIT]]
+; CHECK:       loop.3:
+; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 [[IV_NEXT_2]]
+; CHECK-NEXT:    [[V_3:%.*]] = load volatile i32, ptr [[GEP_3]], align 4
+; CHECK-NEXT:    store volatile i32 [[V_3]], ptr [[GEP_3]], align 4
+; CHECK-NEXT:    [[IV_NEXT_3]] = add nuw i32 [[IV]], 4
+; CHECK-NEXT:    [[CMP_3:%.*]] = icmp ult i32 [[IV_NEXT_3]], [[END]]
+; CHECK-NEXT:    br i1 [[CMP_3]], label [[LOOP]], label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %end = add i32 %start, 5
+  br label %loop
+
+loop:
+  %iv = phi i32 [ %start, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr inbounds i32, ptr %p, i32 %iv
+  %v = load volatile i32, ptr %gep
+  %b0 = call i32 @llvm.bswap.i32(i32 %v)
+  %b1 = call i32 @llvm.bswap.i32(i32 %b0)
+  %b2 = call i32 @llvm.bswap.i32(i32 %b1)
+  %b3 = call i32 @llvm.bswap.i32(i32 %b2)
+  %b4 = call i32 @llvm.bswap.i32(i32 %b3)
+  %b5 = call i32 @llvm.bswap.i32(i32 %b4)
+  %b6 = call i32 @llvm.bswap.i32(i32 %b5)
+  %b7 = call i32 @llvm.bswap.i32(i32 %b6)
+  %b8 = call i32 @llvm.bswap.i32(i32 %b7)
+  %b9 = call i32 @llvm.bswap.i32(i32 %b8)
+  %b10 = call i32 @llvm.bswap.i32(i32 %b9)
+  %b11 = call i32 @llvm.bswap.i32(i32 %b10)
+  %b12 = call i32 @llvm.bswap.i32(i32 %b11)
+  %b13 = call i32 @llvm.bswap.i32(i32 %b12)
+  %b14 = call i32 @llvm.bswap.i32(i32 %b13)
+  %b15 = call i32 @llvm.bswap.i32(i32 %b14)
+  %b16 = call i32 @llvm.bswap.i32(i32 %b15)
+  %b17 = call i32 @llvm.bswap.i32(i32 %b16)
+  %b18 = call i32 @llvm.bswap.i32(i32 %b17)
+  %b19 = call i32 @llvm.bswap.i32(i32 %b18)
+  %b20 = call i32 @llvm.bswap.i32(i32 %b19)
+  %b21 = call i32 @llvm.bswap.i32(i32 %b20)
+  %b22 = call i32 @llvm.bswap.i32(i32 %b21)
+  %b23 = call i32 @llvm.bswap.i32(i32 %b22)
+  %b24 = call i32 @llvm.bswap.i32(i32 %b23)
+  %b25 = call i32 @llvm.bswap.i32(i32 %b24)
+  store volatile i32 %b25, ptr %gep
+  %iv.next = add nuw i32 %iv, 1
+  %cmp = icmp ult i32 %iv.next, %end
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}

diff  --git a/llvm/test/Transforms/LoopUnroll/debug.ll b/llvm/test/Transforms/LoopUnroll/debug.ll
index 6b611952c0935..fd84eeeb3d457 100644
--- a/llvm/test/Transforms/LoopUnroll/debug.ll
+++ b/llvm/test/Transforms/LoopUnroll/debug.ll
@@ -461,7 +461,7 @@ exit:
 ; CHECK-NEXT: Trying loop peeling...
 ; CHECK-NEXT: Trying partial unroll...
 ; CHECK-NEXT: Trying runtime unroll...
-; CHECK-NEXT:  Not runtime unrolling: max trip count 3 is small (< 8) and not forced.
+; CHECK-NEXT:  Not runtime unrolling: max trip count 3 is small (<= 8) and not forced.
 ; CHECK-NEXT: Not unrolling: no viable strategy found.
 
 define i32 @runtime_small_max_tc(ptr %A, i32 %n) {

diff  --git a/llvm/test/Transforms/LoopUnroll/runtime-small-upperbound.ll b/llvm/test/Transforms/LoopUnroll/runtime-small-upperbound.ll
index 247cdd6c6be28..c56d4f267124c 100644
--- a/llvm/test/Transforms/LoopUnroll/runtime-small-upperbound.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-small-upperbound.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -passes=loop-unroll -unroll-runtime %s -o - | FileCheck %s
-; RUN: opt -S -passes=loop-unroll -unroll-runtime -unroll-max-upperbound=6 %s -o - | FileCheck %s --check-prefix=UPPER
+; RUN: opt -S -passes=loop-unroll -unroll-runtime -unroll-max-upperbound=5 %s -o - | FileCheck %s --check-prefix=UPPER
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
@@ -70,7 +70,7 @@ exit:
   ret void
 }
 
-; Check that loop in hoge_5, with a runtime upperbound of 5, is unrolled when -unroll-max-upperbound=4
+; Check that loop in hoge_5, with a runtime upperbound of 5, is unrolled when -unroll-max-upperbound=5
 define dso_local void @hoge_5(i8 %arg) {
 ; CHECK-LABEL: @hoge_5(
 ; CHECK-NEXT:  entry: