[llvm] [AMDGPU][TTI] Threshold bonus to loops whose unrolling makes nested loops unrollable (PR #114579)

Mon Nov 4 02:54:27 PST 2024

https://github.com/lucas-rami updated https://github.com/llvm/llvm-project/pull/114579

>From f56c4321413f3fd567b4044f1cc9521da65e6797 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <Lucas.Ramirez at amd.com>
Date: Fri, 1 Nov 2024 18:03:14 +0100
Subject: [PATCH 1/2] Give cond. loop threshold bonus to outer loop in loop
 nests

---
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      | 68 ++++++++++++++-
 .../LoopUnroll/AMDGPU/unroll-dependent-sub.ll | 84 +++++++++++++++++++
 2 files changed, 151 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 5160851f8c4424..79250ad1f83064 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -47,6 +47,13 @@ static cl::opt<unsigned> UnrollThresholdIf(
   cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
   cl::init(200), cl::Hidden);
 
+static cl::opt<unsigned> UnrollThresholdNestedStatic(
+    "amdgpu-unroll-threshold-nested-static",
+    cl::desc("Unroll threshold increment for AMDGPU for each nested loop whose "
+             "trip count will be made runtime-independent when fully-unrolling "
+             "the outer loop"),
+    cl::init(200), cl::Hidden);
+
 static cl::opt<bool> UnrollRuntimeLocal(
   "amdgpu-unroll-runtime-local",
   cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
@@ -148,8 +155,67 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
       }
     }
   }
-
   unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
+
+  if (llvm::PHINode *IV = L->getInductionVariable(SE)) {
+    // Look for subloops whose trip count would go from runtime-dependent to
+    // runtime-independent if we were to unroll the loop. Give a bonus to the
+    // current loop's unrolling threshold for each of these, as fully unrolling
+    // it would likely expose additional optimization opportunities.
+    for (const Loop *SubLoop : L->getSubLoops()) {
+      std::optional<Loop::LoopBounds> Bounds = SubLoop->getBounds(SE);
+      if (!Bounds)
+        continue;
+      Value *InitIV = &Bounds->getInitialIVValue();
+      Value *FinalIV = &Bounds->getFinalIVValue();
+      Value *StepVal = Bounds->getStepValue();
+      if (!StepVal)
+        continue;
+
+      // Determines whether SubIV's derivation depends exclusively on constants
+      // and/or IV; if it does, SubIVDependsOnIV is set to true if IV is
+      // involved in the derivation.
+      bool SubIVDependsOnIV = false;
+      std::function<bool(const Value *, unsigned)> FromConstsOrLoopIV =
+          [&](const Value *SubIV, unsigned Depth) -> bool {
+        if (SubIV == IV) {
+          SubIVDependsOnIV = true;
+          return true;
+        }
+        if (isa<Constant>(SubIV))
+          return true;
+        if (Depth >= 10)
+          return false;
+
+        const Instruction *I = dyn_cast<Instruction>(SubIV);
+        // No point in checking outside the loop since IV is necessarily inside
+        // it; also stop searching when encountering an instruction that will
+        // likely not allow SubIV's value to be statically computed.
+        if (!I || !L->contains(I) || !isa<BinaryOperator, CastInst, PHINode>(I))
+          return false;
+
+        // SubIV depends on constants or IV if all of the instruction's
+        // operands involved in its derivation also depend on constants or IV.
+        return llvm::all_of(I->operand_values(), [&](const Value *V) {
+          return FromConstsOrLoopIV(V, Depth + 1);
+        });
+      };
+
+      if (FromConstsOrLoopIV(InitIV, 0) && FromConstsOrLoopIV(FinalIV, 0) &&
+          FromConstsOrLoopIV(StepVal, 0) && SubIVDependsOnIV) {
+        UP.Threshold += UnrollThresholdNestedStatic;
+        LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
+                          << " for loop:\n"
+                          << *L
+                          << " due to subloop's trip count becoming "
+                             "runtime-independent after unrolling:\n  "
+                          << *SubLoop);
+        if (UP.Threshold >= MaxBoost)
+          return;
+      }
+    }
+  }
+
   for (const BasicBlock *BB : L->getBlocks()) {
     const DataLayout &DL = BB->getDataLayout();
     unsigned LocalGEPsSeen = 0;
diff --git a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll
new file mode 100644
index 00000000000000..36101c50db98ac
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll
@@ -0,0 +1,84 @@
+; RUN: opt -S -mtriple=amdgcn-- -passes=loop-unroll -debug-only=AMDGPUtti < %s 2>&1 | FileCheck %s
+
+; For @dependent_sub_fullunroll, the threshold bonus should apply
+; CHECK: due to subloop's trip count becoming runtime-independent after unrolling
+
+; For @dependent_sub_no_fullunroll, the threshold bonus should not apply
+; CHECK-NOT: due to subloop's trip count becoming runtime-independent after unrolling
+
+; Check that the outer loop of a double-nested loop where the inner loop's trip
+; count depends exclusively on constants and the outer IV is fully unrolled
+; thanks to receiving a threshold bonus in AMDGPU's TTI.
+
+; CHECK-LABEL: @dependent_sub_fullunroll
+; CHECK: inner.header_latch_exiting.7
+; CHECK: outer.latch_exiting.7
+
+define void @dependent_sub_fullunroll(ptr noundef %mem) {
+entry:
+  br label %outer.header
+
+outer.header:                                                 ; preds = %entry, %outer.latch_exiting
+  %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
+  br label %inner.header_latch_exiting
+
+inner.header_latch_exiting:                                   ; preds = %outer.header, %inner.header_latch_exiting
+  %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
+  %inner.iv_next = add nuw nsw i32 %inner.iv, 1
+  %outer.iv.ext = zext nneg i32 %outer.iv to i64
+  %idx_part = mul nuw nsw i64 %outer.iv.ext, 16 
+  %inner.iv.ext = zext nneg i32 %inner.iv to i64
+  %idx = add nuw nsw i64 %idx_part, %inner.iv.ext 
+  %addr = getelementptr inbounds i8, ptr %mem, i64 %idx
+  store i32 0, ptr %addr
+  %inner.cond = icmp ult i32 %inner.iv_next, 8
+  br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.latch_exiting, !llvm.loop !1
+
+outer.latch_exiting:                                          ; preds = %inner.header_latch_exiting
+  %outer.iv_next = add nuw nsw i32 %outer.iv, 1
+  %outer.cond = icmp ult i32 %outer.iv_next, 8
+  br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1
+  
+end:                                                          ; preds = %outer.latch_exiting
+  ret void
+}
+
+; Check that the outer loop of the same loop nest as dependent_sub_fullunroll
+; is not fully unrolled when the inner loop's final IV value depends on a
+; function argument instead of a combination of the outer IV and constants.
+
+; CHECK-LABEL: @dependent_sub_no_fullunroll
+; CHECK-NOT: outer.latch_exiting.7
+; CHECK-NOT: outer.latch_exiting.7
+
+define void @dependent_sub_no_fullunroll(ptr noundef %mem, i32 noundef %inner.ub) {
+entry:
+  br label %outer.header
+
+outer.header:                                                 ; preds = %entry, %outer.latch_exiting
+  %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
+  br label %inner.header_latch_exiting
+
+inner.header_latch_exiting:                                   ; preds = %outer.header, %inner.header_latch_exiting
+  %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
+  %inner.iv_next = add nuw nsw i32 %inner.iv, 1
+  %outer.iv.ext = zext nneg i32 %outer.iv to i64
+  %idx_part = mul nuw nsw i64 %outer.iv.ext, 16 
+  %inner.iv.ext = zext nneg i32 %inner.iv to i64
+  %idx = add nuw nsw i64 %idx_part, %inner.iv.ext 
+  %addr = getelementptr inbounds i8, ptr %mem, i64 %idx
+  store i32 0, ptr %addr
+  %inner.cond = icmp ult i32 %inner.iv_next, %inner.ub
+  br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.latch_exiting, !llvm.loop !1
+
+outer.latch_exiting:                                          ; preds = %inner.header_latch_exiting
+  %outer.iv_next = add nuw nsw i32 %outer.iv, 1
+  %outer.cond = icmp ult i32 %outer.iv_next, 8
+  br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1
+  
+end:                                                          ; preds = %outer.latch_exiting
+  ret void
+}
+
+!1 = !{!1, !2}
+!2 = !{!"amdgpu.loop.unroll.threshold", i32 100}

>From 0a72dca4142ab896b39bd89f3c5fdab4e1ed6bd8 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <Lucas.Ramirez at amd.com>
Date: Mon, 4 Nov 2024 11:53:43 +0100
Subject: [PATCH 2/2] Address reviewers' comments

---
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |   2 +-
 .../LoopUnroll/AMDGPU/unroll-dependent-sub.ll | 183 ++++++++++++++++--
 2 files changed, 167 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 79250ad1f83064..8d6eb94af4a108 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -157,7 +157,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   }
   unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
 
-  if (llvm::PHINode *IV = L->getInductionVariable(SE)) {
+  if (PHINode *IV = L->getInductionVariable(SE)) {
     // Look for subloops whose trip count would go from runtime-dependent to
     // runtime-independent if we were to unroll the loop. Give a bonus to the
     // current loop's unrolling threshold for each of these, as fully unrolling
diff --git a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll
index 36101c50db98ac..97de4cbf0936c6 100644
--- a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll
+++ b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll
@@ -1,4 +1,6 @@
-; RUN: opt -S -mtriple=amdgcn-- -passes=loop-unroll -debug-only=AMDGPUtti < %s 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; REQUIRES: asserts
+; RUN: opt -S -mtriple=amdgcn-- -passes=loop-unroll -debug < %s 2>&1 | FileCheck %s
 
 ; For @dependent_sub_fullunroll, the threshold bonus should apply
 ; CHECK: due to subloop's trip count becoming runtime-independent after unrolling
@@ -6,15 +8,63 @@
 ; For @dependent_sub_no_fullunroll, the threshold bonus should not apply
 ; CHECK-NOT: due to subloop's trip count becoming runtime-independent after unrolling
 
+; For @dont_unroll_illegal_convergent_op, the threshold bonus should apply even if there is no unrolling
+; CHECK: due to subloop's trip count becoming runtime-independent after unrolling
+
 ; Check that the outer loop of a double-nested loop where the inner loop's trip
 ; count depends exclusively on constants and the outer IV is fully unrolled
 ; thanks to receiving a threshold bonus in AMDGPU's TTI.
 
-; CHECK-LABEL: @dependent_sub_fullunroll
-; CHECK: inner.header_latch_exiting.7
-; CHECK: outer.latch_exiting.7
-
 define void @dependent_sub_fullunroll(ptr noundef %mem) {
+; CHECK-LABEL: @dependent_sub_fullunroll(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer.header:
+; CHECK-NEXT:    br label [[INNER_HEADER_LATCH_EXITING:%.*]]
+; CHECK:       inner.header_latch_exiting:
+; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ]
+; CHECK-NEXT:    [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1
+; CHECK-NEXT:    [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[INNER_IV_EXT]]
+; CHECK-NEXT:    store i32 0, ptr [[ADDR]], align 4
+; CHECK-NEXT:    [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], 8
+; CHECK-NEXT:    br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_LATCH_EXITING:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       outer.latch_exiting:
+; CHECK-NEXT:    br label [[INNER_HEADER_LATCH_EXITING_1:%.*]]
+; CHECK:       inner.header_latch_exiting.1:
+; CHECK-NEXT:    [[INNER_IV_1:%.*]] = phi i32 [ 1, [[OUTER_LATCH_EXITING]] ], [ [[INNER_IV_NEXT_1:%.*]], [[INNER_HEADER_LATCH_EXITING_1]] ]
+; CHECK-NEXT:    [[INNER_IV_NEXT_1]] = add nuw nsw i32 [[INNER_IV_1]], 1
+; CHECK-NEXT:    [[INNER_IV_EXT_1:%.*]] = zext nneg i32 [[INNER_IV_1]] to i64
+; CHECK-NEXT:    [[IDX_1:%.*]] = add nuw nsw i64 16, [[INNER_IV_EXT_1]]
+; CHECK-NEXT:    [[ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_1]]
+; CHECK-NEXT:    store i32 0, ptr [[ADDR_1]], align 4
+; CHECK-NEXT:    [[INNER_COND_1:%.*]] = icmp ult i32 [[INNER_IV_NEXT_1]], 8
+; CHECK-NEXT:    br i1 [[INNER_COND_1]], label [[INNER_HEADER_LATCH_EXITING_1]], label [[OUTER_LATCH_EXITING_1:%.*]], !llvm.loop [[LOOP0]]
+; CHECK:       outer.latch_exiting.1:
+; CHECK-NEXT:    br label [[INNER_HEADER_LATCH_EXITING_2:%.*]]
+; CHECK:       inner.header_latch_exiting.2:
+; CHECK-NEXT:    [[INNER_IV_2:%.*]] = phi i32 [ 2, [[OUTER_LATCH_EXITING_1]] ], [ [[INNER_IV_NEXT_2:%.*]], [[INNER_HEADER_LATCH_EXITING_2]] ]
+; CHECK-NEXT:    [[INNER_IV_NEXT_2]] = add nuw nsw i32 [[INNER_IV_2]], 1
+; CHECK-NEXT:    [[INNER_IV_EXT_2:%.*]] = zext nneg i32 [[INNER_IV_2]] to i64
+; CHECK-NEXT:    [[IDX_2:%.*]] = add nuw nsw i64 32, [[INNER_IV_EXT_2]]
+; CHECK-NEXT:    [[ADDR_2:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_2]]
+; CHECK-NEXT:    store i32 0, ptr [[ADDR_2]], align 4
+; CHECK-NEXT:    [[INNER_COND_2:%.*]] = icmp ult i32 [[INNER_IV_NEXT_2]], 8
+; CHECK-NEXT:    br i1 [[INNER_COND_2]], label [[INNER_HEADER_LATCH_EXITING_2]], label [[OUTER_LATCH_EXITING_2:%.*]], !llvm.loop [[LOOP0]]
+; CHECK:       outer.latch_exiting.2:
+; CHECK-NEXT:    br label [[INNER_HEADER_LATCH_EXITING_3:%.*]]
+; CHECK:       inner.header_latch_exiting.3:
+; CHECK-NEXT:    [[INNER_IV_3:%.*]] = phi i32 [ 3, [[OUTER_LATCH_EXITING_2]] ], [ [[INNER_IV_NEXT_3:%.*]], [[INNER_HEADER_LATCH_EXITING_3]] ]
+; CHECK-NEXT:    [[INNER_IV_NEXT_3]] = add nuw nsw i32 [[INNER_IV_3]], 1
+; CHECK-NEXT:    [[INNER_IV_EXT_3:%.*]] = zext nneg i32 [[INNER_IV_3]] to i64
+; CHECK-NEXT:    [[IDX_3:%.*]] = add nuw nsw i64 48, [[INNER_IV_EXT_3]]
+; CHECK-NEXT:    [[ADDR_3:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_3]]
+; CHECK-NEXT:    store i32 0, ptr [[ADDR_3]], align 4
+; CHECK-NEXT:    [[INNER_COND_3:%.*]] = icmp ult i32 [[INNER_IV_NEXT_3]], 8
+; CHECK-NEXT:    br i1 [[INNER_COND_3]], label [[INNER_HEADER_LATCH_EXITING_3]], label [[OUTER_LATCH_EXITING_3:%.*]], !llvm.loop [[LOOP0]]
+; CHECK:       outer.latch_exiting.3:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %outer.header
 
@@ -26,9 +76,9 @@ inner.header_latch_exiting:                                   ; preds = %outer.h
   %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
   %inner.iv_next = add nuw nsw i32 %inner.iv, 1
   %outer.iv.ext = zext nneg i32 %outer.iv to i64
-  %idx_part = mul nuw nsw i64 %outer.iv.ext, 16 
+  %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
   %inner.iv.ext = zext nneg i32 %inner.iv to i64
-  %idx = add nuw nsw i64 %idx_part, %inner.iv.ext 
+  %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
   %addr = getelementptr inbounds i8, ptr %mem, i64 %idx
   store i32 0, ptr %addr
   %inner.cond = icmp ult i32 %inner.iv_next, 8
@@ -36,9 +86,9 @@ inner.header_latch_exiting:                                   ; preds = %outer.h
 
 outer.latch_exiting:                                          ; preds = %inner.header_latch_exiting
   %outer.iv_next = add nuw nsw i32 %outer.iv, 1
-  %outer.cond = icmp ult i32 %outer.iv_next, 8
+  %outer.cond = icmp ult i32 %outer.iv_next, 4
   br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1
-  
+
 end:                                                          ; preds = %outer.latch_exiting
   ret void
 }
@@ -47,11 +97,45 @@ end:                                                          ; preds = %outer.l
 ; is not fully unrolled when the inner loop's final IV value depends on a
 ; function argument instead of a combination of the outer IV and constants.
 
-; CHECK-LABEL: @dependent_sub_no_fullunroll
-; CHECK-NOT: outer.latch_exiting.7
-; CHECK-NOT: outer.latch_exiting.7
-
 define void @dependent_sub_no_fullunroll(ptr noundef %mem, i32 noundef %inner.ub) {
+; CHECK-LABEL: @dependent_sub_no_fullunroll(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer.header:
+; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT_1:%.*]], [[OUTER_LATCH_EXITING_1:%.*]] ]
+; CHECK-NEXT:    br label [[INNER_HEADER_LATCH_EXITING:%.*]]
+; CHECK:       inner.header_latch_exiting:
+; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i32 [ [[OUTER_IV]], [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ]
+; CHECK-NEXT:    [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1
+; CHECK-NEXT:    [[OUTER_IV_EXT:%.*]] = zext nneg i32 [[OUTER_IV]] to i64
+; CHECK-NEXT:    [[IDX_PART:%.*]] = mul nuw nsw i64 [[OUTER_IV_EXT]], 16
+; CHECK-NEXT:    [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64
+; CHECK-NEXT:    [[IDX:%.*]] = add nuw nsw i64 [[IDX_PART]], [[INNER_IV_EXT]]
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[IDX]]
+; CHECK-NEXT:    store i32 0, ptr [[ADDR]], align 4
+; CHECK-NEXT:    [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], [[INNER_UB:%.*]]
+; CHECK-NEXT:    br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_LATCH_EXITING:%.*]], !llvm.loop [[LOOP0]]
+; CHECK:       outer.latch_exiting:
+; CHECK-NEXT:    [[OUTER_IV_NEXT:%.*]] = add nuw nsw i32 [[OUTER_IV]], 1
+; CHECK-NEXT:    br label [[INNER_HEADER_LATCH_EXITING_1:%.*]]
+; CHECK:       inner.header_latch_exiting.1:
+; CHECK-NEXT:    [[INNER_IV_1:%.*]] = phi i32 [ [[OUTER_IV_NEXT]], [[OUTER_LATCH_EXITING]] ], [ [[INNER_IV_NEXT_1:%.*]], [[INNER_HEADER_LATCH_EXITING_1]] ]
+; CHECK-NEXT:    [[INNER_IV_NEXT_1]] = add nuw nsw i32 [[INNER_IV_1]], 1
+; CHECK-NEXT:    [[OUTER_IV_EXT_1:%.*]] = zext nneg i32 [[OUTER_IV_NEXT]] to i64
+; CHECK-NEXT:    [[IDX_PART_1:%.*]] = mul nuw nsw i64 [[OUTER_IV_EXT_1]], 16
+; CHECK-NEXT:    [[INNER_IV_EXT_1:%.*]] = zext nneg i32 [[INNER_IV_1]] to i64
+; CHECK-NEXT:    [[IDX_1:%.*]] = add nuw nsw i64 [[IDX_PART_1]], [[INNER_IV_EXT_1]]
+; CHECK-NEXT:    [[ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_1]]
+; CHECK-NEXT:    store i32 0, ptr [[ADDR_1]], align 4
+; CHECK-NEXT:    [[INNER_COND_1:%.*]] = icmp ult i32 [[INNER_IV_NEXT_1]], [[INNER_UB]]
+; CHECK-NEXT:    br i1 [[INNER_COND_1]], label [[INNER_HEADER_LATCH_EXITING_1]], label [[OUTER_LATCH_EXITING_1]], !llvm.loop [[LOOP0]]
+; CHECK:       outer.latch_exiting.1:
+; CHECK-NEXT:    [[OUTER_IV_NEXT_1]] = add nuw nsw i32 [[OUTER_IV]], 2
+; CHECK-NEXT:    [[OUTER_COND_1:%.*]] = icmp ult i32 [[OUTER_IV_NEXT_1]], 4
+; CHECK-NEXT:    br i1 [[OUTER_COND_1]], label [[OUTER_HEADER]], label [[END:%.*]], !llvm.loop [[LOOP0]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
 entry:
   br label %outer.header
 
@@ -63,9 +147,9 @@ inner.header_latch_exiting:                                   ; preds = %outer.h
   %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
   %inner.iv_next = add nuw nsw i32 %inner.iv, 1
   %outer.iv.ext = zext nneg i32 %outer.iv to i64
-  %idx_part = mul nuw nsw i64 %outer.iv.ext, 16 
+  %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
   %inner.iv.ext = zext nneg i32 %inner.iv to i64
-  %idx = add nuw nsw i64 %idx_part, %inner.iv.ext 
+  %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
   %addr = getelementptr inbounds i8, ptr %mem, i64 %idx
   store i32 0, ptr %addr
   %inner.cond = icmp ult i32 %inner.iv_next, %inner.ub
@@ -73,9 +157,74 @@ inner.header_latch_exiting:                                   ; preds = %outer.h
 
 outer.latch_exiting:                                          ; preds = %inner.header_latch_exiting
   %outer.iv_next = add nuw nsw i32 %outer.iv, 1
-  %outer.cond = icmp ult i32 %outer.iv_next, 8
+  %outer.cond = icmp ult i32 %outer.iv_next, 4
   br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1
-  
+
+end:                                                          ; preds = %outer.latch_exiting
+  ret void
+}
+
+; Make sure that the threshold bonus does not override a correctness check and
+; unrolling when a convergent operation that is illegal to unroll is present.
+; The loop nest is the same as before except for the fact that the outer
+; loop's upper bound is now 11 (instead of 4) and there is an uncontrolled
+; convergent call in the outer loop's header. Were the call non-convergent,
+; the outer loop would be partially unrolled by a factor of 2, with a breakout
+; of 1.
+
+declare void @convergent_operation() convergent
+
+define void @dont_unroll_illegal_convergent_op(ptr noundef %mem) {
+; CHECK-LABEL: @dont_unroll_illegal_convergent_op(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer.header:
+; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH_EXITING:%.*]] ]
+; CHECK-NEXT:    call void @convergent_operation()
+; CHECK-NEXT:    br label [[INNER_HEADER_LATCH_EXITING:%.*]]
+; CHECK:       inner.header_latch_exiting:
+; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i32 [ [[OUTER_IV]], [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ]
+; CHECK-NEXT:    [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1
+; CHECK-NEXT:    [[OUTER_IV_EXT:%.*]] = zext nneg i32 [[OUTER_IV]] to i64
+; CHECK-NEXT:    [[IDX_PART:%.*]] = mul nuw nsw i64 [[OUTER_IV_EXT]], 16
+; CHECK-NEXT:    [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64
+; CHECK-NEXT:    [[IDX:%.*]] = add nuw nsw i64 [[IDX_PART]], [[INNER_IV_EXT]]
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[IDX]]
+; CHECK-NEXT:    store i32 0, ptr [[ADDR]], align 4
+; CHECK-NEXT:    [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], 8
+; CHECK-NEXT:    br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_LATCH_EXITING]], !llvm.loop [[LOOP0]]
+; CHECK:       outer.latch_exiting:
+; CHECK-NEXT:    [[OUTER_IV_NEXT]] = add nuw nsw i32 [[OUTER_IV]], 1
+; CHECK-NEXT:    [[OUTER_COND:%.*]] = icmp ult i32 [[OUTER_IV_NEXT]], 11
+; CHECK-NEXT:    br i1 [[OUTER_COND]], label [[OUTER_HEADER]], label [[END:%.*]], !llvm.loop [[LOOP0]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %outer.header
+
+outer.header:                                                 ; preds = %entry, %outer.latch_exiting
+  %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
+  call void @convergent_operation()
+  br label %inner.header_latch_exiting
+
+inner.header_latch_exiting:                                   ; preds = %outer.header, %inner.header_latch_exiting
+  %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
+  %inner.iv_next = add nuw nsw i32 %inner.iv, 1
+  %outer.iv.ext = zext nneg i32 %outer.iv to i64
+  %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
+  %inner.iv.ext = zext nneg i32 %inner.iv to i64
+  %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
+  %addr = getelementptr inbounds i8, ptr %mem, i64 %idx
+  store i32 0, ptr %addr
+  %inner.cond = icmp ult i32 %inner.iv_next, 8
+  br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.latch_exiting, !llvm.loop !1
+
+outer.latch_exiting:                                          ; preds = %inner.header_latch_exiting
+  %outer.iv_next = add nuw nsw i32 %outer.iv, 1
+  %outer.cond = icmp ult i32 %outer.iv_next, 11
+  br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1
+
 end:                                                          ; preds = %outer.latch_exiting
   ret void
 }