[llvm] [WIP] [InstCombine] Div ceil optimizations (PR #190175)

Thu Apr 2 06:25:01 PDT 2026

https://github.com/Takashiidobe created https://github.com/llvm/llvm-project/pull/190175

This PR improves handling of `div_ceil` from rust (which emits a div + rem).

Currently, these three rust functions:
```rust
use std::hint::assert_unchecked;

#[unsafe(no_mangle)]
pub fn div_ceil_without_assume(x: u32) -> u32 {
    x.div_ceil(7)
}

#[unsafe(no_mangle)]
pub fn div_ceil_with_assume(x: u32) -> u32 {
    unsafe {
        assert_unchecked(x <= u32::MAX - 7);
    }
    x.div_ceil(7)
}

#[unsafe(no_mangle)]
pub fn div_ceil_with_range(x: u32) -> u32 {
    x.count_zeros().div_ceil(7)
}
```

Will emit this IR (cleaned up): The IR looks pretty good to me as both the assert_unchecked and the popcount are provide range information.

```llvm
define noundef range(i32 0, 613566758) i32 @div_ceil_without_assume(i32 noundef %x) unnamed_addr {
start:
  %d = udiv i32 %x, 7
  %r = urem i32 %x, 7
  %_4.not = icmp ne i32 %r, 0
  %0 = zext i1 %_4.not to i32
  %_0.sroa.0.0 = add nuw nsw i32 %d, %0
  ret i32 %_0.sroa.0.0
}

define noundef range(i32 0, 613566757) i32 @div_ceil_with_assume(i32 noundef %x) unnamed_addr {
start:
  %cond = icmp ult i32 %x, -7
  tail call void @llvm.assume(i1 %cond)
  %d = udiv i32 %x, 7
  %r = urem i32 %x, 7
  %_5.not = icmp ne i32 %r, 0
  %0 = zext i1 %_5.not to i32
  %_0.sroa.0.0 = add nuw nsw i32 %d, %0
  ret i32 %_0.sroa.0.0
}

define noundef range(i32 0, 6) i32 @div_ceil_with_range(i32 noundef %x) unnamed_addr {
start:
  %self = xor i32 %x, -1
  %0 = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 %self)
  %d.lhs.trunc = trunc nuw nsw i32 %0 to i8
  %d3 = udiv i8 %d.lhs.trunc, 7
  %d.zext = zext nneg i8 %d3 to i32
  %r4 = urem i8 %d.lhs.trunc, 7
  %_6.not = icmp ne i8 %r4, 0
  %1 = zext i1 %_6.not to i32
  %_0.sroa.0.0 = add nuw nsw i32 %1, %d.zext
  ret i32 %_0.sroa.0.0
}

declare void @llvm.assume(i1 noundef)

declare i32 @llvm.ctpop.i32(i32)
```
After running through opt and llc on main with popcnt for less noise:
`build/bin/opt -O2 -S < test.ll | build/bin/llc --x86-asm-syntax=intel -mattr=+popcnt -O2 -o -` 

```asm
div_ceil_without_assume:                # @div_ceil_without_assume                                                  
        mov     eax, edi                                                                                            
        movabs  rcx, 2635249153617166336                                                                            
        mul     rcx                                                                                                 
        lea     eax, [8*rdx]                                                                                        
        mov     ecx, edx                                                                                            
        sub     ecx, eax                                                                                            
        xor     eax, eax                                                                                            
        add     ecx, edi                                                                                            
        setne   al                                                                                                  
        add     eax, edx                                                                                            
        ret                                                                                                         
div_ceil_with_assume:                   # @div_ceil_with_assume                                                     
        mov     eax, edi                                                                                            
        movabs  rcx, 2635249153617166336                                                                            
        mul     rcx                                                                                                 
        lea     eax, [8*rdx]                                                                                        
        mov     ecx, edx                                                                                            
        sub     ecx, eax                                                                                            
        xor     eax, eax                                                                                            
        add     ecx, edi                                                                                            
        setne   al                                                                                                  
        add     eax, edx                                                                                            
        ret                                                                                                         
div_ceil_with_range:                    # @div_ceil_with_range                                                      
        not     edi                                                                                                 
        popcnt  ecx, edi                                                                                            
        lea     eax, [rcx + 8*rcx]                                                                                  
        lea     edx, [rcx + 4*rax]                                                                                  
        shr     edx, 8                                                                                              
        lea     esi, [8*rdx]                                                                                        
        sub     esi, edx                                                                                            
        xor     eax, eax                                                                                            
        cmp     cl, sil                                                                                             
        setne   al                                                                                                  
        add     eax, edx                                                                                            
        ret                                                                                                         
```

(as a sidenote, just running llc seems to generate better code? I find this a bit odd)

`build/bin/llc test.ll --x86-asm-syntax=intel -mattr=+popcnt -O2 -o -`

```asm
div_ceil_without_assume:                # @div_ceil_without_assume                                                  
        mov     eax, edi                                                                                            
        movabs  rcx, 2635249153617166336                                                                            
        mul     rcx                                                                                                 
        mov     rax, rdx                                                                                            
        imul    ecx, edi, -1227133513                                                                               
        cmp     ecx, 613566757                                                                                      
        sbb     eax, -1                                                                                             
        ret                                                                                                         
div_ceil_with_assume:                   # @div_ceil_with_assume                                                     
        mov     eax, edi                                                                                            
        movabs  rcx, 2635249153617166336                                                                            
        mul     rcx                                                                                                 
        mov     rax, rdx                                                                                            
        imul    ecx, edi, -1227133513                                                                               
        cmp     ecx, 613566757                                                                                      
        sbb     eax, -1                                                                                             
        ret                                                                                                         
div_ceil_with_range:                    # @div_ceil_with_range                                                      
        not     edi                                                                                                 
        popcnt  ecx, edi                                                                                            
        lea     eax, [rcx + 8*rcx]                                                                                  
        lea     eax, [rcx + 4*rax]                                                                                  
        shr     eax, 8                                                                                              
        imul    ecx, ecx, -73                                                                                       
        cmp     cl, 37 
```

Anyway the llvm IR when run through opt at -O2:

On main:

The only changes I see are the urem being rewritten to a mul + sub for all the instructions, and the range information is persisted but not used.

```llvm
define noundef range(i32 0, 613566758) i32 @div_ceil_without_assume(i32 noundef %x) unnamed_addr {
start:
  %d = udiv i32 %x, 7
  %0 = mul i32 %d, 7
  %r.decomposed = sub i32 %x, %0
  %_4.not = icmp ne i32 %r.decomposed, 0
  %1 = zext i1 %_4.not to i32
  %_0.sroa.0.0 = add nuw nsw i32 %d, %1
  ret i32 %_0.sroa.0.0
}

define noundef range(i32 0, 613566757) i32 @div_ceil_with_assume(i32 noundef %x) unnamed_addr {
start:
  %cond = icmp ult i32 %x, -7
  tail call void @llvm.assume(i1 %cond)
  %d = udiv i32 %x, 7
  %0 = mul i32 %d, 7
  %r.decomposed = sub i32 %x, %0
  %_5.not = icmp ne i32 %r.decomposed, 0
  %1 = zext i1 %_5.not to i32
  %_0.sroa.0.0 = add nuw nsw i32 %d, %1
  ret i32 %_0.sroa.0.0
}

define noundef range(i32 0, 6) i32 @div_ceil_with_range(i32 noundef %x) unnamed_addr {
start:
  %self = xor i32 %x, -1
  %0 = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 %self)
  %d.lhs.trunc = trunc nuw nsw i32 %0 to i8
  %d3 = udiv i8 %d.lhs.trunc, 7
  %d.zext = zext nneg i8 %d3 to i32
  %1 = mul i8 %d3, 7
  %r4.decomposed = sub i8 %d.lhs.trunc, %1
  %_6.not = icmp ne i8 %r4.decomposed, 0
  %2 = zext i1 %_6.not to i32
  %_0.sroa.0.0 = add nuw nsw i32 %2, %d.zext
  ret i32 %_0.sroa.0.0
}

declare void @llvm.assume(i1 noundef)

declare i32 @llvm.ctpop.i32(i32)
```

However, both the assume and the popcount provide range information so we can do an optimization. Div_ceil is emitted as a udiv and urem. We can combine them given the following rules:

Assuming you have floor division of X / Y, we can add Y - 1 to X and floor division will always give us (floor_divide(X / Y) + 1) which gives us the same result.

So the formula in general is:

```
div_ceil(X, Y) = X / Y + (1 if X % Y > 0 else 0)  -> X + Y - 1 / Y
```

But this fails since I forgot about wrapping. So we need a condition that X + Y - 1 does not wrap, and of course Y cannot be 0. Technically we should ignore Y / 1 since that's a trivial identity.

That gets us:

```
add(udiv(X, Y), zext(icmp ne(urem(X, Y), 0))) -> udiv(add nuw(X, Y - 1), Y)
```

[Alive proof here](https://alive2.llvm.org/ce/z/nBvyv4)

This is where I thought I was done, because this should handle the range and assume case (the assume case provides less information than the popcount because popcount's range is narrower for X). So I thought I was done.

Unfortunately this only optimizes one case (the assume case with the wide code). The culprit here is in the popcount case, there's a trunc for the popcount to bound it to an i8 since narrower arithmetic allows for better optimizations before zero extending. 

```llvm
  %d.lhs.trunc = trunc nuw nsw i32 %0 to i8
  %d3 = udiv i8 %d.lhs.trunc, 7
  %d.zext = zext nneg i8 %d3 to i32
  %r4 = urem i8 %d.lhs.trunc, 7
  %_6.not = icmp ne i8 %r4, 0
  %1 = zext i1 %_6.not to i32
```

So we need to handle another form, when the udiv needs to be zexted to the return type (because it was previously trunced, due to the popcount). 

```
add(zext(udiv(X, Y)), zext(icmp ne(urem(X, Y), 0))) -> zext(udiv(add nuw(X, Y - 1), Y))
```

[Alive2 Proof](https://alive2.llvm.org/ce/z/w7bRZW)

There was another oddity I found that trunced information wasn't passing constant ranges, so I added that to ValueTracking.cpp to fix this fold too. 

On this branch the IR now transforms:

```llvm
define noundef range(i32 0, 613566758) i32 @div_ceil_without_assume(i32 noundef %x) unnamed_addr {
start:
  %d = udiv i32 %x, 7
  %0 = mul i32 %d, 7
  %r.decomposed = sub i32 %x, %0
  %_4.not = icmp ne i32 %r.decomposed, 0
  %1 = zext i1 %_4.not to i32
  %_0.sroa.0.0 = add nuw nsw i32 %d, %1
  ret i32 %_0.sroa.0.0
}

define noundef range(i32 0, 613566757) i32 @div_ceil_with_assume(i32 noundef %x) unnamed_addr {
start:
  %cond = icmp ult i32 %x, -7
  tail call void @llvm.assume(i1 %cond)
  %0 = add nuw i32 %x, 6
  %_0.sroa.0.0 = udiv i32 %0, 7
  ret i32 %_0.sroa.0.0
}

define noundef range(i32 0, 6) i32 @div_ceil_with_range(i32 noundef %x) unnamed_addr {
start:
  %self = xor i32 %x, -1
  %0 = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 %self)
  %d.lhs.trunc = trunc nuw nsw i32 %0 to i8
  %1 = add nuw nsw i8 %d.lhs.trunc, 6
  %2 = udiv i8 %1, 7
  %_0.sroa.0.0 = zext nneg i8 %2 to i32
  ret i32 %_0.sroa.0.0
}

declare void @llvm.assume(i1 noundef)

declare i32 @llvm.ctpop.i32(i32)
```

And we get this asm emitted:

```asm
div_ceil_without_assume:                # @div_ceil_without_assume
        mov     eax, edi
        movabs  rcx, 2635249153617166336
        mul     rcx
        lea     eax, [8*rdx]
        mov     ecx, edx
        sub     ecx, eax
        xor     eax, eax
        add     ecx, edi
        setne   al
        add     eax, edx
        ret
div_ceil_with_assume:                   # @div_ceil_with_assume
        lea     eax, [rdi + 6]
        movabs  rcx, 2635249153617166336
        mul     rcx
        mov     rax, rdx
        ret
div_ceil_with_range:                    # @div_ceil_with_range
        not     edi
        popcnt  eax, edi
        add     al, 6
        movzx   eax, al
        imul    eax, eax, 147
        shr     eax, 10
        ret
```


>From c95ada852c3bfdb4db2c571f7f2c50f3d58761cf Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Sat, 21 Mar 2026 16:45:08 -0400
Subject: [PATCH 1/5] add tests for div_ceil folding which currently fail

---
 .../InstCombine/InstCombineAddSub.cpp         |  43 ++++++
 .../InstCombine/InstCombineInternal.h         |   3 +
 .../Transforms/InstCombine/add-divceil.ll     | 132 ++++++++++++++++++
 3 files changed, 178 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/add-divceil.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index c781c6978b275..9865143ae1aa0 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1523,6 +1523,46 @@ static Instruction *foldBoxMultiply(BinaryOperator &I) {
   return nullptr;
 }
 
+// Fold the div_ceil idiom:
+//   add(udiv(A, C), zext(icmp ne(urem(A, C), 0)))
+//     -> udiv(add nuw(A, C - 1), C)
+// The zext of the icmp is just type-plumbing (i1 -> A's type); the fold
+// stays entirely in A's type. Valid when A + (C-1) is provably non-wrapping,
+// checked via ConstantRange (range attributes, assume, etc.).
+Instruction *InstCombinerImpl::foldDivCeil(BinaryOperator &I) {
+  Value *A, *A2;
+  const APInt *C1, *C2;
+  CmpPredicate Pred;
+
+  // Bind A and A2 independently so m_c_Add handles both operand orderings.
+  auto UDivPat = m_OneUse(m_UDiv(m_Value(A), m_APInt(C1)));
+  auto URemPat = m_OneUse(m_URem(m_Value(A2), m_APInt(C2)));
+  auto ICmpPat = m_OneUse(m_ICmp(Pred, URemPat, m_Zero()));
+  auto ZExtPat = m_OneUse(m_ZExt(ICmpPat));
+
+  if (!match(&I, m_c_Add(UDivPat, ZExtPat)) || Pred != ICmpInst::ICMP_NE ||
+      A != A2 || *C1 != *C2 || !C1->ugt(1))
+    return nullptr;
+
+  // Require A + (C-1) to not overflow unsigned in A's type.
+  // Use ConstantRange rather than KnownBits: KnownBits can only derive bounds
+  // from known-zero high bits, so it loses range info for near-max values
+  // (e.g. an assume of "a < UINT_MAX-5" on a 32-bit value leaves no
+  // universally-zero bits and getMaxValue() returns UINT_MAX).
+  // ConstantRange tracks the full [lo, hi) interval and gives a tight max.
+  // Note: computeConstantRangeIncludingKnownBits does not forward AC/DT to
+  // computeConstantRange, so it won't pick up llvm.assume; call the full form.
+  ConstantRange CR = computeConstantRange(A, /*ForSigned=*/false,
+                                          /*UseInstrInfo=*/true, &AC, &I, &DT);
+  APInt UMax = CR.getUnsignedMax();
+  if (UMax.ugt(APInt::getMaxValue(UMax.getBitWidth()) - (*C1 - 1)))
+    return nullptr;
+
+  Value *CMinusOne = ConstantInt::get(A->getType(), *C1 - 1);
+  Value *NUWAdd = Builder.CreateAdd(A, CMinusOne, "", /*HasNUW=*/true);
+  return BinaryOperator::CreateUDiv(NUWAdd, ConstantInt::get(A->getType(), *C1));
+}
+
 Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
   if (Value *V = simplifyAddInst(I.getOperand(0), I.getOperand(1),
                                  I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
@@ -1915,6 +1955,9 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
   if (Instruction *Res = foldBinOpOfSelectAndCastOfSelectCondition(I))
     return Res;
 
+  if (Instruction *Res = foldDivCeil(I))
+    return Res;
+
   // Re-enqueue users of the induction variable of add recurrence if we infer
   // new nuw/nsw flags.
   if (Changed) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 160f766b60973..f58fc5dcc876b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -544,6 +544,9 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   /// (Binop (cast C), (select C, T, F))
   ///    -> (select C, C0, C1)
   Instruction *foldBinOpOfSelectAndCastOfSelectCondition(BinaryOperator &I);
+  /// (add (zext (udiv A, C)), (zext (icmp ne (urem A, C), 0)))
+  ///    -> (zext (udiv (add nuw A, C-1), C))
+  Instruction *foldDivCeil(BinaryOperator &I);
 
   /// This tries to simplify binary operations by factorizing out common terms
   /// (e. g. "(A*B)+(A*C)" -> "A*(B+C)").
diff --git a/llvm/test/Transforms/InstCombine/add-divceil.ll b/llvm/test/Transforms/InstCombine/add-divceil.ll
new file mode 100644
index 0000000000000..bb8c82fefff74
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/add-divceil.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+; Fold the div_ceil idiom (narrow-add form):
+;   add(udiv(A, C), zext i1(icmp ne(urem(A, C), 0)))
+;     -> udiv(add nuw(A, C - 1), C)
+; when A + (C-1) is provably non-wrapping (via KnownBits / range info).
+
+declare void @use(i8)
+declare void @llvm.assume(i1)
+
+; Basic: i8 in [1, 32] -> bits 7,6,5 known zero, max = 63; 63+6 <= 255.
+define i8 @divceil_i8_bounded(i8 range(i8 1, 33) %a) {
+; CHECK-LABEL: @divceil_i8_bounded(
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i8 [[A:%.*]], 6
+; CHECK-NEXT:    [[RESULT:%.*]] = udiv i8 [[TMP1]], 7
+; CHECK-NEXT:    ret i8 [[RESULT]]
+;
+  %q = udiv i8 %a, 7
+  %r = urem i8 %a, 7
+  %cond = icmp ne i8 %r, 0
+  %round = zext i1 %cond to i8
+  %result = add i8 %q, %round
+  ret i8 %result
+}
+
+; Commuted: zext(icmp) on the left of add.
+define i8 @divceil_i8_bounded_commuted(i8 range(i8 1, 33) %a) {
+; CHECK-LABEL: @divceil_i8_bounded_commuted(
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i8 [[A:%.*]], 6
+; CHECK-NEXT:    [[RESULT:%.*]] = udiv i8 [[TMP1]], 7
+; CHECK-NEXT:    ret i8 [[RESULT]]
+;
+  %q = udiv i8 %a, 7
+  %r = urem i8 %a, 7
+  %cond = icmp ne i8 %r, 0
+  %round = zext i1 %cond to i8
+  %result = add i8 %round, %q
+  ret i8 %result
+}
+
+; With llvm.assume: A <= 100, so bit 7 known zero, max = 127; 127+6 <= 255.
+define i8 @divceil_i8_assume(i8 %a) {
+; CHECK-LABEL: @divceil_i8_assume(
+; CHECK-NEXT:    [[OK:%.*]] = icmp ult i8 [[A:%.*]], 101
+; CHECK-NEXT:    call void @llvm.assume(i1 [[OK]])
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i8 [[A]], 6
+; CHECK-NEXT:    [[RESULT:%.*]] = udiv i8 [[TMP1]], 7
+; CHECK-NEXT:    ret i8 [[RESULT]]
+;
+  %ok = icmp ule i8 %a, 100
+  call void @llvm.assume(i1 %ok)
+  %q = udiv i8 %a, 7
+  %r = urem i8 %a, 7
+  %cond = icmp ne i8 %r, 0
+  %round = zext i1 %cond to i8
+  %result = add i8 %q, %round
+  ret i8 %result
+}
+
+; With assume a < UINT_MAX-5 (i.e. a <= UINT_MAX-6): ConstantRange max =
+; UINT_MAX-6; no high bits are universally zero so KnownBits alone would fail,
+; but ConstantRange handles it. This is the form Rust emits for
+; assert_unchecked(a <= u32::MAX - 6) on a div_ceil(7) call.
+define i32 @divceil_i32_assume_near_max(i32 %a) {
+; CHECK-LABEL: @divceil_i32_assume_near_max(
+; CHECK-NEXT:    [[OK:%.*]] = icmp ult i32 [[A:%.*]], -6
+; CHECK-NEXT:    call void @llvm.assume(i1 [[OK]])
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[A]], 6
+; CHECK-NEXT:    [[RESULT:%.*]] = udiv i32 [[TMP1]], 7
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+  %ok = icmp ult i32 %a, -6
+  call void @llvm.assume(i1 %ok)
+  %q = udiv i32 %a, 7
+  %r = urem i32 %a, 7
+  %cond = icmp ne i32 %r, 0
+  %round = zext i1 %cond to i32
+  %result = add i32 %q, %round
+  ret i32 %result
+}
+
+; Negative: no range info -> KnownBits max = 255; 255+6 overflows.
+define i8 @divceil_i8_unbounded(i8 %a) {
+; CHECK-LABEL: @divceil_i8_unbounded(
+; CHECK-NEXT:    [[Q:%.*]] = udiv i8 [[A:%.*]], 7
+; CHECK-NEXT:    [[R:%.*]] = urem i8 [[A]], 7
+; CHECK-NEXT:    [[COND:%.*]] = icmp ne i8 [[R]], 0
+; CHECK-NEXT:    [[ROUND:%.*]] = zext i1 [[COND]] to i8
+; CHECK-NEXT:    [[RESULT:%.*]] = add nuw nsw i8 [[Q]], [[ROUND]]
+; CHECK-NEXT:    ret i8 [[RESULT]]
+;
+  %q = udiv i8 %a, 7
+  %r = urem i8 %a, 7
+  %cond = icmp ne i8 %r, 0
+  %round = zext i1 %cond to i8
+  %result = add i8 %q, %round
+  ret i8 %result
+}
+
+; Negative: udiv has multiple uses -> one-use check fails.
+define i8 @divceil_i8_udiv_multiuse(i8 range(i8 1, 33) %a) {
+; CHECK-LABEL: @divceil_i8_udiv_multiuse(
+; CHECK-NEXT:    [[Q:%.*]] = udiv i8 [[A:%.*]], 7
+; CHECK-NEXT:    [[R:%.*]] = urem i8 [[A]], 7
+; CHECK-NEXT:    [[COND:%.*]] = icmp ne i8 [[R]], 0
+; CHECK-NEXT:    [[ROUND:%.*]] = zext i1 [[COND]] to i8
+; CHECK-NEXT:    [[RESULT:%.*]] = add nuw nsw i8 [[Q]], [[ROUND]]
+; CHECK-NEXT:    call void @use(i8 [[Q]])
+; CHECK-NEXT:    ret i8 [[RESULT]]
+;
+  %q = udiv i8 %a, 7
+  %r = urem i8 %a, 7
+  %cond = icmp ne i8 %r, 0
+  %round = zext i1 %cond to i8
+  %result = add i8 %q, %round
+  call void @use(i8 %q)
+  ret i8 %result
+}
+
+; Negative: divisor == 1 -> no-op (udiv and urem by 1 are trivial).
+define i8 @divceil_i8_div1(i8 range(i8 1, 33) %a) {
+; CHECK-LABEL: @divceil_i8_div1(
+; CHECK-NEXT:    ret i8 [[A:%.*]]
+;
+  %q = udiv i8 %a, 1
+  %r = urem i8 %a, 1
+  %cond = icmp ne i8 %r, 0
+  %round = zext i1 %cond to i8
+  %result = add i8 %q, %round
+  ret i8 %result
+}

>From 5406e664b12a57191b6973634ae502df75215f8b Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Wed, 1 Apr 2026 17:02:39 -0400
Subject: [PATCH 2/5] allow the narrow div ceil fold, add(udiv(X, C), zext(icmp
 ne(urem(X, C), 0)) -> udiv(add nuw(A, C - 1), C) to handle variable divisors

---
 .../InstCombine/InstCombineAddSub.cpp         | 44 +++++----
 .../InstCombine/InstCombineInternal.h         |  4 +-
 .../Transforms/InstCombine/add-divceil.ll     | 92 ++++++++++++++++++-
 3 files changed, 114 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 9865143ae1aa0..0295ef6272aa8 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1524,43 +1524,47 @@ static Instruction *foldBoxMultiply(BinaryOperator &I) {
 }
 
 // Fold the div_ceil idiom:
-//   add(udiv(A, C), zext(icmp ne(urem(A, C), 0)))
-//     -> udiv(add nuw(A, C - 1), C)
-// The zext of the icmp is just type-plumbing (i1 -> A's type); the fold
-// stays entirely in A's type. Valid when A + (C-1) is provably non-wrapping,
-// checked via ConstantRange (range attributes, assume, etc.).
+//   add(udiv(X, Y), zext(icmp ne(urem(X, Y), 0)))
+//     -> udiv(add nuw(X, Y - 1), Y)
+// The zext of the icmp is just type-plumbing (i1 -> X's type); the fold
+// stays entirely in X's type. Valid when X + (Y-1) is provably non-wrapping,
+// checked via ConstantRange on both operands (range attributes, assume, etc.).
 Instruction *InstCombinerImpl::foldDivCeil(BinaryOperator &I) {
-  Value *A, *A2;
-  const APInt *C1, *C2;
+  Value *X, *X2, *Y, *Y2;
   CmpPredicate Pred;
 
-  // Bind A and A2 independently so m_c_Add handles both operand orderings.
-  auto UDivPat = m_OneUse(m_UDiv(m_Value(A), m_APInt(C1)));
-  auto URemPat = m_OneUse(m_URem(m_Value(A2), m_APInt(C2)));
+  // Bind X and X2 independently so m_c_Add handles both operand orderings.
+  auto UDivPat = m_OneUse(m_UDiv(m_Value(X), m_Value(Y)));
+  auto URemPat = m_OneUse(m_URem(m_Value(X2), m_Value(Y2)));
   auto ICmpPat = m_OneUse(m_ICmp(Pred, URemPat, m_Zero()));
   auto ZExtPat = m_OneUse(m_ZExt(ICmpPat));
 
   if (!match(&I, m_c_Add(UDivPat, ZExtPat)) || Pred != ICmpInst::ICMP_NE ||
-      A != A2 || *C1 != *C2 || !C1->ugt(1))
+      X != X2 || Y != Y2)
     return nullptr;
 
-  // Require A + (C-1) to not overflow unsigned in A's type.
+  // Require X + (Y-1) to not overflow unsigned in X's type.
   // Use ConstantRange rather than KnownBits: KnownBits can only derive bounds
   // from known-zero high bits, so it loses range info for near-max values
-  // (e.g. an assume of "a < UINT_MAX-5" on a 32-bit value leaves no
+  // (e.g. an assume of "x < UINT_MAX-5" on a 32-bit value leaves no
   // universally-zero bits and getMaxValue() returns UINT_MAX).
   // ConstantRange tracks the full [lo, hi) interval and gives a tight max.
   // Note: computeConstantRangeIncludingKnownBits does not forward AC/DT to
   // computeConstantRange, so it won't pick up llvm.assume; call the full form.
-  ConstantRange CR = computeConstantRange(A, /*ForSigned=*/false,
-                                          /*UseInstrInfo=*/true, &AC, &I, &DT);
-  APInt UMax = CR.getUnsignedMax();
-  if (UMax.ugt(APInt::getMaxValue(UMax.getBitWidth()) - (*C1 - 1)))
+  ConstantRange CRX = computeConstantRange(X, /*ForSigned=*/false,
+                                           /*UseInstrInfo=*/true, &AC, &I, &DT);
+  ConstantRange CRY = computeConstantRange(Y, /*ForSigned=*/false,
+                                           /*UseInstrInfo=*/true, &AC, &I, &DT);
+  APInt MaxX = CRX.getUnsignedMax();
+  APInt MaxY = CRY.getUnsignedMax();
+  unsigned BitWidth = MaxX.getBitWidth();
+  // MaxX + (MaxY - 1) <= UINT_MAX  <==>  MaxX <= UINT_MAX - (MaxY - 1)
+  if (MaxX.ugt(APInt::getMaxValue(BitWidth) - (MaxY - 1)))
     return nullptr;
 
-  Value *CMinusOne = ConstantInt::get(A->getType(), *C1 - 1);
-  Value *NUWAdd = Builder.CreateAdd(A, CMinusOne, "", /*HasNUW=*/true);
-  return BinaryOperator::CreateUDiv(NUWAdd, ConstantInt::get(A->getType(), *C1));
+  Value *YMinusOne = Builder.CreateSub(Y, ConstantInt::get(Y->getType(), 1));
+  Value *NUWAdd = Builder.CreateAdd(X, YMinusOne, "", /*HasNUW=*/true);
+  return BinaryOperator::CreateUDiv(NUWAdd, Y);
 }
 
 Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index f58fc5dcc876b..93bcb66369775 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -544,8 +544,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   /// (Binop (cast C), (select C, T, F))
   ///    -> (select C, C0, C1)
   Instruction *foldBinOpOfSelectAndCastOfSelectCondition(BinaryOperator &I);
-  /// (add (zext (udiv A, C)), (zext (icmp ne (urem A, C), 0)))
-  ///    -> (zext (udiv (add nuw A, C-1), C))
+  /// (add (udiv X, Y), (zext (icmp ne (urem X, Y), 0)))
+  ///    -> (udiv (add nuw X, Y-1), Y)
   Instruction *foldDivCeil(BinaryOperator &I);
 
   /// This tries to simplify binary operations by factorizing out common terms
diff --git a/llvm/test/Transforms/InstCombine/add-divceil.ll b/llvm/test/Transforms/InstCombine/add-divceil.ll
index bb8c82fefff74..9bdb3427f1efc 100644
--- a/llvm/test/Transforms/InstCombine/add-divceil.ll
+++ b/llvm/test/Transforms/InstCombine/add-divceil.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 
-; Fold the div_ceil idiom (narrow-add form):
-;   add(udiv(A, C), zext i1(icmp ne(urem(A, C), 0)))
-;     -> udiv(add nuw(A, C - 1), C)
-; when A + (C-1) is provably non-wrapping (via KnownBits / range info).
+; Fold the div_ceil idiom:
+;   add(udiv(X, Y), zext i1(icmp ne(urem(X, Y), 0)))
+;     -> udiv(add nuw(X, Y - 1), Y)
+; when X + (Y-1) is provably non-wrapping (via range info on both X and Y).
 
 declare void @use(i8)
 declare void @llvm.assume(i1)
@@ -130,3 +130,87 @@ define i8 @divceil_i8_div1(i8 range(i8 1, 33) %a) {
   %result = add i8 %q, %round
   ret i8 %result
 }
+
+; Variable divisor: both X in [0,100] and Y in [1,10], so max X+(Y-1) = 109 <= 255.
+define i8 @divceil_i8_var_divisor(i8 range(i8 0, 101) %x, i8 range(i8 1, 11) %y) {
+; CHECK-LABEL: @divceil_i8_var_divisor(
+; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i8 [[Y:%.*]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw i8 [[X:%.*]], [[TMP1]]
+; CHECK-NEXT:    [[RESULT:%.*]] = udiv i8 [[TMP2]], [[Y]]
+; CHECK-NEXT:    ret i8 [[RESULT]]
+;
+  %q = udiv i8 %x, %y
+  %r = urem i8 %x, %y
+  %cond = icmp ne i8 %r, 0
+  %round = zext i1 %cond to i8
+  %result = add i8 %q, %round
+  ret i8 %result
+}
+
+; Variable divisor, commuted add.
+define i8 @divceil_i8_var_divisor_commuted(i8 range(i8 0, 101) %x, i8 range(i8 1, 11) %y) {
+; CHECK-LABEL: @divceil_i8_var_divisor_commuted(
+; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i8 [[Y:%.*]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw i8 [[X:%.*]], [[TMP1]]
+; CHECK-NEXT:    [[RESULT:%.*]] = udiv i8 [[TMP2]], [[Y]]
+; CHECK-NEXT:    ret i8 [[RESULT]]
+;
+  %q = udiv i8 %x, %y
+  %r = urem i8 %x, %y
+  %cond = icmp ne i8 %r, 0
+  %round = zext i1 %cond to i8
+  %result = add i8 %round, %q
+  ret i8 %result
+}
+
+; Variable divisor with i32: X in [0, 100], Y in [2, 8], max X+(Y-1) = 107 <= UINT32_MAX.
+define i32 @divceil_i32_var_divisor(i32 range(i32 0, 101) %x, i32 range(i32 2, 9) %y) {
+; CHECK-LABEL: @divceil_i32_var_divisor(
+; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i32 [[Y:%.*]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i32 [[X:%.*]], [[TMP1]]
+; CHECK-NEXT:    [[RESULT:%.*]] = udiv i32 [[TMP2]], [[Y]]
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+  %q = udiv i32 %x, %y
+  %r = urem i32 %x, %y
+  %cond = icmp ne i32 %r, 0
+  %round = zext i1 %cond to i32
+  %result = add i32 %q, %round
+  ret i32 %result
+}
+
+; Negative: Y unbounded -> max Y = 255, max X+(Y-1) = 100+254 overflows i8.
+define i8 @divceil_i8_var_divisor_y_unbounded(i8 range(i8 0, 101) %x, i8 %y) {
+; CHECK-LABEL: @divceil_i8_var_divisor_y_unbounded(
+; CHECK-NEXT:    [[Q:%.*]] = udiv i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = urem i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = icmp ne i8 [[R]], 0
+; CHECK-NEXT:    [[ROUND:%.*]] = zext i1 [[COND]] to i8
+; CHECK-NEXT:    [[RESULT:%.*]] = add nuw i8 [[Q]], [[ROUND]]
+; CHECK-NEXT:    ret i8 [[RESULT]]
+;
+  %q = udiv i8 %x, %y
+  %r = urem i8 %x, %y
+  %cond = icmp ne i8 %r, 0
+  %round = zext i1 %cond to i8
+  %result = add i8 %q, %round
+  ret i8 %result
+}
+
+; Negative: X unbounded -> max X+(Y-1) overflows even with bounded Y.
+define i8 @divceil_i8_var_divisor_x_unbounded(i8 %x, i8 range(i8 1, 11) %y) {
+; CHECK-LABEL: @divceil_i8_var_divisor_x_unbounded(
+; CHECK-NEXT:    [[Q:%.*]] = udiv i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = urem i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[COND:%.*]] = icmp ne i8 [[R]], 0
+; CHECK-NEXT:    [[ROUND:%.*]] = zext i1 [[COND]] to i8
+; CHECK-NEXT:    [[RESULT:%.*]] = add i8 [[Q]], [[ROUND]]
+; CHECK-NEXT:    ret i8 [[RESULT]]
+;
+  %q = udiv i8 %x, %y
+  %r = urem i8 %x, %y
+  %cond = icmp ne i8 %r, 0
+  %round = zext i1 %cond to i8
+  %result = add i8 %q, %round
+  ret i8 %result
+}

>From b089e7df8288ffe1aac6453afd5c8cccc301be53 Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Wed, 1 Apr 2026 20:47:28 -0400
Subject: [PATCH 3/5] add wide div ceil folding so range information properly
 propagates even when the dividend is trunced to a narrower width

---
 llvm/lib/Analysis/ValueTracking.cpp           |  5 ++
 .../InstCombine/InstCombineAddSub.cpp         | 74 +++++++++++++------
 2 files changed, 55 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 340a616f13e19..2460397cba138 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -10361,6 +10361,11 @@ ConstantRange llvm::computeConstantRange(const Value *V, bool ForSigned,
         SI->getFalseValue(), ForSigned, UseInstrInfo, AC, CtxI, DT, Depth + 1);
     CR = CRTrue.unionWith(CRFalse);
     CR = CR.intersectWith(getRangeForSelectPattern(*SI, IIQ));
+  } else if (auto *TI = dyn_cast<TruncInst>(V)) {
+    ConstantRange SrcCR =
+        computeConstantRange(TI->getOperand(0), ForSigned, UseInstrInfo, AC,
+                             CtxI, DT, Depth + 1);
+    CR = SrcCR.truncate(BitWidth);
   } else if (isa<FPToUIInst>(V) || isa<FPToSIInst>(V)) {
     APInt Lower = APInt(BitWidth, 0);
     APInt Upper = APInt(BitWidth, 0);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 0295ef6272aa8..6f05261d335fa 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1524,26 +1524,16 @@ static Instruction *foldBoxMultiply(BinaryOperator &I) {
 }
 
 // Fold the div_ceil idiom:
-//   add(udiv(X, Y), zext(icmp ne(urem(X, Y), 0)))
-//     -> udiv(add nuw(X, Y - 1), Y)
-// The zext of the icmp is just type-plumbing (i1 -> X's type); the fold
-// stays entirely in X's type. Valid when X + (Y-1) is provably non-wrapping,
-// checked via ConstantRange on both operands (range attributes, assume, etc.).
-Instruction *InstCombinerImpl::foldDivCeil(BinaryOperator &I) {
-  Value *X, *X2, *Y, *Y2;
-  CmpPredicate Pred;
-
-  // Bind X and X2 independently so m_c_Add handles both operand orderings.
-  auto UDivPat = m_OneUse(m_UDiv(m_Value(X), m_Value(Y)));
-  auto URemPat = m_OneUse(m_URem(m_Value(X2), m_Value(Y2)));
-  auto ICmpPat = m_OneUse(m_ICmp(Pred, URemPat, m_Zero()));
-  auto ZExtPat = m_OneUse(m_ZExt(ICmpPat));
-
-  if (!match(&I, m_c_Add(UDivPat, ZExtPat)) || Pred != ICmpInst::ICMP_NE ||
-      X != X2 || Y != Y2)
-    return nullptr;
-
-  // Require X + (Y-1) to not overflow unsigned in X's type.
+//   Wide form:  add(udiv(X, Y), zext(icmp ne(urem(X, Y), 0)))
+//                 -> udiv(add nuw(X, Y - 1), Y)
+//   Narrow form: add(zext(udiv(X, Y)), zext(icmp ne(urem(X, Y), 0)))
+//                 -> zext(udiv(add nuw(X, Y - 1), Y))
+// In the narrow form X and Y operate in a type narrower than the add; the
+// result is zero-extended back to the add's type.
+// Valid when X + (Y-1) is provably non-wrapping in X's type, checked via
+// ConstantRange (range attributes, assume, etc.).
+static bool checkDivCeilNUW(Value *X, Value *Y, BinaryOperator &I,
+                             AssumptionCache &AC, DominatorTree &DT) {
   // Use ConstantRange rather than KnownBits: KnownBits can only derive bounds
   // from known-zero high bits, so it loses range info for near-max values
   // (e.g. an assume of "x < UINT_MAX-5" on a 32-bit value leaves no
@@ -1559,12 +1549,48 @@ Instruction *InstCombinerImpl::foldDivCeil(BinaryOperator &I) {
   APInt MaxY = CRY.getUnsignedMax();
   unsigned BitWidth = MaxX.getBitWidth();
   // MaxX + (MaxY - 1) <= UINT_MAX  <==>  MaxX <= UINT_MAX - (MaxY - 1)
-  if (MaxX.ugt(APInt::getMaxValue(BitWidth) - (MaxY - 1)))
+  return !MaxX.ugt(APInt::getMaxValue(BitWidth) - (MaxY - 1));
+}
+
+Instruction *InstCombinerImpl::foldDivCeil(BinaryOperator &I) {
+  Value *X, *X2, *Y, *Y2;
+  CmpPredicate Pred;
+
+  // Wide form: udiv and urem are the same type as the add.
+  auto UDivPat = m_OneUse(m_UDiv(m_Value(X), m_Value(Y)));
+  auto URemPat = m_OneUse(m_URem(m_Value(X2), m_Value(Y2)));
+  auto ICmpPat = m_OneUse(m_ICmp(Pred, URemPat, m_Zero()));
+  auto ZExtCmpPat = m_OneUse(m_ZExt(ICmpPat));
+
+  if (match(&I, m_c_Add(UDivPat, ZExtCmpPat)) && Pred == ICmpInst::ICMP_NE &&
+      X == X2 && Y == Y2 && checkDivCeilNUW(X, Y, I, AC, DT)) {
+    Value *YMinusOne = Builder.CreateSub(Y, ConstantInt::get(Y->getType(), 1));
+    Value *NUWAdd = Builder.CreateAdd(X, YMinusOne, "", /*HasNUW=*/true);
+    return BinaryOperator::CreateUDiv(NUWAdd, Y);
+  }
+
+  // Narrow form: udiv and urem are in a narrower type; both are zext'd before
+  // the add.  add(zext(udiv(X,Y)), zext(ne(urem(X,Y),0)))
+  //   -> zext(udiv(add nuw(X, Y-1), Y))
+  Value *NX, *NX2, *NY, *NY2;
+  CmpPredicate Pred2;
+  auto NUDivPat = m_OneUse(m_UDiv(m_Value(NX), m_Value(NY)));
+  auto NURemPat = m_OneUse(m_URem(m_Value(NX2), m_Value(NY2)));
+  auto NICmpPat = m_OneUse(m_ICmp(Pred2, NURemPat, m_Zero()));
+  auto ZExtDivPat = m_OneUse(m_ZExt(NUDivPat));
+  auto ZExtNarrowCmpPat = m_OneUse(m_ZExt(NICmpPat));
+
+  if (!match(&I, m_c_Add(ZExtDivPat, ZExtNarrowCmpPat)) ||
+      Pred2 != ICmpInst::ICMP_NE || NX != NX2 || NY != NY2)
+    return nullptr;
+
+  if (!checkDivCeilNUW(NX, NY, I, AC, DT))
     return nullptr;
 
-  Value *YMinusOne = Builder.CreateSub(Y, ConstantInt::get(Y->getType(), 1));
-  Value *NUWAdd = Builder.CreateAdd(X, YMinusOne, "", /*HasNUW=*/true);
-  return BinaryOperator::CreateUDiv(NUWAdd, Y);
+  Value *YMinusOne = Builder.CreateSub(NY, ConstantInt::get(NY->getType(), 1));
+  Value *NUWAdd = Builder.CreateAdd(NX, YMinusOne, "", /*HasNUW=*/true);
+  Value *Div = Builder.CreateUDiv(NUWAdd, NY);
+  return new ZExtInst(Div, I.getType());
 }
 
 Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {

>From 2c723332ce7e3079edcbd3ca4062baea904ae4ed Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Wed, 1 Apr 2026 20:49:27 -0400
Subject: [PATCH 4/5] fix failing test that now gets nuw because of trunc
 propagation on sub

---
 llvm/test/Transforms/InstCombine/fls.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/Transforms/InstCombine/fls.ll b/llvm/test/Transforms/InstCombine/fls.ll
index 68bc0a2fc8a1d..ea757268259f5 100644
--- a/llvm/test/Transforms/InstCombine/fls.ll
+++ b/llvm/test/Transforms/InstCombine/fls.ll
@@ -33,7 +33,7 @@ define i32 @flsnotconst(i64 %z) {
 ; CHECK-LABEL: @flsnotconst(
 ; CHECK-NEXT:    [[CTLZ:%.*]] = call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[Z:%.*]], i1 false)
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc nuw nsw i64 [[CTLZ]] to i32
-; CHECK-NEXT:    [[GOO:%.*]] = sub nsw i32 64, [[TMP1]]
+; CHECK-NEXT:    [[GOO:%.*]] = sub nuw nsw i32 64, [[TMP1]]
 ; CHECK-NEXT:    ret i32 [[GOO]]
 ;
   %goo = call i32 @flsl(i64 %z)

>From 31ab576383e62f8f9cb2f52f1735bbfc2b228616 Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Thu, 2 Apr 2026 08:57:17 -0400
Subject: [PATCH 5/5] split folds out

---
 .../InstCombine/InstCombineAddSub.cpp         | 88 ++++++++++---------
 .../InstCombine/InstCombineInternal.h         |  7 +-
 2 files changed, 51 insertions(+), 44 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 6f05261d335fa..fc39de2a1c0c6 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1523,24 +1523,17 @@ static Instruction *foldBoxMultiply(BinaryOperator &I) {
   return nullptr;
 }
 
-// Fold the div_ceil idiom:
-//   Wide form:  add(udiv(X, Y), zext(icmp ne(urem(X, Y), 0)))
-//                 -> udiv(add nuw(X, Y - 1), Y)
-//   Narrow form: add(zext(udiv(X, Y)), zext(icmp ne(urem(X, Y), 0)))
-//                 -> zext(udiv(add nuw(X, Y - 1), Y))
-// In the narrow form X and Y operate in a type narrower than the add; the
-// result is zero-extended back to the add's type.
-// Valid when X + (Y-1) is provably non-wrapping in X's type, checked via
+// Return true if X + (Y-1) is provably non-wrapping in X's type, using
 // ConstantRange (range attributes, assume, etc.).
+// Use ConstantRange rather than KnownBits: KnownBits can only derive bounds
+// from known-zero high bits, so it loses range info for near-max values
+// (e.g. an assume of "x < UINT_MAX-5" on a 32-bit value leaves no
+// universally-zero bits and getMaxValue() returns UINT_MAX).
+// ConstantRange tracks the full [lo, hi) interval and gives a tight max.
+// Note: computeConstantRangeIncludingKnownBits does not forward AC/DT to
+// computeConstantRange, so it won't pick up llvm.assume; call the full form.
 static bool checkDivCeilNUW(Value *X, Value *Y, BinaryOperator &I,
                              AssumptionCache &AC, DominatorTree &DT) {
-  // Use ConstantRange rather than KnownBits: KnownBits can only derive bounds
-  // from known-zero high bits, so it loses range info for near-max values
-  // (e.g. an assume of "x < UINT_MAX-5" on a 32-bit value leaves no
-  // universally-zero bits and getMaxValue() returns UINT_MAX).
-  // ConstantRange tracks the full [lo, hi) interval and gives a tight max.
-  // Note: computeConstantRangeIncludingKnownBits does not forward AC/DT to
-  // computeConstantRange, so it won't pick up llvm.assume; call the full form.
   ConstantRange CRX = computeConstantRange(X, /*ForSigned=*/false,
                                            /*UseInstrInfo=*/true, &AC, &I, &DT);
   ConstantRange CRY = computeConstantRange(Y, /*ForSigned=*/false,
@@ -1552,44 +1545,52 @@ static bool checkDivCeilNUW(Value *X, Value *Y, BinaryOperator &I,
   return !MaxX.ugt(APInt::getMaxValue(BitWidth) - (MaxY - 1));
 }
 
-Instruction *InstCombinerImpl::foldDivCeil(BinaryOperator &I) {
+// Fold the wide form of the div_ceil idiom:
+//   add(udiv(X, Y), zext(icmp ne(urem(X, Y), 0)))
+//     -> udiv(add nuw(X, Y - 1), Y)
+// udiv and urem are the same type as the add.
+// Valid when X + (Y-1) is provably non-wrapping in X's type.
+Instruction *InstCombinerImpl::foldWideDivCeil(BinaryOperator &I) {
   Value *X, *X2, *Y, *Y2;
   CmpPredicate Pred;
 
-  // Wide form: udiv and urem are the same type as the add.
   auto UDivPat = m_OneUse(m_UDiv(m_Value(X), m_Value(Y)));
   auto URemPat = m_OneUse(m_URem(m_Value(X2), m_Value(Y2)));
   auto ICmpPat = m_OneUse(m_ICmp(Pred, URemPat, m_Zero()));
   auto ZExtCmpPat = m_OneUse(m_ZExt(ICmpPat));
 
-  if (match(&I, m_c_Add(UDivPat, ZExtCmpPat)) && Pred == ICmpInst::ICMP_NE &&
-      X == X2 && Y == Y2 && checkDivCeilNUW(X, Y, I, AC, DT)) {
-    Value *YMinusOne = Builder.CreateSub(Y, ConstantInt::get(Y->getType(), 1));
-    Value *NUWAdd = Builder.CreateAdd(X, YMinusOne, "", /*HasNUW=*/true);
-    return BinaryOperator::CreateUDiv(NUWAdd, Y);
-  }
-
-  // Narrow form: udiv and urem are in a narrower type; both are zext'd before
-  // the add.  add(zext(udiv(X,Y)), zext(ne(urem(X,Y),0)))
-  //   -> zext(udiv(add nuw(X, Y-1), Y))
-  Value *NX, *NX2, *NY, *NY2;
-  CmpPredicate Pred2;
-  auto NUDivPat = m_OneUse(m_UDiv(m_Value(NX), m_Value(NY)));
-  auto NURemPat = m_OneUse(m_URem(m_Value(NX2), m_Value(NY2)));
-  auto NICmpPat = m_OneUse(m_ICmp(Pred2, NURemPat, m_Zero()));
-  auto ZExtDivPat = m_OneUse(m_ZExt(NUDivPat));
-  auto ZExtNarrowCmpPat = m_OneUse(m_ZExt(NICmpPat));
-
-  if (!match(&I, m_c_Add(ZExtDivPat, ZExtNarrowCmpPat)) ||
-      Pred2 != ICmpInst::ICMP_NE || NX != NX2 || NY != NY2)
+  if (!match(&I, m_c_Add(UDivPat, ZExtCmpPat)) || Pred != ICmpInst::ICMP_NE ||
+      X != X2 || Y != Y2 || !checkDivCeilNUW(X, Y, I, AC, DT))
     return nullptr;
 
-  if (!checkDivCeilNUW(NX, NY, I, AC, DT))
+  Value *YMinusOne = Builder.CreateSub(Y, ConstantInt::get(Y->getType(), 1));
+  Value *NUWAdd = Builder.CreateAdd(X, YMinusOne, "", /*HasNUW=*/true);
+  return BinaryOperator::CreateUDiv(NUWAdd, Y);
+}
+
+// Fold the narrow form of the div_ceil idiom:
+//   add(zext(udiv(X, Y)), zext(icmp ne(urem(X, Y), 0)))
+//     -> zext(udiv(add nuw(X, Y - 1), Y))
+// udiv and urem operate in a type narrower than the add; the result is
+// zero-extended back to the add's type.
+// Valid when X + (Y-1) is provably non-wrapping in X's type.
+Instruction *InstCombinerImpl::foldNarrowDivCeil(BinaryOperator &I) {
+  Value *X, *X2, *Y, *Y2;
+  CmpPredicate Pred;
+
+  auto UDivPat = m_OneUse(m_UDiv(m_Value(X), m_Value(Y)));
+  auto URemPat = m_OneUse(m_URem(m_Value(X2), m_Value(Y2)));
+  auto ICmpPat = m_OneUse(m_ICmp(Pred, URemPat, m_Zero()));
+  auto ZExtDivPat = m_OneUse(m_ZExt(UDivPat));
+  auto ZExtCmpPat = m_OneUse(m_ZExt(ICmpPat));
+
+  if (!match(&I, m_c_Add(ZExtDivPat, ZExtCmpPat)) || Pred != ICmpInst::ICMP_NE ||
+      X != X2 || Y != Y2 || !checkDivCeilNUW(X, Y, I, AC, DT))
     return nullptr;
 
-  Value *YMinusOne = Builder.CreateSub(NY, ConstantInt::get(NY->getType(), 1));
-  Value *NUWAdd = Builder.CreateAdd(NX, YMinusOne, "", /*HasNUW=*/true);
-  Value *Div = Builder.CreateUDiv(NUWAdd, NY);
+  Value *YMinusOne = Builder.CreateSub(Y, ConstantInt::get(Y->getType(), 1));
+  Value *NUWAdd = Builder.CreateAdd(X, YMinusOne, "", /*HasNUW=*/true);
+  Value *Div = Builder.CreateUDiv(NUWAdd, Y);
   return new ZExtInst(Div, I.getType());
 }
 
@@ -1985,7 +1986,10 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
   if (Instruction *Res = foldBinOpOfSelectAndCastOfSelectCondition(I))
     return Res;
 
-  if (Instruction *Res = foldDivCeil(I))
+  if (Instruction *Res = foldWideDivCeil(I))
+    return Res;
+
+  if (Instruction *Res = foldNarrowDivCeil(I))
     return Res;
 
   // Re-enqueue users of the induction variable of add recurrence if we infer
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 93bcb66369775..dfebf0e6dcb67 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -544,9 +544,12 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   /// (Binop (cast C), (select C, T, F))
   ///    -> (select C, C0, C1)
   Instruction *foldBinOpOfSelectAndCastOfSelectCondition(BinaryOperator &I);
-  /// (add (udiv X, Y), (zext (icmp ne (urem X, Y), 0)))
+  /// Wide form: (add (udiv X, Y), (zext (icmp ne (urem X, Y), 0)))
   ///    -> (udiv (add nuw X, Y-1), Y)
-  Instruction *foldDivCeil(BinaryOperator &I);
+  Instruction *foldWideDivCeil(BinaryOperator &I);
+  /// Narrow form: (add (zext (udiv X, Y)), (zext (icmp ne (urem X, Y), 0)))
+  ///    -> (zext (udiv (add nuw X, Y-1), Y))
+  Instruction *foldNarrowDivCeil(BinaryOperator &I);
 
   /// This tries to simplify binary operations by factorizing out common terms
   /// (e. g. "(A*B)+(A*C)" -> "A*(B+C)").