[llvm] [WIP] [InstCombine] Div ceil optimizations (PR #190175)
Takashi Idobe via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 2 06:25:01 PDT 2026
https://github.com/Takashiidobe created https://github.com/llvm/llvm-project/pull/190175
This PR improves handling of `div_ceil` from rust (which emits a div + rem).
Currently, these three rust functions:
```rust
use std::hint::assert_unchecked;
#[unsafe(no_mangle)]
pub fn div_ceil_without_assume(x: u32) -> u32 {
x.div_ceil(7)
}
#[unsafe(no_mangle)]
pub fn div_ceil_with_assume(x: u32) -> u32 {
unsafe {
assert_unchecked(x <= u32::MAX - 7);
}
x.div_ceil(7)
}
#[unsafe(no_mangle)]
pub fn div_ceil_with_range(x: u32) -> u32 {
x.count_zeros().div_ceil(7)
}
```
Will emit this IR (cleaned up): The IR looks pretty good to me as both the assert_unchecked and the popcount are provide range information.
```llvm
define noundef range(i32 0, 613566758) i32 @div_ceil_without_assume(i32 noundef %x) unnamed_addr {
start:
%d = udiv i32 %x, 7
%r = urem i32 %x, 7
%_4.not = icmp ne i32 %r, 0
%0 = zext i1 %_4.not to i32
%_0.sroa.0.0 = add nuw nsw i32 %d, %0
ret i32 %_0.sroa.0.0
}
define noundef range(i32 0, 613566757) i32 @div_ceil_with_assume(i32 noundef %x) unnamed_addr {
start:
%cond = icmp ult i32 %x, -7
tail call void @llvm.assume(i1 %cond)
%d = udiv i32 %x, 7
%r = urem i32 %x, 7
%_5.not = icmp ne i32 %r, 0
%0 = zext i1 %_5.not to i32
%_0.sroa.0.0 = add nuw nsw i32 %d, %0
ret i32 %_0.sroa.0.0
}
define noundef range(i32 0, 6) i32 @div_ceil_with_range(i32 noundef %x) unnamed_addr {
start:
%self = xor i32 %x, -1
%0 = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 %self)
%d.lhs.trunc = trunc nuw nsw i32 %0 to i8
%d3 = udiv i8 %d.lhs.trunc, 7
%d.zext = zext nneg i8 %d3 to i32
%r4 = urem i8 %d.lhs.trunc, 7
%_6.not = icmp ne i8 %r4, 0
%1 = zext i1 %_6.not to i32
%_0.sroa.0.0 = add nuw nsw i32 %1, %d.zext
ret i32 %_0.sroa.0.0
}
declare void @llvm.assume(i1 noundef)
declare i32 @llvm.ctpop.i32(i32)
```
After running through opt and llc on main with popcnt for less noise:
`build/bin/opt -O2 -S < test.ll | build/bin/llc --x86-asm-syntax=intel -mattr=+popcnt -O2 -o -`
```asm
div_ceil_without_assume: # @div_ceil_without_assume
mov eax, edi
movabs rcx, 2635249153617166336
mul rcx
lea eax, [8*rdx]
mov ecx, edx
sub ecx, eax
xor eax, eax
add ecx, edi
setne al
add eax, edx
ret
div_ceil_with_assume: # @div_ceil_with_assume
mov eax, edi
movabs rcx, 2635249153617166336
mul rcx
lea eax, [8*rdx]
mov ecx, edx
sub ecx, eax
xor eax, eax
add ecx, edi
setne al
add eax, edx
ret
div_ceil_with_range: # @div_ceil_with_range
not edi
popcnt ecx, edi
lea eax, [rcx + 8*rcx]
lea edx, [rcx + 4*rax]
shr edx, 8
lea esi, [8*rdx]
sub esi, edx
xor eax, eax
cmp cl, sil
setne al
add eax, edx
ret
```
(as a sidenote, just running llc seems to generate better code? I find this a bit odd)
`build/bin/llc test.ll --x86-asm-syntax=intel -mattr=+popcnt -O2 -o -`
```asm
div_ceil_without_assume: # @div_ceil_without_assume
mov eax, edi
movabs rcx, 2635249153617166336
mul rcx
mov rax, rdx
imul ecx, edi, -1227133513
cmp ecx, 613566757
sbb eax, -1
ret
div_ceil_with_assume: # @div_ceil_with_assume
mov eax, edi
movabs rcx, 2635249153617166336
mul rcx
mov rax, rdx
imul ecx, edi, -1227133513
cmp ecx, 613566757
sbb eax, -1
ret
div_ceil_with_range: # @div_ceil_with_range
not edi
popcnt ecx, edi
lea eax, [rcx + 8*rcx]
lea eax, [rcx + 4*rax]
shr eax, 8
imul ecx, ecx, -73
cmp cl, 37
```
Anyway the llvm IR when run through opt at -O2:
On main:
The only changes I see are the urem being rewritten to a mul + sub for all the instructions, and the range information is persisted but not used.
```llvm
define noundef range(i32 0, 613566758) i32 @div_ceil_without_assume(i32 noundef %x) unnamed_addr {
start:
%d = udiv i32 %x, 7
%0 = mul i32 %d, 7
%r.decomposed = sub i32 %x, %0
%_4.not = icmp ne i32 %r.decomposed, 0
%1 = zext i1 %_4.not to i32
%_0.sroa.0.0 = add nuw nsw i32 %d, %1
ret i32 %_0.sroa.0.0
}
define noundef range(i32 0, 613566757) i32 @div_ceil_with_assume(i32 noundef %x) unnamed_addr {
start:
%cond = icmp ult i32 %x, -7
tail call void @llvm.assume(i1 %cond)
%d = udiv i32 %x, 7
%0 = mul i32 %d, 7
%r.decomposed = sub i32 %x, %0
%_5.not = icmp ne i32 %r.decomposed, 0
%1 = zext i1 %_5.not to i32
%_0.sroa.0.0 = add nuw nsw i32 %d, %1
ret i32 %_0.sroa.0.0
}
define noundef range(i32 0, 6) i32 @div_ceil_with_range(i32 noundef %x) unnamed_addr {
start:
%self = xor i32 %x, -1
%0 = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 %self)
%d.lhs.trunc = trunc nuw nsw i32 %0 to i8
%d3 = udiv i8 %d.lhs.trunc, 7
%d.zext = zext nneg i8 %d3 to i32
%1 = mul i8 %d3, 7
%r4.decomposed = sub i8 %d.lhs.trunc, %1
%_6.not = icmp ne i8 %r4.decomposed, 0
%2 = zext i1 %_6.not to i32
%_0.sroa.0.0 = add nuw nsw i32 %2, %d.zext
ret i32 %_0.sroa.0.0
}
declare void @llvm.assume(i1 noundef)
declare i32 @llvm.ctpop.i32(i32)
```
However, both the assume and the popcount provide range information so we can do an optimization. Div_ceil is emitted as a udiv and urem. We can combine them given the following rules:
Assuming you have floor division of X / Y, we can add Y - 1 to X and floor division will always give us (floor_divide(X / Y) + 1) which gives us the same result.
So the formula in general is:
```
div_ceil(X, Y) = X / Y + (1 if X % Y > 0 else 0) -> X + Y - 1 / Y
```
But this fails since I forgot about wrapping. So we need a condition that X + Y - 1 does not wrap, and of course Y cannot be 0. Technically we should ignore Y / 1 since that's a trivial identity.
That gets us:
```
add(udiv(X, Y), zext(icmp ne(urem(X, Y), 0))) -> udiv(add nuw(X, Y - 1), Y)
```
[Alive proof here](https://alive2.llvm.org/ce/z/nBvyv4)
This is where I thought I was done, because this should handle the range and assume case (the assume case provides less information than the popcount because popcount's range is narrower for X). So I thought I was done.
Unfortunately this only optimizes one case (the assume case with the wide code). The culprit here is in the popcount case, there's a trunc for the popcount to bound it to an i8 since narrower arithmetic allows for better optimizations before zero extending.
```llvm
%d.lhs.trunc = trunc nuw nsw i32 %0 to i8
%d3 = udiv i8 %d.lhs.trunc, 7
%d.zext = zext nneg i8 %d3 to i32
%r4 = urem i8 %d.lhs.trunc, 7
%_6.not = icmp ne i8 %r4, 0
%1 = zext i1 %_6.not to i32
```
So we need to handle another form, when the udiv needs to be zexted to the return type (because it was previously trunced, due to the popcount).
```
add(zext(udiv(X, Y)), zext(icmp ne(urem(X, Y), 0))) -> zext(udiv(add nuw(X, Y - 1), Y))
```
[Alive2 Proof](https://alive2.llvm.org/ce/z/w7bRZW)
There was another oddity I found that trunced information wasn't passing constant ranges, so I added that to ValueTracking.cpp to fix this fold too.
On this branch the IR now transforms:
```llvm
define noundef range(i32 0, 613566758) i32 @div_ceil_without_assume(i32 noundef %x) unnamed_addr {
start:
%d = udiv i32 %x, 7
%0 = mul i32 %d, 7
%r.decomposed = sub i32 %x, %0
%_4.not = icmp ne i32 %r.decomposed, 0
%1 = zext i1 %_4.not to i32
%_0.sroa.0.0 = add nuw nsw i32 %d, %1
ret i32 %_0.sroa.0.0
}
define noundef range(i32 0, 613566757) i32 @div_ceil_with_assume(i32 noundef %x) unnamed_addr {
start:
%cond = icmp ult i32 %x, -7
tail call void @llvm.assume(i1 %cond)
%0 = add nuw i32 %x, 6
%_0.sroa.0.0 = udiv i32 %0, 7
ret i32 %_0.sroa.0.0
}
define noundef range(i32 0, 6) i32 @div_ceil_with_range(i32 noundef %x) unnamed_addr {
start:
%self = xor i32 %x, -1
%0 = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 %self)
%d.lhs.trunc = trunc nuw nsw i32 %0 to i8
%1 = add nuw nsw i8 %d.lhs.trunc, 6
%2 = udiv i8 %1, 7
%_0.sroa.0.0 = zext nneg i8 %2 to i32
ret i32 %_0.sroa.0.0
}
declare void @llvm.assume(i1 noundef)
declare i32 @llvm.ctpop.i32(i32)
```
And we get this asm emitted:
```asm
div_ceil_without_assume: # @div_ceil_without_assume
mov eax, edi
movabs rcx, 2635249153617166336
mul rcx
lea eax, [8*rdx]
mov ecx, edx
sub ecx, eax
xor eax, eax
add ecx, edi
setne al
add eax, edx
ret
div_ceil_with_assume: # @div_ceil_with_assume
lea eax, [rdi + 6]
movabs rcx, 2635249153617166336
mul rcx
mov rax, rdx
ret
div_ceil_with_range: # @div_ceil_with_range
not edi
popcnt eax, edi
add al, 6
movzx eax, al
imul eax, eax, 147
shr eax, 10
ret
```
>From c95ada852c3bfdb4db2c571f7f2c50f3d58761cf Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Sat, 21 Mar 2026 16:45:08 -0400
Subject: [PATCH 1/5] add tests for div_ceil folding which currently fail
---
.../InstCombine/InstCombineAddSub.cpp | 43 ++++++
.../InstCombine/InstCombineInternal.h | 3 +
.../Transforms/InstCombine/add-divceil.ll | 132 ++++++++++++++++++
3 files changed, 178 insertions(+)
create mode 100644 llvm/test/Transforms/InstCombine/add-divceil.ll
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index c781c6978b275..9865143ae1aa0 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1523,6 +1523,46 @@ static Instruction *foldBoxMultiply(BinaryOperator &I) {
return nullptr;
}
+// Fold the div_ceil idiom:
+// add(udiv(A, C), zext(icmp ne(urem(A, C), 0)))
+// -> udiv(add nuw(A, C - 1), C)
+// The zext of the icmp is just type-plumbing (i1 -> A's type); the fold
+// stays entirely in A's type. Valid when A + (C-1) is provably non-wrapping,
+// checked via ConstantRange (range attributes, assume, etc.).
+Instruction *InstCombinerImpl::foldDivCeil(BinaryOperator &I) {
+ Value *A, *A2;
+ const APInt *C1, *C2;
+ CmpPredicate Pred;
+
+ // Bind A and A2 independently so m_c_Add handles both operand orderings.
+ auto UDivPat = m_OneUse(m_UDiv(m_Value(A), m_APInt(C1)));
+ auto URemPat = m_OneUse(m_URem(m_Value(A2), m_APInt(C2)));
+ auto ICmpPat = m_OneUse(m_ICmp(Pred, URemPat, m_Zero()));
+ auto ZExtPat = m_OneUse(m_ZExt(ICmpPat));
+
+ if (!match(&I, m_c_Add(UDivPat, ZExtPat)) || Pred != ICmpInst::ICMP_NE ||
+ A != A2 || *C1 != *C2 || !C1->ugt(1))
+ return nullptr;
+
+ // Require A + (C-1) to not overflow unsigned in A's type.
+ // Use ConstantRange rather than KnownBits: KnownBits can only derive bounds
+ // from known-zero high bits, so it loses range info for near-max values
+ // (e.g. an assume of "a < UINT_MAX-5" on a 32-bit value leaves no
+ // universally-zero bits and getMaxValue() returns UINT_MAX).
+ // ConstantRange tracks the full [lo, hi) interval and gives a tight max.
+ // Note: computeConstantRangeIncludingKnownBits does not forward AC/DT to
+ // computeConstantRange, so it won't pick up llvm.assume; call the full form.
+ ConstantRange CR = computeConstantRange(A, /*ForSigned=*/false,
+ /*UseInstrInfo=*/true, &AC, &I, &DT);
+ APInt UMax = CR.getUnsignedMax();
+ if (UMax.ugt(APInt::getMaxValue(UMax.getBitWidth()) - (*C1 - 1)))
+ return nullptr;
+
+ Value *CMinusOne = ConstantInt::get(A->getType(), *C1 - 1);
+ Value *NUWAdd = Builder.CreateAdd(A, CMinusOne, "", /*HasNUW=*/true);
+ return BinaryOperator::CreateUDiv(NUWAdd, ConstantInt::get(A->getType(), *C1));
+}
+
Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
if (Value *V = simplifyAddInst(I.getOperand(0), I.getOperand(1),
I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
@@ -1915,6 +1955,9 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
if (Instruction *Res = foldBinOpOfSelectAndCastOfSelectCondition(I))
return Res;
+ if (Instruction *Res = foldDivCeil(I))
+ return Res;
+
// Re-enqueue users of the induction variable of add recurrence if we infer
// new nuw/nsw flags.
if (Changed) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 160f766b60973..f58fc5dcc876b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -544,6 +544,9 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
/// (Binop (cast C), (select C, T, F))
/// -> (select C, C0, C1)
Instruction *foldBinOpOfSelectAndCastOfSelectCondition(BinaryOperator &I);
+ /// (add (zext (udiv A, C)), (zext (icmp ne (urem A, C), 0)))
+ /// -> (zext (udiv (add nuw A, C-1), C))
+ Instruction *foldDivCeil(BinaryOperator &I);
/// This tries to simplify binary operations by factorizing out common terms
/// (e. g. "(A*B)+(A*C)" -> "A*(B+C)").
diff --git a/llvm/test/Transforms/InstCombine/add-divceil.ll b/llvm/test/Transforms/InstCombine/add-divceil.ll
new file mode 100644
index 0000000000000..bb8c82fefff74
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/add-divceil.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+; Fold the div_ceil idiom (narrow-add form):
+; add(udiv(A, C), zext i1(icmp ne(urem(A, C), 0)))
+; -> udiv(add nuw(A, C - 1), C)
+; when A + (C-1) is provably non-wrapping (via KnownBits / range info).
+
+declare void @use(i8)
+declare void @llvm.assume(i1)
+
+; Basic: i8 in [1, 32] -> bits 7,6,5 known zero, max = 63; 63+6 <= 255.
+define i8 @divceil_i8_bounded(i8 range(i8 1, 33) %a) {
+; CHECK-LABEL: @divceil_i8_bounded(
+; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i8 [[A:%.*]], 6
+; CHECK-NEXT: [[RESULT:%.*]] = udiv i8 [[TMP1]], 7
+; CHECK-NEXT: ret i8 [[RESULT]]
+;
+ %q = udiv i8 %a, 7
+ %r = urem i8 %a, 7
+ %cond = icmp ne i8 %r, 0
+ %round = zext i1 %cond to i8
+ %result = add i8 %q, %round
+ ret i8 %result
+}
+
+; Commuted: zext(icmp) on the left of add.
+define i8 @divceil_i8_bounded_commuted(i8 range(i8 1, 33) %a) {
+; CHECK-LABEL: @divceil_i8_bounded_commuted(
+; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i8 [[A:%.*]], 6
+; CHECK-NEXT: [[RESULT:%.*]] = udiv i8 [[TMP1]], 7
+; CHECK-NEXT: ret i8 [[RESULT]]
+;
+ %q = udiv i8 %a, 7
+ %r = urem i8 %a, 7
+ %cond = icmp ne i8 %r, 0
+ %round = zext i1 %cond to i8
+ %result = add i8 %round, %q
+ ret i8 %result
+}
+
+; With llvm.assume: A <= 100, so bit 7 known zero, max = 127; 127+6 <= 255.
+define i8 @divceil_i8_assume(i8 %a) {
+; CHECK-LABEL: @divceil_i8_assume(
+; CHECK-NEXT: [[OK:%.*]] = icmp ult i8 [[A:%.*]], 101
+; CHECK-NEXT: call void @llvm.assume(i1 [[OK]])
+; CHECK-NEXT: [[TMP1:%.*]] = add nuw i8 [[A]], 6
+; CHECK-NEXT: [[RESULT:%.*]] = udiv i8 [[TMP1]], 7
+; CHECK-NEXT: ret i8 [[RESULT]]
+;
+ %ok = icmp ule i8 %a, 100
+ call void @llvm.assume(i1 %ok)
+ %q = udiv i8 %a, 7
+ %r = urem i8 %a, 7
+ %cond = icmp ne i8 %r, 0
+ %round = zext i1 %cond to i8
+ %result = add i8 %q, %round
+ ret i8 %result
+}
+
+; With assume a < UINT_MAX-5 (i.e. a <= UINT_MAX-6): ConstantRange max =
+; UINT_MAX-6; no high bits are universally zero so KnownBits alone would fail,
+; but ConstantRange handles it. This is the form Rust emits for
+; assert_unchecked(a <= u32::MAX - 6) on a div_ceil(7) call.
+define i32 @divceil_i32_assume_near_max(i32 %a) {
+; CHECK-LABEL: @divceil_i32_assume_near_max(
+; CHECK-NEXT: [[OK:%.*]] = icmp ult i32 [[A:%.*]], -6
+; CHECK-NEXT: call void @llvm.assume(i1 [[OK]])
+; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[A]], 6
+; CHECK-NEXT: [[RESULT:%.*]] = udiv i32 [[TMP1]], 7
+; CHECK-NEXT: ret i32 [[RESULT]]
+;
+ %ok = icmp ult i32 %a, -6
+ call void @llvm.assume(i1 %ok)
+ %q = udiv i32 %a, 7
+ %r = urem i32 %a, 7
+ %cond = icmp ne i32 %r, 0
+ %round = zext i1 %cond to i32
+ %result = add i32 %q, %round
+ ret i32 %result
+}
+
+; Negative: no range info -> KnownBits max = 255; 255+6 overflows.
+define i8 @divceil_i8_unbounded(i8 %a) {
+; CHECK-LABEL: @divceil_i8_unbounded(
+; CHECK-NEXT: [[Q:%.*]] = udiv i8 [[A:%.*]], 7
+; CHECK-NEXT: [[R:%.*]] = urem i8 [[A]], 7
+; CHECK-NEXT: [[COND:%.*]] = icmp ne i8 [[R]], 0
+; CHECK-NEXT: [[ROUND:%.*]] = zext i1 [[COND]] to i8
+; CHECK-NEXT: [[RESULT:%.*]] = add nuw nsw i8 [[Q]], [[ROUND]]
+; CHECK-NEXT: ret i8 [[RESULT]]
+;
+ %q = udiv i8 %a, 7
+ %r = urem i8 %a, 7
+ %cond = icmp ne i8 %r, 0
+ %round = zext i1 %cond to i8
+ %result = add i8 %q, %round
+ ret i8 %result
+}
+
+; Negative: udiv has multiple uses -> one-use check fails.
+define i8 @divceil_i8_udiv_multiuse(i8 range(i8 1, 33) %a) {
+; CHECK-LABEL: @divceil_i8_udiv_multiuse(
+; CHECK-NEXT: [[Q:%.*]] = udiv i8 [[A:%.*]], 7
+; CHECK-NEXT: [[R:%.*]] = urem i8 [[A]], 7
+; CHECK-NEXT: [[COND:%.*]] = icmp ne i8 [[R]], 0
+; CHECK-NEXT: [[ROUND:%.*]] = zext i1 [[COND]] to i8
+; CHECK-NEXT: [[RESULT:%.*]] = add nuw nsw i8 [[Q]], [[ROUND]]
+; CHECK-NEXT: call void @use(i8 [[Q]])
+; CHECK-NEXT: ret i8 [[RESULT]]
+;
+ %q = udiv i8 %a, 7
+ %r = urem i8 %a, 7
+ %cond = icmp ne i8 %r, 0
+ %round = zext i1 %cond to i8
+ %result = add i8 %q, %round
+ call void @use(i8 %q)
+ ret i8 %result
+}
+
+; Negative: divisor == 1 -> no-op (udiv and urem by 1 are trivial).
+define i8 @divceil_i8_div1(i8 range(i8 1, 33) %a) {
+; CHECK-LABEL: @divceil_i8_div1(
+; CHECK-NEXT: ret i8 [[A:%.*]]
+;
+ %q = udiv i8 %a, 1
+ %r = urem i8 %a, 1
+ %cond = icmp ne i8 %r, 0
+ %round = zext i1 %cond to i8
+ %result = add i8 %q, %round
+ ret i8 %result
+}
>From 5406e664b12a57191b6973634ae502df75215f8b Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Wed, 1 Apr 2026 17:02:39 -0400
Subject: [PATCH 2/5] allow the narrow div ceil fold, add(udiv(X, C), zext(icmp
ne(urem(X, C), 0)) -> udiv(add nuw(A, C - 1), C) to handle variable divisors
---
.../InstCombine/InstCombineAddSub.cpp | 44 +++++----
.../InstCombine/InstCombineInternal.h | 4 +-
.../Transforms/InstCombine/add-divceil.ll | 92 ++++++++++++++++++-
3 files changed, 114 insertions(+), 26 deletions(-)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 9865143ae1aa0..0295ef6272aa8 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1524,43 +1524,47 @@ static Instruction *foldBoxMultiply(BinaryOperator &I) {
}
// Fold the div_ceil idiom:
-// add(udiv(A, C), zext(icmp ne(urem(A, C), 0)))
-// -> udiv(add nuw(A, C - 1), C)
-// The zext of the icmp is just type-plumbing (i1 -> A's type); the fold
-// stays entirely in A's type. Valid when A + (C-1) is provably non-wrapping,
-// checked via ConstantRange (range attributes, assume, etc.).
+// add(udiv(X, Y), zext(icmp ne(urem(X, Y), 0)))
+// -> udiv(add nuw(X, Y - 1), Y)
+// The zext of the icmp is just type-plumbing (i1 -> X's type); the fold
+// stays entirely in X's type. Valid when X + (Y-1) is provably non-wrapping,
+// checked via ConstantRange on both operands (range attributes, assume, etc.).
Instruction *InstCombinerImpl::foldDivCeil(BinaryOperator &I) {
- Value *A, *A2;
- const APInt *C1, *C2;
+ Value *X, *X2, *Y, *Y2;
CmpPredicate Pred;
- // Bind A and A2 independently so m_c_Add handles both operand orderings.
- auto UDivPat = m_OneUse(m_UDiv(m_Value(A), m_APInt(C1)));
- auto URemPat = m_OneUse(m_URem(m_Value(A2), m_APInt(C2)));
+ // Bind X and X2 independently so m_c_Add handles both operand orderings.
+ auto UDivPat = m_OneUse(m_UDiv(m_Value(X), m_Value(Y)));
+ auto URemPat = m_OneUse(m_URem(m_Value(X2), m_Value(Y2)));
auto ICmpPat = m_OneUse(m_ICmp(Pred, URemPat, m_Zero()));
auto ZExtPat = m_OneUse(m_ZExt(ICmpPat));
if (!match(&I, m_c_Add(UDivPat, ZExtPat)) || Pred != ICmpInst::ICMP_NE ||
- A != A2 || *C1 != *C2 || !C1->ugt(1))
+ X != X2 || Y != Y2)
return nullptr;
- // Require A + (C-1) to not overflow unsigned in A's type.
+ // Require X + (Y-1) to not overflow unsigned in X's type.
// Use ConstantRange rather than KnownBits: KnownBits can only derive bounds
// from known-zero high bits, so it loses range info for near-max values
- // (e.g. an assume of "a < UINT_MAX-5" on a 32-bit value leaves no
+ // (e.g. an assume of "x < UINT_MAX-5" on a 32-bit value leaves no
// universally-zero bits and getMaxValue() returns UINT_MAX).
// ConstantRange tracks the full [lo, hi) interval and gives a tight max.
// Note: computeConstantRangeIncludingKnownBits does not forward AC/DT to
// computeConstantRange, so it won't pick up llvm.assume; call the full form.
- ConstantRange CR = computeConstantRange(A, /*ForSigned=*/false,
- /*UseInstrInfo=*/true, &AC, &I, &DT);
- APInt UMax = CR.getUnsignedMax();
- if (UMax.ugt(APInt::getMaxValue(UMax.getBitWidth()) - (*C1 - 1)))
+ ConstantRange CRX = computeConstantRange(X, /*ForSigned=*/false,
+ /*UseInstrInfo=*/true, &AC, &I, &DT);
+ ConstantRange CRY = computeConstantRange(Y, /*ForSigned=*/false,
+ /*UseInstrInfo=*/true, &AC, &I, &DT);
+ APInt MaxX = CRX.getUnsignedMax();
+ APInt MaxY = CRY.getUnsignedMax();
+ unsigned BitWidth = MaxX.getBitWidth();
+ // MaxX + (MaxY - 1) <= UINT_MAX <==> MaxX <= UINT_MAX - (MaxY - 1)
+ if (MaxX.ugt(APInt::getMaxValue(BitWidth) - (MaxY - 1)))
return nullptr;
- Value *CMinusOne = ConstantInt::get(A->getType(), *C1 - 1);
- Value *NUWAdd = Builder.CreateAdd(A, CMinusOne, "", /*HasNUW=*/true);
- return BinaryOperator::CreateUDiv(NUWAdd, ConstantInt::get(A->getType(), *C1));
+ Value *YMinusOne = Builder.CreateSub(Y, ConstantInt::get(Y->getType(), 1));
+ Value *NUWAdd = Builder.CreateAdd(X, YMinusOne, "", /*HasNUW=*/true);
+ return BinaryOperator::CreateUDiv(NUWAdd, Y);
}
Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index f58fc5dcc876b..93bcb66369775 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -544,8 +544,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
/// (Binop (cast C), (select C, T, F))
/// -> (select C, C0, C1)
Instruction *foldBinOpOfSelectAndCastOfSelectCondition(BinaryOperator &I);
- /// (add (zext (udiv A, C)), (zext (icmp ne (urem A, C), 0)))
- /// -> (zext (udiv (add nuw A, C-1), C))
+ /// (add (udiv X, Y), (zext (icmp ne (urem X, Y), 0)))
+ /// -> (udiv (add nuw X, Y-1), Y)
Instruction *foldDivCeil(BinaryOperator &I);
/// This tries to simplify binary operations by factorizing out common terms
diff --git a/llvm/test/Transforms/InstCombine/add-divceil.ll b/llvm/test/Transforms/InstCombine/add-divceil.ll
index bb8c82fefff74..9bdb3427f1efc 100644
--- a/llvm/test/Transforms/InstCombine/add-divceil.ll
+++ b/llvm/test/Transforms/InstCombine/add-divceil.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-; Fold the div_ceil idiom (narrow-add form):
-; add(udiv(A, C), zext i1(icmp ne(urem(A, C), 0)))
-; -> udiv(add nuw(A, C - 1), C)
-; when A + (C-1) is provably non-wrapping (via KnownBits / range info).
+; Fold the div_ceil idiom:
+; add(udiv(X, Y), zext i1(icmp ne(urem(X, Y), 0)))
+; -> udiv(add nuw(X, Y - 1), Y)
+; when X + (Y-1) is provably non-wrapping (via range info on both X and Y).
declare void @use(i8)
declare void @llvm.assume(i1)
@@ -130,3 +130,87 @@ define i8 @divceil_i8_div1(i8 range(i8 1, 33) %a) {
%result = add i8 %q, %round
ret i8 %result
}
+
+; Variable divisor: both X in [0,100] and Y in [1,10], so max X+(Y-1) = 109 <= 255.
+define i8 @divceil_i8_var_divisor(i8 range(i8 0, 101) %x, i8 range(i8 1, 11) %y) {
+; CHECK-LABEL: @divceil_i8_var_divisor(
+; CHECK-NEXT: [[TMP1:%.*]] = add nsw i8 [[Y:%.*]], -1
+; CHECK-NEXT: [[TMP2:%.*]] = add nuw i8 [[X:%.*]], [[TMP1]]
+; CHECK-NEXT: [[RESULT:%.*]] = udiv i8 [[TMP2]], [[Y]]
+; CHECK-NEXT: ret i8 [[RESULT]]
+;
+ %q = udiv i8 %x, %y
+ %r = urem i8 %x, %y
+ %cond = icmp ne i8 %r, 0
+ %round = zext i1 %cond to i8
+ %result = add i8 %q, %round
+ ret i8 %result
+}
+
+; Variable divisor, commuted add.
+define i8 @divceil_i8_var_divisor_commuted(i8 range(i8 0, 101) %x, i8 range(i8 1, 11) %y) {
+; CHECK-LABEL: @divceil_i8_var_divisor_commuted(
+; CHECK-NEXT: [[TMP1:%.*]] = add nsw i8 [[Y:%.*]], -1
+; CHECK-NEXT: [[TMP2:%.*]] = add nuw i8 [[X:%.*]], [[TMP1]]
+; CHECK-NEXT: [[RESULT:%.*]] = udiv i8 [[TMP2]], [[Y]]
+; CHECK-NEXT: ret i8 [[RESULT]]
+;
+ %q = udiv i8 %x, %y
+ %r = urem i8 %x, %y
+ %cond = icmp ne i8 %r, 0
+ %round = zext i1 %cond to i8
+ %result = add i8 %round, %q
+ ret i8 %result
+}
+
+; Variable divisor with i32: X in [0, 100], Y in [2, 8], max X+(Y-1) = 107 <= UINT32_MAX.
+define i32 @divceil_i32_var_divisor(i32 range(i32 0, 101) %x, i32 range(i32 2, 9) %y) {
+; CHECK-LABEL: @divceil_i32_var_divisor(
+; CHECK-NEXT: [[TMP1:%.*]] = add nsw i32 [[Y:%.*]], -1
+; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[X:%.*]], [[TMP1]]
+; CHECK-NEXT: [[RESULT:%.*]] = udiv i32 [[TMP2]], [[Y]]
+; CHECK-NEXT: ret i32 [[RESULT]]
+;
+ %q = udiv i32 %x, %y
+ %r = urem i32 %x, %y
+ %cond = icmp ne i32 %r, 0
+ %round = zext i1 %cond to i32
+ %result = add i32 %q, %round
+ ret i32 %result
+}
+
+; Negative: Y unbounded -> max Y = 255, max X+(Y-1) = 100+254 overflows i8.
+define i8 @divceil_i8_var_divisor_y_unbounded(i8 range(i8 0, 101) %x, i8 %y) {
+; CHECK-LABEL: @divceil_i8_var_divisor_y_unbounded(
+; CHECK-NEXT: [[Q:%.*]] = udiv i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[R:%.*]] = urem i8 [[X]], [[Y]]
+; CHECK-NEXT: [[COND:%.*]] = icmp ne i8 [[R]], 0
+; CHECK-NEXT: [[ROUND:%.*]] = zext i1 [[COND]] to i8
+; CHECK-NEXT: [[RESULT:%.*]] = add nuw i8 [[Q]], [[ROUND]]
+; CHECK-NEXT: ret i8 [[RESULT]]
+;
+ %q = udiv i8 %x, %y
+ %r = urem i8 %x, %y
+ %cond = icmp ne i8 %r, 0
+ %round = zext i1 %cond to i8
+ %result = add i8 %q, %round
+ ret i8 %result
+}
+
+; Negative: X unbounded -> max X+(Y-1) overflows even with bounded Y.
+define i8 @divceil_i8_var_divisor_x_unbounded(i8 %x, i8 range(i8 1, 11) %y) {
+; CHECK-LABEL: @divceil_i8_var_divisor_x_unbounded(
+; CHECK-NEXT: [[Q:%.*]] = udiv i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[R:%.*]] = urem i8 [[X]], [[Y]]
+; CHECK-NEXT: [[COND:%.*]] = icmp ne i8 [[R]], 0
+; CHECK-NEXT: [[ROUND:%.*]] = zext i1 [[COND]] to i8
+; CHECK-NEXT: [[RESULT:%.*]] = add i8 [[Q]], [[ROUND]]
+; CHECK-NEXT: ret i8 [[RESULT]]
+;
+ %q = udiv i8 %x, %y
+ %r = urem i8 %x, %y
+ %cond = icmp ne i8 %r, 0
+ %round = zext i1 %cond to i8
+ %result = add i8 %q, %round
+ ret i8 %result
+}
>From b089e7df8288ffe1aac6453afd5c8cccc301be53 Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Wed, 1 Apr 2026 20:47:28 -0400
Subject: [PATCH 3/5] add wide div ceil folding so range information properly
propagates even when the dividend is trunced to a narrower width
---
llvm/lib/Analysis/ValueTracking.cpp | 5 ++
.../InstCombine/InstCombineAddSub.cpp | 74 +++++++++++++------
2 files changed, 55 insertions(+), 24 deletions(-)
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 340a616f13e19..2460397cba138 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -10361,6 +10361,11 @@ ConstantRange llvm::computeConstantRange(const Value *V, bool ForSigned,
SI->getFalseValue(), ForSigned, UseInstrInfo, AC, CtxI, DT, Depth + 1);
CR = CRTrue.unionWith(CRFalse);
CR = CR.intersectWith(getRangeForSelectPattern(*SI, IIQ));
+ } else if (auto *TI = dyn_cast<TruncInst>(V)) {
+ ConstantRange SrcCR =
+ computeConstantRange(TI->getOperand(0), ForSigned, UseInstrInfo, AC,
+ CtxI, DT, Depth + 1);
+ CR = SrcCR.truncate(BitWidth);
} else if (isa<FPToUIInst>(V) || isa<FPToSIInst>(V)) {
APInt Lower = APInt(BitWidth, 0);
APInt Upper = APInt(BitWidth, 0);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 0295ef6272aa8..6f05261d335fa 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1524,26 +1524,16 @@ static Instruction *foldBoxMultiply(BinaryOperator &I) {
}
// Fold the div_ceil idiom:
-// add(udiv(X, Y), zext(icmp ne(urem(X, Y), 0)))
-// -> udiv(add nuw(X, Y - 1), Y)
-// The zext of the icmp is just type-plumbing (i1 -> X's type); the fold
-// stays entirely in X's type. Valid when X + (Y-1) is provably non-wrapping,
-// checked via ConstantRange on both operands (range attributes, assume, etc.).
-Instruction *InstCombinerImpl::foldDivCeil(BinaryOperator &I) {
- Value *X, *X2, *Y, *Y2;
- CmpPredicate Pred;
-
- // Bind X and X2 independently so m_c_Add handles both operand orderings.
- auto UDivPat = m_OneUse(m_UDiv(m_Value(X), m_Value(Y)));
- auto URemPat = m_OneUse(m_URem(m_Value(X2), m_Value(Y2)));
- auto ICmpPat = m_OneUse(m_ICmp(Pred, URemPat, m_Zero()));
- auto ZExtPat = m_OneUse(m_ZExt(ICmpPat));
-
- if (!match(&I, m_c_Add(UDivPat, ZExtPat)) || Pred != ICmpInst::ICMP_NE ||
- X != X2 || Y != Y2)
- return nullptr;
-
- // Require X + (Y-1) to not overflow unsigned in X's type.
+// Wide form: add(udiv(X, Y), zext(icmp ne(urem(X, Y), 0)))
+// -> udiv(add nuw(X, Y - 1), Y)
+// Narrow form: add(zext(udiv(X, Y)), zext(icmp ne(urem(X, Y), 0)))
+// -> zext(udiv(add nuw(X, Y - 1), Y))
+// In the narrow form X and Y operate in a type narrower than the add; the
+// result is zero-extended back to the add's type.
+// Valid when X + (Y-1) is provably non-wrapping in X's type, checked via
+// ConstantRange (range attributes, assume, etc.).
+static bool checkDivCeilNUW(Value *X, Value *Y, BinaryOperator &I,
+ AssumptionCache &AC, DominatorTree &DT) {
// Use ConstantRange rather than KnownBits: KnownBits can only derive bounds
// from known-zero high bits, so it loses range info for near-max values
// (e.g. an assume of "x < UINT_MAX-5" on a 32-bit value leaves no
@@ -1559,12 +1549,48 @@ Instruction *InstCombinerImpl::foldDivCeil(BinaryOperator &I) {
APInt MaxY = CRY.getUnsignedMax();
unsigned BitWidth = MaxX.getBitWidth();
// MaxX + (MaxY - 1) <= UINT_MAX <==> MaxX <= UINT_MAX - (MaxY - 1)
- if (MaxX.ugt(APInt::getMaxValue(BitWidth) - (MaxY - 1)))
+ return !MaxX.ugt(APInt::getMaxValue(BitWidth) - (MaxY - 1));
+}
+
+Instruction *InstCombinerImpl::foldDivCeil(BinaryOperator &I) {
+ Value *X, *X2, *Y, *Y2;
+ CmpPredicate Pred;
+
+ // Wide form: udiv and urem are the same type as the add.
+ auto UDivPat = m_OneUse(m_UDiv(m_Value(X), m_Value(Y)));
+ auto URemPat = m_OneUse(m_URem(m_Value(X2), m_Value(Y2)));
+ auto ICmpPat = m_OneUse(m_ICmp(Pred, URemPat, m_Zero()));
+ auto ZExtCmpPat = m_OneUse(m_ZExt(ICmpPat));
+
+ if (match(&I, m_c_Add(UDivPat, ZExtCmpPat)) && Pred == ICmpInst::ICMP_NE &&
+ X == X2 && Y == Y2 && checkDivCeilNUW(X, Y, I, AC, DT)) {
+ Value *YMinusOne = Builder.CreateSub(Y, ConstantInt::get(Y->getType(), 1));
+ Value *NUWAdd = Builder.CreateAdd(X, YMinusOne, "", /*HasNUW=*/true);
+ return BinaryOperator::CreateUDiv(NUWAdd, Y);
+ }
+
+ // Narrow form: udiv and urem are in a narrower type; both are zext'd before
+ // the add. add(zext(udiv(X,Y)), zext(ne(urem(X,Y),0)))
+ // -> zext(udiv(add nuw(X, Y-1), Y))
+ Value *NX, *NX2, *NY, *NY2;
+ CmpPredicate Pred2;
+ auto NUDivPat = m_OneUse(m_UDiv(m_Value(NX), m_Value(NY)));
+ auto NURemPat = m_OneUse(m_URem(m_Value(NX2), m_Value(NY2)));
+ auto NICmpPat = m_OneUse(m_ICmp(Pred2, NURemPat, m_Zero()));
+ auto ZExtDivPat = m_OneUse(m_ZExt(NUDivPat));
+ auto ZExtNarrowCmpPat = m_OneUse(m_ZExt(NICmpPat));
+
+ if (!match(&I, m_c_Add(ZExtDivPat, ZExtNarrowCmpPat)) ||
+ Pred2 != ICmpInst::ICMP_NE || NX != NX2 || NY != NY2)
+ return nullptr;
+
+ if (!checkDivCeilNUW(NX, NY, I, AC, DT))
return nullptr;
- Value *YMinusOne = Builder.CreateSub(Y, ConstantInt::get(Y->getType(), 1));
- Value *NUWAdd = Builder.CreateAdd(X, YMinusOne, "", /*HasNUW=*/true);
- return BinaryOperator::CreateUDiv(NUWAdd, Y);
+ Value *YMinusOne = Builder.CreateSub(NY, ConstantInt::get(NY->getType(), 1));
+ Value *NUWAdd = Builder.CreateAdd(NX, YMinusOne, "", /*HasNUW=*/true);
+ Value *Div = Builder.CreateUDiv(NUWAdd, NY);
+ return new ZExtInst(Div, I.getType());
}
Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
>From 2c723332ce7e3079edcbd3ca4062baea904ae4ed Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Wed, 1 Apr 2026 20:49:27 -0400
Subject: [PATCH 4/5] fix failing test that now gets nuw because of trunc
propagation on sub
---
llvm/test/Transforms/InstCombine/fls.ll | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/Transforms/InstCombine/fls.ll b/llvm/test/Transforms/InstCombine/fls.ll
index 68bc0a2fc8a1d..ea757268259f5 100644
--- a/llvm/test/Transforms/InstCombine/fls.ll
+++ b/llvm/test/Transforms/InstCombine/fls.ll
@@ -33,7 +33,7 @@ define i32 @flsnotconst(i64 %z) {
; CHECK-LABEL: @flsnotconst(
; CHECK-NEXT: [[CTLZ:%.*]] = call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[Z:%.*]], i1 false)
; CHECK-NEXT: [[TMP1:%.*]] = trunc nuw nsw i64 [[CTLZ]] to i32
-; CHECK-NEXT: [[GOO:%.*]] = sub nsw i32 64, [[TMP1]]
+; CHECK-NEXT: [[GOO:%.*]] = sub nuw nsw i32 64, [[TMP1]]
; CHECK-NEXT: ret i32 [[GOO]]
;
%goo = call i32 @flsl(i64 %z)
>From 31ab576383e62f8f9cb2f52f1735bbfc2b228616 Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Thu, 2 Apr 2026 08:57:17 -0400
Subject: [PATCH 5/5] split folds out
---
.../InstCombine/InstCombineAddSub.cpp | 88 ++++++++++---------
.../InstCombine/InstCombineInternal.h | 7 +-
2 files changed, 51 insertions(+), 44 deletions(-)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 6f05261d335fa..fc39de2a1c0c6 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1523,24 +1523,17 @@ static Instruction *foldBoxMultiply(BinaryOperator &I) {
return nullptr;
}
-// Fold the div_ceil idiom:
-// Wide form: add(udiv(X, Y), zext(icmp ne(urem(X, Y), 0)))
-// -> udiv(add nuw(X, Y - 1), Y)
-// Narrow form: add(zext(udiv(X, Y)), zext(icmp ne(urem(X, Y), 0)))
-// -> zext(udiv(add nuw(X, Y - 1), Y))
-// In the narrow form X and Y operate in a type narrower than the add; the
-// result is zero-extended back to the add's type.
-// Valid when X + (Y-1) is provably non-wrapping in X's type, checked via
+// Return true if X + (Y-1) is provably non-wrapping in X's type, using
// ConstantRange (range attributes, assume, etc.).
+// Use ConstantRange rather than KnownBits: KnownBits can only derive bounds
+// from known-zero high bits, so it loses range info for near-max values
+// (e.g. an assume of "x < UINT_MAX-5" on a 32-bit value leaves no
+// universally-zero bits and getMaxValue() returns UINT_MAX).
+// ConstantRange tracks the full [lo, hi) interval and gives a tight max.
+// Note: computeConstantRangeIncludingKnownBits does not forward AC/DT to
+// computeConstantRange, so it won't pick up llvm.assume; call the full form.
static bool checkDivCeilNUW(Value *X, Value *Y, BinaryOperator &I,
AssumptionCache &AC, DominatorTree &DT) {
- // Use ConstantRange rather than KnownBits: KnownBits can only derive bounds
- // from known-zero high bits, so it loses range info for near-max values
- // (e.g. an assume of "x < UINT_MAX-5" on a 32-bit value leaves no
- // universally-zero bits and getMaxValue() returns UINT_MAX).
- // ConstantRange tracks the full [lo, hi) interval and gives a tight max.
- // Note: computeConstantRangeIncludingKnownBits does not forward AC/DT to
- // computeConstantRange, so it won't pick up llvm.assume; call the full form.
ConstantRange CRX = computeConstantRange(X, /*ForSigned=*/false,
/*UseInstrInfo=*/true, &AC, &I, &DT);
ConstantRange CRY = computeConstantRange(Y, /*ForSigned=*/false,
@@ -1552,44 +1545,52 @@ static bool checkDivCeilNUW(Value *X, Value *Y, BinaryOperator &I,
return !MaxX.ugt(APInt::getMaxValue(BitWidth) - (MaxY - 1));
}
-Instruction *InstCombinerImpl::foldDivCeil(BinaryOperator &I) {
+// Fold the wide form of the div_ceil idiom:
+// add(udiv(X, Y), zext(icmp ne(urem(X, Y), 0)))
+// -> udiv(add nuw(X, Y - 1), Y)
+// udiv and urem are the same type as the add.
+// Valid when X + (Y-1) is provably non-wrapping in X's type.
+Instruction *InstCombinerImpl::foldWideDivCeil(BinaryOperator &I) {
Value *X, *X2, *Y, *Y2;
CmpPredicate Pred;
- // Wide form: udiv and urem are the same type as the add.
auto UDivPat = m_OneUse(m_UDiv(m_Value(X), m_Value(Y)));
auto URemPat = m_OneUse(m_URem(m_Value(X2), m_Value(Y2)));
auto ICmpPat = m_OneUse(m_ICmp(Pred, URemPat, m_Zero()));
auto ZExtCmpPat = m_OneUse(m_ZExt(ICmpPat));
- if (match(&I, m_c_Add(UDivPat, ZExtCmpPat)) && Pred == ICmpInst::ICMP_NE &&
- X == X2 && Y == Y2 && checkDivCeilNUW(X, Y, I, AC, DT)) {
- Value *YMinusOne = Builder.CreateSub(Y, ConstantInt::get(Y->getType(), 1));
- Value *NUWAdd = Builder.CreateAdd(X, YMinusOne, "", /*HasNUW=*/true);
- return BinaryOperator::CreateUDiv(NUWAdd, Y);
- }
-
- // Narrow form: udiv and urem are in a narrower type; both are zext'd before
- // the add. add(zext(udiv(X,Y)), zext(ne(urem(X,Y),0)))
- // -> zext(udiv(add nuw(X, Y-1), Y))
- Value *NX, *NX2, *NY, *NY2;
- CmpPredicate Pred2;
- auto NUDivPat = m_OneUse(m_UDiv(m_Value(NX), m_Value(NY)));
- auto NURemPat = m_OneUse(m_URem(m_Value(NX2), m_Value(NY2)));
- auto NICmpPat = m_OneUse(m_ICmp(Pred2, NURemPat, m_Zero()));
- auto ZExtDivPat = m_OneUse(m_ZExt(NUDivPat));
- auto ZExtNarrowCmpPat = m_OneUse(m_ZExt(NICmpPat));
-
- if (!match(&I, m_c_Add(ZExtDivPat, ZExtNarrowCmpPat)) ||
- Pred2 != ICmpInst::ICMP_NE || NX != NX2 || NY != NY2)
+ if (!match(&I, m_c_Add(UDivPat, ZExtCmpPat)) || Pred != ICmpInst::ICMP_NE ||
+ X != X2 || Y != Y2 || !checkDivCeilNUW(X, Y, I, AC, DT))
return nullptr;
- if (!checkDivCeilNUW(NX, NY, I, AC, DT))
+ Value *YMinusOne = Builder.CreateSub(Y, ConstantInt::get(Y->getType(), 1));
+ Value *NUWAdd = Builder.CreateAdd(X, YMinusOne, "", /*HasNUW=*/true);
+ return BinaryOperator::CreateUDiv(NUWAdd, Y);
+}
+
+// Fold the narrow form of the div_ceil idiom:
+// add(zext(udiv(X, Y)), zext(icmp ne(urem(X, Y), 0)))
+// -> zext(udiv(add nuw(X, Y - 1), Y))
+// udiv and urem operate in a type narrower than the add; the result is
+// zero-extended back to the add's type.
+// Valid when X + (Y-1) is provably non-wrapping in X's type.
+Instruction *InstCombinerImpl::foldNarrowDivCeil(BinaryOperator &I) {
+ Value *X, *X2, *Y, *Y2;
+ CmpPredicate Pred;
+
+ auto UDivPat = m_OneUse(m_UDiv(m_Value(X), m_Value(Y)));
+ auto URemPat = m_OneUse(m_URem(m_Value(X2), m_Value(Y2)));
+ auto ICmpPat = m_OneUse(m_ICmp(Pred, URemPat, m_Zero()));
+ auto ZExtDivPat = m_OneUse(m_ZExt(UDivPat));
+ auto ZExtCmpPat = m_OneUse(m_ZExt(ICmpPat));
+
+ if (!match(&I, m_c_Add(ZExtDivPat, ZExtCmpPat)) || Pred != ICmpInst::ICMP_NE ||
+ X != X2 || Y != Y2 || !checkDivCeilNUW(X, Y, I, AC, DT))
return nullptr;
- Value *YMinusOne = Builder.CreateSub(NY, ConstantInt::get(NY->getType(), 1));
- Value *NUWAdd = Builder.CreateAdd(NX, YMinusOne, "", /*HasNUW=*/true);
- Value *Div = Builder.CreateUDiv(NUWAdd, NY);
+ Value *YMinusOne = Builder.CreateSub(Y, ConstantInt::get(Y->getType(), 1));
+ Value *NUWAdd = Builder.CreateAdd(X, YMinusOne, "", /*HasNUW=*/true);
+ Value *Div = Builder.CreateUDiv(NUWAdd, Y);
return new ZExtInst(Div, I.getType());
}
@@ -1985,7 +1986,10 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
if (Instruction *Res = foldBinOpOfSelectAndCastOfSelectCondition(I))
return Res;
- if (Instruction *Res = foldDivCeil(I))
+ if (Instruction *Res = foldWideDivCeil(I))
+ return Res;
+
+ if (Instruction *Res = foldNarrowDivCeil(I))
return Res;
// Re-enqueue users of the induction variable of add recurrence if we infer
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 93bcb66369775..dfebf0e6dcb67 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -544,9 +544,12 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
/// (Binop (cast C), (select C, T, F))
/// -> (select C, C0, C1)
Instruction *foldBinOpOfSelectAndCastOfSelectCondition(BinaryOperator &I);
- /// (add (udiv X, Y), (zext (icmp ne (urem X, Y), 0)))
+ /// Wide form: (add (udiv X, Y), (zext (icmp ne (urem X, Y), 0)))
/// -> (udiv (add nuw X, Y-1), Y)
- Instruction *foldDivCeil(BinaryOperator &I);
+ Instruction *foldWideDivCeil(BinaryOperator &I);
+ /// Narrow form: (add (zext (udiv X, Y)), (zext (icmp ne (urem X, Y), 0)))
+ /// -> (zext (udiv (add nuw X, Y-1), Y))
+ Instruction *foldNarrowDivCeil(BinaryOperator &I);
/// This tries to simplify binary operations by factorizing out common terms
/// (e. g. "(A*B)+(A*C)" -> "A*(B+C)").
More information about the llvm-commits
mailing list