[llvm] InstCombine: improve optimizations for ceiling division with no overflow (PR #142869)

Sat Jun 7 13:58:11 PDT 2025

https://github.com/gaynor-anthropic updated https://github.com/llvm/llvm-project/pull/142869

>From ee8f7a53db2bc56e85d7e63a31228cf7e7453835 Mon Sep 17 00:00:00 2001
From: Alex Gaynor <gaynor at anthropic.com>
Date: Wed, 4 Jun 2025 18:34:08 -0400
Subject: [PATCH 1/5] InstCombine: improve optimizations for ceiling division
 with no overflow

fixes #142497
---
 .../InstCombine/InstCombineAddSub.cpp         |  44 +++++
 llvm/test/Transforms/InstCombine/add.ll       | 157 ++++++++++++++++++
 2 files changed, 201 insertions(+)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index a9ac5ff9b9c89..16ebd7bceff63 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1787,6 +1787,50 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
   if (Instruction *Ashr = foldAddToAshr(I))
     return Ashr;
 
+  // Ceiling division by power-of-2:
+  // (X >> log2(N)) + zext(X & (N-1) != 0) --> (X + (N-1)) >> log2(N)
+  // This is valid when adding (N-1) to X doesn't overflow.
+  {
+    Value *X = nullptr, *Cmp = nullptr;
+    const APInt *ShiftAmt = nullptr, *Mask = nullptr;
+    CmpPredicate Pred;
+
+    // Match: (X >> C) + zext((X & Mask) != 0)
+    // or:    zext((X & Mask) != 0) + (X >> C)
+    Value *Op0 = I.getOperand(0);
+    Value *Op1 = I.getOperand(1);
+
+    // Try matching with shift on left, zext on right
+    bool Matched = false;
+    if (match(Op0, m_LShr(m_Value(X), m_APInt(ShiftAmt))) &&
+        match(Op1, m_ZExt(m_Value(Cmp)))) {
+      Matched = match(Cmp, m_ICmp(Pred, m_And(m_Specific(X), m_APInt(Mask)),
+                                  m_ZeroInt()));
+    } else if (match(Op1, m_LShr(m_Value(X), m_APInt(ShiftAmt))) &&
+               match(Op0, m_ZExt(m_Value(Cmp)))) {
+      Matched = match(Cmp, m_ICmp(Pred, m_And(m_Specific(X), m_APInt(Mask)),
+                                  m_ZeroInt()));
+    }
+
+    if (Matched &&
+        Pred == ICmpInst::ICMP_NE &&
+        ShiftAmt && ShiftAmt->uge(1) && ShiftAmt->ult(BitWidth) &&
+        Mask && *Mask == (APInt(BitWidth, 1) << *ShiftAmt) - 1) {
+
+      // Check if X + Mask doesn't overflow
+      Constant *MaskC = ConstantInt::get(X->getType(), *Mask);
+      bool WillNotOverflowUnsigned = willNotOverflowUnsignedAdd(X, MaskC, I);
+
+      if (WillNotOverflowUnsigned) {
+        // (X + Mask) >> ShiftAmt
+        bool WillNotOverflowSigned = willNotOverflowSignedAdd(X, MaskC, I);
+        Value *Add = Builder.CreateAdd(X, MaskC, "", WillNotOverflowUnsigned,
+                                      WillNotOverflowSigned);
+        return BinaryOperator::CreateLShr(Add, ConstantInt::get(X->getType(), *ShiftAmt));
+      }
+    }
+  }
+
   // (~X) + (~Y) --> -2 - (X + Y)
   {
     // To ensure we can save instructions we need to ensure that we consume both
diff --git a/llvm/test/Transforms/InstCombine/add.ll b/llvm/test/Transforms/InstCombine/add.ll
index 495f99824652d..d364082eab317 100644
--- a/llvm/test/Transforms/InstCombine/add.ll
+++ b/llvm/test/Transforms/InstCombine/add.ll
@@ -4273,4 +4273,161 @@ define i32 @fold_zext_nneg_add_const_fail2(i8 %x) {
 }
 
 declare void @llvm.assume(i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+
+; Ceiling division by power-of-2: (x >> log2(N)) + ((x & (N-1)) != 0) -> (x + (N-1)) >> log2(N)
+; This is only valid when x + (N-1) doesn't overflow
+
+; Test with known range that prevents overflow
+define noundef range(i32 0, 100) i32 @ceil_div_by_8_known_range(i32 noundef range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_by_8_known_range(
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[X:%.*]], 7
+; CHECK-NEXT:    [[R:%.*]] = lshr i32 [[TMP1]], 3
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  %and = and i32 %x, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add i32 %shr, %ext
+  ret i32 %r
+}
+
+; Test with the exact IR from the original testcase
+define noundef range(i32 0, 6) i32 @ceil_div_from_clz(i32 noundef %v) {
+; CHECK-LABEL: @ceil_div_from_clz(
+; CHECK-NEXT:    [[CTLZ:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[V:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nuw nsw i32 39, [[CTLZ]]
+; CHECK-NEXT:    [[R:%.*]] = lshr i32 [[TMP1]], 3
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %ctlz = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 %v, i1 false)
+  %sub = sub nuw nsw i32 32, %ctlz
+  %shr = lshr i32 %sub, 3
+  %and = and i32 %sub, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add nuw nsw i32 %shr, %ext
+  ret i32 %r
+}
+
+; Vector version with known range
+define <2 x i32> @ceil_div_by_8_vec_range(<2 x i32> range(i32 0, 1000) %x) {
+; CHECK-LABEL: @ceil_div_by_8_vec_range(
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw <2 x i32> [[X:%.*]], splat (i32 7)
+; CHECK-NEXT:    [[R:%.*]] = lshr <2 x i32> [[TMP1]], splat (i32 3)
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %shr = lshr <2 x i32> %x, <i32 3, i32 3>
+  %and = and <2 x i32> %x, <i32 7, i32 7>
+  %cmp = icmp ne <2 x i32> %and, <i32 0, i32 0>
+  %ext = zext <2 x i1> %cmp to <2 x i32>
+  %r = add <2 x i32> %shr, %ext
+  ret <2 x i32> %r
+}
+
+; Ceiling division by 16 with known range
+define i16 @ceil_div_by_16_i16(i16 range(i16 0, 1000) %x) {
+; CHECK-LABEL: @ceil_div_by_16_i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i16 [[X:%.*]], 15
+; CHECK-NEXT:    [[R:%.*]] = lshr i16 [[TMP1]], 4
+; CHECK-NEXT:    ret i16 [[R]]
+;
+  %shr = lshr i16 %x, 4
+  %and = and i16 %x, 15
+  %cmp = icmp ne i16 %and, 0
+  %ext = zext i1 %cmp to i16
+  %r = add i16 %shr, %ext
+  ret i16 %r
+}
+
+; Negative test: no overflow guarantee - should NOT optimize
+define i32 @ceil_div_by_8_no_overflow_info(i32 %x) {
+; CHECK-LABEL: @ceil_div_by_8_no_overflow_info(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  %and = and i32 %x, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add i32 %shr, %ext
+  ret i32 %r
+}
+
+; Negative test: nuw on final add doesn't help
+define i32 @ceil_div_by_8_only_nuw_on_add(i32 %x) {
+; CHECK-LABEL: @ceil_div_by_8_only_nuw_on_add(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  %and = and i32 %x, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add nuw i32 %shr, %ext  ; nuw here doesn't prove x+7 won't overflow
+  ret i32 %r
+}
+
+; Negative test: wrong mask
+define i32 @ceil_div_wrong_mask(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_wrong_mask(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 6
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  %and = and i32 %x, 6  ; Wrong mask: should be 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add i32 %shr, %ext
+  ret i32 %r
+}
+
+; Negative test: wrong shift amount
+define i32 @ceil_div_wrong_shift(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_wrong_shift(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 4
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 4  ; Shift by 4, but mask is 7 (should be 15)
+  %and = and i32 %x, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add i32 %shr, %ext
+  ret i32 %r
+}
+
+; Negative test: wrong comparison
+define i32 @ceil_div_wrong_cmp(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_wrong_cmp(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  %and = and i32 %x, 7
+  %cmp = icmp eq i32 %and, 0  ; Wrong: should be ne
+  %ext = zext i1 %cmp to i32
+  %r = add i32 %shr, %ext
+  ret i32 %r
+}
 declare void @fake_func(i32)

>From 2b55fe227d63472a27f6572fc26695e714ae72f4 Mon Sep 17 00:00:00 2001
From: Alex Gaynor <gaynor at anthropic.com>
Date: Thu, 5 Jun 2025 20:49:07 -0400
Subject: [PATCH 2/5] review feedback: make use of m_c_Add and m_LowBitMask
 helpers

---
 .../InstCombine/InstCombineAddSub.cpp         | 25 ++++++-------------
 1 file changed, 7 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 16ebd7bceff63..a2f89708009f7 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1791,31 +1791,20 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
   // (X >> log2(N)) + zext(X & (N-1) != 0) --> (X + (N-1)) >> log2(N)
   // This is valid when adding (N-1) to X doesn't overflow.
   {
-    Value *X = nullptr, *Cmp = nullptr;
+    Value *X = nullptr, *Cmp = nullptr, *Shift = nullptr;
     const APInt *ShiftAmt = nullptr, *Mask = nullptr;
     CmpPredicate Pred;
 
     // Match: (X >> C) + zext((X & Mask) != 0)
     // or:    zext((X & Mask) != 0) + (X >> C)
-    Value *Op0 = I.getOperand(0);
-    Value *Op1 = I.getOperand(1);
-
-    // Try matching with shift on left, zext on right
-    bool Matched = false;
-    if (match(Op0, m_LShr(m_Value(X), m_APInt(ShiftAmt))) &&
-        match(Op1, m_ZExt(m_Value(Cmp)))) {
-      Matched = match(Cmp, m_ICmp(Pred, m_And(m_Specific(X), m_APInt(Mask)),
-                                  m_ZeroInt()));
-    } else if (match(Op1, m_LShr(m_Value(X), m_APInt(ShiftAmt))) &&
-               match(Op0, m_ZExt(m_Value(Cmp)))) {
-      Matched = match(Cmp, m_ICmp(Pred, m_And(m_Specific(X), m_APInt(Mask)),
-                                  m_ZeroInt()));
-    }
-
-    if (Matched &&
+    if (match(&I, m_c_Add(m_Value(Shift), m_ZExt(m_Value(Cmp)))) &&
+        match(Shift, m_LShr(m_Value(X), m_APInt(ShiftAmt))) &&
+        Shift->hasOneUse() &&
+        match(Cmp, m_ICmp(Pred, m_And(m_Specific(X), m_LowBitMask(Mask)),
+                          m_ZeroInt())) &&
         Pred == ICmpInst::ICMP_NE &&
         ShiftAmt && ShiftAmt->uge(1) && ShiftAmt->ult(BitWidth) &&
-        Mask && *Mask == (APInt(BitWidth, 1) << *ShiftAmt) - 1) {
+        Mask && Mask->popcount() == *ShiftAmt) {
 
       // Check if X + Mask doesn't overflow
       Constant *MaskC = ConstantInt::get(X->getType(), *Mask);

>From 1df936da385fa04b024401937fb45b30c6c32755 Mon Sep 17 00:00:00 2001
From: gaynor-anthropic <gaynor at anthropic.com>
Date: Sat, 7 Jun 2025 13:47:17 -0700
Subject: [PATCH 3/5] code review: apply suggestions

Co-authored-by: Yingwei Zheng <dtcxzyw at qq.com>
---
 .../Transforms/InstCombine/InstCombineAddSub.cpp   | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index a2f89708009f7..767acc3d3019e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1797,14 +1797,8 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
 
     // Match: (X >> C) + zext((X & Mask) != 0)
     // or:    zext((X & Mask) != 0) + (X >> C)
-    if (match(&I, m_c_Add(m_Value(Shift), m_ZExt(m_Value(Cmp)))) &&
-        match(Shift, m_LShr(m_Value(X), m_APInt(ShiftAmt))) &&
-        Shift->hasOneUse() &&
-        match(Cmp, m_ICmp(Pred, m_And(m_Specific(X), m_LowBitMask(Mask)),
-                          m_ZeroInt())) &&
-        Pred == ICmpInst::ICMP_NE &&
-        ShiftAmt && ShiftAmt->uge(1) && ShiftAmt->ult(BitWidth) &&
-        Mask && Mask->popcount() == *ShiftAmt) {
+    if (match(&I, m_c_Add(m_OneUse(m_LShr(m_Value(X), m_APInt(ShiftAmt))), m_ZExt(m_SpecificICmp(ICmpInst::ICMP_NE, m_And(m_Deferred(X), m_LowBitMask(Mask)),
+                          m_ZeroInt())))) && Mask->popcount() == *ShiftAmt) {
 
       // Check if X + Mask doesn't overflow
       Constant *MaskC = ConstantInt::get(X->getType(), *Mask);
@@ -1812,9 +1806,7 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
 
       if (WillNotOverflowUnsigned) {
         // (X + Mask) >> ShiftAmt
-        bool WillNotOverflowSigned = willNotOverflowSignedAdd(X, MaskC, I);
-        Value *Add = Builder.CreateAdd(X, MaskC, "", WillNotOverflowUnsigned,
-                                      WillNotOverflowSigned);
+        Value *Add = Builder.CreateNUWAdd(X, MaskC);
         return BinaryOperator::CreateLShr(Add, ConstantInt::get(X->getType(), *ShiftAmt));
       }
     }

>From 339675ff867b313d5da9b179893e62a8e08b816e Mon Sep 17 00:00:00 2001
From: Alex Gaynor <gaynor at anthropic.com>
Date: Sat, 7 Jun 2025 16:49:39 -0400
Subject: [PATCH 4/5] clang-format

---
 .../Transforms/InstCombine/InstCombineAddSub.cpp    | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 767acc3d3019e..12994e280f7fb 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1791,14 +1791,18 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
   // (X >> log2(N)) + zext(X & (N-1) != 0) --> (X + (N-1)) >> log2(N)
   // This is valid when adding (N-1) to X doesn't overflow.
   {
-    Value *X = nullptr, *Cmp = nullptr, *Shift = nullptr;
+    Value *X = nullptr;
     const APInt *ShiftAmt = nullptr, *Mask = nullptr;
     CmpPredicate Pred;
 
     // Match: (X >> C) + zext((X & Mask) != 0)
     // or:    zext((X & Mask) != 0) + (X >> C)
-    if (match(&I, m_c_Add(m_OneUse(m_LShr(m_Value(X), m_APInt(ShiftAmt))), m_ZExt(m_SpecificICmp(ICmpInst::ICMP_NE, m_And(m_Deferred(X), m_LowBitMask(Mask)),
-                          m_ZeroInt())))) && Mask->popcount() == *ShiftAmt) {
+    if (match(&I, m_c_Add(m_OneUse(m_LShr(m_Value(X), m_APInt(ShiftAmt))),
+                          m_ZExt(m_SpecificICmp(
+                              ICmpInst::ICMP_NE,
+                              m_And(m_Deferred(X), m_LowBitMask(Mask)),
+                              m_ZeroInt())))) &&
+        Mask->popcount() == *ShiftAmt) {
 
       // Check if X + Mask doesn't overflow
       Constant *MaskC = ConstantInt::get(X->getType(), *Mask);
@@ -1807,7 +1811,8 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
       if (WillNotOverflowUnsigned) {
         // (X + Mask) >> ShiftAmt
         Value *Add = Builder.CreateNUWAdd(X, MaskC);
-        return BinaryOperator::CreateLShr(Add, ConstantInt::get(X->getType(), *ShiftAmt));
+        return BinaryOperator::CreateLShr(
+            Add, ConstantInt::get(X->getType(), *ShiftAmt));
       }
     }
   }

>From 74fc5e1aed804e783fd20d86e80cd22ade7dde13 Mon Sep 17 00:00:00 2001
From: Alex Gaynor <gaynor at anthropic.com>
Date: Sat, 7 Jun 2025 16:58:00 -0400
Subject: [PATCH 5/5] addeditional test cases

---
 llvm/test/Transforms/InstCombine/add.ll | 84 +++++++++++++++++++++++++
 1 file changed, 84 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/add.ll b/llvm/test/Transforms/InstCombine/add.ll
index d364082eab317..74c022b07a9a3 100644
--- a/llvm/test/Transforms/InstCombine/add.ll
+++ b/llvm/test/Transforms/InstCombine/add.ll
@@ -4430,4 +4430,88 @@ define i32 @ceil_div_wrong_cmp(i32 range(i32 0, 100) %x) {
   %r = add i32 %shr, %ext
   ret i32 %r
 }
+
+; Multi-use test: all intermediate values have uses
+define i32 @ceil_div_multi_use(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_multi_use(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
+; CHECK-NEXT:    call void @use_i32(i32 [[SHR]])
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 7
+; CHECK-NEXT:    call void @use_i32(i32 [[AND]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    call void @use_i32(i32 [[EXT]])
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  call void @use_i32(i32 %shr)
+  %and = and i32 %x, 7
+  call void @use_i32(i32 %and)
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  call void @use_i32(i32 %ext)
+  %r = add i32 %shr, %ext
+  ret i32 %r
+}
+
+; Commuted test: add operands are swapped  
+define i32 @ceil_div_commuted(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_commuted(
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[X:%.*]], 7
+; CHECK-NEXT:    [[R:%.*]] = lshr i32 [[TMP1]], 3
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  %and = and i32 %x, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  %r = add i32 %ext, %shr  ; Operands swapped
+  ret i32 %r
+}
+
+; Commuted with multi-use
+define i32 @ceil_div_commuted_multi_use(i32 range(i32 0, 100) %x) {
+; CHECK-LABEL: @ceil_div_commuted_multi_use(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 3
+; CHECK-NEXT:    call void @use_i32(i32 [[SHR]])
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[EXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    call void @use_i32(i32 [[EXT]])
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %shr = lshr i32 %x, 3
+  call void @use_i32(i32 %shr)
+  %and = and i32 %x, 7
+  %cmp = icmp ne i32 %and, 0
+  %ext = zext i1 %cmp to i32
+  call void @use_i32(i32 %ext)
+  %r = add i32 %ext, %shr  ; Operands swapped
+  ret i32 %r
+}
+
+; Multi-use with vector type
+define <2 x i32> @ceil_div_vec_multi_use(<2 x i32> range(i32 0, 1000) %x) {
+; CHECK-LABEL: @ceil_div_vec_multi_use(
+; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i32> [[X:%.*]], splat (i32 3)
+; CHECK-NEXT:    call void @use_vec(<2 x i32> [[SHR]])
+; CHECK-NEXT:    [[AND:%.*]] = and <2 x i32> [[X]], splat (i32 7)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne <2 x i32> [[AND]], zeroinitializer
+; CHECK-NEXT:    [[EXT:%.*]] = zext <2 x i1> [[CMP]] to <2 x i32>
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw <2 x i32> [[SHR]], [[EXT]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %shr = lshr <2 x i32> %x, <i32 3, i32 3>
+  call void @use_vec(<2 x i32> %shr)
+  %and = and <2 x i32> %x, <i32 7, i32 7>
+  %cmp = icmp ne <2 x i32> %and, <i32 0, i32 0>
+  %ext = zext <2 x i1> %cmp to <2 x i32>
+  %r = add <2 x i32> %shr, %ext
+  ret <2 x i32> %r
+}
+
+declare void @use_i32(i32)
+declare void @use_vec(<2 x i32>)
 declare void @fake_func(i32)