[llvm] [CGP] Reconstruct borrow chain from icmp pattern for subtract-with-carry (PR #189018)

Fri Mar 27 07:49:33 PDT 2026

https://github.com/chfast created https://github.com/llvm/llvm-project/pull/189018

InstCombine simplifies chained usub.with.overflow intrinsics (from multi-precision subtraction, e.g. __builtin_subcll) into an icmp pattern:

  carry_out = or(icmp ult X, Y, and(icmp eq X, Y, carry_in))

This is algebraically correct but prevents the backend from generating efficient subtract-with-borrow instructions (x86 sbb, aarch64 sbcs).

Add combineToUSubWithCarry() to CodeGenPrepare that matches this pattern and reconstructs the chained usub.with.overflow form. The existing DAG combiner (combineCarryDiamond) then combines these into USUBO_CARRY.

The transformation is gated on target support for ISD::USUBO_CARRY.

Fixes #106118.

>From ad750a31cbbda7a29438dab7408eecede34b73bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Bylica?= <pawel at hepcolgum.band>
Date: Fri, 27 Mar 2026 14:11:49 +0100
Subject: [PATCH] [CGP] Reconstruct borrow chain from icmp pattern for
 subtract-with-carry

InstCombine simplifies chained usub.with.overflow intrinsics (from
multi-precision subtraction, e.g. __builtin_subcll) into an icmp pattern:

  carry_out = or(icmp ult X, Y, and(icmp eq X, Y, carry_in))

This is algebraically correct but prevents the backend from generating
efficient subtract-with-borrow instructions (x86 sbb, aarch64 sbcs).

Add combineToUSubWithCarry() to CodeGenPrepare that matches this pattern
and reconstructs the chained usub.with.overflow form. The existing DAG
combiner (combineCarryDiamond) then combines these into USUBO_CARRY.

The transformation is gated on target support for ISD::USUBO_CARRY.

Fixes #106118.
---
 llvm/lib/CodeGen/CodeGenPrepare.cpp    | 67 +++++++++++++++++++++
 llvm/test/CodeGen/AArch64/cgp-usubo.ll | 83 ++++++++++++++++++++++++--
 llvm/test/CodeGen/X86/subcarry.ll      |  9 +--
 3 files changed, 147 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 56ee6e8b43304..64408ed3b9acb 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -474,6 +474,7 @@ class CodeGenPrepare {
   bool optimizeURem(Instruction *Rem);
   bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
   bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
+  bool combineToUSubWithCarry(BinaryOperator *OrI);
   bool unfoldPowerOf2Test(CmpInst *Cmp);
   void verifyBFIUpdates(Function &F);
   bool _run(Function &F);
@@ -1762,6 +1763,68 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
   return true;
 }
 
+/// FIXME: The name is similar to combineToUSubWithOverflow. Consider renaming
+/// one or both to better distinguish them.
+///
+/// Reconstruct a subtract-with-borrow chain from its canonicalized icmp form.
+///
+/// InstCombine simplifies chained usub.with.overflow intrinsics (used for
+/// multi-precision subtraction) into icmp patterns:
+///   carry_out = or(icmp ult A, B, and(icmp eq A, B, carry_in))
+///
+/// This is algebraically correct but prevents the backend from generating
+/// efficient subtract-with-borrow instructions (e.g. x86 sbb, aarch64 sbcs).
+///
+/// This function matches the pattern and reconstructs the chained
+/// usub.with.overflow form that the DAG combiner can lower to USUBO_CARRY.
+bool CodeGenPrepare::combineToUSubWithCarry(BinaryOperator *OrI) {
+  if (!OrI->getType()->isIntOrIntVectorTy(1))
+    return false;
+
+  assert(OrI->getOpcode() == Instruction::Or && "Expected or instruction");
+
+  // Match: or(icmp ult A, B, and(icmp eq A, B, carry_in))
+  // with all commuted variants of and and icmp eq operand order.
+  Value *A, *B, *CarryIn;
+  if (!match(
+          OrI,
+          m_c_BinOp(
+              m_SpecificICmp(ICmpInst::ICMP_ULT, m_Value(A), m_Value(B)),
+              m_c_And(m_CombineOr(m_SpecificICmp(ICmpInst::ICMP_EQ,
+                                                 m_Deferred(A), m_Deferred(B)),
+                                  m_SpecificICmp(ICmpInst::ICMP_EQ,
+                                                 m_Deferred(B), m_Deferred(A))),
+                      m_Value(CarryIn)))))
+    return false;
+
+  // Check that the target supports subtract-with-borrow for this type.
+  Type *OpTy = A->getType();
+  EVT VT = TLI->getValueType(*DL, OpTy);
+  if (!TLI->isOperationLegalOrCustom(ISD::USUBO_CARRY, VT))
+    return false;
+
+  // Build the replacement:
+  //   %carry_ext = zext i1 %carry_in to iN
+  //   (%diff, %borrow1) = usubo(A, B)
+  //   (_, %borrow2) = usubo(%diff, %carry_ext)
+  //   %result = or %borrow1, %borrow2
+  IRBuilder<> Builder(OrI);
+
+  Value *CarryExt = Builder.CreateZExt(CarryIn, OpTy, "carry.ext");
+  Value *Sub1 =
+      Builder.CreateBinaryIntrinsic(Intrinsic::usub_with_overflow, A, B);
+  Value *Diff = Builder.CreateExtractValue(Sub1, 0, "diff");
+  Value *Borrow1 = Builder.CreateExtractValue(Sub1, 1, "borrow");
+  Value *Sub2 = Builder.CreateBinaryIntrinsic(Intrinsic::usub_with_overflow,
+                                              Diff, CarryExt);
+  Value *Borrow2 = Builder.CreateExtractValue(Sub2, 1, "borrow");
+  Value *Result = Builder.CreateOr(Borrow1, Borrow2, "carry.out");
+
+  replaceAllUsesWith(OrI, Result, FreshBBs, IsHugeFunc);
+  OrI->eraseFromParent();
+  return true;
+}
+
 // Decanonicalizes icmp+ctpop power-of-two test if ctpop is slow.
 // The same transformation exists in DAG combiner, but we repeat it here because
 // DAG builder can break the pattern by moving icmp into a successor block.
@@ -8979,6 +9042,10 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
       sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts))
     return true;
 
+  if (BinOp && BinOp->getOpcode() == Instruction::Or &&
+      combineToUSubWithCarry(BinOp))
+    return true;
+
   // TODO: Move this into the switch on opcode - it handles shifts already.
   if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
                 BinOp->getOpcode() == Instruction::LShr)) {
diff --git a/llvm/test/CodeGen/AArch64/cgp-usubo.ll b/llvm/test/CodeGen/AArch64/cgp-usubo.ll
index d307107fc07ee..cf5230a34e5b6 100644
--- a/llvm/test/CodeGen/AArch64/cgp-usubo.ll
+++ b/llvm/test/CodeGen/AArch64/cgp-usubo.ll
@@ -120,6 +120,79 @@ define i1 @usubo_eq_constant1_op1_i32(i32 %x, ptr %p) nounwind {
   ret i1 %ov
 }
 
+; Borrow chain: or(icmp ult X, Y, and(icmp eq X, Y, carry_in)) -> cmp + sbcs.
+; See https://github.com/llvm/llvm-project/issues/106118.
+
+define i1 @subcarry_ult_2x64(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
+; CHECK-LABEL: subcarry_ult_2x64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp x0, x2
+; CHECK-NEXT:    sbcs xzr, x1, x3
+; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    ret
+  %b0 = icmp ult i64 %x0, %y0
+  %b1 = icmp ult i64 %x1, %y1
+  %e1 = icmp eq i64 %x1, %y1
+  %bp = and i1 %b0, %e1
+  %br = or i1 %b1, %bp
+  ret i1 %br
+}
+
+define i1 @subcarry_ult_3x64(i64 %x0, i64 %x1, i64 %x2, i64 %y0, i64 %y1, i64 %y2) nounwind {
+; CHECK-LABEL: subcarry_ult_3x64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp x0, x3
+; CHECK-NEXT:    sbcs xzr, x1, x4
+; CHECK-NEXT:    sbcs xzr, x2, x5
+; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    ret
+  %b0 = icmp ult i64 %x0, %y0
+  %b1 = icmp ult i64 %x1, %y1
+  %e1 = icmp eq i64 %x1, %y1
+  %bp1 = and i1 %b0, %e1
+  %br1 = or i1 %b1, %bp1
+  %b2 = icmp ult i64 %x2, %y2
+  %e2 = icmp eq i64 %x2, %y2
+  %bp2 = and i1 %br1, %e2
+  %br2 = or i1 %b2, %bp2
+  ret i1 %br2
+}
+
+; Negative test: icmp eq compares different operands than icmp ult.
+define i1 @no_subcarry_different_ops(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
+; CHECK-LABEL: no_subcarry_different_ops:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp x0, x2
+; CHECK-NEXT:    ccmp x1, x0, #0, lo
+; CHECK-NEXT:    ccmp x1, x3, #0, ne
+; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    ret
+  %b0 = icmp ult i64 %x0, %y0
+  %b1 = icmp ult i64 %x1, %y1
+  %e1 = icmp eq i64 %x1, %x0
+  %bp = and i1 %b0, %e1
+  %br = or i1 %b1, %bp
+  ret i1 %br
+}
+
+; Negative test: icmp slt instead of icmp ult.
+define i1 @no_subcarry_signed(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
+; CHECK-LABEL: no_subcarry_signed:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp x0, x2
+; CHECK-NEXT:    cset w8, lo
+; CHECK-NEXT:    cmp x1, x3
+; CHECK-NEXT:    csel w8, wzr, w8, ne
+; CHECK-NEXT:    csinc w0, w8, wzr, ge
+; CHECK-NEXT:    ret
+  %b0 = icmp ult i64 %x0, %y0
+  %b1 = icmp slt i64 %x1, %y1
+  %e1 = icmp eq i64 %x1, %y1
+  %bp = and i1 %b0, %e1
+  %br = or i1 %b1, %bp
+  ret i1 %br
+}
+
 ; Verify insertion point for multi-BB.
 
 declare void @call(i1)
@@ -127,12 +200,12 @@ declare void @call(i1)
 define i1 @usubo_ult_sub_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) nounwind {
 ; CHECK-LABEL: usubo_ult_sub_dominates_i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    tbz w3, #0, .LBB7_2
+; CHECK-NEXT:    tbz w3, #0, .LBB11_2
 ; CHECK-NEXT:  // %bb.1: // %t
 ; CHECK-NEXT:    subs x8, x0, x1
 ; CHECK-NEXT:    cset w3, lo
 ; CHECK-NEXT:    str x8, [x2]
-; CHECK-NEXT:  .LBB7_2: // %common.ret
+; CHECK-NEXT:  .LBB11_2: // %common.ret
 ; CHECK-NEXT:    and w0, w3, #0x1
 ; CHECK-NEXT:    ret
 entry:
@@ -158,7 +231,7 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) nounwin
 ; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov w19, w3
 ; CHECK-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    tbz w3, #0, .LBB8_3
+; CHECK-NEXT:    tbz w3, #0, .LBB12_3
 ; CHECK-NEXT:  // %bb.1: // %t
 ; CHECK-NEXT:    cmp x0, x1
 ; CHECK-NEXT:    mov x22, x0
@@ -168,11 +241,11 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) nounwin
 ; CHECK-NEXT:    mov w0, w21
 ; CHECK-NEXT:    bl call
 ; CHECK-NEXT:    subs x8, x22, x23
-; CHECK-NEXT:    b.hs .LBB8_3
+; CHECK-NEXT:    b.hs .LBB12_3
 ; CHECK-NEXT:  // %bb.2: // %end
 ; CHECK-NEXT:    mov w19, w21
 ; CHECK-NEXT:    str x8, [x20]
-; CHECK-NEXT:  .LBB8_3: // %common.ret
+; CHECK-NEXT:  .LBB12_3: // %common.ret
 ; CHECK-NEXT:    and w0, w19, #0x1
 ; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/subcarry.ll b/llvm/test/CodeGen/X86/subcarry.ll
index 7d5db07c0172a..96bfa3ba93b28 100644
--- a/llvm/test/CodeGen/X86/subcarry.ll
+++ b/llvm/test/CodeGen/X86/subcarry.ll
@@ -671,17 +671,12 @@ define i1 @subcarry_ult_2x64(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
 ; https://github.com/llvm/llvm-project/commit/926e7312b2f20f2f7b0a3d5ddbd29da5625507f3
 ; This is also the result of "naive" implementation (x1 < y1) | ((x0 < y0) & (x1 == y1)).
 ; C source: https://godbolt.org/z/W1qqvqGbr
-; TODO: This should be optimized to cmp + sbb.
 define i1 @subcarry_ult_2x64_2(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
 ; CHECK-LABEL: subcarry_ult_2x64_2:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    cmpq %rdx, %rdi
-; CHECK-NEXT:    setb %dl
-; CHECK-NEXT:    cmpq %rcx, %rsi
-; CHECK-NEXT:    setb %cl
-; CHECK-NEXT:    sete %al
-; CHECK-NEXT:    andb %dl, %al
-; CHECK-NEXT:    orb %cl, %al
+; CHECK-NEXT:    sbbq %rcx, %rsi
+; CHECK-NEXT:    setb %al
 ; CHECK-NEXT:    retq
 entry:
   %0 = icmp ult i64 %x0, %y0