[llvm] [CGP] Reconstruct borrow chain from icmp pattern for subtract-with-carry (PR #189018)
Paweł Bylica via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 27 07:49:33 PDT 2026
https://github.com/chfast created https://github.com/llvm/llvm-project/pull/189018
InstCombine simplifies chained usub.with.overflow intrinsics (from multi-precision subtraction, e.g. __builtin_subcll) into an icmp pattern:
carry_out = or(icmp ult X, Y, and(icmp eq X, Y, carry_in))
This is algebraically correct but prevents the backend from generating efficient subtract-with-borrow instructions (x86 sbb, aarch64 sbcs).
Add combineToUSubWithCarry() to CodeGenPrepare that matches this pattern and reconstructs the chained usub.with.overflow form. The existing DAG combiner (combineCarryDiamond) then combines these into USUBO_CARRY.
The transformation is gated on target support for ISD::USUBO_CARRY.
Fixes #106118.
>From ad750a31cbbda7a29438dab7408eecede34b73bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Bylica?= <pawel at hepcolgum.band>
Date: Fri, 27 Mar 2026 14:11:49 +0100
Subject: [PATCH] [CGP] Reconstruct borrow chain from icmp pattern for
subtract-with-carry
InstCombine simplifies chained usub.with.overflow intrinsics (from
multi-precision subtraction, e.g. __builtin_subcll) into an icmp pattern:
carry_out = or(icmp ult X, Y, and(icmp eq X, Y, carry_in))
This is algebraically correct but prevents the backend from generating
efficient subtract-with-borrow instructions (x86 sbb, aarch64 sbcs).
Add combineToUSubWithCarry() to CodeGenPrepare that matches this pattern
and reconstructs the chained usub.with.overflow form. The existing DAG
combiner (combineCarryDiamond) then combines these into USUBO_CARRY.
The transformation is gated on target support for ISD::USUBO_CARRY.
Fixes #106118.
---
llvm/lib/CodeGen/CodeGenPrepare.cpp | 67 +++++++++++++++++++++
llvm/test/CodeGen/AArch64/cgp-usubo.ll | 83 ++++++++++++++++++++++++--
llvm/test/CodeGen/X86/subcarry.ll | 9 +--
3 files changed, 147 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 56ee6e8b43304..64408ed3b9acb 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -474,6 +474,7 @@ class CodeGenPrepare {
bool optimizeURem(Instruction *Rem);
bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
+ bool combineToUSubWithCarry(BinaryOperator *OrI);
bool unfoldPowerOf2Test(CmpInst *Cmp);
void verifyBFIUpdates(Function &F);
bool _run(Function &F);
@@ -1762,6 +1763,68 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
return true;
}
+/// FIXME: The name is similar to combineToUSubWithOverflow. Consider renaming
+/// one or both to better distinguish them.
+///
+/// Reconstruct a subtract-with-borrow chain from its canonicalized icmp form.
+///
+/// InstCombine simplifies chained usub.with.overflow intrinsics (used for
+/// multi-precision subtraction) into icmp patterns:
+/// carry_out = or(icmp ult A, B, and(icmp eq A, B, carry_in))
+///
+/// This is algebraically correct but prevents the backend from generating
+/// efficient subtract-with-borrow instructions (e.g. x86 sbb, aarch64 sbcs).
+///
+/// This function matches the pattern and reconstructs the chained
+/// usub.with.overflow form that the DAG combiner can lower to USUBO_CARRY.
+bool CodeGenPrepare::combineToUSubWithCarry(BinaryOperator *OrI) {
+ if (!OrI->getType()->isIntOrIntVectorTy(1))
+ return false;
+
+ assert(OrI->getOpcode() == Instruction::Or && "Expected or instruction");
+
+ // Match: or(icmp ult A, B, and(icmp eq A, B, carry_in))
+ // with all commuted variants of and and icmp eq operand order.
+ Value *A, *B, *CarryIn;
+ if (!match(
+ OrI,
+ m_c_BinOp(
+ m_SpecificICmp(ICmpInst::ICMP_ULT, m_Value(A), m_Value(B)),
+ m_c_And(m_CombineOr(m_SpecificICmp(ICmpInst::ICMP_EQ,
+ m_Deferred(A), m_Deferred(B)),
+ m_SpecificICmp(ICmpInst::ICMP_EQ,
+ m_Deferred(B), m_Deferred(A))),
+ m_Value(CarryIn)))))
+ return false;
+
+ // Check that the target supports subtract-with-borrow for this type.
+ Type *OpTy = A->getType();
+ EVT VT = TLI->getValueType(*DL, OpTy);
+ if (!TLI->isOperationLegalOrCustom(ISD::USUBO_CARRY, VT))
+ return false;
+
+ // Build the replacement:
+ // %carry_ext = zext i1 %carry_in to iN
+ // (%diff, %borrow1) = usubo(A, B)
+ // (_, %borrow2) = usubo(%diff, %carry_ext)
+ // %result = or %borrow1, %borrow2
+ IRBuilder<> Builder(OrI);
+
+ Value *CarryExt = Builder.CreateZExt(CarryIn, OpTy, "carry.ext");
+ Value *Sub1 =
+ Builder.CreateBinaryIntrinsic(Intrinsic::usub_with_overflow, A, B);
+ Value *Diff = Builder.CreateExtractValue(Sub1, 0, "diff");
+ Value *Borrow1 = Builder.CreateExtractValue(Sub1, 1, "borrow");
+ Value *Sub2 = Builder.CreateBinaryIntrinsic(Intrinsic::usub_with_overflow,
+ Diff, CarryExt);
+ Value *Borrow2 = Builder.CreateExtractValue(Sub2, 1, "borrow");
+ Value *Result = Builder.CreateOr(Borrow1, Borrow2, "carry.out");
+
+ replaceAllUsesWith(OrI, Result, FreshBBs, IsHugeFunc);
+ OrI->eraseFromParent();
+ return true;
+}
+
// Decanonicalizes icmp+ctpop power-of-two test if ctpop is slow.
// The same transformation exists in DAG combiner, but we repeat it here because
// DAG builder can break the pattern by moving icmp into a successor block.
@@ -8979,6 +9042,10 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts))
return true;
+ if (BinOp && BinOp->getOpcode() == Instruction::Or &&
+ combineToUSubWithCarry(BinOp))
+ return true;
+
// TODO: Move this into the switch on opcode - it handles shifts already.
if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
BinOp->getOpcode() == Instruction::LShr)) {
diff --git a/llvm/test/CodeGen/AArch64/cgp-usubo.ll b/llvm/test/CodeGen/AArch64/cgp-usubo.ll
index d307107fc07ee..cf5230a34e5b6 100644
--- a/llvm/test/CodeGen/AArch64/cgp-usubo.ll
+++ b/llvm/test/CodeGen/AArch64/cgp-usubo.ll
@@ -120,6 +120,79 @@ define i1 @usubo_eq_constant1_op1_i32(i32 %x, ptr %p) nounwind {
ret i1 %ov
}
+; Borrow chain: or(icmp ult X, Y, and(icmp eq X, Y, carry_in)) -> cmp + sbcs.
+; See https://github.com/llvm/llvm-project/issues/106118.
+
+define i1 @subcarry_ult_2x64(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
+; CHECK-LABEL: subcarry_ult_2x64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmp x0, x2
+; CHECK-NEXT: sbcs xzr, x1, x3
+; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: ret
+ %b0 = icmp ult i64 %x0, %y0
+ %b1 = icmp ult i64 %x1, %y1
+ %e1 = icmp eq i64 %x1, %y1
+ %bp = and i1 %b0, %e1
+ %br = or i1 %b1, %bp
+ ret i1 %br
+}
+
+define i1 @subcarry_ult_3x64(i64 %x0, i64 %x1, i64 %x2, i64 %y0, i64 %y1, i64 %y2) nounwind {
+; CHECK-LABEL: subcarry_ult_3x64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmp x0, x3
+; CHECK-NEXT: sbcs xzr, x1, x4
+; CHECK-NEXT: sbcs xzr, x2, x5
+; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: ret
+ %b0 = icmp ult i64 %x0, %y0
+ %b1 = icmp ult i64 %x1, %y1
+ %e1 = icmp eq i64 %x1, %y1
+ %bp1 = and i1 %b0, %e1
+ %br1 = or i1 %b1, %bp1
+ %b2 = icmp ult i64 %x2, %y2
+ %e2 = icmp eq i64 %x2, %y2
+ %bp2 = and i1 %br1, %e2
+ %br2 = or i1 %b2, %bp2
+ ret i1 %br2
+}
+
+; Negative test: icmp eq compares different operands than icmp ult.
+define i1 @no_subcarry_different_ops(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
+; CHECK-LABEL: no_subcarry_different_ops:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmp x0, x2
+; CHECK-NEXT: ccmp x1, x0, #0, lo
+; CHECK-NEXT: ccmp x1, x3, #0, ne
+; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: ret
+ %b0 = icmp ult i64 %x0, %y0
+ %b1 = icmp ult i64 %x1, %y1
+ %e1 = icmp eq i64 %x1, %x0
+ %bp = and i1 %b0, %e1
+ %br = or i1 %b1, %bp
+ ret i1 %br
+}
+
+; Negative test: icmp slt instead of icmp ult.
+define i1 @no_subcarry_signed(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
+; CHECK-LABEL: no_subcarry_signed:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmp x0, x2
+; CHECK-NEXT: cset w8, lo
+; CHECK-NEXT: cmp x1, x3
+; CHECK-NEXT: csel w8, wzr, w8, ne
+; CHECK-NEXT: csinc w0, w8, wzr, ge
+; CHECK-NEXT: ret
+ %b0 = icmp ult i64 %x0, %y0
+ %b1 = icmp slt i64 %x1, %y1
+ %e1 = icmp eq i64 %x1, %y1
+ %bp = and i1 %b0, %e1
+ %br = or i1 %b1, %bp
+ ret i1 %br
+}
+
; Verify insertion point for multi-BB.
declare void @call(i1)
@@ -127,12 +200,12 @@ declare void @call(i1)
define i1 @usubo_ult_sub_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) nounwind {
; CHECK-LABEL: usubo_ult_sub_dominates_i64:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: tbz w3, #0, .LBB7_2
+; CHECK-NEXT: tbz w3, #0, .LBB11_2
; CHECK-NEXT: // %bb.1: // %t
; CHECK-NEXT: subs x8, x0, x1
; CHECK-NEXT: cset w3, lo
; CHECK-NEXT: str x8, [x2]
-; CHECK-NEXT: .LBB7_2: // %common.ret
+; CHECK-NEXT: .LBB11_2: // %common.ret
; CHECK-NEXT: and w0, w3, #0x1
; CHECK-NEXT: ret
entry:
@@ -158,7 +231,7 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) nounwin
; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: mov w19, w3
; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: tbz w3, #0, .LBB8_3
+; CHECK-NEXT: tbz w3, #0, .LBB12_3
; CHECK-NEXT: // %bb.1: // %t
; CHECK-NEXT: cmp x0, x1
; CHECK-NEXT: mov x22, x0
@@ -168,11 +241,11 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) nounwin
; CHECK-NEXT: mov w0, w21
; CHECK-NEXT: bl call
; CHECK-NEXT: subs x8, x22, x23
-; CHECK-NEXT: b.hs .LBB8_3
+; CHECK-NEXT: b.hs .LBB12_3
; CHECK-NEXT: // %bb.2: // %end
; CHECK-NEXT: mov w19, w21
; CHECK-NEXT: str x8, [x20]
-; CHECK-NEXT: .LBB8_3: // %common.ret
+; CHECK-NEXT: .LBB12_3: // %common.ret
; CHECK-NEXT: and w0, w19, #0x1
; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/subcarry.ll b/llvm/test/CodeGen/X86/subcarry.ll
index 7d5db07c0172a..96bfa3ba93b28 100644
--- a/llvm/test/CodeGen/X86/subcarry.ll
+++ b/llvm/test/CodeGen/X86/subcarry.ll
@@ -671,17 +671,12 @@ define i1 @subcarry_ult_2x64(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
; https://github.com/llvm/llvm-project/commit/926e7312b2f20f2f7b0a3d5ddbd29da5625507f3
; This is also the result of "naive" implementation (x1 < y1) | ((x0 < y0) & (x1 == y1)).
; C source: https://godbolt.org/z/W1qqvqGbr
-; TODO: This should be optimized to cmp + sbb.
define i1 @subcarry_ult_2x64_2(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
; CHECK-LABEL: subcarry_ult_2x64_2:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpq %rdx, %rdi
-; CHECK-NEXT: setb %dl
-; CHECK-NEXT: cmpq %rcx, %rsi
-; CHECK-NEXT: setb %cl
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: andb %dl, %al
-; CHECK-NEXT: orb %cl, %al
+; CHECK-NEXT: sbbq %rcx, %rsi
+; CHECK-NEXT: setb %al
; CHECK-NEXT: retq
entry:
%0 = icmp ult i64 %x0, %y0
More information about the llvm-commits
mailing list