[llvm] [X86] [AArch64] fuse constant addition after sbb (PR #184541)

Wed Mar 4 17:02:24 PST 2026

https://github.com/Takashiidobe updated https://github.com/llvm/llvm-project/pull/184541

>From 0da03849f3a3960f6909ce7d29f23afc256b1823 Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Tue, 3 Mar 2026 21:26:36 -0500
Subject: [PATCH] fuse addition after sbb

---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  44 +++
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  15 +
 llvm/test/CodeGen/AArch64/sbc-add-constant.ll | 273 ++++++++++++++++++
 llvm/test/CodeGen/X86/sbb-add-constant.ll     | 204 +++++++++++++
 4 files changed, 536 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/sbc-add-constant.ll
 create mode 100644 llvm/test/CodeGen/X86/sbb-add-constant.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8497a97aaf5fa..29dffb70f27c4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23346,6 +23346,48 @@ static SDValue performAddTruncShiftCombine(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(ISD::ADD, DL, VT, Trunc, Shift);
 }
 
+// Fold ADD(AArch64ISD::SBC(Y, 0, W), C) -> AArch64ISD::SBC(Y, -C, W)
+//
+// SBC(Y, 0, W) = Y - 0 - ~carry = Y + carry - 1
+// Adding C:  Y + carry - 1 + C = Y - (-C) - ~carry = SBC(Y, -C, W)
+//
+// AArch64's SBC has no immediate form, so -C is still materialized via MOV,
+// but that MOV is data-independent of the preceding SUBS and can execute in
+// parallel, reducing the critical-path length from 3 to 2 cycles:
+//   Before: subs -> sbc -> add   (3 dependent cycles)
+//   After:  mov || subs -> sbc   (2 dependent cycles)
+//
+// AArch64 SBC (non-flag-setting) reads NZCV as an input but does not write it,
+// so no flags-output guard is needed. The fold is always safe when SBC has one
+// use and its subtrahend is zero.
+static SDValue performAddWithSBCCombine(SDNode *N, SelectionDAG &DAG) {
+  if (N->getOpcode() != ISD::ADD)
+    return SDValue();
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  SDValue SBC = N->getOperand(0);
+  SDValue C   = N->getOperand(1);
+  // ADD is commutative; constant may be on either side.
+  if (SBC.getOpcode() != AArch64ISD::SBC)
+    std::swap(SBC, C);
+  if (SBC.getOpcode() != AArch64ISD::SBC || !SBC.hasOneUse())
+    return SDValue();
+  if (!isNullConstant(SBC.getOperand(1)))
+    return SDValue();
+  // AArch64 SBC (non-flag-setting) has only one output; no flags guard needed.
+  auto *CC = dyn_cast<ConstantSDNode>(C);
+  if (!CC)
+    return SDValue();
+
+  SDLoc DL(N);
+  return DAG.getNode(AArch64ISD::SBC, DL, VT,
+                     SBC.getOperand(0),
+                     DAG.getConstant(-CC->getAPIntValue(), DL, VT),
+                     SBC.getOperand(2));
+}
+
 static SDValue performAddSubCombine(SDNode *N,
                                     TargetLowering::DAGCombinerInfo &DCI) {
   // Try to change sum of two reductions.
@@ -23371,6 +23413,8 @@ static SDValue performAddSubCombine(SDNode *N,
     return Val;
   if (SDValue Val = performAddTruncShiftCombine(N, DCI.DAG))
     return Val;
+  if (SDValue Val = performAddWithSBCCombine(N, DCI.DAG))
+    return Val;
 
   if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
     return Val;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6384c4d58a480..c1b3ba8cf75a0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -59549,6 +59549,21 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
                        Op0.getOperand(0), Op0.getOperand(2));
   }
 
+  // Fold ADD(SBB(Y,0,W),C) -> SBB(Y,-C,W)
+  // SBB(Y,0,W) = Y - 0 - CF = Y - CF; adding C gives Y - CF + C = Y - (-C) - CF.
+  // The SBB flags output must be dead: changing the subtrahend from 0 to -C
+  // produces different EFLAGS bits.
+  if (Op0.getOpcode() == X86ISD::SBB && Op0->hasOneUse() &&
+      X86::isZeroNode(Op0.getOperand(1)) && !Op0->hasAnyUseOfValue(1)) {
+    if (auto *C = dyn_cast<ConstantSDNode>(Op1)) {
+      SDLoc SBBLoc(Op0);
+      return DAG.getNode(X86ISD::SBB, SBBLoc, Op0->getVTList(),
+                         Op0.getOperand(0),
+                         DAG.getConstant(-C->getAPIntValue(), SBBLoc, VT),
+                         Op0.getOperand(2)).getValue(0);
+    }
+  }
+
   if (SDValue IFMA52 = matchVPMADD52(N, DAG, DL, VT, Subtarget))
     return IFMA52;
 
diff --git a/llvm/test/CodeGen/AArch64/sbc-add-constant.ll b/llvm/test/CodeGen/AArch64/sbc-add-constant.ll
new file mode 100644
index 0000000000000..2ee3db02f7d87
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sbc-add-constant.ll
@@ -0,0 +1,273 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck --check-prefixes=CHECK,CHECK-SD %s
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel < %s | FileCheck --check-prefixes=CHECK,CHECK-GI %s
+;
+; Verify that ADD(SBC(Y,0,flags),C) folds to SBC(Y,-C,flags).
+; SBC(Y,0,W) = Y - 0 - ~carry = Y + carry - 1; adding C gives Y + carry - 1 + C
+;            = Y - (-C) - ~carry = SBC(Y,-C,W).
+;
+; AArch64's SBC has no immediate form so -C is materialised via MOV, but the
+; MOV is data-independent of the preceding SUBS and can execute in parallel,
+; reducing the critical-path depth from 3 to 2 cycles.
+;
+; Unlike x86 SBB, AArch64 SBC (non-flag-setting) reads NZCV as an input but
+; does not write it.  There is therefore no "flags output live" guard: folding
+; ADD(SBC(Y,0),C)->SBC(Y,-C) is always safe when the SBC has one use and its
+; subtrahend is zero.
+;
+; Guards that must prevent the fold:
+;   - addend is not a compile-time constant
+;   - SBC value result has more than one use
+
+declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64)
+declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32)
+declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64)
+
+;------------------------------------------------------------------------------
+; Positive: fold fires (SelectionDAG only; GlobalISel unaffected)
+;------------------------------------------------------------------------------
+
+; Basic i64: sbc xzr + add #10 -> mov #-10 + sbc reg
+define i64 @g_i64(i64 %a, i64 %b) {
+; CHECK-SD-LABEL: g_i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov x8, #-10 // =0xfffffffffffffff6
+; CHECK-SD-NEXT:    subs x9, x0, x1
+; CHECK-SD-NEXT:    sbc x0, x9, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: g_i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    subs x8, x0, x1
+; CHECK-GI-NEXT:    cset w9, lo
+; CHECK-GI-NEXT:    sbfx x9, x9, #0, #1
+; CHECK-GI-NEXT:    add x8, x8, x9
+; CHECK-GI-NEXT:    add x0, x8, #10
+; CHECK-GI-NEXT:    ret
+  %ov  = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+  %val = extractvalue { i64, i1 } %ov, 0
+  %bit = extractvalue { i64, i1 } %ov, 1
+  %ext = sext i1 %bit to i64
+  %r   = add i64 %val, %ext
+  %r2  = add i64 %r, 10
+  ret i64 %r2
+}
+
+; Basic i32
+define i32 @g_i32(i32 %a, i32 %b) {
+; CHECK-SD-LABEL: g_i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #-10 // =0xfffffff6
+; CHECK-SD-NEXT:    subs w9, w0, w1
+; CHECK-SD-NEXT:    sbc w0, w9, w8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: g_i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    subs w8, w0, w1
+; CHECK-GI-NEXT:    cset w9, lo
+; CHECK-GI-NEXT:    sbfx w9, w9, #0, #1
+; CHECK-GI-NEXT:    add w8, w8, w9
+; CHECK-GI-NEXT:    add w0, w8, #10
+; CHECK-GI-NEXT:    ret
+  %ov  = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
+  %val = extractvalue { i32, i1 } %ov, 0
+  %bit = extractvalue { i32, i1 } %ov, 1
+  %ext = sext i1 %bit to i32
+  %r   = add i32 %val, %ext
+  %r2  = add i32 %r, 10
+  ret i32 %r2
+}
+
+; Negative addend: add -5 -> sbc with #5 materialised
+define i64 @g_neg_const(i64 %a, i64 %b) {
+; CHECK-SD-LABEL: g_neg_const:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #5 // =0x5
+; CHECK-SD-NEXT:    subs x9, x0, x1
+; CHECK-SD-NEXT:    sbc x0, x9, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: g_neg_const:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    subs x8, x0, x1
+; CHECK-GI-NEXT:    cset w9, lo
+; CHECK-GI-NEXT:    sbfx x9, x9, #0, #1
+; CHECK-GI-NEXT:    add x8, x8, x9
+; CHECK-GI-NEXT:    sub x0, x8, #5
+; CHECK-GI-NEXT:    ret
+  %ov  = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+  %val = extractvalue { i64, i1 } %ov, 0
+  %bit = extractvalue { i64, i1 } %ov, 1
+  %ext = sext i1 %bit to i64
+  %r   = add i64 %val, %ext
+  %r2  = add i64 %r, -5
+  ret i64 %r2
+}
+
+; In-register use with multiply: shows the saved critical-path cycle.
+; Before fix: subs -> sbc -> add -> mul  (add on critical path)
+; After  fix: mov || subs -> sbc -> mul  (mov runs in parallel with subs)
+define i64 @g_mul(i64 %a, i64 %b, i64 %c) {
+; CHECK-SD-LABEL: g_mul:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    subs x8, x0, x1
+; CHECK-SD-NEXT:    mov x9, #-10 // =0xfffffffffffffff6
+; CHECK-SD-NEXT:    sbc x8, x8, x9
+; CHECK-SD-NEXT:    mul x0, x8, x2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: g_mul:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    subs x8, x0, x1
+; CHECK-GI-NEXT:    cset w9, lo
+; CHECK-GI-NEXT:    sbfx x9, x9, #0, #1
+; CHECK-GI-NEXT:    add x8, x8, x9
+; CHECK-GI-NEXT:    add x8, x8, #10
+; CHECK-GI-NEXT:    mul x0, x8, x2
+; CHECK-GI-NEXT:    ret
+  %ov  = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+  %val = extractvalue { i64, i1 } %ov, 0
+  %bit = extractvalue { i64, i1 } %ov, 1
+  %ext = sext i1 %bit to i64
+  %r   = add i64 %val, %ext
+  %r2  = add i64 %r, 10
+  %r3  = mul i64 %r2, %c
+  ret i64 %r3
+}
+
+; ADC regression: existing add-with-carry fold must be unaffected.
+define i64 @f_adc_regression(i64 %a, i64 %b) {
+; CHECK-SD-LABEL: f_adc_regression:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, #10 // =0xa
+; CHECK-SD-NEXT:    adds x9, x0, x1
+; CHECK-SD-NEXT:    adc x0, x9, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: f_adc_regression:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    adds x8, x0, x1
+; CHECK-GI-NEXT:    cset w9, hs
+; CHECK-GI-NEXT:    and x9, x9, #0x1
+; CHECK-GI-NEXT:    add x8, x8, x9
+; CHECK-GI-NEXT:    add x0, x8, #10
+; CHECK-GI-NEXT:    ret
+  %ov  = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
+  %val = extractvalue { i64, i1 } %ov, 0
+  %bit = extractvalue { i64, i1 } %ov, 1
+  %ext = zext i1 %bit to i64
+  %r   = add i64 %val, %ext
+  %r2  = add i64 %r, 10
+  ret i64 %r2
+}
+
+; Borrow-chain: unlike x86, AArch64 SBC has no flags output, so the fold fires
+; even when the lo borrow is used elsewhere.  lo_sbc has exactly one use
+; (the add-10), so hasOneUse() passes and the fold is applied.
+define {i64, i64} @g_flags_fold(i64 %a_lo, i64 %a_hi, i64 %b_lo, i64 %b_hi) {
+; CHECK-SD-LABEL: g_flags_fold:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    subs x8, x0, x2
+; CHECK-SD-NEXT:    cset w9, lo
+; CHECK-SD-NEXT:    subs x10, x1, x3
+; CHECK-SD-NEXT:    sbc x10, x10, xzr
+; CHECK-SD-NEXT:    sub x8, x8, x9
+; CHECK-SD-NEXT:    sub x1, x10, x9
+; CHECK-SD-NEXT:    add x0, x8, #10
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: g_flags_fold:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    subs x8, x0, x2
+; CHECK-GI-NEXT:    cset w9, lo
+; CHECK-GI-NEXT:    subs x10, x1, x3
+; CHECK-GI-NEXT:    cset w11, lo
+; CHECK-GI-NEXT:    sbfx x12, x9, #0, #1
+; CHECK-GI-NEXT:    and x9, x9, #0x1
+; CHECK-GI-NEXT:    sbfx x11, x11, #0, #1
+; CHECK-GI-NEXT:    add x8, x8, x12
+; CHECK-GI-NEXT:    add x10, x10, x11
+; CHECK-GI-NEXT:    add x0, x8, #10
+; CHECK-GI-NEXT:    sub x1, x10, x9
+; CHECK-GI-NEXT:    ret
+  %lo_ov  = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a_lo, i64 %b_lo)
+  %lo_val = extractvalue { i64, i1 } %lo_ov, 0
+  %lo_bit = extractvalue { i64, i1 } %lo_ov, 1
+  %lo_ext = sext i1 %lo_bit to i64
+  %lo_sbc = add i64 %lo_val, %lo_ext
+  %lo_r   = add i64 %lo_sbc, 10
+
+  %hi_sub = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a_hi, i64 %b_hi)
+  %hi_val = extractvalue { i64, i1 } %hi_sub, 0
+  %hi_bit = extractvalue { i64, i1 } %hi_sub, 1
+  %hi_ext = sext i1 %hi_bit to i64
+  %hi_sbc = add i64 %hi_val, %hi_ext
+  %lo_b   = zext i1 %lo_bit to i64
+  %hi_r   = sub i64 %hi_sbc, %lo_b
+
+  %ret    = insertvalue {i64, i64} undef, i64 %lo_r, 0
+  %ret2   = insertvalue {i64, i64} %ret,  i64 %hi_r, 1
+  ret {i64, i64} %ret2
+}
+
+;------------------------------------------------------------------------------
+; Negative: fold must not fire
+;------------------------------------------------------------------------------
+
+; Non-constant addend.
+define i64 @g_nonconstant(i64 %a, i64 %b, i64 %c) {
+; CHECK-SD-LABEL: g_nonconstant:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    subs x8, x0, x1
+; CHECK-SD-NEXT:    sbc x8, x8, xzr
+; CHECK-SD-NEXT:    add x0, x8, x2
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: g_nonconstant:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    subs x8, x0, x1
+; CHECK-GI-NEXT:    cset w9, lo
+; CHECK-GI-NEXT:    add x8, x8, x2
+; CHECK-GI-NEXT:    sbfx x9, x9, #0, #1
+; CHECK-GI-NEXT:    add x0, x8, x9
+; CHECK-GI-NEXT:    ret
+  %ov  = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+  %val = extractvalue { i64, i1 } %ov, 0
+  %bit = extractvalue { i64, i1 } %ov, 1
+  %ext = sext i1 %bit to i64
+  %r   = add i64 %val, %ext
+  %r2  = add i64 %r, %c
+  ret i64 %r2
+}
+
+; Multiple uses of SBC result: hasOneUse() guard.
+define i64 @g_multi_use(i64 %a, i64 %b, ptr %out) {
+; CHECK-SD-LABEL: g_multi_use:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    subs x8, x0, x1
+; CHECK-SD-NEXT:    sbc x8, x8, xzr
+; CHECK-SD-NEXT:    add x0, x8, #10
+; CHECK-SD-NEXT:    str x8, [x2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: g_multi_use:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    subs x8, x0, x1
+; CHECK-GI-NEXT:    cset w9, lo
+; CHECK-GI-NEXT:    sbfx x9, x9, #0, #1
+; CHECK-GI-NEXT:    add x8, x8, x9
+; CHECK-GI-NEXT:    add x0, x8, #10
+; CHECK-GI-NEXT:    str x8, [x2]
+; CHECK-GI-NEXT:    ret
+  %ov  = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+  %val = extractvalue { i64, i1 } %ov, 0
+  %bit = extractvalue { i64, i1 } %ov, 1
+  %ext = sext i1 %bit to i64
+  %sbc = add i64 %val, %ext
+  store i64 %sbc, ptr %out
+  %r   = add i64 %sbc, 10
+  ret i64 %r
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/X86/sbb-add-constant.ll b/llvm/test/CodeGen/X86/sbb-add-constant.ll
new file mode 100644
index 0000000000000..4897589046b67
--- /dev/null
+++ b/llvm/test/CodeGen/X86/sbb-add-constant.ll
@@ -0,0 +1,204 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+;
+; Verify that ADD(SBB(Y,0,flags),C) folds to SBB(Y,-C,flags).
+; SBB(Y,0) = Y - CF; adding C gives Y - CF + C = Y - (-C) - CF = SBB(Y,-C).
+; The symmetric ADC fold (ADD(ADC(Y,0),X) -> ADC(X,Y)) already exists;
+; this tests the missing SBB counterpart.
+;
+; Guards that must prevent the fold:
+;   - addend is not a compile-time constant
+;   - SBB value result has more than one use
+;   - SBB flags output is consumed by a subsequent carry instruction
+
+declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64)
+declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32)
+declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64)
+
+;------------------------------------------------------------------------------
+; Positive: fold fires
+;------------------------------------------------------------------------------
+
+; Basic i64: sbb $0 + add $10 -> sbb $-10
+define i64 @g_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: g_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    subq %rsi, %rax
+; CHECK-NEXT:    sbbq $-10, %rax
+; CHECK-NEXT:    retq
+  %ov  = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+  %val = extractvalue { i64, i1 } %ov, 0
+  %bit = extractvalue { i64, i1 } %ov, 1
+  %ext = sext i1 %bit to i64
+  %r   = add i64 %val, %ext
+  %r2  = add i64 %r, 10
+  ret i64 %r2
+}
+
+; Basic i32: sbbl $-10
+define i32 @g_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: g_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    subl %esi, %eax
+; CHECK-NEXT:    sbbl $-10, %eax
+; CHECK-NEXT:    retq
+  %ov  = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
+  %val = extractvalue { i32, i1 } %ov, 0
+  %bit = extractvalue { i32, i1 } %ov, 1
+  %ext = sext i1 %bit to i32
+  %r   = add i32 %val, %ext
+  %r2  = add i32 %r, 10
+  ret i32 %r2
+}
+
+; Negative addend: add -5 -> sbb $5  (negate of -5 is 5)
+define i64 @g_neg_const(i64 %a, i64 %b) {
+; CHECK-LABEL: g_neg_const:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    subq %rsi, %rax
+; CHECK-NEXT:    sbbq $5, %rax
+; CHECK-NEXT:    retq
+  %ov  = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+  %val = extractvalue { i64, i1 } %ov, 0
+  %bit = extractvalue { i64, i1 } %ov, 1
+  %ext = sext i1 %bit to i64
+  %r   = add i64 %val, %ext
+  %r2  = add i64 %r, -5
+  ret i64 %r2
+}
+
+; Large constant fitting in imm32: add $4096 -> sbb $-4096
+define i64 @g_large_const(i64 %a, i64 %b) {
+; CHECK-LABEL: g_large_const:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    subq %rsi, %rax
+; CHECK-NEXT:    sbbq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT:    retq
+  %ov  = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+  %val = extractvalue { i64, i1 } %ov, 0
+  %bit = extractvalue { i64, i1 } %ov, 1
+  %ext = sext i1 %bit to i64
+  %r   = add i64 %val, %ext
+  %r2  = add i64 %r, 4096
+  ret i64 %r2
+}
+
+; In-register use with multiply: shows addq truly eliminated (no leaq escape).
+; Before fix: sub + sbb $0 + addq $10 + imulq = 4 insns
+; After fix:  sub + sbb $-10 + imulq = 3 insns
+define i64 @g_mul(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: g_mul:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    subq %rsi, %rax
+; CHECK-NEXT:    sbbq $-10, %rax
+; CHECK-NEXT:    imulq %rdx, %rax
+; CHECK-NEXT:    retq
+  %ov  = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+  %val = extractvalue { i64, i1 } %ov, 0
+  %bit = extractvalue { i64, i1 } %ov, 1
+  %ext = sext i1 %bit to i64
+  %r   = add i64 %val, %ext
+  %r2  = add i64 %r, 10
+  %r3  = mul i64 %r2, %c
+  ret i64 %r3
+}
+
+; ADC regression: existing ADD(ADC(Y,0),X)->ADC(X,Y) fold must be unaffected.
+define i64 @f_adc_regression(i64 %a, i64 %b) {
+; CHECK-LABEL: f_adc_regression:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    addq %rsi, %rax
+; CHECK-NEXT:    adcq $10, %rax
+; CHECK-NEXT:    retq
+  %ov  = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
+  %val = extractvalue { i64, i1 } %ov, 0
+  %bit = extractvalue { i64, i1 } %ov, 1
+  %ext = zext i1 %bit to i64
+  %r   = add i64 %val, %ext
+  %r2  = add i64 %r, 10
+  ret i64 %r2
+}
+
+;------------------------------------------------------------------------------
+; Negative: fold must not fire
+;------------------------------------------------------------------------------
+
+; Non-constant addend: sbb $0 must remain, addq %reg must not be folded.
+define i64 @g_nonconstant(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: g_nonconstant:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq %rsi, %rdi
+; CHECK-NEXT:    sbbq $0, %rdi
+; CHECK-NEXT:    leaq (%rdi,%rdx), %rax
+; CHECK-NEXT:    retq
+  %ov  = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+  %val = extractvalue { i64, i1 } %ov, 0
+  %bit = extractvalue { i64, i1 } %ov, 1
+  %ext = sext i1 %bit to i64
+  %r   = add i64 %val, %ext
+  %r2  = add i64 %r, %c
+  ret i64 %r2
+}
+
+; Multiple uses of SBB result: hasOneUse() guard prevents fold.
+; The SBB value feeds both the add-10 and a store; fold must not fire.
+define i64 @g_multi_use(i64 %a, i64 %b, ptr %out) {
+; CHECK-LABEL: g_multi_use:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq %rsi, %rdi
+; CHECK-NEXT:    sbbq $0, %rdi
+; CHECK-NEXT:    movq %rdi, (%rdx)
+; CHECK-NEXT:    leaq 10(%rdi), %rax
+; CHECK-NEXT:    retq
+  %ov  = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+  %val = extractvalue { i64, i1 } %ov, 0
+  %bit = extractvalue { i64, i1 } %ov, 1
+  %ext = sext i1 %bit to i64
+  %sbb = add i64 %val, %ext
+  store i64 %sbb, ptr %out
+  %r   = add i64 %sbb, 10
+  ret i64 %r
+}
+
+; Flags live: SBB carry output consumed by a second SBB in a borrow chain.
+; !hasAnyUseOfValue(1) guard prevents fold; constant must not be absorbed
+; into the first sbb because doing so would corrupt the carry into the second.
+define {i64, i64} @g_flags_live(i64 %a_lo, i64 %a_hi, i64 %b_lo, i64 %b_hi) {
+; CHECK-LABEL: g_flags_live:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %r8d, %r8d
+; CHECK-NEXT:    subq %rdx, %rdi
+; CHECK-NEXT:    setb %r8b
+; CHECK-NEXT:    subq %r8, %rdi
+; CHECK-NEXT:    leaq 10(%rdi), %rax
+; CHECK-NEXT:    subq %rcx, %rsi
+; CHECK-NEXT:    sbbq $0, %rsi
+; CHECK-NEXT:    subq %r8, %rsi
+; CHECK-NEXT:    movq %rsi, %rdx
+; CHECK-NEXT:    retq
+  %lo_ov  = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a_lo, i64 %b_lo)
+  %lo_val = extractvalue { i64, i1 } %lo_ov, 0
+  %lo_bit = extractvalue { i64, i1 } %lo_ov, 1
+  %lo_ext = sext i1 %lo_bit to i64
+  %lo_sbb = add i64 %lo_val, %lo_ext       ; SBB(lo, 0, sub_flags)
+  %lo_r   = add i64 %lo_sbb, 10            ; fold must NOT fire: carry is live
+
+  ; High limb consumes the same borrow — keeps lo_sbb carry output live.
+  %hi_sub = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a_hi, i64 %b_hi)
+  %hi_val = extractvalue { i64, i1 } %hi_sub, 0
+  %hi_bit = extractvalue { i64, i1 } %hi_sub, 1
+  %hi_ext = sext i1 %hi_bit to i64
+  %hi_sbb = add i64 %hi_val, %hi_ext
+  %lo_b   = zext i1 %lo_bit to i64
+  %hi_r   = sub i64 %hi_sbb, %lo_b
+
+  %ret    = insertvalue {i64, i64} undef,  i64 %lo_r, 0
+  %ret2   = insertvalue {i64, i64} %ret,   i64 %hi_r, 1
+  ret {i64, i64} %ret2
+}