[llvm] [X86] [AArch64] fuse constant addition after sbb (PR #184541)
Takashi Idobe via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 4 17:02:24 PST 2026
https://github.com/Takashiidobe updated https://github.com/llvm/llvm-project/pull/184541
>From 0da03849f3a3960f6909ce7d29f23afc256b1823 Mon Sep 17 00:00:00 2001
From: Takashiidobe <idobetakashi at gmail.com>
Date: Tue, 3 Mar 2026 21:26:36 -0500
Subject: [PATCH] fuse addition after sbb
---
.../Target/AArch64/AArch64ISelLowering.cpp | 44 +++
llvm/lib/Target/X86/X86ISelLowering.cpp | 15 +
llvm/test/CodeGen/AArch64/sbc-add-constant.ll | 273 ++++++++++++++++++
llvm/test/CodeGen/X86/sbb-add-constant.ll | 204 +++++++++++++
4 files changed, 536 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/sbc-add-constant.ll
create mode 100644 llvm/test/CodeGen/X86/sbb-add-constant.ll
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8497a97aaf5fa..29dffb70f27c4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23346,6 +23346,48 @@ static SDValue performAddTruncShiftCombine(SDNode *N, SelectionDAG &DAG) {
return DAG.getNode(ISD::ADD, DL, VT, Trunc, Shift);
}
+// Fold ADD(AArch64ISD::SBC(Y, 0, W), C) -> AArch64ISD::SBC(Y, -C, W)
+//
+// SBC(Y, 0, W) = Y - 0 - ~carry = Y + carry - 1
+// Adding C: Y + carry - 1 + C = Y - (-C) - ~carry = SBC(Y, -C, W)
+//
+// AArch64's SBC has no immediate form, so -C is still materialized via MOV,
+// but that MOV is data-independent of the preceding SUBS and can execute in
+// parallel, reducing the critical-path length from 3 to 2 cycles:
+// Before: subs -> sbc -> add (3 dependent cycles)
+// After: mov || subs -> sbc (2 dependent cycles)
+//
+// AArch64 SBC (non-flag-setting) reads NZCV as an input but does not write it,
+// so no flags-output guard is needed. The fold is always safe when SBC has one
+// use and its subtrahend is zero.
+static SDValue performAddWithSBCCombine(SDNode *N, SelectionDAG &DAG) {
+ if (N->getOpcode() != ISD::ADD)
+ return SDValue();
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::i32 && VT != MVT::i64)
+ return SDValue();
+
+ SDValue SBC = N->getOperand(0);
+ SDValue C = N->getOperand(1);
+ // ADD is commutative; constant may be on either side.
+ if (SBC.getOpcode() != AArch64ISD::SBC)
+ std::swap(SBC, C);
+ if (SBC.getOpcode() != AArch64ISD::SBC || !SBC.hasOneUse())
+ return SDValue();
+ if (!isNullConstant(SBC.getOperand(1)))
+ return SDValue();
+ // AArch64 SBC (non-flag-setting) has only one output; no flags guard needed.
+ auto *CC = dyn_cast<ConstantSDNode>(C);
+ if (!CC)
+ return SDValue();
+
+ SDLoc DL(N);
+ return DAG.getNode(AArch64ISD::SBC, DL, VT,
+ SBC.getOperand(0),
+ DAG.getConstant(-CC->getAPIntValue(), DL, VT),
+ SBC.getOperand(2));
+}
+
static SDValue performAddSubCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
// Try to change sum of two reductions.
@@ -23371,6 +23413,8 @@ static SDValue performAddSubCombine(SDNode *N,
return Val;
if (SDValue Val = performAddTruncShiftCombine(N, DCI.DAG))
return Val;
+ if (SDValue Val = performAddWithSBCCombine(N, DCI.DAG))
+ return Val;
if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
return Val;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6384c4d58a480..c1b3ba8cf75a0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -59549,6 +59549,21 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
Op0.getOperand(0), Op0.getOperand(2));
}
+ // Fold ADD(SBB(Y,0,W),C) -> SBB(Y,-C,W)
+ // SBB(Y,0,W) = Y - 0 - CF = Y - CF; adding C gives Y - CF + C = Y - (-C) - CF.
+ // The SBB flags output must be dead: changing the subtrahend from 0 to -C
+ // produces different EFLAGS bits.
+ if (Op0.getOpcode() == X86ISD::SBB && Op0->hasOneUse() &&
+ X86::isZeroNode(Op0.getOperand(1)) && !Op0->hasAnyUseOfValue(1)) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Op1)) {
+ SDLoc SBBLoc(Op0);
+ return DAG.getNode(X86ISD::SBB, SBBLoc, Op0->getVTList(),
+ Op0.getOperand(0),
+ DAG.getConstant(-C->getAPIntValue(), SBBLoc, VT),
+ Op0.getOperand(2)).getValue(0);
+ }
+ }
+
if (SDValue IFMA52 = matchVPMADD52(N, DAG, DL, VT, Subtarget))
return IFMA52;
diff --git a/llvm/test/CodeGen/AArch64/sbc-add-constant.ll b/llvm/test/CodeGen/AArch64/sbc-add-constant.ll
new file mode 100644
index 0000000000000..2ee3db02f7d87
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sbc-add-constant.ll
@@ -0,0 +1,273 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck --check-prefixes=CHECK,CHECK-SD %s
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -global-isel < %s | FileCheck --check-prefixes=CHECK,CHECK-GI %s
+;
+; Verify that ADD(SBC(Y,0,flags),C) folds to SBC(Y,-C,flags).
+; SBC(Y,0,W) = Y - 0 - ~carry = Y + carry - 1; adding C gives Y + carry - 1 + C
+; = Y - (-C) - ~carry = SBC(Y,-C,W).
+;
+; AArch64's SBC has no immediate form so -C is materialised via MOV, but the
+; MOV is data-independent of the preceding SUBS and can execute in parallel,
+; reducing the critical-path depth from 3 to 2 cycles.
+;
+; Unlike x86 SBB, AArch64 SBC (non-flag-setting) reads NZCV as an input but
+; does not write it. There is therefore no "flags output live" guard: folding
+; ADD(SBC(Y,0),C)->SBC(Y,-C) is always safe when the SBC has one use and its
+; subtrahend is zero.
+;
+; Guards that must prevent the fold:
+; - addend is not a compile-time constant
+; - SBC value result has more than one use
+
+declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64)
+declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32)
+declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64)
+
+;------------------------------------------------------------------------------
+; Positive: fold fires (SelectionDAG only; GlobalISel unaffected)
+;------------------------------------------------------------------------------
+
+; Basic i64: sbc xzr + add #10 -> mov #-10 + sbc reg
+define i64 @g_i64(i64 %a, i64 %b) {
+; CHECK-SD-LABEL: g_i64:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: mov x8, #-10 // =0xfffffffffffffff6
+; CHECK-SD-NEXT: subs x9, x0, x1
+; CHECK-SD-NEXT: sbc x0, x9, x8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: g_i64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: subs x8, x0, x1
+; CHECK-GI-NEXT: cset w9, lo
+; CHECK-GI-NEXT: sbfx x9, x9, #0, #1
+; CHECK-GI-NEXT: add x8, x8, x9
+; CHECK-GI-NEXT: add x0, x8, #10
+; CHECK-GI-NEXT: ret
+ %ov = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+ %val = extractvalue { i64, i1 } %ov, 0
+ %bit = extractvalue { i64, i1 } %ov, 1
+ %ext = sext i1 %bit to i64
+ %r = add i64 %val, %ext
+ %r2 = add i64 %r, 10
+ ret i64 %r2
+}
+
+; Basic i32
+define i32 @g_i32(i32 %a, i32 %b) {
+; CHECK-SD-LABEL: g_i32:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: mov w8, #-10 // =0xfffffff6
+; CHECK-SD-NEXT: subs w9, w0, w1
+; CHECK-SD-NEXT: sbc w0, w9, w8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: g_i32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: subs w8, w0, w1
+; CHECK-GI-NEXT: cset w9, lo
+; CHECK-GI-NEXT: sbfx w9, w9, #0, #1
+; CHECK-GI-NEXT: add w8, w8, w9
+; CHECK-GI-NEXT: add w0, w8, #10
+; CHECK-GI-NEXT: ret
+ %ov = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
+ %val = extractvalue { i32, i1 } %ov, 0
+ %bit = extractvalue { i32, i1 } %ov, 1
+ %ext = sext i1 %bit to i32
+ %r = add i32 %val, %ext
+ %r2 = add i32 %r, 10
+ ret i32 %r2
+}
+
+; Negative addend: add -5 -> sbc with #5 materialised
+define i64 @g_neg_const(i64 %a, i64 %b) {
+; CHECK-SD-LABEL: g_neg_const:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: mov w8, #5 // =0x5
+; CHECK-SD-NEXT: subs x9, x0, x1
+; CHECK-SD-NEXT: sbc x0, x9, x8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: g_neg_const:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: subs x8, x0, x1
+; CHECK-GI-NEXT: cset w9, lo
+; CHECK-GI-NEXT: sbfx x9, x9, #0, #1
+; CHECK-GI-NEXT: add x8, x8, x9
+; CHECK-GI-NEXT: sub x0, x8, #5
+; CHECK-GI-NEXT: ret
+ %ov = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+ %val = extractvalue { i64, i1 } %ov, 0
+ %bit = extractvalue { i64, i1 } %ov, 1
+ %ext = sext i1 %bit to i64
+ %r = add i64 %val, %ext
+ %r2 = add i64 %r, -5
+ ret i64 %r2
+}
+
+; In-register use with multiply: shows the saved critical-path cycle.
+; Before fix: subs -> sbc -> add -> mul (add on critical path)
+; After fix: mov || subs -> sbc -> mul (mov runs in parallel with subs)
+define i64 @g_mul(i64 %a, i64 %b, i64 %c) {
+; CHECK-SD-LABEL: g_mul:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: subs x8, x0, x1
+; CHECK-SD-NEXT: mov x9, #-10 // =0xfffffffffffffff6
+; CHECK-SD-NEXT: sbc x8, x8, x9
+; CHECK-SD-NEXT: mul x0, x8, x2
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: g_mul:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: subs x8, x0, x1
+; CHECK-GI-NEXT: cset w9, lo
+; CHECK-GI-NEXT: sbfx x9, x9, #0, #1
+; CHECK-GI-NEXT: add x8, x8, x9
+; CHECK-GI-NEXT: add x8, x8, #10
+; CHECK-GI-NEXT: mul x0, x8, x2
+; CHECK-GI-NEXT: ret
+ %ov = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+ %val = extractvalue { i64, i1 } %ov, 0
+ %bit = extractvalue { i64, i1 } %ov, 1
+ %ext = sext i1 %bit to i64
+ %r = add i64 %val, %ext
+ %r2 = add i64 %r, 10
+ %r3 = mul i64 %r2, %c
+ ret i64 %r3
+}
+
+; ADC regression: existing add-with-carry fold must be unaffected.
+define i64 @f_adc_regression(i64 %a, i64 %b) {
+; CHECK-SD-LABEL: f_adc_regression:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: mov w8, #10 // =0xa
+; CHECK-SD-NEXT: adds x9, x0, x1
+; CHECK-SD-NEXT: adc x0, x9, x8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: f_adc_regression:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: adds x8, x0, x1
+; CHECK-GI-NEXT: cset w9, hs
+; CHECK-GI-NEXT: and x9, x9, #0x1
+; CHECK-GI-NEXT: add x8, x8, x9
+; CHECK-GI-NEXT: add x0, x8, #10
+; CHECK-GI-NEXT: ret
+ %ov = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
+ %val = extractvalue { i64, i1 } %ov, 0
+ %bit = extractvalue { i64, i1 } %ov, 1
+ %ext = zext i1 %bit to i64
+ %r = add i64 %val, %ext
+ %r2 = add i64 %r, 10
+ ret i64 %r2
+}
+
+; Borrow-chain: unlike x86, AArch64 SBC has no flags output, so the fold fires
+; even when the lo borrow is used elsewhere. lo_sbc has exactly one use
+; (the add-10), so hasOneUse() passes and the fold is applied.
+define {i64, i64} @g_flags_fold(i64 %a_lo, i64 %a_hi, i64 %b_lo, i64 %b_hi) {
+; CHECK-SD-LABEL: g_flags_fold:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: subs x8, x0, x2
+; CHECK-SD-NEXT: cset w9, lo
+; CHECK-SD-NEXT: subs x10, x1, x3
+; CHECK-SD-NEXT: sbc x10, x10, xzr
+; CHECK-SD-NEXT: sub x8, x8, x9
+; CHECK-SD-NEXT: sub x1, x10, x9
+; CHECK-SD-NEXT: add x0, x8, #10
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: g_flags_fold:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: subs x8, x0, x2
+; CHECK-GI-NEXT: cset w9, lo
+; CHECK-GI-NEXT: subs x10, x1, x3
+; CHECK-GI-NEXT: cset w11, lo
+; CHECK-GI-NEXT: sbfx x12, x9, #0, #1
+; CHECK-GI-NEXT: and x9, x9, #0x1
+; CHECK-GI-NEXT: sbfx x11, x11, #0, #1
+; CHECK-GI-NEXT: add x8, x8, x12
+; CHECK-GI-NEXT: add x10, x10, x11
+; CHECK-GI-NEXT: add x0, x8, #10
+; CHECK-GI-NEXT: sub x1, x10, x9
+; CHECK-GI-NEXT: ret
+ %lo_ov = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a_lo, i64 %b_lo)
+ %lo_val = extractvalue { i64, i1 } %lo_ov, 0
+ %lo_bit = extractvalue { i64, i1 } %lo_ov, 1
+ %lo_ext = sext i1 %lo_bit to i64
+ %lo_sbc = add i64 %lo_val, %lo_ext
+ %lo_r = add i64 %lo_sbc, 10
+
+ %hi_sub = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a_hi, i64 %b_hi)
+ %hi_val = extractvalue { i64, i1 } %hi_sub, 0
+ %hi_bit = extractvalue { i64, i1 } %hi_sub, 1
+ %hi_ext = sext i1 %hi_bit to i64
+ %hi_sbc = add i64 %hi_val, %hi_ext
+ %lo_b = zext i1 %lo_bit to i64
+ %hi_r = sub i64 %hi_sbc, %lo_b
+
+ %ret = insertvalue {i64, i64} undef, i64 %lo_r, 0
+ %ret2 = insertvalue {i64, i64} %ret, i64 %hi_r, 1
+ ret {i64, i64} %ret2
+}
+
+;------------------------------------------------------------------------------
+; Negative: fold must not fire
+;------------------------------------------------------------------------------
+
+; Non-constant addend.
+define i64 @g_nonconstant(i64 %a, i64 %b, i64 %c) {
+; CHECK-SD-LABEL: g_nonconstant:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: subs x8, x0, x1
+; CHECK-SD-NEXT: sbc x8, x8, xzr
+; CHECK-SD-NEXT: add x0, x8, x2
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: g_nonconstant:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: subs x8, x0, x1
+; CHECK-GI-NEXT: cset w9, lo
+; CHECK-GI-NEXT: add x8, x8, x2
+; CHECK-GI-NEXT: sbfx x9, x9, #0, #1
+; CHECK-GI-NEXT: add x0, x8, x9
+; CHECK-GI-NEXT: ret
+ %ov = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+ %val = extractvalue { i64, i1 } %ov, 0
+ %bit = extractvalue { i64, i1 } %ov, 1
+ %ext = sext i1 %bit to i64
+ %r = add i64 %val, %ext
+ %r2 = add i64 %r, %c
+ ret i64 %r2
+}
+
+; Multiple uses of SBC result: hasOneUse() guard.
+define i64 @g_multi_use(i64 %a, i64 %b, ptr %out) {
+; CHECK-SD-LABEL: g_multi_use:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: subs x8, x0, x1
+; CHECK-SD-NEXT: sbc x8, x8, xzr
+; CHECK-SD-NEXT: add x0, x8, #10
+; CHECK-SD-NEXT: str x8, [x2]
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: g_multi_use:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: subs x8, x0, x1
+; CHECK-GI-NEXT: cset w9, lo
+; CHECK-GI-NEXT: sbfx x9, x9, #0, #1
+; CHECK-GI-NEXT: add x8, x8, x9
+; CHECK-GI-NEXT: add x0, x8, #10
+; CHECK-GI-NEXT: str x8, [x2]
+; CHECK-GI-NEXT: ret
+ %ov = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+ %val = extractvalue { i64, i1 } %ov, 0
+ %bit = extractvalue { i64, i1 } %ov, 1
+ %ext = sext i1 %bit to i64
+ %sbc = add i64 %val, %ext
+ store i64 %sbc, ptr %out
+ %r = add i64 %sbc, 10
+ ret i64 %r
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/X86/sbb-add-constant.ll b/llvm/test/CodeGen/X86/sbb-add-constant.ll
new file mode 100644
index 0000000000000..4897589046b67
--- /dev/null
+++ b/llvm/test/CodeGen/X86/sbb-add-constant.ll
@@ -0,0 +1,204 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+;
+; Verify that ADD(SBB(Y,0,flags),C) folds to SBB(Y,-C,flags).
+; SBB(Y,0) = Y - CF; adding C gives Y - CF + C = Y - (-C) - CF = SBB(Y,-C).
+; The symmetric ADC fold (ADD(ADC(Y,0),X) -> ADC(X,Y)) already exists;
+; this tests the missing SBB counterpart.
+;
+; Guards that must prevent the fold:
+; - addend is not a compile-time constant
+; - SBB value result has more than one use
+; - SBB flags output is consumed by a subsequent carry instruction
+
+declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64)
+declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32)
+declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64)
+
+;------------------------------------------------------------------------------
+; Positive: fold fires
+;------------------------------------------------------------------------------
+
+; Basic i64: sbb $0 + add $10 -> sbb $-10
+define i64 @g_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: g_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: subq %rsi, %rax
+; CHECK-NEXT: sbbq $-10, %rax
+; CHECK-NEXT: retq
+ %ov = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+ %val = extractvalue { i64, i1 } %ov, 0
+ %bit = extractvalue { i64, i1 } %ov, 1
+ %ext = sext i1 %bit to i64
+ %r = add i64 %val, %ext
+ %r2 = add i64 %r, 10
+ ret i64 %r2
+}
+
+; Basic i32: sbbl $-10
+define i32 @g_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: g_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: subl %esi, %eax
+; CHECK-NEXT: sbbl $-10, %eax
+; CHECK-NEXT: retq
+ %ov = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
+ %val = extractvalue { i32, i1 } %ov, 0
+ %bit = extractvalue { i32, i1 } %ov, 1
+ %ext = sext i1 %bit to i32
+ %r = add i32 %val, %ext
+ %r2 = add i32 %r, 10
+ ret i32 %r2
+}
+
+; Negative addend: add -5 -> sbb $5 (negate of -5 is 5)
+define i64 @g_neg_const(i64 %a, i64 %b) {
+; CHECK-LABEL: g_neg_const:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: subq %rsi, %rax
+; CHECK-NEXT: sbbq $5, %rax
+; CHECK-NEXT: retq
+ %ov = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+ %val = extractvalue { i64, i1 } %ov, 0
+ %bit = extractvalue { i64, i1 } %ov, 1
+ %ext = sext i1 %bit to i64
+ %r = add i64 %val, %ext
+ %r2 = add i64 %r, -5
+ ret i64 %r2
+}
+
+; Large constant fitting in imm32: add $4096 -> sbb $-4096
+define i64 @g_large_const(i64 %a, i64 %b) {
+; CHECK-LABEL: g_large_const:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: subq %rsi, %rax
+; CHECK-NEXT: sbbq $-4096, %rax # imm = 0xF000
+; CHECK-NEXT: retq
+ %ov = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+ %val = extractvalue { i64, i1 } %ov, 0
+ %bit = extractvalue { i64, i1 } %ov, 1
+ %ext = sext i1 %bit to i64
+ %r = add i64 %val, %ext
+ %r2 = add i64 %r, 4096
+ ret i64 %r2
+}
+
+; In-register use with multiply: shows addq truly eliminated (no leaq escape).
+; Before fix: sub + sbb $0 + addq $10 + imulq = 4 insns
+; After fix: sub + sbb $-10 + imulq = 3 insns
+define i64 @g_mul(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: g_mul:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: subq %rsi, %rax
+; CHECK-NEXT: sbbq $-10, %rax
+; CHECK-NEXT: imulq %rdx, %rax
+; CHECK-NEXT: retq
+ %ov = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+ %val = extractvalue { i64, i1 } %ov, 0
+ %bit = extractvalue { i64, i1 } %ov, 1
+ %ext = sext i1 %bit to i64
+ %r = add i64 %val, %ext
+ %r2 = add i64 %r, 10
+ %r3 = mul i64 %r2, %c
+ ret i64 %r3
+}
+
+; ADC regression: existing ADD(ADC(Y,0),X)->ADC(X,Y) fold must be unaffected.
+define i64 @f_adc_regression(i64 %a, i64 %b) {
+; CHECK-LABEL: f_adc_regression:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: addq %rsi, %rax
+; CHECK-NEXT: adcq $10, %rax
+; CHECK-NEXT: retq
+ %ov = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
+ %val = extractvalue { i64, i1 } %ov, 0
+ %bit = extractvalue { i64, i1 } %ov, 1
+ %ext = zext i1 %bit to i64
+ %r = add i64 %val, %ext
+ %r2 = add i64 %r, 10
+ ret i64 %r2
+}
+
+;------------------------------------------------------------------------------
+; Negative: fold must not fire
+;------------------------------------------------------------------------------
+
+; Non-constant addend: sbb $0 must remain, addq %reg must not be folded.
+define i64 @g_nonconstant(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: g_nonconstant:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq %rsi, %rdi
+; CHECK-NEXT: sbbq $0, %rdi
+; CHECK-NEXT: leaq (%rdi,%rdx), %rax
+; CHECK-NEXT: retq
+ %ov = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+ %val = extractvalue { i64, i1 } %ov, 0
+ %bit = extractvalue { i64, i1 } %ov, 1
+ %ext = sext i1 %bit to i64
+ %r = add i64 %val, %ext
+ %r2 = add i64 %r, %c
+ ret i64 %r2
+}
+
+; Multiple uses of SBB result: hasOneUse() guard prevents fold.
+; The SBB value feeds both the add-10 and a store; fold must not fire.
+define i64 @g_multi_use(i64 %a, i64 %b, ptr %out) {
+; CHECK-LABEL: g_multi_use:
+; CHECK: # %bb.0:
+; CHECK-NEXT: subq %rsi, %rdi
+; CHECK-NEXT: sbbq $0, %rdi
+; CHECK-NEXT: movq %rdi, (%rdx)
+; CHECK-NEXT: leaq 10(%rdi), %rax
+; CHECK-NEXT: retq
+ %ov = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
+ %val = extractvalue { i64, i1 } %ov, 0
+ %bit = extractvalue { i64, i1 } %ov, 1
+ %ext = sext i1 %bit to i64
+ %sbb = add i64 %val, %ext
+ store i64 %sbb, ptr %out
+ %r = add i64 %sbb, 10
+ ret i64 %r
+}
+
+; Flags live: SBB carry output consumed by a second SBB in a borrow chain.
+; !hasAnyUseOfValue(1) guard prevents fold; constant must not be absorbed
+; into the first sbb because doing so would corrupt the carry into the second.
+define {i64, i64} @g_flags_live(i64 %a_lo, i64 %a_hi, i64 %b_lo, i64 %b_hi) {
+; CHECK-LABEL: g_flags_live:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorl %r8d, %r8d
+; CHECK-NEXT: subq %rdx, %rdi
+; CHECK-NEXT: setb %r8b
+; CHECK-NEXT: subq %r8, %rdi
+; CHECK-NEXT: leaq 10(%rdi), %rax
+; CHECK-NEXT: subq %rcx, %rsi
+; CHECK-NEXT: sbbq $0, %rsi
+; CHECK-NEXT: subq %r8, %rsi
+; CHECK-NEXT: movq %rsi, %rdx
+; CHECK-NEXT: retq
+ %lo_ov = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a_lo, i64 %b_lo)
+ %lo_val = extractvalue { i64, i1 } %lo_ov, 0
+ %lo_bit = extractvalue { i64, i1 } %lo_ov, 1
+ %lo_ext = sext i1 %lo_bit to i64
+ %lo_sbb = add i64 %lo_val, %lo_ext ; SBB(lo, 0, sub_flags)
+ %lo_r = add i64 %lo_sbb, 10 ; fold must NOT fire: carry is live
+
+ ; High limb consumes the same borrow — keeps lo_sbb carry output live.
+ %hi_sub = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a_hi, i64 %b_hi)
+ %hi_val = extractvalue { i64, i1 } %hi_sub, 0
+ %hi_bit = extractvalue { i64, i1 } %hi_sub, 1
+ %hi_ext = sext i1 %hi_bit to i64
+ %hi_sbb = add i64 %hi_val, %hi_ext
+ %lo_b = zext i1 %lo_bit to i64
+ %hi_r = sub i64 %hi_sbb, %lo_b
+
+ %ret = insertvalue {i64, i64} undef, i64 %lo_r, 0
+ %ret2 = insertvalue {i64, i64} %ret, i64 %hi_r, 1
+ ret {i64, i64} %ret2
+}
More information about the llvm-commits
mailing list