[llvm] [CGP] Reconstruct borrow chain from icmp pattern for subtract-with-carry (PR #189018)
Paweł Bylica via llvm-commits
llvm-commits at lists.llvm.org
Sat Apr 4 13:00:20 PDT 2026
https://github.com/chfast updated https://github.com/llvm/llvm-project/pull/189018
>From b9f49be912686ea0b1483f1f7dc1f94531d7c88a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Bylica?= <pawel at hepcolgum.band>
Date: Thu, 2 Apr 2026 09:23:48 +0200
Subject: [PATCH 1/2] [X86][AArch64][test] Add borrow chain tests for
subtract-with-carry (NFC)
Add test cases to cgp-usubo.ll (AArch64) and update subcarry.ll (X86)
for the borrow chain pattern:
or(icmp ult A, B, and(icmp eq A, B, carry_in))
Positive tests: 2-limb, 3-limb, commuted eq operands.
Negative tests: i128 (non-simple type), different operands, signed cmp.
The X86 subcarry_ult_2x64_2 function currently produces suboptimal
setb/sete/and/or instead of cmp+sbb.
---
llvm/test/CodeGen/AArch64/cgp-usubo.ll | 129 ++++++++++++++++++++++++-
llvm/test/CodeGen/X86/subcarry.ll | 78 ++++++++++++++-
2 files changed, 201 insertions(+), 6 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/cgp-usubo.ll b/llvm/test/CodeGen/AArch64/cgp-usubo.ll
index d307107fc07ee..b91616805d402 100644
--- a/llvm/test/CodeGen/AArch64/cgp-usubo.ll
+++ b/llvm/test/CodeGen/AArch64/cgp-usubo.ll
@@ -120,6 +120,125 @@ define i1 @usubo_eq_constant1_op1_i32(i32 %x, ptr %p) nounwind {
ret i1 %ov
}
+; Borrow chain: or(icmp ult X, Y, and(icmp eq X, Y, carry_in)) -> cmp + sbcs.
+; See https://github.com/llvm/llvm-project/issues/106118.
+
+define i1 @subcarry_ult_2x64(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
+; CHECK-LABEL: subcarry_ult_2x64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmp x0, x2
+; CHECK-NEXT: cset w8, lo
+; CHECK-NEXT: cmp x1, x3
+; CHECK-NEXT: csel w8, wzr, w8, ne
+; CHECK-NEXT: csinc w0, w8, wzr, hs
+; CHECK-NEXT: ret
+ %b0 = icmp ult i64 %x0, %y0
+ %b1 = icmp ult i64 %x1, %y1
+ %e1 = icmp eq i64 %x1, %y1
+ %bp = and i1 %b0, %e1
+ %br = or i1 %b1, %bp
+ ret i1 %br
+}
+
+; Commuted eq operands: icmp eq Y, X instead of icmp eq X, Y.
+define i1 @subcarry_ult_2x64_commuted_eq(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
+; CHECK-LABEL: subcarry_ult_2x64_commuted_eq:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmp x0, x2
+; CHECK-NEXT: ccmp x3, x1, #0, lo
+; CHECK-NEXT: ccmp x1, x3, #0, ne
+; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: ret
+ %b0 = icmp ult i64 %x0, %y0
+ %b1 = icmp ult i64 %x1, %y1
+ %e1 = icmp eq i64 %y1, %x1
+ %bp = and i1 %b0, %e1
+ %br = or i1 %b1, %bp
+ ret i1 %br
+}
+
+define i1 @subcarry_ult_3x64(i64 %x0, i64 %x1, i64 %x2, i64 %y0, i64 %y1, i64 %y2) nounwind {
+; CHECK-LABEL: subcarry_ult_3x64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmp x0, x3
+; CHECK-NEXT: cset w8, lo
+; CHECK-NEXT: cmp x1, x4
+; CHECK-NEXT: csel w8, wzr, w8, ne
+; CHECK-NEXT: csinc w8, w8, wzr, hs
+; CHECK-NEXT: cmp x2, x5
+; CHECK-NEXT: csel w8, wzr, w8, ne
+; CHECK-NEXT: csinc w0, w8, wzr, hs
+; CHECK-NEXT: ret
+ %b0 = icmp ult i64 %x0, %y0
+ %b1 = icmp ult i64 %x1, %y1
+ %e1 = icmp eq i64 %x1, %y1
+ %bp1 = and i1 %b0, %e1
+ %br1 = or i1 %b1, %bp1
+ %b2 = icmp ult i64 %x2, %y2
+ %e2 = icmp eq i64 %x2, %y2
+ %bp2 = and i1 %br1, %e2
+ %br2 = or i1 %b2, %bp2
+ ret i1 %br2
+}
+
+; Unsigned less than of two 2x128 integers combined into a borrow chain.
+define i1 @subcarry_ult_2x128(i128 %x0, i128 %x1, i128 %y0, i128 %y1) nounwind {
+; CHECK-LABEL: subcarry_ult_2x128:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmp x2, x6
+; CHECK-NEXT: cset w8, eq
+; CHECK-NEXT: cmp x3, x7
+; CHECK-NEXT: csel w8, wzr, w8, ne
+; CHECK-NEXT: cmp x0, x4
+; CHECK-NEXT: sbcs xzr, x1, x5
+; CHECK-NEXT: csel w8, wzr, w8, hs
+; CHECK-NEXT: cmp x2, x6
+; CHECK-NEXT: sbcs xzr, x3, x7
+; CHECK-NEXT: csinc w0, w8, wzr, hs
+; CHECK-NEXT: ret
+ %b0 = icmp ult i128 %x0, %y0
+ %b1 = icmp ult i128 %x1, %y1
+ %e1 = icmp eq i128 %x1, %y1
+ %bp = and i1 %b0, %e1
+ %br = or i1 %b1, %bp
+ ret i1 %br
+}
+
+; Negative test: icmp eq compares different operands than icmp ult.
+define i1 @no_subcarry_different_ops(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
+; CHECK-LABEL: no_subcarry_different_ops:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmp x0, x2
+; CHECK-NEXT: ccmp x1, x0, #0, lo
+; CHECK-NEXT: ccmp x1, x3, #0, ne
+; CHECK-NEXT: cset w0, lo
+; CHECK-NEXT: ret
+ %b0 = icmp ult i64 %x0, %y0
+ %b1 = icmp ult i64 %x1, %y1
+ %e1 = icmp eq i64 %x1, %x0
+ %bp = and i1 %b0, %e1
+ %br = or i1 %b1, %bp
+ ret i1 %br
+}
+
+; Negative test: icmp slt instead of icmp ult.
+define i1 @no_subcarry_signed(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
+; CHECK-LABEL: no_subcarry_signed:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmp x0, x2
+; CHECK-NEXT: cset w8, lo
+; CHECK-NEXT: cmp x1, x3
+; CHECK-NEXT: csel w8, wzr, w8, ne
+; CHECK-NEXT: csinc w0, w8, wzr, ge
+; CHECK-NEXT: ret
+ %b0 = icmp ult i64 %x0, %y0
+ %b1 = icmp slt i64 %x1, %y1
+ %e1 = icmp eq i64 %x1, %y1
+ %bp = and i1 %b0, %e1
+ %br = or i1 %b1, %bp
+ ret i1 %br
+}
+
; Verify insertion point for multi-BB.
declare void @call(i1)
@@ -127,12 +246,12 @@ declare void @call(i1)
define i1 @usubo_ult_sub_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) nounwind {
; CHECK-LABEL: usubo_ult_sub_dominates_i64:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: tbz w3, #0, .LBB7_2
+; CHECK-NEXT: tbz w3, #0, .LBB13_2
; CHECK-NEXT: // %bb.1: // %t
; CHECK-NEXT: subs x8, x0, x1
; CHECK-NEXT: cset w3, lo
; CHECK-NEXT: str x8, [x2]
-; CHECK-NEXT: .LBB7_2: // %common.ret
+; CHECK-NEXT: .LBB13_2: // %common.ret
; CHECK-NEXT: and w0, w3, #0x1
; CHECK-NEXT: ret
entry:
@@ -158,7 +277,7 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) nounwin
; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: mov w19, w3
; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: tbz w3, #0, .LBB8_3
+; CHECK-NEXT: tbz w3, #0, .LBB14_3
; CHECK-NEXT: // %bb.1: // %t
; CHECK-NEXT: cmp x0, x1
; CHECK-NEXT: mov x22, x0
@@ -168,11 +287,11 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) nounwin
; CHECK-NEXT: mov w0, w21
; CHECK-NEXT: bl call
; CHECK-NEXT: subs x8, x22, x23
-; CHECK-NEXT: b.hs .LBB8_3
+; CHECK-NEXT: b.hs .LBB14_3
; CHECK-NEXT: // %bb.2: // %end
; CHECK-NEXT: mov w19, w21
; CHECK-NEXT: str x8, [x20]
-; CHECK-NEXT: .LBB8_3: // %common.ret
+; CHECK-NEXT: .LBB14_3: // %common.ret
; CHECK-NEXT: and w0, w19, #0x1
; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/subcarry.ll b/llvm/test/CodeGen/X86/subcarry.ll
index 60bedad53ac9a..e9e79962c5662 100644
--- a/llvm/test/CodeGen/X86/subcarry.ll
+++ b/llvm/test/CodeGen/X86/subcarry.ll
@@ -1355,7 +1355,7 @@ define i1 @subcarry_ult_2x64(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
; https://github.com/llvm/llvm-project/commit/926e7312b2f20f2f7b0a3d5ddbd29da5625507f3
; This is also the result of "naive" implementation (x1 < y1) | ((x0 < y0) & (x1 == y1)).
; C source: https://godbolt.org/z/W1qqvqGbr
-; TODO: This should be optimized to cmp + sbb.
+; TODO: This should be optimized to cmp + sbb (https://github.com/llvm/llvm-project/issues/106118).
define i1 @subcarry_ult_2x64_2(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
; X64-LABEL: subcarry_ult_2x64_2:
; X64: # %bb.0: # %entry
@@ -1437,5 +1437,81 @@ entry:
%2 = and i1 %1, %0
ret i1 %2
}
+
+; Unsigned less than of two 2x128 integers combined into a borrow chain.
+define i1 @subcarry_ult_2x128(i128 %x0, i128 %x1, i128 %y0, i128 %y1) nounwind {
+; X64-LABEL: subcarry_ult_2x128:
+; X64: # %bb.0:
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT: cmpq %r8, %rdi
+; X64-NEXT: sbbq %r9, %rsi
+; X64-NEXT: setb %sil
+; X64-NEXT: cmpq %r10, %rdx
+; X64-NEXT: movq %rcx, %rdi
+; X64-NEXT: sbbq %rax, %rdi
+; X64-NEXT: setb %dil
+; X64-NEXT: xorq %rax, %rcx
+; X64-NEXT: xorq %r10, %rdx
+; X64-NEXT: orq %rcx, %rdx
+; X64-NEXT: sete %al
+; X64-NEXT: andb %sil, %al
+; X64-NEXT: orb %dil, %al
+; X64-NEXT: retq
+;
+; X86-LABEL: subcarry_ult_2x128:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $16, %esp
+; X86-NEXT: movl 24(%ebp), %esi
+; X86-NEXT: movl 28(%ebp), %ebx
+; X86-NEXT: movl 8(%ebp), %edx
+; X86-NEXT: cmpl 40(%ebp), %edx
+; X86-NEXT: movl 12(%ebp), %edx
+; X86-NEXT: sbbl 44(%ebp), %edx
+; X86-NEXT: movl 16(%ebp), %edx
+; X86-NEXT: sbbl 48(%ebp), %edx
+; X86-NEXT: movl 20(%ebp), %edx
+; X86-NEXT: sbbl 52(%ebp), %edx
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: cmpl 56(%ebp), %esi
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: sbbl 60(%ebp), %edx
+; X86-NEXT: movl 32(%ebp), %edx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: sbbl 64(%ebp), %edi
+; X86-NEXT: movl 68(%ebp), %edi
+; X86-NEXT: movl 36(%ebp), %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: sbbl %edi, %eax
+; X86-NEXT: setb %ah
+; X86-NEXT: xorl %edi, %ecx
+; X86-NEXT: xorl 60(%ebp), %ebx
+; X86-NEXT: orl %ecx, %ebx
+; X86-NEXT: xorl 64(%ebp), %edx
+; X86-NEXT: xorl 56(%ebp), %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: sete %al
+; X86-NEXT: andb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
+; X86-NEXT: orb %ah, %al
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+ %b0 = icmp ult i128 %x0, %y0
+ %b1 = icmp ult i128 %x1, %y1
+ %e1 = icmp eq i128 %x1, %y1
+ %bp = and i1 %b0, %e1
+ %br = or i1 %b1, %bp
+ ret i1 %br
+}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK: {{.*}}
>From 847fbf1986beb3d4f12c0a5ca67a18c11a5b6697 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Bylica?= <pawel at hepcolgum.band>
Date: Sat, 4 Apr 2026 19:04:02 +0200
Subject: [PATCH 2/2] [CGP] Reconstruct borrow chain from icmp pattern for
subtract-with-carry
InstCombine simplifies chained usub.with.overflow intrinsics (from
multi-precision subtraction, e.g. __builtin_subcll) into an icmp pattern:
carry_out = or(icmp ult A, B, and(icmp eq A, B, carry_in))
This is algebraically correct but prevents the backend from generating
efficient subtract-with-borrow instructions (x86 sbb, aarch64 sbcs).
Add combineToUSubWithCarry() to CodeGenPrepare that matches this pattern
and reconstructs the chained usub.with.overflow form. The existing DAG
combiner (combineCarryDiamond) folds this into USUBO_CARRY.
The transformation is gated on target support for ISD::USUBO_CARRY.
Fixes #106118.
---
llvm/lib/CodeGen/CodeGenPrepare.cpp | 62 ++++++++++++++++++++++++++
llvm/test/CodeGen/AArch64/cgp-usubo.ll | 35 ++++++---------
llvm/test/CodeGen/X86/subcarry.ll | 57 ++++++++---------------
3 files changed, 95 insertions(+), 59 deletions(-)
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index aae6e718901fe..7afc9f32eabca 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -469,6 +469,7 @@ class CodeGenPrepare {
bool optimizeURem(Instruction *Rem);
bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
+ bool combineToUSubWithCarry(BinaryOperator *OrI);
bool unfoldPowerOf2Test(CmpInst *Cmp);
void verifyBFIUpdates(Function &F);
bool _run(Function &F);
@@ -1797,6 +1798,63 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
return true;
}
+/// FIXME: The name is similar to combineToUSubWithOverflow. Consider renaming
+/// one or both to better distinguish them.
+///
+/// Reconstruct a subtract-with-borrow chain from its canonicalized icmp form.
+///
+/// InstCombine simplifies chained usub.with.overflow intrinsics (used for
+/// multi-precision subtraction) into icmp patterns:
+/// carry_out = or(icmp ult A, B, and(icmp eq A, B, carry_in))
+///
+/// This is algebraically correct but prevents the backend from generating
+/// efficient subtract-with-borrow instructions (e.g. x86 sbb, aarch64 sbcs).
+///
+/// This function matches the pattern and reconstructs the chained
+/// usub.with.overflow form that the DAG combiner can lower to USUBO_CARRY.
+bool CodeGenPrepare::combineToUSubWithCarry(BinaryOperator *OrI) {
+ if (!OrI->getType()->isIntOrIntVectorTy(1))
+ return false;
+
+ assert(OrI->getOpcode() == Instruction::Or && "Expected or instruction");
+
+ // Match: or(icmp ult A, B, and(icmp eq A, B, carry_in))
+ // with all commuted variants of and and icmp eq operand order.
+ Value *A, *B, *CarryIn;
+ if (!match(
+ OrI,
+ m_c_BinOp(m_SpecificICmp(ICmpInst::ICMP_ULT, m_Value(A), m_Value(B)),
+ m_c_And(m_c_SpecificICmp(ICmpInst::ICMP_EQ, m_Deferred(A),
+ m_Deferred(B)),
+ m_Value(CarryIn)))))
+ return false;
+
+ Type *OpTy = A->getType();
+ EVT VT = TLI->getValueType(*DL, OpTy);
+ if (VT.isSimple())
+ VT = TLI->getTypeToTransformTo(OrI->getContext(), VT);
+ if (!TLI->isOperationLegalOrCustom(ISD::USUBO_CARRY, VT))
+ return false;
+
+ // Decompose into two usub.with.overflow whose borrows are OR'd.
+ // The DAG combiner (combineCarryDiamond) folds this into USUBO_CARRY.
+ IRBuilder<> Builder(OrI);
+
+ Value *CarryExt = Builder.CreateZExt(CarryIn, OpTy, "carry.ext");
+ Value *Sub1 =
+ Builder.CreateBinaryIntrinsic(Intrinsic::usub_with_overflow, A, B);
+ Value *Diff = Builder.CreateExtractValue(Sub1, 0, "diff");
+ Value *Borrow1 = Builder.CreateExtractValue(Sub1, 1, "borrow");
+ Value *Sub2 = Builder.CreateBinaryIntrinsic(Intrinsic::usub_with_overflow,
+ Diff, CarryExt);
+ Value *Borrow2 = Builder.CreateExtractValue(Sub2, 1, "borrow");
+ Value *Result = Builder.CreateOr(Borrow1, Borrow2, "carry.out");
+
+ replaceAllUsesWith(OrI, Result, FreshBBs, IsHugeFunc);
+ OrI->eraseFromParent();
+ return true;
+}
+
// Decanonicalizes icmp+ctpop power-of-two test if ctpop is slow.
// The same transformation exists in DAG combiner, but we repeat it here because
// DAG builder can break the pattern by moving icmp into a successor block.
@@ -9011,6 +9069,10 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts))
return true;
+ if (BinOp && BinOp->getOpcode() == Instruction::Or &&
+ combineToUSubWithCarry(BinOp))
+ return true;
+
// TODO: Move this into the switch on opcode - it handles shifts already.
if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
BinOp->getOpcode() == Instruction::LShr)) {
diff --git a/llvm/test/CodeGen/AArch64/cgp-usubo.ll b/llvm/test/CodeGen/AArch64/cgp-usubo.ll
index b91616805d402..5712e4359e9e9 100644
--- a/llvm/test/CodeGen/AArch64/cgp-usubo.ll
+++ b/llvm/test/CodeGen/AArch64/cgp-usubo.ll
@@ -127,10 +127,8 @@ define i1 @subcarry_ult_2x64(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
; CHECK-LABEL: subcarry_ult_2x64:
; CHECK: // %bb.0:
; CHECK-NEXT: cmp x0, x2
-; CHECK-NEXT: cset w8, lo
-; CHECK-NEXT: cmp x1, x3
-; CHECK-NEXT: csel w8, wzr, w8, ne
-; CHECK-NEXT: csinc w0, w8, wzr, hs
+; CHECK-NEXT: sbcs xzr, x1, x3
+; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%b0 = icmp ult i64 %x0, %y0
%b1 = icmp ult i64 %x1, %y1
@@ -145,8 +143,7 @@ define i1 @subcarry_ult_2x64_commuted_eq(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nou
; CHECK-LABEL: subcarry_ult_2x64_commuted_eq:
; CHECK: // %bb.0:
; CHECK-NEXT: cmp x0, x2
-; CHECK-NEXT: ccmp x3, x1, #0, lo
-; CHECK-NEXT: ccmp x1, x3, #0, ne
+; CHECK-NEXT: sbcs xzr, x1, x3
; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%b0 = icmp ult i64 %x0, %y0
@@ -161,13 +158,9 @@ define i1 @subcarry_ult_3x64(i64 %x0, i64 %x1, i64 %x2, i64 %y0, i64 %y1, i64 %y
; CHECK-LABEL: subcarry_ult_3x64:
; CHECK: // %bb.0:
; CHECK-NEXT: cmp x0, x3
-; CHECK-NEXT: cset w8, lo
-; CHECK-NEXT: cmp x1, x4
-; CHECK-NEXT: csel w8, wzr, w8, ne
-; CHECK-NEXT: csinc w8, w8, wzr, hs
-; CHECK-NEXT: cmp x2, x5
-; CHECK-NEXT: csel w8, wzr, w8, ne
-; CHECK-NEXT: csinc w0, w8, wzr, hs
+; CHECK-NEXT: sbcs xzr, x1, x4
+; CHECK-NEXT: sbcs xzr, x2, x5
+; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
%b0 = icmp ult i64 %x0, %y0
%b1 = icmp ult i64 %x1, %y1
@@ -185,16 +178,16 @@ define i1 @subcarry_ult_3x64(i64 %x0, i64 %x1, i64 %x2, i64 %y0, i64 %y1, i64 %y
define i1 @subcarry_ult_2x128(i128 %x0, i128 %x1, i128 %y0, i128 %y1) nounwind {
; CHECK-LABEL: subcarry_ult_2x128:
; CHECK: // %bb.0:
-; CHECK-NEXT: cmp x2, x6
-; CHECK-NEXT: cset w8, eq
-; CHECK-NEXT: cmp x3, x7
-; CHECK-NEXT: csel w8, wzr, w8, ne
; CHECK-NEXT: cmp x0, x4
; CHECK-NEXT: sbcs xzr, x1, x5
-; CHECK-NEXT: csel w8, wzr, w8, hs
-; CHECK-NEXT: cmp x2, x6
-; CHECK-NEXT: sbcs xzr, x3, x7
-; CHECK-NEXT: csinc w0, w8, wzr, hs
+; CHECK-NEXT: cset w8, lo
+; CHECK-NEXT: subs x9, x2, x6
+; CHECK-NEXT: sbcs x10, x3, x7
+; CHECK-NEXT: cset w11, lo
+; CHECK-NEXT: cmp x9, x8
+; CHECK-NEXT: sbcs xzr, x10, xzr
+; CHECK-NEXT: cset w8, lo
+; CHECK-NEXT: orr w0, w11, w8
; CHECK-NEXT: ret
%b0 = icmp ult i128 %x0, %y0
%b1 = icmp ult i128 %x1, %y1
diff --git a/llvm/test/CodeGen/X86/subcarry.ll b/llvm/test/CodeGen/X86/subcarry.ll
index e9e79962c5662..2c8346de637b0 100644
--- a/llvm/test/CodeGen/X86/subcarry.ll
+++ b/llvm/test/CodeGen/X86/subcarry.ll
@@ -1355,45 +1355,32 @@ define i1 @subcarry_ult_2x64(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
; https://github.com/llvm/llvm-project/commit/926e7312b2f20f2f7b0a3d5ddbd29da5625507f3
; This is also the result of "naive" implementation (x1 < y1) | ((x0 < y0) & (x1 == y1)).
; C source: https://godbolt.org/z/W1qqvqGbr
-; TODO: This should be optimized to cmp + sbb (https://github.com/llvm/llvm-project/issues/106118).
define i1 @subcarry_ult_2x64_2(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
; X64-LABEL: subcarry_ult_2x64_2:
; X64: # %bb.0: # %entry
; X64-NEXT: cmpq %rdx, %rdi
-; X64-NEXT: setb %dl
-; X64-NEXT: cmpq %rcx, %rsi
-; X64-NEXT: setb %cl
-; X64-NEXT: sete %al
-; X64-NEXT: andb %dl, %al
-; X64-NEXT: orb %cl, %al
+; X64-NEXT: sbbq %rcx, %rsi
+; X64-NEXT: setb %al
; X64-NEXT: retq
;
; X86-LABEL: subcarry_ult_2x64_2:
; X86: # %bb.0: # %entry
; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: subl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi
; X86-NEXT: setb %bl
-; X86-NEXT: cmpl %ecx, %eax
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: sbbl %esi, %edi
-; X86-NEXT: setb %bh
-; X86-NEXT: xorl %esi, %edx
-; X86-NEXT: xorl %ecx, %eax
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: sete %al
-; X86-NEXT: andb %bl, %al
-; X86-NEXT: orb %bh, %al
+; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: sbbl $0, %edx
+; X86-NEXT: sbbl $0, %esi
+; X86-NEXT: setb %al
+; X86-NEXT: orb %bl, %al
; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; X86-NEXT: retl
entry:
@@ -1442,21 +1429,15 @@ entry:
define i1 @subcarry_ult_2x128(i128 %x0, i128 %x1, i128 %y0, i128 %y1) nounwind {
; X64-LABEL: subcarry_ult_2x128:
; X64: # %bb.0:
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT: subq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: sbbq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT: setb %r10b
; X64-NEXT: cmpq %r8, %rdi
; X64-NEXT: sbbq %r9, %rsi
-; X64-NEXT: setb %sil
-; X64-NEXT: cmpq %r10, %rdx
-; X64-NEXT: movq %rcx, %rdi
-; X64-NEXT: sbbq %rax, %rdi
-; X64-NEXT: setb %dil
-; X64-NEXT: xorq %rax, %rcx
-; X64-NEXT: xorq %r10, %rdx
-; X64-NEXT: orq %rcx, %rdx
-; X64-NEXT: sete %al
-; X64-NEXT: andb %sil, %al
-; X64-NEXT: orb %dil, %al
+; X64-NEXT: sbbq $0, %rdx
+; X64-NEXT: sbbq $0, %rcx
+; X64-NEXT: setb %al
+; X64-NEXT: orb %r10b, %al
; X64-NEXT: retq
;
; X86-LABEL: subcarry_ult_2x128:
More information about the llvm-commits
mailing list