[llvm] [CGP] Reconstruct borrow chain from icmp pattern for subtract-with-carry (PR #189018)

Sat Apr 4 13:00:20 PDT 2026

https://github.com/chfast updated https://github.com/llvm/llvm-project/pull/189018

>From b9f49be912686ea0b1483f1f7dc1f94531d7c88a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Bylica?= <pawel at hepcolgum.band>
Date: Thu, 2 Apr 2026 09:23:48 +0200
Subject: [PATCH 1/2] [X86][AArch64][test] Add borrow chain tests for
 subtract-with-carry (NFC)

Add test cases to cgp-usubo.ll (AArch64) and update subcarry.ll (X86)
for the borrow chain pattern:
  or(icmp ult A, B, and(icmp eq A, B, carry_in))

Positive tests: 2-limb, 3-limb, commuted eq operands.
Negative tests: i128 (non-simple type), different operands, signed cmp.

The X86 subcarry_ult_2x64_2 function currently produces suboptimal
setb/sete/and/or instead of cmp+sbb.
---
 llvm/test/CodeGen/AArch64/cgp-usubo.ll | 129 ++++++++++++++++++++++++-
 llvm/test/CodeGen/X86/subcarry.ll      |  78 ++++++++++++++-
 2 files changed, 201 insertions(+), 6 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/cgp-usubo.ll b/llvm/test/CodeGen/AArch64/cgp-usubo.ll
index d307107fc07ee..b91616805d402 100644
--- a/llvm/test/CodeGen/AArch64/cgp-usubo.ll
+++ b/llvm/test/CodeGen/AArch64/cgp-usubo.ll
@@ -120,6 +120,125 @@ define i1 @usubo_eq_constant1_op1_i32(i32 %x, ptr %p) nounwind {
   ret i1 %ov
 }
 
+; Borrow chain: or(icmp ult X, Y, and(icmp eq X, Y, carry_in)) -> cmp + sbcs.
+; See https://github.com/llvm/llvm-project/issues/106118.
+
+define i1 @subcarry_ult_2x64(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
+; CHECK-LABEL: subcarry_ult_2x64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp x0, x2
+; CHECK-NEXT:    cset w8, lo
+; CHECK-NEXT:    cmp x1, x3
+; CHECK-NEXT:    csel w8, wzr, w8, ne
+; CHECK-NEXT:    csinc w0, w8, wzr, hs
+; CHECK-NEXT:    ret
+  %b0 = icmp ult i64 %x0, %y0
+  %b1 = icmp ult i64 %x1, %y1
+  %e1 = icmp eq i64 %x1, %y1
+  %bp = and i1 %b0, %e1
+  %br = or i1 %b1, %bp
+  ret i1 %br
+}
+
+; Commuted eq operands: icmp eq Y, X instead of icmp eq X, Y.
+define i1 @subcarry_ult_2x64_commuted_eq(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
+; CHECK-LABEL: subcarry_ult_2x64_commuted_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp x0, x2
+; CHECK-NEXT:    ccmp x3, x1, #0, lo
+; CHECK-NEXT:    ccmp x1, x3, #0, ne
+; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    ret
+  %b0 = icmp ult i64 %x0, %y0
+  %b1 = icmp ult i64 %x1, %y1
+  %e1 = icmp eq i64 %y1, %x1
+  %bp = and i1 %b0, %e1
+  %br = or i1 %b1, %bp
+  ret i1 %br
+}
+
+define i1 @subcarry_ult_3x64(i64 %x0, i64 %x1, i64 %x2, i64 %y0, i64 %y1, i64 %y2) nounwind {
+; CHECK-LABEL: subcarry_ult_3x64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp x0, x3
+; CHECK-NEXT:    cset w8, lo
+; CHECK-NEXT:    cmp x1, x4
+; CHECK-NEXT:    csel w8, wzr, w8, ne
+; CHECK-NEXT:    csinc w8, w8, wzr, hs
+; CHECK-NEXT:    cmp x2, x5
+; CHECK-NEXT:    csel w8, wzr, w8, ne
+; CHECK-NEXT:    csinc w0, w8, wzr, hs
+; CHECK-NEXT:    ret
+  %b0 = icmp ult i64 %x0, %y0
+  %b1 = icmp ult i64 %x1, %y1
+  %e1 = icmp eq i64 %x1, %y1
+  %bp1 = and i1 %b0, %e1
+  %br1 = or i1 %b1, %bp1
+  %b2 = icmp ult i64 %x2, %y2
+  %e2 = icmp eq i64 %x2, %y2
+  %bp2 = and i1 %br1, %e2
+  %br2 = or i1 %b2, %bp2
+  ret i1 %br2
+}
+
+; Unsigned less than of two 2x128 integers combined into a borrow chain.
+define i1 @subcarry_ult_2x128(i128 %x0, i128 %x1, i128 %y0, i128 %y1) nounwind {
+; CHECK-LABEL: subcarry_ult_2x128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp x2, x6
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    cmp x3, x7
+; CHECK-NEXT:    csel w8, wzr, w8, ne
+; CHECK-NEXT:    cmp x0, x4
+; CHECK-NEXT:    sbcs xzr, x1, x5
+; CHECK-NEXT:    csel w8, wzr, w8, hs
+; CHECK-NEXT:    cmp x2, x6
+; CHECK-NEXT:    sbcs xzr, x3, x7
+; CHECK-NEXT:    csinc w0, w8, wzr, hs
+; CHECK-NEXT:    ret
+  %b0 = icmp ult i128 %x0, %y0
+  %b1 = icmp ult i128 %x1, %y1
+  %e1 = icmp eq i128 %x1, %y1
+  %bp = and i1 %b0, %e1
+  %br = or i1 %b1, %bp
+  ret i1 %br
+}
+
+; Negative test: icmp eq compares different operands than icmp ult.
+define i1 @no_subcarry_different_ops(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
+; CHECK-LABEL: no_subcarry_different_ops:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp x0, x2
+; CHECK-NEXT:    ccmp x1, x0, #0, lo
+; CHECK-NEXT:    ccmp x1, x3, #0, ne
+; CHECK-NEXT:    cset w0, lo
+; CHECK-NEXT:    ret
+  %b0 = icmp ult i64 %x0, %y0
+  %b1 = icmp ult i64 %x1, %y1
+  %e1 = icmp eq i64 %x1, %x0
+  %bp = and i1 %b0, %e1
+  %br = or i1 %b1, %bp
+  ret i1 %br
+}
+
+; Negative test: icmp slt instead of icmp ult.
+define i1 @no_subcarry_signed(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
+; CHECK-LABEL: no_subcarry_signed:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmp x0, x2
+; CHECK-NEXT:    cset w8, lo
+; CHECK-NEXT:    cmp x1, x3
+; CHECK-NEXT:    csel w8, wzr, w8, ne
+; CHECK-NEXT:    csinc w0, w8, wzr, ge
+; CHECK-NEXT:    ret
+  %b0 = icmp ult i64 %x0, %y0
+  %b1 = icmp slt i64 %x1, %y1
+  %e1 = icmp eq i64 %x1, %y1
+  %bp = and i1 %b0, %e1
+  %br = or i1 %b1, %bp
+  ret i1 %br
+}
+
 ; Verify insertion point for multi-BB.
 
 declare void @call(i1)
@@ -127,12 +246,12 @@ declare void @call(i1)
 define i1 @usubo_ult_sub_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) nounwind {
 ; CHECK-LABEL: usubo_ult_sub_dominates_i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    tbz w3, #0, .LBB7_2
+; CHECK-NEXT:    tbz w3, #0, .LBB13_2
 ; CHECK-NEXT:  // %bb.1: // %t
 ; CHECK-NEXT:    subs x8, x0, x1
 ; CHECK-NEXT:    cset w3, lo
 ; CHECK-NEXT:    str x8, [x2]
-; CHECK-NEXT:  .LBB7_2: // %common.ret
+; CHECK-NEXT:  .LBB13_2: // %common.ret
 ; CHECK-NEXT:    and w0, w3, #0x1
 ; CHECK-NEXT:    ret
 entry:
@@ -158,7 +277,7 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) nounwin
 ; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov w19, w3
 ; CHECK-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    tbz w3, #0, .LBB8_3
+; CHECK-NEXT:    tbz w3, #0, .LBB14_3
 ; CHECK-NEXT:  // %bb.1: // %t
 ; CHECK-NEXT:    cmp x0, x1
 ; CHECK-NEXT:    mov x22, x0
@@ -168,11 +287,11 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) nounwin
 ; CHECK-NEXT:    mov w0, w21
 ; CHECK-NEXT:    bl call
 ; CHECK-NEXT:    subs x8, x22, x23
-; CHECK-NEXT:    b.hs .LBB8_3
+; CHECK-NEXT:    b.hs .LBB14_3
 ; CHECK-NEXT:  // %bb.2: // %end
 ; CHECK-NEXT:    mov w19, w21
 ; CHECK-NEXT:    str x8, [x20]
-; CHECK-NEXT:  .LBB8_3: // %common.ret
+; CHECK-NEXT:  .LBB14_3: // %common.ret
 ; CHECK-NEXT:    and w0, w19, #0x1
 ; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/subcarry.ll b/llvm/test/CodeGen/X86/subcarry.ll
index 60bedad53ac9a..e9e79962c5662 100644
--- a/llvm/test/CodeGen/X86/subcarry.ll
+++ b/llvm/test/CodeGen/X86/subcarry.ll
@@ -1355,7 +1355,7 @@ define i1 @subcarry_ult_2x64(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
 ; https://github.com/llvm/llvm-project/commit/926e7312b2f20f2f7b0a3d5ddbd29da5625507f3
 ; This is also the result of "naive" implementation (x1 < y1) | ((x0 < y0) & (x1 == y1)).
 ; C source: https://godbolt.org/z/W1qqvqGbr
-; TODO: This should be optimized to cmp + sbb.
+; TODO: This should be optimized to cmp + sbb (https://github.com/llvm/llvm-project/issues/106118).
 define i1 @subcarry_ult_2x64_2(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
 ; X64-LABEL: subcarry_ult_2x64_2:
 ; X64:       # %bb.0: # %entry
@@ -1437,5 +1437,81 @@ entry:
   %2 = and i1 %1, %0
   ret i1 %2
 }
+
+; Unsigned less than of two 2x128 integers combined into a borrow chain.
+define i1 @subcarry_ult_2x128(i128 %x0, i128 %x1, i128 %y0, i128 %y1) nounwind {
+; X64-LABEL: subcarry_ult_2x128:
+; X64:       # %bb.0:
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT:    cmpq %r8, %rdi
+; X64-NEXT:    sbbq %r9, %rsi
+; X64-NEXT:    setb %sil
+; X64-NEXT:    cmpq %r10, %rdx
+; X64-NEXT:    movq %rcx, %rdi
+; X64-NEXT:    sbbq %rax, %rdi
+; X64-NEXT:    setb %dil
+; X64-NEXT:    xorq %rax, %rcx
+; X64-NEXT:    xorq %r10, %rdx
+; X64-NEXT:    orq %rcx, %rdx
+; X64-NEXT:    sete %al
+; X64-NEXT:    andb %sil, %al
+; X64-NEXT:    orb %dil, %al
+; X64-NEXT:    retq
+;
+; X86-LABEL: subcarry_ult_2x128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    movl 28(%ebp), %ebx
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    cmpl 40(%ebp), %edx
+; X86-NEXT:    movl 12(%ebp), %edx
+; X86-NEXT:    sbbl 44(%ebp), %edx
+; X86-NEXT:    movl 16(%ebp), %edx
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    movl 20(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %edx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    cmpl 56(%ebp), %esi
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    sbbl 60(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sbbl 64(%ebp), %edi
+; X86-NEXT:    movl 68(%ebp), %edi
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    setb %ah
+; X86-NEXT:    xorl %edi, %ecx
+; X86-NEXT:    xorl 60(%ebp), %ebx
+; X86-NEXT:    orl %ecx, %ebx
+; X86-NEXT:    xorl 64(%ebp), %edx
+; X86-NEXT:    xorl 56(%ebp), %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    sete %al
+; X86-NEXT:    andb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
+; X86-NEXT:    orb %ah, %al
+; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %b0 = icmp ult i128 %x0, %y0
+  %b1 = icmp ult i128 %x1, %y1
+  %e1 = icmp eq i128 %x1, %y1
+  %bp = and i1 %b0, %e1
+  %br = or i1 %b1, %bp
+  ret i1 %br
+}
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK: {{.*}}

>From 847fbf1986beb3d4f12c0a5ca67a18c11a5b6697 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20Bylica?= <pawel at hepcolgum.band>
Date: Sat, 4 Apr 2026 19:04:02 +0200
Subject: [PATCH 2/2] [CGP] Reconstruct borrow chain from icmp pattern for
 subtract-with-carry

InstCombine simplifies chained usub.with.overflow intrinsics (from
multi-precision subtraction, e.g. __builtin_subcll) into an icmp pattern:

  carry_out = or(icmp ult A, B, and(icmp eq A, B, carry_in))

This is algebraically correct but prevents the backend from generating
efficient subtract-with-borrow instructions (x86 sbb, aarch64 sbcs).

Add combineToUSubWithCarry() to CodeGenPrepare that matches this pattern
and reconstructs the chained usub.with.overflow form. The existing DAG
combiner (combineCarryDiamond) folds this into USUBO_CARRY.

The transformation is gated on target support for ISD::USUBO_CARRY.

Fixes #106118.
---
 llvm/lib/CodeGen/CodeGenPrepare.cpp    | 62 ++++++++++++++++++++++++++
 llvm/test/CodeGen/AArch64/cgp-usubo.ll | 35 ++++++---------
 llvm/test/CodeGen/X86/subcarry.ll      | 57 ++++++++---------------
 3 files changed, 95 insertions(+), 59 deletions(-)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index aae6e718901fe..7afc9f32eabca 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -469,6 +469,7 @@ class CodeGenPrepare {
   bool optimizeURem(Instruction *Rem);
   bool combineToUSubWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
   bool combineToUAddWithOverflow(CmpInst *Cmp, ModifyDT &ModifiedDT);
+  bool combineToUSubWithCarry(BinaryOperator *OrI);
   bool unfoldPowerOf2Test(CmpInst *Cmp);
   void verifyBFIUpdates(Function &F);
   bool _run(Function &F);
@@ -1797,6 +1798,63 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
   return true;
 }
 
+/// FIXME: The name is similar to combineToUSubWithOverflow. Consider renaming
+/// one or both to better distinguish them.
+///
+/// Reconstruct a subtract-with-borrow chain from its canonicalized icmp form.
+///
+/// InstCombine simplifies chained usub.with.overflow intrinsics (used for
+/// multi-precision subtraction) into icmp patterns:
+///   carry_out = or(icmp ult A, B, and(icmp eq A, B, carry_in))
+///
+/// This is algebraically correct but prevents the backend from generating
+/// efficient subtract-with-borrow instructions (e.g. x86 sbb, aarch64 sbcs).
+///
+/// This function matches the pattern and reconstructs the chained
+/// usub.with.overflow form that the DAG combiner can lower to USUBO_CARRY.
+bool CodeGenPrepare::combineToUSubWithCarry(BinaryOperator *OrI) {
+  if (!OrI->getType()->isIntOrIntVectorTy(1))
+    return false;
+
+  assert(OrI->getOpcode() == Instruction::Or && "Expected or instruction");
+
+  // Match: or(icmp ult A, B, and(icmp eq A, B, carry_in))
+  // with all commuted variants of and and icmp eq operand order.
+  Value *A, *B, *CarryIn;
+  if (!match(
+          OrI,
+          m_c_BinOp(m_SpecificICmp(ICmpInst::ICMP_ULT, m_Value(A), m_Value(B)),
+                    m_c_And(m_c_SpecificICmp(ICmpInst::ICMP_EQ, m_Deferred(A),
+                                             m_Deferred(B)),
+                            m_Value(CarryIn)))))
+    return false;
+
+  Type *OpTy = A->getType();
+  EVT VT = TLI->getValueType(*DL, OpTy);
+  if (VT.isSimple())
+    VT = TLI->getTypeToTransformTo(OrI->getContext(), VT);
+  if (!TLI->isOperationLegalOrCustom(ISD::USUBO_CARRY, VT))
+    return false;
+
+  // Decompose into two usub.with.overflow whose borrows are OR'd.
+  // The DAG combiner (combineCarryDiamond) folds this into USUBO_CARRY.
+  IRBuilder<> Builder(OrI);
+
+  Value *CarryExt = Builder.CreateZExt(CarryIn, OpTy, "carry.ext");
+  Value *Sub1 =
+      Builder.CreateBinaryIntrinsic(Intrinsic::usub_with_overflow, A, B);
+  Value *Diff = Builder.CreateExtractValue(Sub1, 0, "diff");
+  Value *Borrow1 = Builder.CreateExtractValue(Sub1, 1, "borrow");
+  Value *Sub2 = Builder.CreateBinaryIntrinsic(Intrinsic::usub_with_overflow,
+                                              Diff, CarryExt);
+  Value *Borrow2 = Builder.CreateExtractValue(Sub2, 1, "borrow");
+  Value *Result = Builder.CreateOr(Borrow1, Borrow2, "carry.out");
+
+  replaceAllUsesWith(OrI, Result, FreshBBs, IsHugeFunc);
+  OrI->eraseFromParent();
+  return true;
+}
+
 // Decanonicalizes icmp+ctpop power-of-two test if ctpop is slow.
 // The same transformation exists in DAG combiner, but we repeat it here because
 // DAG builder can break the pattern by moving icmp into a successor block.
@@ -9011,6 +9069,10 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) {
       sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts))
     return true;
 
+  if (BinOp && BinOp->getOpcode() == Instruction::Or &&
+      combineToUSubWithCarry(BinOp))
+    return true;
+
   // TODO: Move this into the switch on opcode - it handles shifts already.
   if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
                 BinOp->getOpcode() == Instruction::LShr)) {
diff --git a/llvm/test/CodeGen/AArch64/cgp-usubo.ll b/llvm/test/CodeGen/AArch64/cgp-usubo.ll
index b91616805d402..5712e4359e9e9 100644
--- a/llvm/test/CodeGen/AArch64/cgp-usubo.ll
+++ b/llvm/test/CodeGen/AArch64/cgp-usubo.ll
@@ -127,10 +127,8 @@ define i1 @subcarry_ult_2x64(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
 ; CHECK-LABEL: subcarry_ult_2x64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmp x0, x2
-; CHECK-NEXT:    cset w8, lo
-; CHECK-NEXT:    cmp x1, x3
-; CHECK-NEXT:    csel w8, wzr, w8, ne
-; CHECK-NEXT:    csinc w0, w8, wzr, hs
+; CHECK-NEXT:    sbcs xzr, x1, x3
+; CHECK-NEXT:    cset w0, lo
 ; CHECK-NEXT:    ret
   %b0 = icmp ult i64 %x0, %y0
   %b1 = icmp ult i64 %x1, %y1
@@ -145,8 +143,7 @@ define i1 @subcarry_ult_2x64_commuted_eq(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nou
 ; CHECK-LABEL: subcarry_ult_2x64_commuted_eq:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmp x0, x2
-; CHECK-NEXT:    ccmp x3, x1, #0, lo
-; CHECK-NEXT:    ccmp x1, x3, #0, ne
+; CHECK-NEXT:    sbcs xzr, x1, x3
 ; CHECK-NEXT:    cset w0, lo
 ; CHECK-NEXT:    ret
   %b0 = icmp ult i64 %x0, %y0
@@ -161,13 +158,9 @@ define i1 @subcarry_ult_3x64(i64 %x0, i64 %x1, i64 %x2, i64 %y0, i64 %y1, i64 %y
 ; CHECK-LABEL: subcarry_ult_3x64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmp x0, x3
-; CHECK-NEXT:    cset w8, lo
-; CHECK-NEXT:    cmp x1, x4
-; CHECK-NEXT:    csel w8, wzr, w8, ne
-; CHECK-NEXT:    csinc w8, w8, wzr, hs
-; CHECK-NEXT:    cmp x2, x5
-; CHECK-NEXT:    csel w8, wzr, w8, ne
-; CHECK-NEXT:    csinc w0, w8, wzr, hs
+; CHECK-NEXT:    sbcs xzr, x1, x4
+; CHECK-NEXT:    sbcs xzr, x2, x5
+; CHECK-NEXT:    cset w0, lo
 ; CHECK-NEXT:    ret
   %b0 = icmp ult i64 %x0, %y0
   %b1 = icmp ult i64 %x1, %y1
@@ -185,16 +178,16 @@ define i1 @subcarry_ult_3x64(i64 %x0, i64 %x1, i64 %x2, i64 %y0, i64 %y1, i64 %y
 define i1 @subcarry_ult_2x128(i128 %x0, i128 %x1, i128 %y0, i128 %y1) nounwind {
 ; CHECK-LABEL: subcarry_ult_2x128:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp x2, x6
-; CHECK-NEXT:    cset w8, eq
-; CHECK-NEXT:    cmp x3, x7
-; CHECK-NEXT:    csel w8, wzr, w8, ne
 ; CHECK-NEXT:    cmp x0, x4
 ; CHECK-NEXT:    sbcs xzr, x1, x5
-; CHECK-NEXT:    csel w8, wzr, w8, hs
-; CHECK-NEXT:    cmp x2, x6
-; CHECK-NEXT:    sbcs xzr, x3, x7
-; CHECK-NEXT:    csinc w0, w8, wzr, hs
+; CHECK-NEXT:    cset w8, lo
+; CHECK-NEXT:    subs x9, x2, x6
+; CHECK-NEXT:    sbcs x10, x3, x7
+; CHECK-NEXT:    cset w11, lo
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    sbcs xzr, x10, xzr
+; CHECK-NEXT:    cset w8, lo
+; CHECK-NEXT:    orr w0, w11, w8
 ; CHECK-NEXT:    ret
   %b0 = icmp ult i128 %x0, %y0
   %b1 = icmp ult i128 %x1, %y1
diff --git a/llvm/test/CodeGen/X86/subcarry.ll b/llvm/test/CodeGen/X86/subcarry.ll
index e9e79962c5662..2c8346de637b0 100644
--- a/llvm/test/CodeGen/X86/subcarry.ll
+++ b/llvm/test/CodeGen/X86/subcarry.ll
@@ -1355,45 +1355,32 @@ define i1 @subcarry_ult_2x64(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
 ; https://github.com/llvm/llvm-project/commit/926e7312b2f20f2f7b0a3d5ddbd29da5625507f3
 ; This is also the result of "naive" implementation (x1 < y1) | ((x0 < y0) & (x1 == y1)).
 ; C source: https://godbolt.org/z/W1qqvqGbr
-; TODO: This should be optimized to cmp + sbb (https://github.com/llvm/llvm-project/issues/106118).
 define i1 @subcarry_ult_2x64_2(i64 %x0, i64 %x1, i64 %y0, i64 %y1) nounwind {
 ; X64-LABEL: subcarry_ult_2x64_2:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    cmpq %rdx, %rdi
-; X64-NEXT:    setb %dl
-; X64-NEXT:    cmpq %rcx, %rsi
-; X64-NEXT:    setb %cl
-; X64-NEXT:    sete %al
-; X64-NEXT:    andb %dl, %al
-; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    sbbq %rcx, %rsi
+; X64-NEXT:    setb %al
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: subcarry_ult_2x64_2:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    setb %bl
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    setb %bh
-; X86-NEXT:    xorl %esi, %edx
-; X86-NEXT:    xorl %ecx, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    andb %bl, %al
-; X86-NEXT:    orb %bh, %al
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    setb %al
+; X86-NEXT:    orb %bl, %al
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 entry:
@@ -1442,21 +1429,15 @@ entry:
 define i1 @subcarry_ult_2x128(i128 %x0, i128 %x1, i128 %y0, i128 %y1) nounwind {
 ; X64-LABEL: subcarry_ult_2x128:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    setb %r10b
 ; X64-NEXT:    cmpq %r8, %rdi
 ; X64-NEXT:    sbbq %r9, %rsi
-; X64-NEXT:    setb %sil
-; X64-NEXT:    cmpq %r10, %rdx
-; X64-NEXT:    movq %rcx, %rdi
-; X64-NEXT:    sbbq %rax, %rdi
-; X64-NEXT:    setb %dil
-; X64-NEXT:    xorq %rax, %rcx
-; X64-NEXT:    xorq %r10, %rdx
-; X64-NEXT:    orq %rcx, %rdx
-; X64-NEXT:    sete %al
-; X64-NEXT:    andb %sil, %al
-; X64-NEXT:    orb %dil, %al
+; X64-NEXT:    sbbq $0, %rdx
+; X64-NEXT:    sbbq $0, %rcx
+; X64-NEXT:    setb %al
+; X64-NEXT:    orb %r10b, %al
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: subcarry_ult_2x128: