[llvm] [DAG] Fold (umin (sub a b) a) -> (usubo a b); (select usubo.1 a usubo.0) (PR #161651)

Mon Nov 10 05:20:54 PST 2025

https://github.com/ckoparkar updated https://github.com/llvm/llvm-project/pull/161651

>From 3a24f85aebb329f3ccc7272114e61201ef4ff161 Mon Sep 17 00:00:00 2001
From: Chaitanya Koparkar <ckoparkar at gmail.com>
Date: Thu, 2 Oct 2025 07:29:30 -0400
Subject: [PATCH 01/10] [DAG] Fold (umin (sub a b) a) -> (usubo a b); (select
 usubo.1 a usubo.0)

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 24 +++++++++++++++
 .../CodeGen/X86/underflow-compare-fold.ll     | 29 +++++++++++++++++++
 2 files changed, 53 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/underflow-compare-fold.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 558c5a0390228..c188ab49f2821 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6199,6 +6199,30 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
                                         SDLoc(N), VT, N0, N1))
     return SD;
 
+  // (umin (sub a, b) a) -> (usubo a, b); (select usubo.1, a, usubo.0)
+  //
+  // IR:
+  //   %sub  = sub %a, %b
+  //   %cond = umin %sub, %a
+  //   ->
+  //   %usubo    = usubo %a, %b
+  //   %overflow = extractvalue %usubo, 1
+  //   %sub      = extractvalue %usubo, 0
+  //   %cond     = select %overflow, %a, %sub
+  if (N0.getOpcode() == ISD::SUB) {
+    SDValue A, B, C;
+    if (sd_match(N,
+                 m_AnyOf(m_UMin(m_Sub(m_Value(A), m_Value(B)), m_Value(C)),
+                         m_SMin(m_Sub(m_Value(A), m_Value(B)), m_Value(C))))) {
+      EVT AVT = A.getValueType();
+      if (A == C && TLI.isOperationLegalOrCustom(ISD::USUBO, AVT)) {
+        SDVTList VTs = DAG.getVTList(AVT, MVT::i1);
+        SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, A, B);
+        return DAG.getSelect(DL, VT, USO.getValue(1), A, USO.getValue(0));
+      }
+    }
+  }
+
   // Simplify the operands using demanded-bits information.
   if (SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
diff --git a/llvm/test/CodeGen/X86/underflow-compare-fold.ll b/llvm/test/CodeGen/X86/underflow-compare-fold.ll
new file mode 100644
index 0000000000000..1d1cfa4f97c50
--- /dev/null
+++ b/llvm/test/CodeGen/X86/underflow-compare-fold.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -mtriple=x86_64 | FileCheck %s
+
+; GitHub issue #161036
+
+define i64 @subIfNoUnderflow_umin(i64 %a, i64 %b) {
+; CHECK-LABEL: subIfNoUnderflow_umin
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movq    %rdi, %rax
+; CHECK-NEXT: subq    %rsi, %rax
+; CHECK-NEXT: cmovbq  %rdi, %rax
+; retq
+entry:
+  %sub = sub i64 %a, %b
+  %cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+  ret i64 %cond
+}
+
+define i64 @subIfNoUnderflow_smin(i64 %a, i64 %b) {
+; CHECK-LABEL: subIfNoUnderflow_smin
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movq    %rdi, %rax
+; CHECK-NEXT: subq    %rsi, %rax
+; CHECK-NEXT: cmovbq  %rdi, %rax
+; retq
+entry:
+  %sub = sub i64 %a, %b
+  %cond = tail call i64 @llvm.smin.i64(i64 %sub, i64 %a)
+  ret i64 %cond
+}

>From 0db4bf0442f37a26a577b974d69962332e19d378 Mon Sep 17 00:00:00 2001
From: Chaitanya Koparkar <ckoparkar at gmail.com>
Date: Thu, 2 Oct 2025 08:30:36 -0400
Subject: [PATCH 02/10] Remove smin pattern, it might not be correct

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp   |  4 +---
 llvm/test/CodeGen/X86/underflow-compare-fold.ll | 13 -------------
 2 files changed, 1 insertion(+), 16 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c188ab49f2821..99d7000c3b62e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6211,9 +6211,7 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
   //   %cond     = select %overflow, %a, %sub
   if (N0.getOpcode() == ISD::SUB) {
     SDValue A, B, C;
-    if (sd_match(N,
-                 m_AnyOf(m_UMin(m_Sub(m_Value(A), m_Value(B)), m_Value(C)),
-                         m_SMin(m_Sub(m_Value(A), m_Value(B)), m_Value(C))))) {
+    if (sd_match(N, m_UMin(m_Sub(m_Value(A), m_Value(B)), m_Value(C)))) {
       EVT AVT = A.getValueType();
       if (A == C && TLI.isOperationLegalOrCustom(ISD::USUBO, AVT)) {
         SDVTList VTs = DAG.getVTList(AVT, MVT::i1);
diff --git a/llvm/test/CodeGen/X86/underflow-compare-fold.ll b/llvm/test/CodeGen/X86/underflow-compare-fold.ll
index 1d1cfa4f97c50..4dcaefc7a5586 100644
--- a/llvm/test/CodeGen/X86/underflow-compare-fold.ll
+++ b/llvm/test/CodeGen/X86/underflow-compare-fold.ll
@@ -14,16 +14,3 @@ entry:
   %cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
   ret i64 %cond
 }
-
-define i64 @subIfNoUnderflow_smin(i64 %a, i64 %b) {
-; CHECK-LABEL: subIfNoUnderflow_smin
-; CHECK-LABEL: %bb.0
-; CHECK-NEXT: movq    %rdi, %rax
-; CHECK-NEXT: subq    %rsi, %rax
-; CHECK-NEXT: cmovbq  %rdi, %rax
-; retq
-entry:
-  %sub = sub i64 %a, %b
-  %cond = tail call i64 @llvm.smin.i64(i64 %sub, i64 %a)
-  ret i64 %cond
-}

>From c0ba3fd77bd8e7d8e3ce7fbe09cb50a6e211e0d7 Mon Sep 17 00:00:00 2001
From: Chaitanya Koparkar <ckoparkar at gmail.com>
Date: Thu, 2 Oct 2025 09:28:15 -0400
Subject: [PATCH 03/10] Fix check for retq

---
 llvm/test/CodeGen/X86/underflow-compare-fold.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/X86/underflow-compare-fold.ll b/llvm/test/CodeGen/X86/underflow-compare-fold.ll
index 4dcaefc7a5586..2416bcb909485 100644
--- a/llvm/test/CodeGen/X86/underflow-compare-fold.ll
+++ b/llvm/test/CodeGen/X86/underflow-compare-fold.ll
@@ -8,7 +8,7 @@ define i64 @subIfNoUnderflow_umin(i64 %a, i64 %b) {
 ; CHECK-NEXT: movq    %rdi, %rax
 ; CHECK-NEXT: subq    %rsi, %rax
 ; CHECK-NEXT: cmovbq  %rdi, %rax
-; retq
+; CHECK-NEXT: retq
 entry:
   %sub = sub i64 %a, %b
   %cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)

>From 57e2ea0310b83baab4efae67794d03ef53c364f2 Mon Sep 17 00:00:00 2001
From: Chaitanya Koparkar <ckoparkar at gmail.com>
Date: Fri, 3 Oct 2025 07:43:07 -0400
Subject: [PATCH 04/10] Address review comments

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp  | 18 ++++--------------
 .../CodeGen/AArch64/underflow-compare-fold.ll  | 14 ++++++++++++++
 .../test/CodeGen/X86/underflow-compare-fold.ll |  5 ++---
 3 files changed, 20 insertions(+), 17 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/underflow-compare-fold.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 99d7000c3b62e..5d559d713b3eb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6200,21 +6200,11 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
     return SD;
 
   // (umin (sub a, b) a) -> (usubo a, b); (select usubo.1, a, usubo.0)
-  //
-  // IR:
-  //   %sub  = sub %a, %b
-  //   %cond = umin %sub, %a
-  //   ->
-  //   %usubo    = usubo %a, %b
-  //   %overflow = extractvalue %usubo, 1
-  //   %sub      = extractvalue %usubo, 0
-  //   %cond     = select %overflow, %a, %sub
   if (N0.getOpcode() == ISD::SUB) {
-    SDValue A, B, C;
-    if (sd_match(N, m_UMin(m_Sub(m_Value(A), m_Value(B)), m_Value(C)))) {
-      EVT AVT = A.getValueType();
-      if (A == C && TLI.isOperationLegalOrCustom(ISD::USUBO, AVT)) {
-        SDVTList VTs = DAG.getVTList(AVT, MVT::i1);
+    SDValue A, B;
+    if (sd_match(N, m_UMin(m_Sub(m_Value(A), m_Value(B)), m_Deferred(A)))) {
+      if (TLI.isOperationLegalOrCustom(ISD::USUBO, VT)) {
+        SDVTList VTs = DAG.getVTList(VT, MVT::i1);
         SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, A, B);
         return DAG.getSelect(DL, VT, USO.getValue(1), A, USO.getValue(0));
       }
diff --git a/llvm/test/CodeGen/AArch64/underflow-compare-fold.ll b/llvm/test/CodeGen/AArch64/underflow-compare-fold.ll
new file mode 100644
index 0000000000000..6dcf2479d6daa
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/underflow-compare-fold.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+
+; GitHub issue #161036
+
+define i64 @underflow_compare_fold(i64 %a, i64 %b) {
+; CHECK-LABEL: underflow_compare_fold
+; CHECK:      // %bb.0:
+; CHECK-NEXT: subs x8, x0, x1
+; CHECK-NEXT: csel x0, x0, x8, lo
+; CHECK-NEXT: ret
+  %sub = sub i64 %a, %b
+  %cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+  ret i64 %cond
+}
diff --git a/llvm/test/CodeGen/X86/underflow-compare-fold.ll b/llvm/test/CodeGen/X86/underflow-compare-fold.ll
index 2416bcb909485..a520ee1621d86 100644
--- a/llvm/test/CodeGen/X86/underflow-compare-fold.ll
+++ b/llvm/test/CodeGen/X86/underflow-compare-fold.ll
@@ -2,14 +2,13 @@
 
 ; GitHub issue #161036
 
-define i64 @subIfNoUnderflow_umin(i64 %a, i64 %b) {
-; CHECK-LABEL: subIfNoUnderflow_umin
+define i64 @underflow_compare_fold(i64 %a, i64 %b) {
+; CHECK-LABEL: underflow_compare_fold
 ; CHECK-LABEL: %bb.0
 ; CHECK-NEXT: movq    %rdi, %rax
 ; CHECK-NEXT: subq    %rsi, %rax
 ; CHECK-NEXT: cmovbq  %rdi, %rax
 ; CHECK-NEXT: retq
-entry:
   %sub = sub i64 %a, %b
   %cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
   ret i64 %cond

>From 7e08f231fea2d17e1c36f2da99bb4b626518e72e Mon Sep 17 00:00:00 2001
From: Chaitanya Koparkar <ckoparkar at gmail.com>
Date: Wed, 22 Oct 2025 08:39:37 -0400
Subject: [PATCH 05/10] Don't use MVT::i1

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 5d559d713b3eb..8c6cde629b751 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6200,11 +6200,12 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
     return SD;
 
   // (umin (sub a, b) a) -> (usubo a, b); (select usubo.1, a, usubo.0)
-  if (N0.getOpcode() == ISD::SUB) {
+  {
     SDValue A, B;
     if (sd_match(N, m_UMin(m_Sub(m_Value(A), m_Value(B)), m_Deferred(A)))) {
       if (TLI.isOperationLegalOrCustom(ISD::USUBO, VT)) {
-        SDVTList VTs = DAG.getVTList(VT, MVT::i1);
+        EVT SETCCT = getSetCCResultType(VT);
+        SDVTList VTs = DAG.getVTList(VT, SETCCT);
         SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, A, B);
         return DAG.getSelect(DL, VT, USO.getValue(1), A, USO.getValue(0));
       }

>From 7660fae35b12d17dc0080c2736475c8a3ba89007 Mon Sep 17 00:00:00 2001
From: Chaitanya Koparkar <ckoparkar at gmail.com>
Date: Fri, 31 Oct 2025 07:50:22 -0400
Subject: [PATCH 06/10] Fix pattern match

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 8c6cde629b751..66be2d7e16045 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6202,13 +6202,13 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
   // (umin (sub a, b) a) -> (usubo a, b); (select usubo.1, a, usubo.0)
   {
     SDValue A, B;
-    if (sd_match(N, m_UMin(m_Sub(m_Value(A), m_Value(B)), m_Deferred(A)))) {
-      if (TLI.isOperationLegalOrCustom(ISD::USUBO, VT)) {
-        EVT SETCCT = getSetCCResultType(VT);
-        SDVTList VTs = DAG.getVTList(VT, SETCCT);
-        SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, A, B);
-        return DAG.getSelect(DL, VT, USO.getValue(1), A, USO.getValue(0));
-      }
+    if (sd_match(N0, m_Sub(m_Value(A), m_Value(B))) &&
+        sd_match(N1, m_Specific(A)) &&
+        TLI.isOperationLegalOrCustom(ISD::USUBO, VT)) {
+      EVT SETCCT = getSetCCResultType(VT);
+      SDVTList VTs = DAG.getVTList(VT, SETCCT);
+      SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, A, B);
+      return DAG.getSelect(DL, VT, USO.getValue(1), A, USO.getValue(0));
     }
   }
 

>From d795383e23462d2d5b6fa2c21973c2d2a26fb400 Mon Sep 17 00:00:00 2001
From: Chaitanya Koparkar <ckoparkar at gmail.com>
Date: Fri, 31 Oct 2025 08:32:27 -0400
Subject: [PATCH 07/10] Add some more tests

---
 .../CodeGen/AArch64/underflow-compare-fold.ll | 39 ++++++++++++++++++
 .../CodeGen/X86/underflow-compare-fold.ll     | 41 +++++++++++++++++++
 2 files changed, 80 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/underflow-compare-fold.ll b/llvm/test/CodeGen/AArch64/underflow-compare-fold.ll
index 6dcf2479d6daa..5eb831e98c677 100644
--- a/llvm/test/CodeGen/AArch64/underflow-compare-fold.ll
+++ b/llvm/test/CodeGen/AArch64/underflow-compare-fold.ll
@@ -2,6 +2,7 @@
 
 ; GitHub issue #161036
 
+; Positive test : umin(sub(a,b),a) with scalar types should be folded
 define i64 @underflow_compare_fold(i64 %a, i64 %b) {
 ; CHECK-LABEL: underflow_compare_fold
 ; CHECK:      // %bb.0:
@@ -12,3 +13,41 @@ define i64 @underflow_compare_fold(i64 %a, i64 %b) {
   %cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
   ret i64 %cond
 }
+
+; Negative test, vector types : umin(sub(a,b),a) but with vectors
+define <16 x i8> @underflow_compare_dontfold_vectors(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: underflow_compare_dontfold_vectors
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: sub v1.16b, v0.16b, v1.16b
+; CHECK-NEXT: umin v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: ret
+  %sub = sub <16 x i8> %a, %b
+  %cond = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %sub, <16 x i8> %a)
+  ret <16 x i8> %cond
+}
+
+; Negative test, pattern mismatch : umin(a,sub(a,b))
+define i64 @umin_sub_inverse_args(i64 %a, i64 %b) {
+; CHECK-LABEL: umin_sub_inverse_args
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: sub x8, x0, x1
+; CHECK-NEXT: cmp x0, x8
+; CHECK-NEXT: csel x0, x0, x8, lo
+; CHECK-NEXT: ret
+  %sub = sub i64 %a, %b
+  %cond = tail call i64 @llvm.umin.i64(i64 %a, i64 %sub)
+  ret i64 %cond
+}
+
+; Negative test, pattern mismatch : umin(add(a,b),a)
+define i64 @umin_add(i64 %a, i64 %b) {
+; CHECK-LABEL: umin_add
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: add x8, x0, x1
+; CHECK-NEXT: cmp x8, x0
+; CHECK-NEXT: csel x0, x8, x0, lo
+; CHECK-NEXT: ret
+  %add = add i64 %a, %b
+  %cond = tail call i64 @llvm.umin.i64(i64 %add, i64 %a)
+  ret i64 %cond
+}
diff --git a/llvm/test/CodeGen/X86/underflow-compare-fold.ll b/llvm/test/CodeGen/X86/underflow-compare-fold.ll
index a520ee1621d86..366c3f040e962 100644
--- a/llvm/test/CodeGen/X86/underflow-compare-fold.ll
+++ b/llvm/test/CodeGen/X86/underflow-compare-fold.ll
@@ -2,6 +2,7 @@
 
 ; GitHub issue #161036
 
+; Positive test : umin(sub(a,b),a) with scalar types should be folded
 define i64 @underflow_compare_fold(i64 %a, i64 %b) {
 ; CHECK-LABEL: underflow_compare_fold
 ; CHECK-LABEL: %bb.0
@@ -13,3 +14,43 @@ define i64 @underflow_compare_fold(i64 %a, i64 %b) {
   %cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
   ret i64 %cond
 }
+
+; Negative test, vector types : umin(sub(a,b),a) but with vectors
+define <16 x i8> @underflow_compare_dontfold_vectors(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: underflow_compare_dontfold_vectors
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: psubb %xmm1, %xmm2
+; CHECK-NEXT: pminub %xmm2, %xmm0
+; CHECK-NEXT: retq
+  %sub = sub <16 x i8> %a, %b
+  %cond = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %sub, <16 x i8> %a)
+  ret <16 x i8> %cond
+}
+
+; Negative test, pattern mismatch : umin(a,sub(a,b))
+define i64 @umin_sub_inverse_args(i64 %a, i64 %b) {
+; CHECK-LABEL: umin_sub_inverse_args
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: subq %rsi, %rax
+; CHECK-NEXT: cmpq %rax, %rdi
+; CHECK-NEXT: cmovbq %rdi, %rax
+; CHECK-NEXT: retq
+  %sub = sub i64 %a, %b
+  %cond = tail call i64 @llvm.umin.i64(i64 %a, i64 %sub)
+  ret i64 %cond
+}
+
+; Negative test, pattern mismatch : umin(add(a,b),a)
+define i64 @umin_add(i64 %a, i64 %b) {
+; CHECK-LABEL: umin_add
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: leaq (%rsi,%rdi), %rax
+; CHECK-NEXT: cmpq %rdi, %rax
+; CHECK-NEXT: cmovaeq %rdi, %rax
+; CHECK-NEXT: retq
+  %add = add i64 %a, %b
+  %cond = tail call i64 @llvm.umin.i64(i64 %add, i64 %a)
+  ret i64 %cond
+}

>From a27dcf53a5ed5f77aa8c6dafcdfc91459bcb462f Mon Sep 17 00:00:00 2001
From: Chaitanya Koparkar <ckoparkar at gmail.com>
Date: Fri, 31 Oct 2025 08:55:44 -0400
Subject: [PATCH 08/10] Simplify pattern match

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 66be2d7e16045..f144ce2888dc1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6201,14 +6201,13 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
 
   // (umin (sub a, b) a) -> (usubo a, b); (select usubo.1, a, usubo.0)
   {
-    SDValue A, B;
-    if (sd_match(N0, m_Sub(m_Value(A), m_Value(B))) &&
-        sd_match(N1, m_Specific(A)) &&
+    SDValue B;
+    if (sd_match(N0, m_Sub(m_Specific(N1), m_Value(B))) &&
         TLI.isOperationLegalOrCustom(ISD::USUBO, VT)) {
       EVT SETCCT = getSetCCResultType(VT);
       SDVTList VTs = DAG.getVTList(VT, SETCCT);
-      SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, A, B);
-      return DAG.getSelect(DL, VT, USO.getValue(1), A, USO.getValue(0));
+      SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, N1, B);
+      return DAG.getSelect(DL, VT, USO.getValue(1), N1, USO.getValue(0));
     }
   }
 

>From e1ba30be7603af33f73fea4cb6c815f160a76e52 Mon Sep 17 00:00:00 2001
From: Chaitanya Koparkar <ckoparkar at gmail.com>
Date: Thu, 6 Nov 2025 08:41:19 -0500
Subject: [PATCH 09/10] Edits

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  20 ++-
 .../umin-sub-to-usubo-select-combine.ll       | 151 +++++++++++++++++
 .../CodeGen/AArch64/underflow-compare-fold.ll |  53 ------
 .../X86/umin-sub-to-usubo-select-combine.ll   | 156 ++++++++++++++++++
 .../CodeGen/X86/underflow-compare-fold.ll     |  56 -------
 5 files changed, 320 insertions(+), 116 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/umin-sub-to-usubo-select-combine.ll
 delete mode 100644 llvm/test/CodeGen/AArch64/underflow-compare-fold.ll
 create mode 100644 llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll
 delete mode 100644 llvm/test/CodeGen/X86/underflow-compare-fold.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d43ec1002cd86..39c91fdfd371c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6216,16 +6216,22 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
                                         SDLoc(N), VT, N0, N1))
     return SD;
 
-  // (umin (sub a, b) a) -> (usubo a, b); (select usubo.1, a, usubo.0)
-  {
+  if (TLI.isOperationLegalOrCustom(ISD::USUBO, VT)) {
     SDValue B;
-    if (sd_match(N0, m_Sub(m_Specific(N1), m_Value(B))) &&
-        TLI.isOperationLegalOrCustom(ISD::USUBO, VT)) {
-      EVT SETCCT = getSetCCResultType(VT);
-      SDVTList VTs = DAG.getVTList(VT, SETCCT);
+
+    // (umin (sub a, b), a) -> (usubo a, b); (select usubo.1, a, usubo.0)
+    if (sd_match(N0, m_Sub(m_Specific(N1), m_Value(B)))) {
+      SDVTList VTs = DAG.getVTList(VT, getSetCCResultType(VT));
       SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, N1, B);
       return DAG.getSelect(DL, VT, USO.getValue(1), N1, USO.getValue(0));
     }
+
+    // (umin a, (sub a, b)) -> (usubo a, b); (select usubo.1, a, usubo.0)
+    if (sd_match(N1, m_Sub(m_Specific(N0), m_Value(B)))) {
+      SDVTList VTs = DAG.getVTList(VT, getSetCCResultType(VT));
+      SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, N0, B);
+      return DAG.getSelect(DL, VT, USO.getValue(1), N0, USO.getValue(0));
+    }
   }
 
   // Simplify the operands using demanded-bits information.
@@ -9386,7 +9392,7 @@ static unsigned bigEndianByteAt(unsigned BW, unsigned i) {
 // Check if the bytes offsets we are looking at match with either big or
 // little endian value loaded. Return true for big endian, false for little
 // endian, and std::nullopt if match failed.
-static std::optional<bool> isBigEndian(const ArrayRef<int64_t> ByteOffsets,
+static std::optional<bool> isBigEndian(ArrayRef<int64_t> ByteOffsets,
                                        int64_t FirstOffset) {
   // The endian can be decided only when it is 2 bytes at least.
   unsigned Width = ByteOffsets.size();
diff --git a/llvm/test/CodeGen/AArch64/umin-sub-to-usubo-select-combine.ll b/llvm/test/CodeGen/AArch64/umin-sub-to-usubo-select-combine.ll
new file mode 100644
index 0000000000000..fe3eee06db65e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/umin-sub-to-usubo-select-combine.ll
@@ -0,0 +1,151 @@
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+
+; GitHub issue #161036
+
+; Positive test : umin(sub(a,b),a) with scalar types should be folded
+define i64 @underflow_compare_fold_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: underflow_compare_fold_i64
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT:  subs x8, x0, x1
+; CHECK-NEXT:  csel x0, x0, x8, lo
+; CHECK-NEXT:  ret
+  %sub = sub i64 %a, %b
+  %cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+  ret i64 %cond
+}
+
+; Positive test : umin(a,sub(a,b)) with scalar types should be folded
+define i64 @underflow_compare_fold_i64_commute(i64 %a, i64 %b) {
+; CHECK-LABEL: underflow_compare_fold_i64_commute
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT:  subs x8, x0, x1
+; CHECK-NEXT:  csel x0, x0, x8, lo
+; CHECK-NEXT:  ret
+  %sub = sub i64 %a, %b
+  %cond = tail call i64 @llvm.umin.i64(i64 %a, i64 %sub)
+  ret i64 %cond
+}
+
+; Positive test : multi-use is OK since the sub instruction still runs once
+define i64 @underflow_compare_fold_i64_multi_use(i64 %a, i64 %b, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: underflow_compare_fold_i64_multi_use
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT:  subs x8, x0, x1
+; CHECK-NEXT:  csel x0, x0, x8, lo
+; CHECK-NEXT:  str	x8, [x2]
+; CHECK-NEXT:  ret
+  %sub = sub i64 %a, %b
+  store i64 %sub, ptr addrspace(1) %ptr
+  %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+  ret i64 %cond
+}
+
+; Positive test : i32
+define i32 @underflow_compare_fold_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: underflow_compare_fold_i32
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT:  subs w8, w0, w1
+; CHECK-NEXT:  csel w0, w0, w8, lo
+; CHECK-NEXT:  ret
+  %sub = sub i32 %a, %b
+  %cond = tail call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+  ret i32 %cond
+}
+
+; Positive test : i32
+define i32 @underflow_compare_fold_i32_commute(i32 %a, i32 %b) {
+; CHECK-LABEL: underflow_compare_fold_i32_commute
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT:  subs w8, w0, w1
+; CHECK-NEXT:  csel w0, w0, w8, lo
+; CHECK-NEXT:  ret
+  %sub = sub i32 %a, %b
+  %cond = tail call i32 @llvm.umin.i32(i32 %a, i32 %sub)
+  ret i32 %cond
+}
+
+; Positive test : i32
+define i32 @underflow_compare_fold_i32_multi_use(i32 %a, i32 %b, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: underflow_compare_fold_i32_multi_use
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT:  subs w8, w0, w1
+; CHECK-NEXT:  csel w0, w0, w8, lo
+; CHECK-NEXT:  str	w8, [x2]
+; CHECK-NEXT:  ret
+  %sub = sub i32 %a, %b
+  store i32 %sub, ptr addrspace(1) %ptr
+  %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+  ret i32 %cond
+}
+
+; Negative test : i16
+define i16 @underflow_compare_fold_i16(i16 %a, i16 %b) {
+; CHECK-LABEL: underflow_compare_fold_i16
+; CHECK-LABEL: %bb.0:
+; CHECK-LABEL: sub w8, w0, w1
+; CHECK-LABEL: and w9, w0, #0xffff
+; CHECK-LABEL: and w8, w8, #0xffff
+; CHECK-LABEL: cmp w8, w9
+; CHECK-LABEL: csel w0, w8, w9, lo
+; CHECK-LABEL: ret
+  %sub = sub i16 %a, %b
+  %cond = tail call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+  ret i16 %cond
+}
+
+; Negative test : i16
+define i16 @underflow_compare_fold_i16_commute(i16 %a, i16 %b) {
+; CHECK-LABEL: underflow_compare_fold_i16_commute
+; CHECK-LABEL: %bb.0:
+; CHECK-LABEL: sub w8, w0, w1
+; CHECK-LABEL: and w9, w0, #0xffff
+; CHECK-LABEL: and w8, w8, #0xffff
+; CHECK-LABEL: cmp w9, w8
+; CHECK-LABEL: csel w0, w9, w8, lo
+; CHECK-LABEL: ret
+  %sub = sub i16 %a, %b
+  %cond = tail call i16 @llvm.umin.i16(i16 %a, i16 %sub)
+  ret i16 %cond
+}
+
+; Negative test : i16
+define i16 @underflow_compare_fold_i16_multi_use(i16 %a, i16 %b, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: underflow_compare_fold_i16_multi_use
+; CHECK-LABEL: %bb.0:
+; CHECK-LABEL: sub w8, w0, w1
+; CHECK-LABEL: and w9, w0, #0xffff
+; CHECK-LABEL: and w10, w8, #0xffff
+; CHECK-LABEL: strh w8, [x2]
+; CHECK-LABEL: cmp w10, w9
+; CHECK-LABEL: csel w0, w10, w9, lo
+; CHECK-LABEL: ret
+  %sub = sub i16 %a, %b
+  store i16 %sub, ptr addrspace(1) %ptr
+  %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+  ret i16 %cond
+}
+
+; Negative test, vector types : umin(sub(a,b),a) but with vectors
+define <16 x i8> @underflow_compare_dontfold_vectors(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: underflow_compare_dontfold_vectors
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: sub v1.16b, v0.16b, v1.16b
+; CHECK-NEXT: umin v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: ret
+  %sub = sub <16 x i8> %a, %b
+  %cond = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %sub, <16 x i8> %a)
+  ret <16 x i8> %cond
+}
+
+; Negative test, pattern mismatch : umin(add(a,b),a)
+define i64 @umin_add(i64 %a, i64 %b) {
+; CHECK-LABEL: umin_add
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: add x8, x0, x1
+; CHECK-NEXT: cmp x8, x0
+; CHECK-NEXT: csel x0, x8, x0, lo
+; CHECK-NEXT: ret
+  %add = add i64 %a, %b
+  %cond = tail call i64 @llvm.umin.i64(i64 %add, i64 %a)
+  ret i64 %cond
+}
diff --git a/llvm/test/CodeGen/AArch64/underflow-compare-fold.ll b/llvm/test/CodeGen/AArch64/underflow-compare-fold.ll
deleted file mode 100644
index 5eb831e98c677..0000000000000
--- a/llvm/test/CodeGen/AArch64/underflow-compare-fold.ll
+++ /dev/null
@@ -1,53 +0,0 @@
-; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
-
-; GitHub issue #161036
-
-; Positive test : umin(sub(a,b),a) with scalar types should be folded
-define i64 @underflow_compare_fold(i64 %a, i64 %b) {
-; CHECK-LABEL: underflow_compare_fold
-; CHECK:      // %bb.0:
-; CHECK-NEXT: subs x8, x0, x1
-; CHECK-NEXT: csel x0, x0, x8, lo
-; CHECK-NEXT: ret
-  %sub = sub i64 %a, %b
-  %cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
-  ret i64 %cond
-}
-
-; Negative test, vector types : umin(sub(a,b),a) but with vectors
-define <16 x i8> @underflow_compare_dontfold_vectors(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: underflow_compare_dontfold_vectors
-; CHECK-LABEL: %bb.0
-; CHECK-NEXT: sub v1.16b, v0.16b, v1.16b
-; CHECK-NEXT: umin v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ret
-  %sub = sub <16 x i8> %a, %b
-  %cond = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %sub, <16 x i8> %a)
-  ret <16 x i8> %cond
-}
-
-; Negative test, pattern mismatch : umin(a,sub(a,b))
-define i64 @umin_sub_inverse_args(i64 %a, i64 %b) {
-; CHECK-LABEL: umin_sub_inverse_args
-; CHECK-LABEL: %bb.0
-; CHECK-NEXT: sub x8, x0, x1
-; CHECK-NEXT: cmp x0, x8
-; CHECK-NEXT: csel x0, x0, x8, lo
-; CHECK-NEXT: ret
-  %sub = sub i64 %a, %b
-  %cond = tail call i64 @llvm.umin.i64(i64 %a, i64 %sub)
-  ret i64 %cond
-}
-
-; Negative test, pattern mismatch : umin(add(a,b),a)
-define i64 @umin_add(i64 %a, i64 %b) {
-; CHECK-LABEL: umin_add
-; CHECK-LABEL: %bb.0
-; CHECK-NEXT: add x8, x0, x1
-; CHECK-NEXT: cmp x8, x0
-; CHECK-NEXT: csel x0, x8, x0, lo
-; CHECK-NEXT: ret
-  %add = add i64 %a, %b
-  %cond = tail call i64 @llvm.umin.i64(i64 %add, i64 %a)
-  ret i64 %cond
-}
diff --git a/llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll b/llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll
new file mode 100644
index 0000000000000..e9756b411eb2c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll
@@ -0,0 +1,156 @@
+; RUN: llc < %s -mtriple=x86_64 | FileCheck %s
+
+; GitHub issue #161036
+
+; Positive test : umin(sub(a,b),a) with scalar types should be folded
+define i64 @underflow_compare_fold_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: underflow_compare_fold_i64
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movq    %rdi, %rax
+; CHECK-NEXT: subq    %rsi, %rax
+; CHECK-NEXT: cmovbq  %rdi, %rax
+; CHECK-NEXT: retq
+  %sub = sub i64 %a, %b
+  %cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+  ret i64 %cond
+}
+
+; Positive test : umin(a,sub(a,b)) with scalar types should be folded
+define i64 @underflow_compare_fold_i64_commute(i64 %a, i64 %b) {
+; CHECK-LABEL: underflow_compare_fold_i64_commute
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movq    %rdi, %rax
+; CHECK-NEXT: subq    %rsi, %rax
+; CHECK-NEXT: cmovbq  %rdi, %rax
+; CHECK-NEXT: retq
+  %sub = sub i64 %a, %b
+  %cond = tail call i64 @llvm.umin.i64(i64 %a, i64 %sub)
+  ret i64 %cond
+}
+
+; Positive test : multi-use is OK since the sub instruction still runs once
+define i64 @underflow_compare_fold_i64_multi_use(i64 %a, i64 %b, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: underflow_compare_fold_i64_multi_use
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movq    %rdi, %rax
+; CHECK-NEXT: subq    %rsi, %rax
+; CHECK-NEXT: movq    %rax, (%rdx)
+; CHECK-NEXT: cmovbq  %rdi, %rax
+; CHECK-NEXT: retq
+  %sub = sub i64 %a, %b
+  store i64 %sub, ptr addrspace(1) %ptr
+  %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+  ret i64 %cond
+}
+
+; Positive test : i32
+define i32 @underflow_compare_fold_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: underflow_compare_fold_i32
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movl    %edi, %eax
+; CHECK-NEXT: subl    %esi, %eax
+; CHECK-NEXT: cmovbl  %edi, %eax
+; CHECK-NEXT: retq
+  %sub = sub i32 %a, %b
+  %cond = tail call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+  ret i32 %cond
+}
+
+; Positive test : i32
+define i32 @underflow_compare_fold_i32_commute(i32 %a, i32 %b) {
+; CHECK-LABEL: underflow_compare_fold_i32_commute
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movl    %edi, %eax
+; CHECK-NEXT: subl    %esi, %eax
+; CHECK-NEXT: cmovbl  %edi, %eax
+; CHECK-NEXT: retq
+  %sub = sub i32 %a, %b
+  %cond = tail call i32 @llvm.umin.i32(i32 %a, i32 %sub)
+  ret i32 %cond
+}
+
+; Positive test : i32
+define i32 @underflow_compare_fold_i32_multi_use(i32 %a, i32 %b, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: underflow_compare_fold_i32_multi_use
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movl    %edi, %eax
+; CHECK-NEXT: subl    %esi, %eax
+; CHECK-NEXT: movl    %eax, (%rdx)
+; CHECK-NEXT: cmovbl  %edi, %eax
+; CHECK-NEXT: retq
+  %sub = sub i32 %a, %b
+  store i32 %sub, ptr addrspace(1) %ptr
+  %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+  ret i32 %cond
+}
+
+; Positive test : i16
+define i16 @underflow_compare_fold_i16(i16 %a, i16 %b) {
+; CHECK-LABEL: underflow_compare_fold_i16
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT: movl    %edi, %eax
+; CHECK-NEXT: subw    %si,  %ax
+; CHECK-NEXT: cmovbl  %edi, %eax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: retq
+  %sub = sub i16 %a, %b
+  %cond = tail call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+  ret i16 %cond
+}
+
+; Positive test : i16
+define i16 @underflow_compare_fold_i16_commute(i16 %a, i16 %b) {
+; CHECK-LABEL: underflow_compare_fold_i16_commute
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT: movl    %edi, %eax
+; CHECK-NEXT: subw    %si,  %ax
+; CHECK-NEXT: cmovbl  %edi, %eax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: retq
+  %sub = sub i16 %a, %b
+  %cond = tail call i16 @llvm.umin.i16(i16 %a, i16 %sub)
+  ret i16 %cond
+}
+
+; Positive test : i16
+define i16 @underflow_compare_fold_i16_multi_use(i16 %a, i16 %b, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: underflow_compare_fold_i16_multi_use
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT: movl    %edi, %eax
+; CHECK-NEXT: subw    %si,  %ax
+; CHECK-NEXT: movw    %ax,  (%rdx)
+; CHECK-NEXT: cmovbl  %edi, %eax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: retq
+  %sub = sub i16 %a, %b
+  store i16 %sub, ptr addrspace(1) %ptr
+  %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+  ret i16 %cond
+}
+
+
+; Negative test, vector types : umin(sub(a,b),a) but with vectors
+define <16 x i8> @underflow_compare_dontfold_vectors(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: underflow_compare_dontfold_vectors
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: psubb %xmm1, %xmm2
+; CHECK-NEXT: pminub %xmm2, %xmm0
+; CHECK-NEXT: retq
+  %sub = sub <16 x i8> %a, %b
+  %cond = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %sub, <16 x i8> %a)
+  ret <16 x i8> %cond
+}
+
+; Negative test, pattern mismatch : umin(add(a,b),a)
+define i64 @umin_add(i64 %a, i64 %b) {
+; CHECK-LABEL: umin_add
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: leaq (%rsi,%rdi), %rax
+; CHECK-NEXT: cmpq %rdi, %rax
+; CHECK-NEXT: cmovaeq %rdi, %rax
+; CHECK-NEXT: retq
+  %add = add i64 %a, %b
+  %cond = tail call i64 @llvm.umin.i64(i64 %add, i64 %a)
+  ret i64 %cond
+}
diff --git a/llvm/test/CodeGen/X86/underflow-compare-fold.ll b/llvm/test/CodeGen/X86/underflow-compare-fold.ll
deleted file mode 100644
index 366c3f040e962..0000000000000
--- a/llvm/test/CodeGen/X86/underflow-compare-fold.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64 | FileCheck %s
-
-; GitHub issue #161036
-
-; Positive test : umin(sub(a,b),a) with scalar types should be folded
-define i64 @underflow_compare_fold(i64 %a, i64 %b) {
-; CHECK-LABEL: underflow_compare_fold
-; CHECK-LABEL: %bb.0
-; CHECK-NEXT: movq    %rdi, %rax
-; CHECK-NEXT: subq    %rsi, %rax
-; CHECK-NEXT: cmovbq  %rdi, %rax
-; CHECK-NEXT: retq
-  %sub = sub i64 %a, %b
-  %cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
-  ret i64 %cond
-}
-
-; Negative test, vector types : umin(sub(a,b),a) but with vectors
-define <16 x i8> @underflow_compare_dontfold_vectors(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: underflow_compare_dontfold_vectors
-; CHECK-LABEL: %bb.0
-; CHECK-NEXT: movdqa %xmm0, %xmm2
-; CHECK-NEXT: psubb %xmm1, %xmm2
-; CHECK-NEXT: pminub %xmm2, %xmm0
-; CHECK-NEXT: retq
-  %sub = sub <16 x i8> %a, %b
-  %cond = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %sub, <16 x i8> %a)
-  ret <16 x i8> %cond
-}
-
-; Negative test, pattern mismatch : umin(a,sub(a,b))
-define i64 @umin_sub_inverse_args(i64 %a, i64 %b) {
-; CHECK-LABEL: umin_sub_inverse_args
-; CHECK-LABEL: %bb.0
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: subq %rsi, %rax
-; CHECK-NEXT: cmpq %rax, %rdi
-; CHECK-NEXT: cmovbq %rdi, %rax
-; CHECK-NEXT: retq
-  %sub = sub i64 %a, %b
-  %cond = tail call i64 @llvm.umin.i64(i64 %a, i64 %sub)
-  ret i64 %cond
-}
-
-; Negative test, pattern mismatch : umin(add(a,b),a)
-define i64 @umin_add(i64 %a, i64 %b) {
-; CHECK-LABEL: umin_add
-; CHECK-LABEL: %bb.0
-; CHECK-NEXT: leaq (%rsi,%rdi), %rax
-; CHECK-NEXT: cmpq %rdi, %rax
-; CHECK-NEXT: cmovaeq %rdi, %rax
-; CHECK-NEXT: retq
-  %add = add i64 %a, %b
-  %cond = tail call i64 @llvm.umin.i64(i64 %add, i64 %a)
-  ret i64 %cond
-}

>From 6df10cc293bd377d3c336350ba2e378241e570af Mon Sep 17 00:00:00 2001
From: Chaitanya Koparkar <ckoparkar at gmail.com>
Date: Mon, 10 Nov 2025 08:20:14 -0500
Subject: [PATCH 10/10] Adjust patterns for AMDGPU

---
 .../umin-sub-to-usubo-select-combine.ll       | 45 ++++++++++---------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll b/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll
index 22e4a24435f12..6e105dbbb3f94 100644
--- a/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll
@@ -26,16 +26,16 @@ define i32 @v_underflow_compare_fold_i32(i32 %a, i32 %b) #0 {
 ; GFX9-LABEL: v_underflow_compare_fold_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_sub_u32_e32 v1, v0, v1
-; GFX9-NEXT:    v_min_u32_e32 v0, v1, v0
+; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_underflow_compare_fold_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v0, v1
+; GFX11-NEXT:    v_sub_co_u32 v1, vcc_lo, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_u32_e32 v0, v1, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub i32 %a, %b
   %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
@@ -46,16 +46,16 @@ define i32 @v_underflow_compare_fold_i32_commute(i32 %a, i32 %b) #0 {
 ; GFX9-LABEL: v_underflow_compare_fold_i32_commute:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_sub_u32_e32 v1, v0, v1
-; GFX9-NEXT:    v_min_u32_e32 v0, v0, v1
+; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_underflow_compare_fold_i32_commute:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v0, v1
+; GFX11-NEXT:    v_sub_co_u32 v1, vcc_lo, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_u32_e32 v0, v0, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub i32 %a, %b
   %cond = call i32 @llvm.umin.i32(i32 %a, i32 %sub)
@@ -66,19 +66,20 @@ define i32 @v_underflow_compare_fold_i32_multi_use(i32 %a, i32 %b, ptr addrspace
 ; GFX9-LABEL: v_underflow_compare_fold_i32_multi_use:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_sub_u32_e32 v1, v0, v1
-; GFX9-NEXT:    v_min_u32_e32 v0, v1, v0
-; GFX9-NEXT:    global_store_dword v[2:3], v1, off
+; GFX9-NEXT:    v_sub_u32_e32 v4, v0, v1
+; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT:    global_store_dword v[2:3], v4, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_underflow_compare_fold_i32_multi_use:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_min_u32_e32 v0, v1, v0
-; GFX11-NEXT:    global_store_b32 v[2:3], v1, off
+; GFX11-NEXT:    v_sub_nc_u32_e32 v4, v0, v1
+; GFX11-NEXT:    v_sub_co_u32 v1, vcc_lo, v0, v1
+; GFX11-NEXT:    global_store_b32 v[2:3], v4, off
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sub = sub i32 %a, %b
   store i32 %sub, ptr addrspace(1) %ptr
@@ -190,15 +191,19 @@ define amdgpu_ps i16 @s_underflow_compare_fold_i16(i16 inreg %a, i16 inreg %b) #
 define amdgpu_ps i32 @s_underflow_compare_fold_i32(i32 inreg %a, i32 inreg %b) #0 {
 ; GFX9-LABEL: s_underflow_compare_fold_i32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_sub_i32 s1, s0, s1
-; GFX9-NEXT:    s_min_u32 s0, s1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_underflow_compare_fold_i32:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_sub_i32 s1, s0, s1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_min_u32 s0, s1, s0
+; GFX11-NEXT:    v_sub_co_u32 v0, s1, s0, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s0, s1
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %sub = sub i32 %a, %b
   %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)