[llvm] [DAG] Fold (umin (sub a b) a) -> (usubo a b); (select usubo.1 a usubo.0) (PR #161651)
Chaitanya Koparkar via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 10 09:40:08 PST 2025
https://github.com/ckoparkar updated https://github.com/llvm/llvm-project/pull/161651
>From 3a24f85aebb329f3ccc7272114e61201ef4ff161 Mon Sep 17 00:00:00 2001
From: Chaitanya Koparkar <ckoparkar at gmail.com>
Date: Thu, 2 Oct 2025 07:29:30 -0400
Subject: [PATCH 01/11] [DAG] Fold (umin (sub a b) a) -> (usubo a b); (select
usubo.1 a usubo.0)
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 24 +++++++++++++++
.../CodeGen/X86/underflow-compare-fold.ll | 29 +++++++++++++++++++
2 files changed, 53 insertions(+)
create mode 100644 llvm/test/CodeGen/X86/underflow-compare-fold.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 558c5a0390228..c188ab49f2821 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6199,6 +6199,30 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
SDLoc(N), VT, N0, N1))
return SD;
+ // (umin (sub a, b) a) -> (usubo a, b); (select usubo.1, a, usubo.0)
+ //
+ // IR:
+ // %sub = sub %a, %b
+ // %cond = umin %sub, %a
+ // ->
+ // %usubo = usubo %a, %b
+ // %overflow = extractvalue %usubo, 1
+ // %sub = extractvalue %usubo, 0
+ // %cond = select %overflow, %a, %sub
+ if (N0.getOpcode() == ISD::SUB) {
+ SDValue A, B, C;
+ if (sd_match(N,
+ m_AnyOf(m_UMin(m_Sub(m_Value(A), m_Value(B)), m_Value(C)),
+ m_SMin(m_Sub(m_Value(A), m_Value(B)), m_Value(C))))) {
+ EVT AVT = A.getValueType();
+ if (A == C && TLI.isOperationLegalOrCustom(ISD::USUBO, AVT)) {
+ SDVTList VTs = DAG.getVTList(AVT, MVT::i1);
+ SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, A, B);
+ return DAG.getSelect(DL, VT, USO.getValue(1), A, USO.getValue(0));
+ }
+ }
+ }
+
// Simplify the operands using demanded-bits information.
if (SimplifyDemandedBits(SDValue(N, 0)))
return SDValue(N, 0);
diff --git a/llvm/test/CodeGen/X86/underflow-compare-fold.ll b/llvm/test/CodeGen/X86/underflow-compare-fold.ll
new file mode 100644
index 0000000000000..1d1cfa4f97c50
--- /dev/null
+++ b/llvm/test/CodeGen/X86/underflow-compare-fold.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -mtriple=x86_64 | FileCheck %s
+
+; GitHub issue #161036
+
+define i64 @subIfNoUnderflow_umin(i64 %a, i64 %b) {
+; CHECK-LABEL: subIfNoUnderflow_umin
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: subq %rsi, %rax
+; CHECK-NEXT: cmovbq %rdi, %rax
+; retq
+entry:
+ %sub = sub i64 %a, %b
+ %cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+ ret i64 %cond
+}
+
+define i64 @subIfNoUnderflow_smin(i64 %a, i64 %b) {
+; CHECK-LABEL: subIfNoUnderflow_smin
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: subq %rsi, %rax
+; CHECK-NEXT: cmovbq %rdi, %rax
+; retq
+entry:
+ %sub = sub i64 %a, %b
+ %cond = tail call i64 @llvm.smin.i64(i64 %sub, i64 %a)
+ ret i64 %cond
+}
>From 0db4bf0442f37a26a577b974d69962332e19d378 Mon Sep 17 00:00:00 2001
From: Chaitanya Koparkar <ckoparkar at gmail.com>
Date: Thu, 2 Oct 2025 08:30:36 -0400
Subject: [PATCH 02/11] Remove smin pattern, it might not be correct
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 +---
llvm/test/CodeGen/X86/underflow-compare-fold.ll | 13 -------------
2 files changed, 1 insertion(+), 16 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c188ab49f2821..99d7000c3b62e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6211,9 +6211,7 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
// %cond = select %overflow, %a, %sub
if (N0.getOpcode() == ISD::SUB) {
SDValue A, B, C;
- if (sd_match(N,
- m_AnyOf(m_UMin(m_Sub(m_Value(A), m_Value(B)), m_Value(C)),
- m_SMin(m_Sub(m_Value(A), m_Value(B)), m_Value(C))))) {
+ if (sd_match(N, m_UMin(m_Sub(m_Value(A), m_Value(B)), m_Value(C)))) {
EVT AVT = A.getValueType();
if (A == C && TLI.isOperationLegalOrCustom(ISD::USUBO, AVT)) {
SDVTList VTs = DAG.getVTList(AVT, MVT::i1);
diff --git a/llvm/test/CodeGen/X86/underflow-compare-fold.ll b/llvm/test/CodeGen/X86/underflow-compare-fold.ll
index 1d1cfa4f97c50..4dcaefc7a5586 100644
--- a/llvm/test/CodeGen/X86/underflow-compare-fold.ll
+++ b/llvm/test/CodeGen/X86/underflow-compare-fold.ll
@@ -14,16 +14,3 @@ entry:
%cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
ret i64 %cond
}
-
-define i64 @subIfNoUnderflow_smin(i64 %a, i64 %b) {
-; CHECK-LABEL: subIfNoUnderflow_smin
-; CHECK-LABEL: %bb.0
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: subq %rsi, %rax
-; CHECK-NEXT: cmovbq %rdi, %rax
-; retq
-entry:
- %sub = sub i64 %a, %b
- %cond = tail call i64 @llvm.smin.i64(i64 %sub, i64 %a)
- ret i64 %cond
-}
>From c0ba3fd77bd8e7d8e3ce7fbe09cb50a6e211e0d7 Mon Sep 17 00:00:00 2001
From: Chaitanya Koparkar <ckoparkar at gmail.com>
Date: Thu, 2 Oct 2025 09:28:15 -0400
Subject: [PATCH 03/11] Fix check for retq
---
llvm/test/CodeGen/X86/underflow-compare-fold.ll | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/test/CodeGen/X86/underflow-compare-fold.ll b/llvm/test/CodeGen/X86/underflow-compare-fold.ll
index 4dcaefc7a5586..2416bcb909485 100644
--- a/llvm/test/CodeGen/X86/underflow-compare-fold.ll
+++ b/llvm/test/CodeGen/X86/underflow-compare-fold.ll
@@ -8,7 +8,7 @@ define i64 @subIfNoUnderflow_umin(i64 %a, i64 %b) {
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: subq %rsi, %rax
; CHECK-NEXT: cmovbq %rdi, %rax
-; retq
+; CHECK-NEXT: retq
entry:
%sub = sub i64 %a, %b
%cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
>From 57e2ea0310b83baab4efae67794d03ef53c364f2 Mon Sep 17 00:00:00 2001
From: Chaitanya Koparkar <ckoparkar at gmail.com>
Date: Fri, 3 Oct 2025 07:43:07 -0400
Subject: [PATCH 04/11] Address review comments
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 18 ++++--------------
.../CodeGen/AArch64/underflow-compare-fold.ll | 14 ++++++++++++++
.../test/CodeGen/X86/underflow-compare-fold.ll | 5 ++---
3 files changed, 20 insertions(+), 17 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/underflow-compare-fold.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 99d7000c3b62e..5d559d713b3eb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6200,21 +6200,11 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
return SD;
// (umin (sub a, b) a) -> (usubo a, b); (select usubo.1, a, usubo.0)
- //
- // IR:
- // %sub = sub %a, %b
- // %cond = umin %sub, %a
- // ->
- // %usubo = usubo %a, %b
- // %overflow = extractvalue %usubo, 1
- // %sub = extractvalue %usubo, 0
- // %cond = select %overflow, %a, %sub
if (N0.getOpcode() == ISD::SUB) {
- SDValue A, B, C;
- if (sd_match(N, m_UMin(m_Sub(m_Value(A), m_Value(B)), m_Value(C)))) {
- EVT AVT = A.getValueType();
- if (A == C && TLI.isOperationLegalOrCustom(ISD::USUBO, AVT)) {
- SDVTList VTs = DAG.getVTList(AVT, MVT::i1);
+ SDValue A, B;
+ if (sd_match(N, m_UMin(m_Sub(m_Value(A), m_Value(B)), m_Deferred(A)))) {
+ if (TLI.isOperationLegalOrCustom(ISD::USUBO, VT)) {
+ SDVTList VTs = DAG.getVTList(VT, MVT::i1);
SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, A, B);
return DAG.getSelect(DL, VT, USO.getValue(1), A, USO.getValue(0));
}
diff --git a/llvm/test/CodeGen/AArch64/underflow-compare-fold.ll b/llvm/test/CodeGen/AArch64/underflow-compare-fold.ll
new file mode 100644
index 0000000000000..6dcf2479d6daa
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/underflow-compare-fold.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+
+; GitHub issue #161036
+
+define i64 @underflow_compare_fold(i64 %a, i64 %b) {
+; CHECK-LABEL: underflow_compare_fold
+; CHECK: // %bb.0:
+; CHECK-NEXT: subs x8, x0, x1
+; CHECK-NEXT: csel x0, x0, x8, lo
+; CHECK-NEXT: ret
+ %sub = sub i64 %a, %b
+ %cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+ ret i64 %cond
+}
diff --git a/llvm/test/CodeGen/X86/underflow-compare-fold.ll b/llvm/test/CodeGen/X86/underflow-compare-fold.ll
index 2416bcb909485..a520ee1621d86 100644
--- a/llvm/test/CodeGen/X86/underflow-compare-fold.ll
+++ b/llvm/test/CodeGen/X86/underflow-compare-fold.ll
@@ -2,14 +2,13 @@
; GitHub issue #161036
-define i64 @subIfNoUnderflow_umin(i64 %a, i64 %b) {
-; CHECK-LABEL: subIfNoUnderflow_umin
+define i64 @underflow_compare_fold(i64 %a, i64 %b) {
+; CHECK-LABEL: underflow_compare_fold
; CHECK-LABEL: %bb.0
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: subq %rsi, %rax
; CHECK-NEXT: cmovbq %rdi, %rax
; CHECK-NEXT: retq
-entry:
%sub = sub i64 %a, %b
%cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
ret i64 %cond
>From 7e08f231fea2d17e1c36f2da99bb4b626518e72e Mon Sep 17 00:00:00 2001
From: Chaitanya Koparkar <ckoparkar at gmail.com>
Date: Wed, 22 Oct 2025 08:39:37 -0400
Subject: [PATCH 05/11] Don't use MVT::i1
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 5d559d713b3eb..8c6cde629b751 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6200,11 +6200,12 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
return SD;
// (umin (sub a, b) a) -> (usubo a, b); (select usubo.1, a, usubo.0)
- if (N0.getOpcode() == ISD::SUB) {
+ {
SDValue A, B;
if (sd_match(N, m_UMin(m_Sub(m_Value(A), m_Value(B)), m_Deferred(A)))) {
if (TLI.isOperationLegalOrCustom(ISD::USUBO, VT)) {
- SDVTList VTs = DAG.getVTList(VT, MVT::i1);
+ EVT SETCCT = getSetCCResultType(VT);
+ SDVTList VTs = DAG.getVTList(VT, SETCCT);
SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, A, B);
return DAG.getSelect(DL, VT, USO.getValue(1), A, USO.getValue(0));
}
>From 7660fae35b12d17dc0080c2736475c8a3ba89007 Mon Sep 17 00:00:00 2001
From: Chaitanya Koparkar <ckoparkar at gmail.com>
Date: Fri, 31 Oct 2025 07:50:22 -0400
Subject: [PATCH 06/11] Fix pattern match
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 8c6cde629b751..66be2d7e16045 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6202,13 +6202,13 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
// (umin (sub a, b) a) -> (usubo a, b); (select usubo.1, a, usubo.0)
{
SDValue A, B;
- if (sd_match(N, m_UMin(m_Sub(m_Value(A), m_Value(B)), m_Deferred(A)))) {
- if (TLI.isOperationLegalOrCustom(ISD::USUBO, VT)) {
- EVT SETCCT = getSetCCResultType(VT);
- SDVTList VTs = DAG.getVTList(VT, SETCCT);
- SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, A, B);
- return DAG.getSelect(DL, VT, USO.getValue(1), A, USO.getValue(0));
- }
+ if (sd_match(N0, m_Sub(m_Value(A), m_Value(B))) &&
+ sd_match(N1, m_Specific(A)) &&
+ TLI.isOperationLegalOrCustom(ISD::USUBO, VT)) {
+ EVT SETCCT = getSetCCResultType(VT);
+ SDVTList VTs = DAG.getVTList(VT, SETCCT);
+ SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, A, B);
+ return DAG.getSelect(DL, VT, USO.getValue(1), A, USO.getValue(0));
}
}
>From d795383e23462d2d5b6fa2c21973c2d2a26fb400 Mon Sep 17 00:00:00 2001
From: Chaitanya Koparkar <ckoparkar at gmail.com>
Date: Fri, 31 Oct 2025 08:32:27 -0400
Subject: [PATCH 07/11] Add some more tests
---
.../CodeGen/AArch64/underflow-compare-fold.ll | 39 ++++++++++++++++++
.../CodeGen/X86/underflow-compare-fold.ll | 41 +++++++++++++++++++
2 files changed, 80 insertions(+)
diff --git a/llvm/test/CodeGen/AArch64/underflow-compare-fold.ll b/llvm/test/CodeGen/AArch64/underflow-compare-fold.ll
index 6dcf2479d6daa..5eb831e98c677 100644
--- a/llvm/test/CodeGen/AArch64/underflow-compare-fold.ll
+++ b/llvm/test/CodeGen/AArch64/underflow-compare-fold.ll
@@ -2,6 +2,7 @@
; GitHub issue #161036
+; Positive test : umin(sub(a,b),a) with scalar types should be folded
define i64 @underflow_compare_fold(i64 %a, i64 %b) {
; CHECK-LABEL: underflow_compare_fold
; CHECK: // %bb.0:
@@ -12,3 +13,41 @@ define i64 @underflow_compare_fold(i64 %a, i64 %b) {
%cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
ret i64 %cond
}
+
+; Negative test, vector types : umin(sub(a,b),a) but with vectors
+define <16 x i8> @underflow_compare_dontfold_vectors(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: underflow_compare_dontfold_vectors
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: sub v1.16b, v0.16b, v1.16b
+; CHECK-NEXT: umin v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: ret
+ %sub = sub <16 x i8> %a, %b
+ %cond = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %sub, <16 x i8> %a)
+ ret <16 x i8> %cond
+}
+
+; Negative test, pattern mismatch : umin(a,sub(a,b))
+define i64 @umin_sub_inverse_args(i64 %a, i64 %b) {
+; CHECK-LABEL: umin_sub_inverse_args
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: sub x8, x0, x1
+; CHECK-NEXT: cmp x0, x8
+; CHECK-NEXT: csel x0, x0, x8, lo
+; CHECK-NEXT: ret
+ %sub = sub i64 %a, %b
+ %cond = tail call i64 @llvm.umin.i64(i64 %a, i64 %sub)
+ ret i64 %cond
+}
+
+; Negative test, pattern mismatch : umin(add(a,b),a)
+define i64 @umin_add(i64 %a, i64 %b) {
+; CHECK-LABEL: umin_add
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: add x8, x0, x1
+; CHECK-NEXT: cmp x8, x0
+; CHECK-NEXT: csel x0, x8, x0, lo
+; CHECK-NEXT: ret
+ %add = add i64 %a, %b
+ %cond = tail call i64 @llvm.umin.i64(i64 %add, i64 %a)
+ ret i64 %cond
+}
diff --git a/llvm/test/CodeGen/X86/underflow-compare-fold.ll b/llvm/test/CodeGen/X86/underflow-compare-fold.ll
index a520ee1621d86..366c3f040e962 100644
--- a/llvm/test/CodeGen/X86/underflow-compare-fold.ll
+++ b/llvm/test/CodeGen/X86/underflow-compare-fold.ll
@@ -2,6 +2,7 @@
; GitHub issue #161036
+; Positive test : umin(sub(a,b),a) with scalar types should be folded
define i64 @underflow_compare_fold(i64 %a, i64 %b) {
; CHECK-LABEL: underflow_compare_fold
; CHECK-LABEL: %bb.0
@@ -13,3 +14,43 @@ define i64 @underflow_compare_fold(i64 %a, i64 %b) {
%cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
ret i64 %cond
}
+
+; Negative test, vector types : umin(sub(a,b),a) but with vectors
+define <16 x i8> @underflow_compare_dontfold_vectors(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: underflow_compare_dontfold_vectors
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: psubb %xmm1, %xmm2
+; CHECK-NEXT: pminub %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %sub = sub <16 x i8> %a, %b
+ %cond = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %sub, <16 x i8> %a)
+ ret <16 x i8> %cond
+}
+
+; Negative test, pattern mismatch : umin(a,sub(a,b))
+define i64 @umin_sub_inverse_args(i64 %a, i64 %b) {
+; CHECK-LABEL: umin_sub_inverse_args
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: subq %rsi, %rax
+; CHECK-NEXT: cmpq %rax, %rdi
+; CHECK-NEXT: cmovbq %rdi, %rax
+; CHECK-NEXT: retq
+ %sub = sub i64 %a, %b
+ %cond = tail call i64 @llvm.umin.i64(i64 %a, i64 %sub)
+ ret i64 %cond
+}
+
+; Negative test, pattern mismatch : umin(add(a,b),a)
+define i64 @umin_add(i64 %a, i64 %b) {
+; CHECK-LABEL: umin_add
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: leaq (%rsi,%rdi), %rax
+; CHECK-NEXT: cmpq %rdi, %rax
+; CHECK-NEXT: cmovaeq %rdi, %rax
+; CHECK-NEXT: retq
+ %add = add i64 %a, %b
+ %cond = tail call i64 @llvm.umin.i64(i64 %add, i64 %a)
+ ret i64 %cond
+}
>From a27dcf53a5ed5f77aa8c6dafcdfc91459bcb462f Mon Sep 17 00:00:00 2001
From: Chaitanya Koparkar <ckoparkar at gmail.com>
Date: Fri, 31 Oct 2025 08:55:44 -0400
Subject: [PATCH 08/11] Simplify pattern match
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 66be2d7e16045..f144ce2888dc1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6201,14 +6201,13 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
// (umin (sub a, b) a) -> (usubo a, b); (select usubo.1, a, usubo.0)
{
- SDValue A, B;
- if (sd_match(N0, m_Sub(m_Value(A), m_Value(B))) &&
- sd_match(N1, m_Specific(A)) &&
+ SDValue B;
+ if (sd_match(N0, m_Sub(m_Specific(N1), m_Value(B))) &&
TLI.isOperationLegalOrCustom(ISD::USUBO, VT)) {
EVT SETCCT = getSetCCResultType(VT);
SDVTList VTs = DAG.getVTList(VT, SETCCT);
- SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, A, B);
- return DAG.getSelect(DL, VT, USO.getValue(1), A, USO.getValue(0));
+ SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, N1, B);
+ return DAG.getSelect(DL, VT, USO.getValue(1), N1, USO.getValue(0));
}
}
>From e1ba30be7603af33f73fea4cb6c815f160a76e52 Mon Sep 17 00:00:00 2001
From: Chaitanya Koparkar <ckoparkar at gmail.com>
Date: Thu, 6 Nov 2025 08:41:19 -0500
Subject: [PATCH 09/11] Edits
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 20 ++-
.../umin-sub-to-usubo-select-combine.ll | 151 +++++++++++++++++
.../CodeGen/AArch64/underflow-compare-fold.ll | 53 ------
.../X86/umin-sub-to-usubo-select-combine.ll | 156 ++++++++++++++++++
.../CodeGen/X86/underflow-compare-fold.ll | 56 -------
5 files changed, 320 insertions(+), 116 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/umin-sub-to-usubo-select-combine.ll
delete mode 100644 llvm/test/CodeGen/AArch64/underflow-compare-fold.ll
create mode 100644 llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll
delete mode 100644 llvm/test/CodeGen/X86/underflow-compare-fold.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d43ec1002cd86..39c91fdfd371c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6216,16 +6216,22 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
SDLoc(N), VT, N0, N1))
return SD;
- // (umin (sub a, b) a) -> (usubo a, b); (select usubo.1, a, usubo.0)
- {
+ if (TLI.isOperationLegalOrCustom(ISD::USUBO, VT)) {
SDValue B;
- if (sd_match(N0, m_Sub(m_Specific(N1), m_Value(B))) &&
- TLI.isOperationLegalOrCustom(ISD::USUBO, VT)) {
- EVT SETCCT = getSetCCResultType(VT);
- SDVTList VTs = DAG.getVTList(VT, SETCCT);
+
+ // (umin (sub a, b), a) -> (usubo a, b); (select usubo.1, a, usubo.0)
+ if (sd_match(N0, m_Sub(m_Specific(N1), m_Value(B)))) {
+ SDVTList VTs = DAG.getVTList(VT, getSetCCResultType(VT));
SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, N1, B);
return DAG.getSelect(DL, VT, USO.getValue(1), N1, USO.getValue(0));
}
+
+ // (umin a, (sub a, b)) -> (usubo a, b); (select usubo.1, a, usubo.0)
+ if (sd_match(N1, m_Sub(m_Specific(N0), m_Value(B)))) {
+ SDVTList VTs = DAG.getVTList(VT, getSetCCResultType(VT));
+ SDValue USO = DAG.getNode(ISD::USUBO, DL, VTs, N0, B);
+ return DAG.getSelect(DL, VT, USO.getValue(1), N0, USO.getValue(0));
+ }
}
// Simplify the operands using demanded-bits information.
@@ -9386,7 +9392,7 @@ static unsigned bigEndianByteAt(unsigned BW, unsigned i) {
// Check if the bytes offsets we are looking at match with either big or
// little endian value loaded. Return true for big endian, false for little
// endian, and std::nullopt if match failed.
-static std::optional<bool> isBigEndian(const ArrayRef<int64_t> ByteOffsets,
+static std::optional<bool> isBigEndian(ArrayRef<int64_t> ByteOffsets,
int64_t FirstOffset) {
// The endian can be decided only when it is 2 bytes at least.
unsigned Width = ByteOffsets.size();
diff --git a/llvm/test/CodeGen/AArch64/umin-sub-to-usubo-select-combine.ll b/llvm/test/CodeGen/AArch64/umin-sub-to-usubo-select-combine.ll
new file mode 100644
index 0000000000000..fe3eee06db65e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/umin-sub-to-usubo-select-combine.ll
@@ -0,0 +1,151 @@
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+
+; GitHub issue #161036
+
+; Positive test : umin(sub(a,b),a) with scalar types should be folded
+define i64 @underflow_compare_fold_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: underflow_compare_fold_i64
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT: subs x8, x0, x1
+; CHECK-NEXT: csel x0, x0, x8, lo
+; CHECK-NEXT: ret
+ %sub = sub i64 %a, %b
+ %cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+ ret i64 %cond
+}
+
+; Positive test : umin(a,sub(a,b)) with scalar types should be folded
+define i64 @underflow_compare_fold_i64_commute(i64 %a, i64 %b) {
+; CHECK-LABEL: underflow_compare_fold_i64_commute
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT: subs x8, x0, x1
+; CHECK-NEXT: csel x0, x0, x8, lo
+; CHECK-NEXT: ret
+ %sub = sub i64 %a, %b
+ %cond = tail call i64 @llvm.umin.i64(i64 %a, i64 %sub)
+ ret i64 %cond
+}
+
+; Positive test : multi-use is OK since the sub instruction still runs once
+define i64 @underflow_compare_fold_i64_multi_use(i64 %a, i64 %b, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: underflow_compare_fold_i64_multi_use
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT: subs x8, x0, x1
+; CHECK-NEXT: csel x0, x0, x8, lo
+; CHECK-NEXT: str x8, [x2]
+; CHECK-NEXT: ret
+ %sub = sub i64 %a, %b
+ store i64 %sub, ptr addrspace(1) %ptr
+ %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+ ret i64 %cond
+}
+
+; Positive test : i32
+define i32 @underflow_compare_fold_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: underflow_compare_fold_i32
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT: subs w8, w0, w1
+; CHECK-NEXT: csel w0, w0, w8, lo
+; CHECK-NEXT: ret
+ %sub = sub i32 %a, %b
+ %cond = tail call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+ ret i32 %cond
+}
+
+; Positive test : i32
+define i32 @underflow_compare_fold_i32_commute(i32 %a, i32 %b) {
+; CHECK-LABEL: underflow_compare_fold_i32_commute
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT: subs w8, w0, w1
+; CHECK-NEXT: csel w0, w0, w8, lo
+; CHECK-NEXT: ret
+ %sub = sub i32 %a, %b
+ %cond = tail call i32 @llvm.umin.i32(i32 %a, i32 %sub)
+ ret i32 %cond
+}
+
+; Positive test : i32
+define i32 @underflow_compare_fold_i32_multi_use(i32 %a, i32 %b, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: underflow_compare_fold_i32_multi_use
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT: subs w8, w0, w1
+; CHECK-NEXT: csel w0, w0, w8, lo
+; CHECK-NEXT: str w8, [x2]
+; CHECK-NEXT: ret
+ %sub = sub i32 %a, %b
+ store i32 %sub, ptr addrspace(1) %ptr
+ %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+ ret i32 %cond
+}
+
+; Negative test : i16
+define i16 @underflow_compare_fold_i16(i16 %a, i16 %b) {
+; CHECK-LABEL: underflow_compare_fold_i16
+; CHECK-LABEL: %bb.0:
+; CHECK-LABEL: sub w8, w0, w1
+; CHECK-LABEL: and w9, w0, #0xffff
+; CHECK-LABEL: and w8, w8, #0xffff
+; CHECK-LABEL: cmp w8, w9
+; CHECK-LABEL: csel w0, w8, w9, lo
+; CHECK-LABEL: ret
+ %sub = sub i16 %a, %b
+ %cond = tail call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+ ret i16 %cond
+}
+
+; Negative test : i16
+define i16 @underflow_compare_fold_i16_commute(i16 %a, i16 %b) {
+; CHECK-LABEL: underflow_compare_fold_i16_commute
+; CHECK-LABEL: %bb.0:
+; CHECK-LABEL: sub w8, w0, w1
+; CHECK-LABEL: and w9, w0, #0xffff
+; CHECK-LABEL: and w8, w8, #0xffff
+; CHECK-LABEL: cmp w9, w8
+; CHECK-LABEL: csel w0, w9, w8, lo
+; CHECK-LABEL: ret
+ %sub = sub i16 %a, %b
+ %cond = tail call i16 @llvm.umin.i16(i16 %a, i16 %sub)
+ ret i16 %cond
+}
+
+; Negative test : i16
+define i16 @underflow_compare_fold_i16_multi_use(i16 %a, i16 %b, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: underflow_compare_fold_i16_multi_use
+; CHECK-LABEL: %bb.0:
+; CHECK-LABEL: sub w8, w0, w1
+; CHECK-LABEL: and w9, w0, #0xffff
+; CHECK-LABEL: and w10, w8, #0xffff
+; CHECK-LABEL: strh w8, [x2]
+; CHECK-LABEL: cmp w10, w9
+; CHECK-LABEL: csel w0, w10, w9, lo
+; CHECK-LABEL: ret
+ %sub = sub i16 %a, %b
+ store i16 %sub, ptr addrspace(1) %ptr
+ %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+ ret i16 %cond
+}
+
+; Negative test, vector types : umin(sub(a,b),a) but with vectors
+define <16 x i8> @underflow_compare_dontfold_vectors(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: underflow_compare_dontfold_vectors
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: sub v1.16b, v0.16b, v1.16b
+; CHECK-NEXT: umin v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: ret
+ %sub = sub <16 x i8> %a, %b
+ %cond = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %sub, <16 x i8> %a)
+ ret <16 x i8> %cond
+}
+
+; Negative test, pattern mismatch : umin(add(a,b),a)
+define i64 @umin_add(i64 %a, i64 %b) {
+; CHECK-LABEL: umin_add
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: add x8, x0, x1
+; CHECK-NEXT: cmp x8, x0
+; CHECK-NEXT: csel x0, x8, x0, lo
+; CHECK-NEXT: ret
+ %add = add i64 %a, %b
+ %cond = tail call i64 @llvm.umin.i64(i64 %add, i64 %a)
+ ret i64 %cond
+}
diff --git a/llvm/test/CodeGen/AArch64/underflow-compare-fold.ll b/llvm/test/CodeGen/AArch64/underflow-compare-fold.ll
deleted file mode 100644
index 5eb831e98c677..0000000000000
--- a/llvm/test/CodeGen/AArch64/underflow-compare-fold.ll
+++ /dev/null
@@ -1,53 +0,0 @@
-; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
-
-; GitHub issue #161036
-
-; Positive test : umin(sub(a,b),a) with scalar types should be folded
-define i64 @underflow_compare_fold(i64 %a, i64 %b) {
-; CHECK-LABEL: underflow_compare_fold
-; CHECK: // %bb.0:
-; CHECK-NEXT: subs x8, x0, x1
-; CHECK-NEXT: csel x0, x0, x8, lo
-; CHECK-NEXT: ret
- %sub = sub i64 %a, %b
- %cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
- ret i64 %cond
-}
-
-; Negative test, vector types : umin(sub(a,b),a) but with vectors
-define <16 x i8> @underflow_compare_dontfold_vectors(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: underflow_compare_dontfold_vectors
-; CHECK-LABEL: %bb.0
-; CHECK-NEXT: sub v1.16b, v0.16b, v1.16b
-; CHECK-NEXT: umin v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: ret
- %sub = sub <16 x i8> %a, %b
- %cond = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %sub, <16 x i8> %a)
- ret <16 x i8> %cond
-}
-
-; Negative test, pattern mismatch : umin(a,sub(a,b))
-define i64 @umin_sub_inverse_args(i64 %a, i64 %b) {
-; CHECK-LABEL: umin_sub_inverse_args
-; CHECK-LABEL: %bb.0
-; CHECK-NEXT: sub x8, x0, x1
-; CHECK-NEXT: cmp x0, x8
-; CHECK-NEXT: csel x0, x0, x8, lo
-; CHECK-NEXT: ret
- %sub = sub i64 %a, %b
- %cond = tail call i64 @llvm.umin.i64(i64 %a, i64 %sub)
- ret i64 %cond
-}
-
-; Negative test, pattern mismatch : umin(add(a,b),a)
-define i64 @umin_add(i64 %a, i64 %b) {
-; CHECK-LABEL: umin_add
-; CHECK-LABEL: %bb.0
-; CHECK-NEXT: add x8, x0, x1
-; CHECK-NEXT: cmp x8, x0
-; CHECK-NEXT: csel x0, x8, x0, lo
-; CHECK-NEXT: ret
- %add = add i64 %a, %b
- %cond = tail call i64 @llvm.umin.i64(i64 %add, i64 %a)
- ret i64 %cond
-}
diff --git a/llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll b/llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll
new file mode 100644
index 0000000000000..e9756b411eb2c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll
@@ -0,0 +1,156 @@
+; RUN: llc < %s -mtriple=x86_64 | FileCheck %s
+
+; GitHub issue #161036
+
+; Positive test : umin(sub(a,b),a) with scalar types should be folded
+define i64 @underflow_compare_fold_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: underflow_compare_fold_i64
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: subq %rsi, %rax
+; CHECK-NEXT: cmovbq %rdi, %rax
+; CHECK-NEXT: retq
+ %sub = sub i64 %a, %b
+ %cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+ ret i64 %cond
+}
+
+; Positive test : umin(a,sub(a,b)) with scalar types should be folded
+define i64 @underflow_compare_fold_i64_commute(i64 %a, i64 %b) {
+; CHECK-LABEL: underflow_compare_fold_i64_commute
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: subq %rsi, %rax
+; CHECK-NEXT: cmovbq %rdi, %rax
+; CHECK-NEXT: retq
+ %sub = sub i64 %a, %b
+ %cond = tail call i64 @llvm.umin.i64(i64 %a, i64 %sub)
+ ret i64 %cond
+}
+
+; Positive test : multi-use is OK since the sub instruction still runs once
+define i64 @underflow_compare_fold_i64_multi_use(i64 %a, i64 %b, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: underflow_compare_fold_i64_multi_use
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: subq %rsi, %rax
+; CHECK-NEXT: movq %rax, (%rdx)
+; CHECK-NEXT: cmovbq %rdi, %rax
+; CHECK-NEXT: retq
+ %sub = sub i64 %a, %b
+ store i64 %sub, ptr addrspace(1) %ptr
+ %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+ ret i64 %cond
+}
+
+; Positive test : i32
+define i32 @underflow_compare_fold_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: underflow_compare_fold_i32
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: subl %esi, %eax
+; CHECK-NEXT: cmovbl %edi, %eax
+; CHECK-NEXT: retq
+ %sub = sub i32 %a, %b
+ %cond = tail call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+ ret i32 %cond
+}
+
+; Positive test : i32
+define i32 @underflow_compare_fold_i32_commute(i32 %a, i32 %b) {
+; CHECK-LABEL: underflow_compare_fold_i32_commute
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: subl %esi, %eax
+; CHECK-NEXT: cmovbl %edi, %eax
+; CHECK-NEXT: retq
+ %sub = sub i32 %a, %b
+ %cond = tail call i32 @llvm.umin.i32(i32 %a, i32 %sub)
+ ret i32 %cond
+}
+
+; Positive test : i32
+define i32 @underflow_compare_fold_i32_multi_use(i32 %a, i32 %b, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: underflow_compare_fold_i32_multi_use
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: subl %esi, %eax
+; CHECK-NEXT: movl %eax, (%rdx)
+; CHECK-NEXT: cmovbl %edi, %eax
+; CHECK-NEXT: retq
+ %sub = sub i32 %a, %b
+ store i32 %sub, ptr addrspace(1) %ptr
+ %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+ ret i32 %cond
+}
+
+; Positive test : i16
+define i16 @underflow_compare_fold_i16(i16 %a, i16 %b) {
+; CHECK-LABEL: underflow_compare_fold_i16
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: subw %si, %ax
+; CHECK-NEXT: cmovbl %edi, %eax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: retq
+ %sub = sub i16 %a, %b
+ %cond = tail call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+ ret i16 %cond
+}
+
+; Positive test : i16
+define i16 @underflow_compare_fold_i16_commute(i16 %a, i16 %b) {
+; CHECK-LABEL: underflow_compare_fold_i16_commute
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: subw %si, %ax
+; CHECK-NEXT: cmovbl %edi, %eax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: retq
+ %sub = sub i16 %a, %b
+ %cond = tail call i16 @llvm.umin.i16(i16 %a, i16 %sub)
+ ret i16 %cond
+}
+
+; Positive test : i16
+define i16 @underflow_compare_fold_i16_multi_use(i16 %a, i16 %b, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: underflow_compare_fold_i16_multi_use
+; CHECK-LABEL: %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: subw %si, %ax
+; CHECK-NEXT: movw %ax, (%rdx)
+; CHECK-NEXT: cmovbl %edi, %eax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: retq
+ %sub = sub i16 %a, %b
+ store i16 %sub, ptr addrspace(1) %ptr
+ %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+ ret i16 %cond
+}
+
+
+; Negative test, vector types : umin(sub(a,b),a) but with vectors
+define <16 x i8> @underflow_compare_dontfold_vectors(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: underflow_compare_dontfold_vectors
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: psubb %xmm1, %xmm2
+; CHECK-NEXT: pminub %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %sub = sub <16 x i8> %a, %b
+ %cond = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %sub, <16 x i8> %a)
+ ret <16 x i8> %cond
+}
+
+; Negative test, pattern mismatch : umin(add(a,b),a)
+define i64 @umin_add(i64 %a, i64 %b) {
+; CHECK-LABEL: umin_add
+; CHECK-LABEL: %bb.0
+; CHECK-NEXT: leaq (%rsi,%rdi), %rax
+; CHECK-NEXT: cmpq %rdi, %rax
+; CHECK-NEXT: cmovaeq %rdi, %rax
+; CHECK-NEXT: retq
+ %add = add i64 %a, %b
+ %cond = tail call i64 @llvm.umin.i64(i64 %add, i64 %a)
+ ret i64 %cond
+}
diff --git a/llvm/test/CodeGen/X86/underflow-compare-fold.ll b/llvm/test/CodeGen/X86/underflow-compare-fold.ll
deleted file mode 100644
index 366c3f040e962..0000000000000
--- a/llvm/test/CodeGen/X86/underflow-compare-fold.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64 | FileCheck %s
-
-; GitHub issue #161036
-
-; Positive test : umin(sub(a,b),a) with scalar types should be folded
-define i64 @underflow_compare_fold(i64 %a, i64 %b) {
-; CHECK-LABEL: underflow_compare_fold
-; CHECK-LABEL: %bb.0
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: subq %rsi, %rax
-; CHECK-NEXT: cmovbq %rdi, %rax
-; CHECK-NEXT: retq
- %sub = sub i64 %a, %b
- %cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
- ret i64 %cond
-}
-
-; Negative test, vector types : umin(sub(a,b),a) but with vectors
-define <16 x i8> @underflow_compare_dontfold_vectors(<16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: underflow_compare_dontfold_vectors
-; CHECK-LABEL: %bb.0
-; CHECK-NEXT: movdqa %xmm0, %xmm2
-; CHECK-NEXT: psubb %xmm1, %xmm2
-; CHECK-NEXT: pminub %xmm2, %xmm0
-; CHECK-NEXT: retq
- %sub = sub <16 x i8> %a, %b
- %cond = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %sub, <16 x i8> %a)
- ret <16 x i8> %cond
-}
-
-; Negative test, pattern mismatch : umin(a,sub(a,b))
-define i64 @umin_sub_inverse_args(i64 %a, i64 %b) {
-; CHECK-LABEL: umin_sub_inverse_args
-; CHECK-LABEL: %bb.0
-; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: subq %rsi, %rax
-; CHECK-NEXT: cmpq %rax, %rdi
-; CHECK-NEXT: cmovbq %rdi, %rax
-; CHECK-NEXT: retq
- %sub = sub i64 %a, %b
- %cond = tail call i64 @llvm.umin.i64(i64 %a, i64 %sub)
- ret i64 %cond
-}
-
-; Negative test, pattern mismatch : umin(add(a,b),a)
-define i64 @umin_add(i64 %a, i64 %b) {
-; CHECK-LABEL: umin_add
-; CHECK-LABEL: %bb.0
-; CHECK-NEXT: leaq (%rsi,%rdi), %rax
-; CHECK-NEXT: cmpq %rdi, %rax
-; CHECK-NEXT: cmovaeq %rdi, %rax
-; CHECK-NEXT: retq
- %add = add i64 %a, %b
- %cond = tail call i64 @llvm.umin.i64(i64 %add, i64 %a)
- ret i64 %cond
-}
>From 6df10cc293bd377d3c336350ba2e378241e570af Mon Sep 17 00:00:00 2001
From: Chaitanya Koparkar <ckoparkar at gmail.com>
Date: Mon, 10 Nov 2025 08:20:14 -0500
Subject: [PATCH 10/11] Adjust patterns for AMDGPU
---
.../umin-sub-to-usubo-select-combine.ll | 45 ++++++++++---------
1 file changed, 25 insertions(+), 20 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll b/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll
index 22e4a24435f12..6e105dbbb3f94 100644
--- a/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/umin-sub-to-usubo-select-combine.ll
@@ -26,16 +26,16 @@ define i32 @v_underflow_compare_fold_i32(i32 %a, i32 %b) #0 {
; GFX9-LABEL: v_underflow_compare_fold_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_sub_u32_e32 v1, v0, v1
-; GFX9-NEXT: v_min_u32_e32 v0, v1, v0
+; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_underflow_compare_fold_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_sub_nc_u32_e32 v1, v0, v1
+; GFX11-NEXT: v_sub_co_u32 v1, vcc_lo, v0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_u32_e32 v0, v1, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sub = sub i32 %a, %b
%cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
@@ -46,16 +46,16 @@ define i32 @v_underflow_compare_fold_i32_commute(i32 %a, i32 %b) #0 {
; GFX9-LABEL: v_underflow_compare_fold_i32_commute:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_sub_u32_e32 v1, v0, v1
-; GFX9-NEXT: v_min_u32_e32 v0, v0, v1
+; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_underflow_compare_fold_i32_commute:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_sub_nc_u32_e32 v1, v0, v1
+; GFX11-NEXT: v_sub_co_u32 v1, vcc_lo, v0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_u32_e32 v0, v0, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sub = sub i32 %a, %b
%cond = call i32 @llvm.umin.i32(i32 %a, i32 %sub)
@@ -66,19 +66,20 @@ define i32 @v_underflow_compare_fold_i32_multi_use(i32 %a, i32 %b, ptr addrspace
; GFX9-LABEL: v_underflow_compare_fold_i32_multi_use:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_sub_u32_e32 v1, v0, v1
-; GFX9-NEXT: v_min_u32_e32 v0, v1, v0
-; GFX9-NEXT: global_store_dword v[2:3], v1, off
+; GFX9-NEXT: v_sub_u32_e32 v4, v0, v1
+; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v0, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT: global_store_dword v[2:3], v4, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_underflow_compare_fold_i32_multi_use:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_sub_nc_u32_e32 v1, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_u32_e32 v0, v1, v0
-; GFX11-NEXT: global_store_b32 v[2:3], v1, off
+; GFX11-NEXT: v_sub_nc_u32_e32 v4, v0, v1
+; GFX11-NEXT: v_sub_co_u32 v1, vcc_lo, v0, v1
+; GFX11-NEXT: global_store_b32 v[2:3], v4, off
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sub = sub i32 %a, %b
store i32 %sub, ptr addrspace(1) %ptr
@@ -190,15 +191,19 @@ define amdgpu_ps i16 @s_underflow_compare_fold_i16(i16 inreg %a, i16 inreg %b) #
define amdgpu_ps i32 @s_underflow_compare_fold_i32(i32 inreg %a, i32 inreg %b) #0 {
; GFX9-LABEL: s_underflow_compare_fold_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_sub_i32 s1, s0, s1
-; GFX9-NEXT: s_min_u32 s0, s1, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_underflow_compare_fold_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_sub_i32 s1, s0, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s0, s1, s0
+; GFX11-NEXT: v_sub_co_u32 v0, s1, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%sub = sub i32 %a, %b
%cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
>From 0a54b44162d622226ddc8f542b96ea45fd8e16f2 Mon Sep 17 00:00:00 2001
From: Chaitanya Koparkar <ckoparkar at gmail.com>
Date: Mon, 10 Nov 2025 12:39:21 -0500
Subject: [PATCH 11/11] Adjust patterns in AMDGPU/llvm.set.rounding.ll
---
llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll | 640 ++++++++++++------
1 file changed, 421 insertions(+), 219 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
index 0c8dbe865a872..a9eee2d8702c6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
@@ -10,51 +10,85 @@ declare void @llvm.set.rounding(i32)
declare i32 @llvm.get.rounding()
define amdgpu_gfx void @s_set_rounding(i32 inreg %rounding) {
-; GFX678-LABEL: s_set_rounding:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_add_i32 s34, s4, -4
-; GFX678-NEXT: s_min_u32 s34, s4, s34
-; GFX678-NEXT: s_lshl_b32 s36, s34, 2
-; GFX678-NEXT: s_mov_b32 s34, 0x1c84a50f
-; GFX678-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX678-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
-; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX678-NEXT: s_setpc_b64 s[30:31]
+; GFX6-LABEL: s_set_rounding:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: v_sub_i32_e64 v1, vcc, s4, 4
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX6-NEXT: s_mov_b32 s34, 0x1c84a50f
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT: s_mov_b32 s35, 0xb73e62d9
+; GFX6-NEXT: v_lshr_b64 v[0:1], s[34:35], v0
+; GFX6-NEXT: v_readfirstlane_b32 s34, v0
+; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: s_set_rounding:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: v_sub_i32_e64 v1, vcc, s4, 4
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX7-NEXT: s_mov_b32 s34, 0x1c84a50f
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT: s_mov_b32 s35, 0xb73e62d9
+; GFX7-NEXT: v_lshr_b64 v[0:1], s[34:35], v0
+; GFX7-NEXT: v_readfirstlane_b32 s34, v0
+; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_set_rounding:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_sub_u32_e64 v1, vcc, s4, 4
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT: s_mov_b32 s34, 0x1c84a50f
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: s_mov_b32 s35, 0xb73e62d9
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[34:35]
+; GFX8-NEXT: v_readfirstlane_b32 s34, v0
+; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: s_set_rounding:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_add_i32 s34, s4, -4
-; GFX9-NEXT: s_min_u32 s34, s4, s34
-; GFX9-NEXT: s_lshl_b32 s36, s34, 2
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_sub_co_u32_e64 v1, vcc, s4, 4
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[34:35]
+; GFX9-NEXT: v_readfirstlane_b32 s34, v0
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: s_set_rounding:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_add_i32 s34, s4, -4
-; GFX10-NEXT: s_min_u32 s36, s4, s34
+; GFX10-NEXT: v_sub_co_u32 v0, s34, s4, 4
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s34
; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f
; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX10-NEXT: s_lshl_b32 s36, s36, 2
-; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[34:35]
+; GFX10-NEXT: v_readfirstlane_b32 s34, v0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: s_set_rounding:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_add_i32 s0, s4, -4
-; GFX11-NEXT: s_min_u32 s2, s4, s0
+; GFX11-NEXT: v_sub_co_u32 v0, s0, s4, 4
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s0
; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f
; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9
-; GFX11-NEXT: s_lshl_b32 s2, s2, 2
-; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1]
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
call void @llvm.set.rounding(i32 %rounding)
@@ -70,10 +104,12 @@ define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) {
; GFX6-NEXT: ;;#ASMSTART
; GFX6-NEXT: ;;#ASMEND
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_add_i32 s3, s2, -4
-; GFX6-NEXT: s_min_u32 s2, s2, s3
-; GFX6-NEXT: s_lshl_b32 s2, s2, 2
-; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
+; GFX6-NEXT: v_mov_b32_e32 v0, s2
+; GFX6-NEXT: v_sub_i32_e64 v1, vcc, s2, 4
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v0
+; GFX6-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
; GFX6-NEXT: s_endpgm
;
@@ -85,10 +121,12 @@ define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) {
; GFX7-NEXT: ;;#ASMSTART
; GFX7-NEXT: ;;#ASMEND
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_add_i32 s3, s2, -4
-; GFX7-NEXT: s_min_u32 s2, s2, s3
-; GFX7-NEXT: s_lshl_b32 s2, s2, 2
-; GFX7-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_sub_i32_e64 v1, vcc, s2, 4
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT: v_lshr_b64 v[0:1], s[0:1], v0
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
; GFX7-NEXT: s_endpgm
;
@@ -100,10 +138,12 @@ define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) {
; GFX8-NEXT: ;;#ASMSTART
; GFX8-NEXT: ;;#ASMEND
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_add_i32 s3, s2, -4
-; GFX8-NEXT: s_min_u32 s2, s2, s3
-; GFX8-NEXT: s_lshl_b32 s2, s2, 2
-; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_sub_u32_e64 v1, vcc, s2, 4
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1]
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
; GFX8-NEXT: s_endpgm
;
@@ -115,40 +155,44 @@ define amdgpu_kernel void @s_set_rounding_kernel(i32 inreg %rounding) {
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_i32 s3, s2, -4
-; GFX9-NEXT: s_min_u32 s2, s2, s3
-; GFX9-NEXT: s_lshl_b32 s2, s2, 2
-; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_sub_co_u32_e64 v1, vcc, s2, 4
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1]
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_set_rounding_kernel:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dword s2, s[4:5], 0x24
-; GFX10-NEXT: s_mov_b32 s0, 0x1c84a50f
-; GFX10-NEXT: s_mov_b32 s1, 0xb73e62d9
+; GFX10-NEXT: s_load_dword s0, s[4:5], 0x24
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_i32 s3, s2, -4
-; GFX10-NEXT: s_min_u32 s2, s2, s3
-; GFX10-NEXT: s_lshl_b32 s2, s2, 2
-; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
+; GFX10-NEXT: v_sub_co_u32 v0, s1, s0, 4
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1
+; GFX10-NEXT: s_mov_b32 s0, 0x1c84a50f
+; GFX10-NEXT: s_mov_b32 s1, 0xb73e62d9
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1]
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_set_rounding_kernel:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x24
-; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f
-; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_i32 s3, s2, -4
-; GFX11-NEXT: s_min_u32 s2, s2, s3
-; GFX11-NEXT: s_lshl_b32 s2, s2, 2
-; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT: v_sub_co_u32 v0, s1, s0, 4
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1
+; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f
+; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1]
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
; GFX11-NEXT: s_endpgm
call void @llvm.set.rounding(i32 %rounding)
@@ -160,8 +204,8 @@ define void @v_set_rounding(i32 %rounding) {
; GFX6-LABEL: v_set_rounding:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v1, vcc, -4, v0
-; GFX6-NEXT: v_min_u32_e32 v0, v0, v1
+; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 4, v0
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX6-NEXT: s_mov_b32 s4, 0x1c84a50f
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX6-NEXT: s_mov_b32 s5, 0xb73e62d9
@@ -173,8 +217,8 @@ define void @v_set_rounding(i32 %rounding) {
; GFX7-LABEL: v_set_rounding:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v1, vcc, -4, v0
-; GFX7-NEXT: v_min_u32_e32 v0, v0, v1
+; GFX7-NEXT: v_subrev_i32_e32 v1, vcc, 4, v0
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX7-NEXT: s_mov_b32 s4, 0x1c84a50f
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: s_mov_b32 s5, 0xb73e62d9
@@ -186,8 +230,8 @@ define void @v_set_rounding(i32 %rounding) {
; GFX8-LABEL: v_set_rounding:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, -4, v0
-; GFX8-NEXT: v_min_u32_e32 v0, v0, v1
+; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 4, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX8-NEXT: s_mov_b32 s4, 0x1c84a50f
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: s_mov_b32 s5, 0xb73e62d9
@@ -199,8 +243,8 @@ define void @v_set_rounding(i32 %rounding) {
; GFX9-LABEL: v_set_rounding:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_add_u32_e32 v1, -4, v0
-; GFX9-NEXT: v_min_u32_e32 v0, v0, v1
+; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, 4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x1c84a50f
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_mov_b32 s5, 0xb73e62d9
@@ -212,10 +256,10 @@ define void @v_set_rounding(i32 %rounding) {
; GFX10-LABEL: v_set_rounding:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v1, -4, v0
+; GFX10-NEXT: v_sub_co_u32 v1, vcc_lo, v0, 4
; GFX10-NEXT: s_mov_b32 s4, 0x1c84a50f
; GFX10-NEXT: s_mov_b32 s5, 0xb73e62d9
-; GFX10-NEXT: v_min_u32_e32 v0, v0, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[4:5]
; GFX10-NEXT: v_readfirstlane_b32 s4, v0
@@ -225,10 +269,10 @@ define void @v_set_rounding(i32 %rounding) {
; GFX11-LABEL: v_set_rounding:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v1, -4, v0
+; GFX11-NEXT: v_sub_co_u32 v1, vcc_lo, v0, 4
; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f
; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9
-; GFX11-NEXT: v_min_u32_e32 v0, v0, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1]
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
@@ -239,26 +283,74 @@ define void @v_set_rounding(i32 %rounding) {
}
define void @set_rounding_get_rounding() {
-; GFX678-LABEL: set_rounding_get_rounding:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 4)
-; GFX678-NEXT: s_lshl_b32 s6, s4, 2
-; GFX678-NEXT: s_mov_b32 s4, 0xeb24da71
-; GFX678-NEXT: s_mov_b32 s5, 0xc96f385
-; GFX678-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
-; GFX678-NEXT: s_and_b32 s4, s4, 15
-; GFX678-NEXT: s_add_i32 s5, s4, 4
-; GFX678-NEXT: s_cmp_lt_u32 s4, 4
-; GFX678-NEXT: s_cselect_b32 s4, s4, s5
-; GFX678-NEXT: s_add_i32 s5, s4, -4
-; GFX678-NEXT: s_min_u32 s4, s4, s5
-; GFX678-NEXT: s_lshl_b32 s6, s4, 2
-; GFX678-NEXT: s_mov_b32 s4, 0x1c84a50f
-; GFX678-NEXT: s_mov_b32 s5, 0xb73e62d9
-; GFX678-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
-; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
-; GFX678-NEXT: s_setpc_b64 s[30:31]
+; GFX6-LABEL: set_rounding_get_rounding:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 4)
+; GFX6-NEXT: s_lshl_b32 s6, s4, 2
+; GFX6-NEXT: s_mov_b32 s4, 0xeb24da71
+; GFX6-NEXT: s_mov_b32 s5, 0xc96f385
+; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
+; GFX6-NEXT: s_and_b32 s4, s4, 15
+; GFX6-NEXT: s_add_i32 s5, s4, 4
+; GFX6-NEXT: s_cmp_lt_u32 s4, 4
+; GFX6-NEXT: s_cselect_b32 s4, s4, s5
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: v_sub_i32_e64 v1, vcc, s4, 4
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX6-NEXT: s_mov_b32 s4, 0x1c84a50f
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT: s_mov_b32 s5, 0xb73e62d9
+; GFX6-NEXT: v_lshr_b64 v[0:1], s[4:5], v0
+; GFX6-NEXT: v_readfirstlane_b32 s4, v0
+; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: set_rounding_get_rounding:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 4)
+; GFX7-NEXT: s_lshl_b32 s6, s4, 2
+; GFX7-NEXT: s_mov_b32 s4, 0xeb24da71
+; GFX7-NEXT: s_mov_b32 s5, 0xc96f385
+; GFX7-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
+; GFX7-NEXT: s_and_b32 s4, s4, 15
+; GFX7-NEXT: s_add_i32 s5, s4, 4
+; GFX7-NEXT: s_cmp_lt_u32 s4, 4
+; GFX7-NEXT: s_cselect_b32 s4, s4, s5
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: v_sub_i32_e64 v1, vcc, s4, 4
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX7-NEXT: s_mov_b32 s4, 0x1c84a50f
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT: s_mov_b32 s5, 0xb73e62d9
+; GFX7-NEXT: v_lshr_b64 v[0:1], s[4:5], v0
+; GFX7-NEXT: v_readfirstlane_b32 s4, v0
+; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: set_rounding_get_rounding:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 0, 4)
+; GFX8-NEXT: s_lshl_b32 s6, s4, 2
+; GFX8-NEXT: s_mov_b32 s4, 0xeb24da71
+; GFX8-NEXT: s_mov_b32 s5, 0xc96f385
+; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
+; GFX8-NEXT: s_and_b32 s4, s4, 15
+; GFX8-NEXT: s_add_i32 s5, s4, 4
+; GFX8-NEXT: s_cmp_lt_u32 s4, 4
+; GFX8-NEXT: s_cselect_b32 s4, s4, s5
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_sub_u32_e64 v1, vcc, s4, 4
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT: s_mov_b32 s4, 0x1c84a50f
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: s_mov_b32 s5, 0xb73e62d9
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[4:5]
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: set_rounding_get_rounding:
; GFX9: ; %bb.0:
@@ -272,12 +364,14 @@ define void @set_rounding_get_rounding() {
; GFX9-NEXT: s_add_i32 s5, s4, 4
; GFX9-NEXT: s_cmp_lt_u32 s4, 4
; GFX9-NEXT: s_cselect_b32 s4, s4, s5
-; GFX9-NEXT: s_add_i32 s5, s4, -4
-; GFX9-NEXT: s_min_u32 s4, s4, s5
-; GFX9-NEXT: s_lshl_b32 s6, s4, 2
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_sub_co_u32_e64 v1, vcc, s4, 4
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x1c84a50f
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_mov_b32 s5, 0xb73e62d9
-; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[4:5]
+; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -293,12 +387,13 @@ define void @set_rounding_get_rounding() {
; GFX10-NEXT: s_add_i32 s5, s4, 4
; GFX10-NEXT: s_cmp_lt_u32 s4, 4
; GFX10-NEXT: s_cselect_b32 s4, s4, s5
-; GFX10-NEXT: s_add_i32 s5, s4, -4
-; GFX10-NEXT: s_min_u32 s6, s4, s5
+; GFX10-NEXT: v_sub_co_u32 v0, s5, s4, 4
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s5
; GFX10-NEXT: s_mov_b32 s4, 0x1c84a50f
; GFX10-NEXT: s_mov_b32 s5, 0xb73e62d9
-; GFX10-NEXT: s_lshl_b32 s6, s6, 2
-; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[4:5]
+; GFX10-NEXT: v_readfirstlane_b32 s4, v0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -314,12 +409,13 @@ define void @set_rounding_get_rounding() {
; GFX11-NEXT: s_add_i32 s1, s0, 4
; GFX11-NEXT: s_cmp_lt_u32 s0, 4
; GFX11-NEXT: s_cselect_b32 s0, s0, s1
-; GFX11-NEXT: s_add_i32 s1, s0, -4
-; GFX11-NEXT: s_min_u32 s2, s0, s1
+; GFX11-NEXT: v_sub_co_u32 v0, s1, s0, 4
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1
; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f
; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9
-; GFX11-NEXT: s_lshl_b32 s2, s2, 2
-; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1]
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%rounding = call i32 @llvm.get.rounding()
@@ -922,24 +1018,28 @@ define amdgpu_gfx void @s_set_rounding_i2_signext(i2 signext inreg %rounding) {
; GFX6-LABEL: s_set_rounding_i2_signext:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: s_add_i32 s34, s4, -4
-; GFX6-NEXT: s_min_u32 s34, s4, s34
-; GFX6-NEXT: s_lshl_b32 s36, s34, 2
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: v_sub_i32_e64 v1, vcc, s4, 4
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX6-NEXT: s_mov_b32 s34, 0x1c84a50f
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX6-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX6-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
+; GFX6-NEXT: v_lshr_b64 v[0:1], s[34:35], v0
+; GFX6-NEXT: v_readfirstlane_b32 s34, v0
; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: s_set_rounding_i2_signext:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_add_i32 s34, s4, -4
-; GFX7-NEXT: s_min_u32 s34, s4, s34
-; GFX7-NEXT: s_lshl_b32 s36, s34, 2
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: v_sub_i32_e64 v1, vcc, s4, 4
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX7-NEXT: s_mov_b32 s34, 0x1c84a50f
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX7-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
+; GFX7-NEXT: v_lshr_b64 v[0:1], s[34:35], v0
+; GFX7-NEXT: v_readfirstlane_b32 s34, v0
; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -947,12 +1047,14 @@ define amdgpu_gfx void @s_set_rounding_i2_signext(i2 signext inreg %rounding) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_sext_i32_i16 s34, s4
-; GFX8-NEXT: s_add_i32 s35, s34, -4
-; GFX8-NEXT: s_min_u32 s34, s34, s35
-; GFX8-NEXT: s_lshl_b32 s36, s34, 2
+; GFX8-NEXT: v_mov_b32_e32 v0, s34
+; GFX8-NEXT: v_sub_u32_e64 v1, vcc, s34, 4
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX8-NEXT: s_mov_b32 s34, 0x1c84a50f
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX8-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[34:35]
+; GFX8-NEXT: v_readfirstlane_b32 s34, v0
; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -960,12 +1062,14 @@ define amdgpu_gfx void @s_set_rounding_i2_signext(i2 signext inreg %rounding) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_sext_i32_i16 s34, s4
-; GFX9-NEXT: s_add_i32 s35, s34, -4
-; GFX9-NEXT: s_min_u32 s34, s34, s35
-; GFX9-NEXT: s_lshl_b32 s36, s34, 2
+; GFX9-NEXT: v_mov_b32_e32 v0, s34
+; GFX9-NEXT: v_sub_co_u32_e64 v1, vcc, s34, 4
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[34:35]
+; GFX9-NEXT: v_readfirstlane_b32 s34, v0
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -973,12 +1077,13 @@ define amdgpu_gfx void @s_set_rounding_i2_signext(i2 signext inreg %rounding) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_sext_i32_i16 s34, s4
-; GFX10-NEXT: s_add_i32 s35, s34, -4
-; GFX10-NEXT: s_min_u32 s36, s34, s35
+; GFX10-NEXT: v_sub_co_u32 v0, s35, s34, 4
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s34, s35
; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f
; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX10-NEXT: s_lshl_b32 s36, s36, 2
-; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[34:35]
+; GFX10-NEXT: v_readfirstlane_b32 s34, v0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -986,12 +1091,13 @@ define amdgpu_gfx void @s_set_rounding_i2_signext(i2 signext inreg %rounding) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_sext_i32_i16 s0, s4
-; GFX11-NEXT: s_add_i32 s1, s0, -4
-; GFX11-NEXT: s_min_u32 s2, s0, s1
+; GFX11-NEXT: v_sub_co_u32 v0, s1, s0, 4
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1
; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f
; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9
-; GFX11-NEXT: s_lshl_b32 s2, s2, 2
-; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1]
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sext.rounding = sext i2 %rounding to i32
@@ -1003,24 +1109,28 @@ define amdgpu_gfx void @s_set_rounding_i3_signext(i3 signext inreg %rounding) {
; GFX6-LABEL: s_set_rounding_i3_signext:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: s_add_i32 s34, s4, -4
-; GFX6-NEXT: s_min_u32 s34, s4, s34
-; GFX6-NEXT: s_lshl_b32 s36, s34, 2
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: v_sub_i32_e64 v1, vcc, s4, 4
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX6-NEXT: s_mov_b32 s34, 0x1c84a50f
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX6-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX6-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
+; GFX6-NEXT: v_lshr_b64 v[0:1], s[34:35], v0
+; GFX6-NEXT: v_readfirstlane_b32 s34, v0
; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: s_set_rounding_i3_signext:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_add_i32 s34, s4, -4
-; GFX7-NEXT: s_min_u32 s34, s4, s34
-; GFX7-NEXT: s_lshl_b32 s36, s34, 2
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: v_sub_i32_e64 v1, vcc, s4, 4
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX7-NEXT: s_mov_b32 s34, 0x1c84a50f
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX7-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
+; GFX7-NEXT: v_lshr_b64 v[0:1], s[34:35], v0
+; GFX7-NEXT: v_readfirstlane_b32 s34, v0
; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -1028,12 +1138,14 @@ define amdgpu_gfx void @s_set_rounding_i3_signext(i3 signext inreg %rounding) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_sext_i32_i16 s34, s4
-; GFX8-NEXT: s_add_i32 s35, s34, -4
-; GFX8-NEXT: s_min_u32 s34, s34, s35
-; GFX8-NEXT: s_lshl_b32 s36, s34, 2
+; GFX8-NEXT: v_mov_b32_e32 v0, s34
+; GFX8-NEXT: v_sub_u32_e64 v1, vcc, s34, 4
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX8-NEXT: s_mov_b32 s34, 0x1c84a50f
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX8-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[34:35]
+; GFX8-NEXT: v_readfirstlane_b32 s34, v0
; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -1041,12 +1153,14 @@ define amdgpu_gfx void @s_set_rounding_i3_signext(i3 signext inreg %rounding) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_sext_i32_i16 s34, s4
-; GFX9-NEXT: s_add_i32 s35, s34, -4
-; GFX9-NEXT: s_min_u32 s34, s34, s35
-; GFX9-NEXT: s_lshl_b32 s36, s34, 2
+; GFX9-NEXT: v_mov_b32_e32 v0, s34
+; GFX9-NEXT: v_sub_co_u32_e64 v1, vcc, s34, 4
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[34:35]
+; GFX9-NEXT: v_readfirstlane_b32 s34, v0
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -1054,12 +1168,13 @@ define amdgpu_gfx void @s_set_rounding_i3_signext(i3 signext inreg %rounding) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_sext_i32_i16 s34, s4
-; GFX10-NEXT: s_add_i32 s35, s34, -4
-; GFX10-NEXT: s_min_u32 s36, s34, s35
+; GFX10-NEXT: v_sub_co_u32 v0, s35, s34, 4
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s34, s35
; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f
; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX10-NEXT: s_lshl_b32 s36, s36, 2
-; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[34:35]
+; GFX10-NEXT: v_readfirstlane_b32 s34, v0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -1067,12 +1182,13 @@ define amdgpu_gfx void @s_set_rounding_i3_signext(i3 signext inreg %rounding) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_sext_i32_i16 s0, s4
-; GFX11-NEXT: s_add_i32 s1, s0, -4
-; GFX11-NEXT: s_min_u32 s2, s0, s1
+; GFX11-NEXT: v_sub_co_u32 v0, s1, s0, 4
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1
; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f
; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9
-; GFX11-NEXT: s_lshl_b32 s2, s2, 2
-; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1]
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sext.rounding = sext i3 %rounding to i32
@@ -1084,24 +1200,28 @@ define amdgpu_gfx void @s_set_rounding_i3_zeroext(i3 zeroext inreg %rounding) {
; GFX6-LABEL: s_set_rounding_i3_zeroext:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: s_add_i32 s34, s4, -4
-; GFX6-NEXT: s_min_u32 s34, s4, s34
-; GFX6-NEXT: s_lshl_b32 s36, s34, 2
+; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: v_sub_i32_e64 v1, vcc, s4, 4
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX6-NEXT: s_mov_b32 s34, 0x1c84a50f
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX6-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX6-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
+; GFX6-NEXT: v_lshr_b64 v[0:1], s[34:35], v0
+; GFX6-NEXT: v_readfirstlane_b32 s34, v0
; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: s_set_rounding_i3_zeroext:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_add_i32 s34, s4, -4
-; GFX7-NEXT: s_min_u32 s34, s4, s34
-; GFX7-NEXT: s_lshl_b32 s36, s34, 2
+; GFX7-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-NEXT: v_sub_i32_e64 v1, vcc, s4, 4
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX7-NEXT: s_mov_b32 s34, 0x1c84a50f
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX7-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX7-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
+; GFX7-NEXT: v_lshr_b64 v[0:1], s[34:35], v0
+; GFX7-NEXT: v_readfirstlane_b32 s34, v0
; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -1109,12 +1229,14 @@ define amdgpu_gfx void @s_set_rounding_i3_zeroext(i3 zeroext inreg %rounding) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_and_b32 s34, 0xffff, s4
-; GFX8-NEXT: s_add_i32 s35, s34, -4
-; GFX8-NEXT: s_min_u32 s34, s34, s35
-; GFX8-NEXT: s_lshl_b32 s36, s34, 2
+; GFX8-NEXT: v_mov_b32_e32 v0, s34
+; GFX8-NEXT: v_sub_u32_e64 v1, vcc, s34, 4
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX8-NEXT: s_mov_b32 s34, 0x1c84a50f
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX8-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[34:35]
+; GFX8-NEXT: v_readfirstlane_b32 s34, v0
; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -1122,12 +1244,14 @@ define amdgpu_gfx void @s_set_rounding_i3_zeroext(i3 zeroext inreg %rounding) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s34, 0xffff, s4
-; GFX9-NEXT: s_add_i32 s35, s34, -4
-; GFX9-NEXT: s_min_u32 s34, s34, s35
-; GFX9-NEXT: s_lshl_b32 s36, s34, 2
+; GFX9-NEXT: v_mov_b32_e32 v0, s34
+; GFX9-NEXT: v_sub_co_u32_e64 v1, vcc, s34, 4
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[34:35]
+; GFX9-NEXT: v_readfirstlane_b32 s34, v0
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -1135,12 +1259,13 @@ define amdgpu_gfx void @s_set_rounding_i3_zeroext(i3 zeroext inreg %rounding) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_and_b32 s34, 0xffff, s4
-; GFX10-NEXT: s_add_i32 s35, s34, -4
-; GFX10-NEXT: s_min_u32 s36, s34, s35
+; GFX10-NEXT: v_sub_co_u32 v0, s35, s34, 4
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s34, s35
; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f
; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX10-NEXT: s_lshl_b32 s36, s36, 2
-; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[34:35]
+; GFX10-NEXT: v_readfirstlane_b32 s34, v0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -1148,12 +1273,13 @@ define amdgpu_gfx void @s_set_rounding_i3_zeroext(i3 zeroext inreg %rounding) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s0, 0xffff, s4
-; GFX11-NEXT: s_add_i32 s1, s0, -4
-; GFX11-NEXT: s_min_u32 s2, s0, s1
+; GFX11-NEXT: v_sub_co_u32 v0, s1, s0, 4
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1
; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f
; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9
-; GFX11-NEXT: s_lshl_b32 s2, s2, 2
-; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1]
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sext.rounding = zext i3 %rounding to i32
@@ -1488,31 +1614,67 @@ define amdgpu_gfx void @s_set_rounding_select_3_0(i32 inreg %cond) {
}
define amdgpu_gfx void @s_set_rounding_select_4_0(i32 inreg %cond) {
-; GFX678-LABEL: s_set_rounding_select_4_0:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_cmp_eq_u32 s4, 0
-; GFX678-NEXT: s_cselect_b32 s34, 4, 0
-; GFX678-NEXT: s_add_i32 s35, s34, -4
-; GFX678-NEXT: s_min_u32 s34, s34, s35
-; GFX678-NEXT: s_lshl_b32 s36, s34, 2
-; GFX678-NEXT: s_mov_b32 s34, 0x1c84a50f
-; GFX678-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX678-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
-; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX678-NEXT: s_setpc_b64 s[30:31]
+; GFX6-LABEL: s_set_rounding_select_4_0:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_cmp_eq_u32 s4, 0
+; GFX6-NEXT: s_cselect_b32 s34, 4, 0
+; GFX6-NEXT: v_mov_b32_e32 v0, s34
+; GFX6-NEXT: v_sub_i32_e64 v1, vcc, s34, 4
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX6-NEXT: s_mov_b32 s34, 0x1c84a50f
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT: s_mov_b32 s35, 0xb73e62d9
+; GFX6-NEXT: v_lshr_b64 v[0:1], s[34:35], v0
+; GFX6-NEXT: v_readfirstlane_b32 s34, v0
+; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: s_set_rounding_select_4_0:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_cmp_eq_u32 s4, 0
+; GFX7-NEXT: s_cselect_b32 s34, 4, 0
+; GFX7-NEXT: v_mov_b32_e32 v0, s34
+; GFX7-NEXT: v_sub_i32_e64 v1, vcc, s34, 4
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX7-NEXT: s_mov_b32 s34, 0x1c84a50f
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT: s_mov_b32 s35, 0xb73e62d9
+; GFX7-NEXT: v_lshr_b64 v[0:1], s[34:35], v0
+; GFX7-NEXT: v_readfirstlane_b32 s34, v0
+; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_set_rounding_select_4_0:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_cmp_eq_u32 s4, 0
+; GFX8-NEXT: s_cselect_b32 s34, 4, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s34
+; GFX8-NEXT: v_sub_u32_e64 v1, vcc, s34, 4
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT: s_mov_b32 s34, 0x1c84a50f
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: s_mov_b32 s35, 0xb73e62d9
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[34:35]
+; GFX8-NEXT: v_readfirstlane_b32 s34, v0
+; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: s_set_rounding_select_4_0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_cmp_eq_u32 s4, 0
; GFX9-NEXT: s_cselect_b32 s34, 4, 0
-; GFX9-NEXT: s_add_i32 s35, s34, -4
-; GFX9-NEXT: s_min_u32 s34, s34, s35
-; GFX9-NEXT: s_lshl_b32 s36, s34, 2
+; GFX9-NEXT: v_mov_b32_e32 v0, s34
+; GFX9-NEXT: v_sub_co_u32_e64 v1, vcc, s34, 4
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[34:35]
+; GFX9-NEXT: v_readfirstlane_b32 s34, v0
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -1521,12 +1683,13 @@ define amdgpu_gfx void @s_set_rounding_select_4_0(i32 inreg %cond) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_cmp_eq_u32 s4, 0
; GFX10-NEXT: s_cselect_b32 s34, 4, 0
-; GFX10-NEXT: s_add_i32 s35, s34, -4
-; GFX10-NEXT: s_min_u32 s36, s34, s35
+; GFX10-NEXT: v_sub_co_u32 v0, s35, s34, 4
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s34, s35
; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f
; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX10-NEXT: s_lshl_b32 s36, s36, 2
-; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[34:35]
+; GFX10-NEXT: v_readfirstlane_b32 s34, v0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -1535,12 +1698,13 @@ define amdgpu_gfx void @s_set_rounding_select_4_0(i32 inreg %cond) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_cmp_eq_u32 s4, 0
; GFX11-NEXT: s_cselect_b32 s0, 4, 0
-; GFX11-NEXT: s_add_i32 s1, s0, -4
-; GFX11-NEXT: s_min_u32 s2, s0, s1
+; GFX11-NEXT: v_sub_co_u32 v0, s1, s0, 4
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1
; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f
; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9
-; GFX11-NEXT: s_lshl_b32 s2, s2, 2
-; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1]
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %cond, 0
@@ -1550,31 +1714,67 @@ define amdgpu_gfx void @s_set_rounding_select_4_0(i32 inreg %cond) {
}
define amdgpu_gfx void @s_set_rounding_select_3_5(i32 inreg %cond) {
-; GFX678-LABEL: s_set_rounding_select_3_5:
-; GFX678: ; %bb.0:
-; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX678-NEXT: s_cmp_eq_u32 s4, 0
-; GFX678-NEXT: s_cselect_b32 s34, 3, 5
-; GFX678-NEXT: s_add_i32 s35, s34, -4
-; GFX678-NEXT: s_min_u32 s34, s34, s35
-; GFX678-NEXT: s_lshl_b32 s36, s34, 2
-; GFX678-NEXT: s_mov_b32 s34, 0x1c84a50f
-; GFX678-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX678-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
-; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
-; GFX678-NEXT: s_setpc_b64 s[30:31]
+; GFX6-LABEL: s_set_rounding_select_3_5:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_cmp_eq_u32 s4, 0
+; GFX6-NEXT: s_cselect_b32 s34, 3, 5
+; GFX6-NEXT: v_mov_b32_e32 v0, s34
+; GFX6-NEXT: v_sub_i32_e64 v1, vcc, s34, 4
+; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX6-NEXT: s_mov_b32 s34, 0x1c84a50f
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT: s_mov_b32 s35, 0xb73e62d9
+; GFX6-NEXT: v_lshr_b64 v[0:1], s[34:35], v0
+; GFX6-NEXT: v_readfirstlane_b32 s34, v0
+; GFX6-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: s_set_rounding_select_3_5:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_cmp_eq_u32 s4, 0
+; GFX7-NEXT: s_cselect_b32 s34, 3, 5
+; GFX7-NEXT: v_mov_b32_e32 v0, s34
+; GFX7-NEXT: v_sub_i32_e64 v1, vcc, s34, 4
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX7-NEXT: s_mov_b32 s34, 0x1c84a50f
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX7-NEXT: s_mov_b32 s35, 0xb73e62d9
+; GFX7-NEXT: v_lshr_b64 v[0:1], s[34:35], v0
+; GFX7-NEXT: v_readfirstlane_b32 s34, v0
+; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: s_set_rounding_select_3_5:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_cmp_eq_u32 s4, 0
+; GFX8-NEXT: s_cselect_b32 s34, 3, 5
+; GFX8-NEXT: v_mov_b32_e32 v0, s34
+; GFX8-NEXT: v_sub_u32_e64 v1, vcc, s34, 4
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT: s_mov_b32 s34, 0x1c84a50f
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: s_mov_b32 s35, 0xb73e62d9
+; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[34:35]
+; GFX8-NEXT: v_readfirstlane_b32 s34, v0
+; GFX8-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: s_set_rounding_select_3_5:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_cmp_eq_u32 s4, 0
; GFX9-NEXT: s_cselect_b32 s34, 3, 5
-; GFX9-NEXT: s_add_i32 s35, s34, -4
-; GFX9-NEXT: s_min_u32 s34, s34, s35
-; GFX9-NEXT: s_lshl_b32 s36, s34, 2
+; GFX9-NEXT: v_mov_b32_e32 v0, s34
+; GFX9-NEXT: v_sub_co_u32_e64 v1, vcc, s34, 4
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX9-NEXT: s_mov_b32 s34, 0x1c84a50f
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX9-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[34:35]
+; GFX9-NEXT: v_readfirstlane_b32 s34, v0
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -1583,12 +1783,13 @@ define amdgpu_gfx void @s_set_rounding_select_3_5(i32 inreg %cond) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_cmp_eq_u32 s4, 0
; GFX10-NEXT: s_cselect_b32 s34, 3, 5
-; GFX10-NEXT: s_add_i32 s35, s34, -4
-; GFX10-NEXT: s_min_u32 s36, s34, s35
+; GFX10-NEXT: v_sub_co_u32 v0, s35, s34, 4
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s34, s35
; GFX10-NEXT: s_mov_b32 s34, 0x1c84a50f
; GFX10-NEXT: s_mov_b32 s35, 0xb73e62d9
-; GFX10-NEXT: s_lshl_b32 s36, s36, 2
-; GFX10-NEXT: s_lshr_b64 s[34:35], s[34:35], s36
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[34:35]
+; GFX10-NEXT: v_readfirstlane_b32 s34, v0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -1597,12 +1798,13 @@ define amdgpu_gfx void @s_set_rounding_select_3_5(i32 inreg %cond) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_cmp_eq_u32 s4, 0
; GFX11-NEXT: s_cselect_b32 s0, 3, 5
-; GFX11-NEXT: s_add_i32 s1, s0, -4
-; GFX11-NEXT: s_min_u32 s2, s0, s1
+; GFX11-NEXT: v_sub_co_u32 v0, s1, s0, 4
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1
; GFX11-NEXT: s_mov_b32 s0, 0x1c84a50f
; GFX11-NEXT: s_mov_b32 s1, 0xb73e62d9
-; GFX11-NEXT: s_lshl_b32 s2, s2, 2
-; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1]
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %cond, 0
More information about the llvm-commits
mailing list