[llvm] [AMDGPU][ISel] `setcc` peephole for comparisons with upper 32 bits of a 64-bit register pair (PR #177662)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 10 10:57:21 PST 2026
https://github.com/zGoldthorpe updated https://github.com/llvm/llvm-project/pull/177662
>From f1f552c3bb5204efd21910e864b0997cb20bfb49 Mon Sep 17 00:00:00 2001
From: Zach Goldthorpe <Zach.Goldthorpe at amd.com>
Date: Fri, 23 Jan 2026 12:54:32 -0600
Subject: [PATCH 1/5] Added peephole for `setcc`.
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 7 +++++++
llvm/test/CodeGen/AMDGPU/setcc64.ll | 11 +++++++++++
2 files changed, 18 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index b043d5354042d..14072575fab39 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17050,6 +17050,13 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
return LHS.getOperand(0);
}
}
+
+ // setcc v.64, 0x1'000'000, ult => setcc v.hi32, 0, eq
+ if (VT == MVT::i64 && CRHSVal.getZExtValue() == 1ull << 32 &&
+ CC == ISD::SETULT) {
+ return DAG.getSetCC(SL, N->getValueType(0), getHiHalf64(LHS, DAG),
+ DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ);
+ }
}
// Eliminate setcc by using carryout from add/sub instruction
diff --git a/llvm/test/CodeGen/AMDGPU/setcc64.ll b/llvm/test/CodeGen/AMDGPU/setcc64.ll
index b36ed3e91c573..4660b1c1e93e2 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc64.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc64.ll
@@ -284,4 +284,15 @@ entry:
ret void
}
+; GCN-LABEL: {{^}}i64_select:
+; GCN: s_cmp_eq_u32
+; GCN: s_cselect_b32
+; GCN: s_cselect_b32
+define amdgpu_kernel void @i64_select(ptr addrspace(1) %out, i64 %a, i64 %b, i64 %mask) #0 {
+ %mask.hi.z = icmp ult i64 %mask, 4294967296
+ %tmp = select i1 %mask.hi.z, i64 %a, i64 %b
+ store i64 %tmp, ptr addrspace(1) %out
+ ret void
+}
+
attributes #0 = { nounwind }
>From cd03b5748dd5f9bdac9496c2b3d70f4700edeb36 Mon Sep 17 00:00:00 2001
From: Zach Goldthorpe <Zach.Goldthorpe at amd.com>
Date: Fri, 23 Jan 2026 15:40:42 -0600
Subject: [PATCH 2/5] Extended to complementary cases, and created more tests.
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 20 ++-
llvm/test/CodeGen/AMDGPU/setcc-select.ll | 166 ++++++++++++++++++++++
llvm/test/CodeGen/AMDGPU/setcc64.ll | 11 --
3 files changed, 182 insertions(+), 15 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/setcc-select.ll
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 14072575fab39..a4c169c3d0fb4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17052,10 +17052,22 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
}
// setcc v.64, 0x1'000'000, ult => setcc v.hi32, 0, eq
- if (VT == MVT::i64 && CRHSVal.getZExtValue() == 1ull << 32 &&
- CC == ISD::SETULT) {
- return DAG.getSetCC(SL, N->getValueType(0), getHiHalf64(LHS, DAG),
- DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ);
+ if (VT == MVT::i64) {
+ const uint64_t Bit32 = 1ull << 32;
+ const uint64_t CRHSInt = CRHSVal.getZExtValue();
+
+ ISD::CondCode NewCC = ISD::SETCC_INVALID;
+ if ((CRHSInt == Bit32 && CC == ISD::SETULT) ||
+ (CRHSInt == Bit32 - 1 && CC == ISD::SETULE)) {
+ NewCC = ISD::SETEQ;
+ } else if ((CRHSInt == Bit32 && CC == ISD::SETUGE) ||
+ (CRHSInt == Bit32 - 1 && CC == ISD::SETUGT)) {
+ NewCC = ISD::SETNE;
+ }
+
+ if (NewCC != ISD::SETCC_INVALID)
+ return DAG.getSetCC(SL, N->getValueType(0), getHiHalf64(LHS, DAG),
+ DAG.getConstant(0, SL, MVT::i32), NewCC);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/setcc-select.ll b/llvm/test/CodeGen/AMDGPU/setcc-select.ll
new file mode 100644
index 0000000000000..fd4229fc57117
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/setcc-select.ll
@@ -0,0 +1,166 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck %s
+
+define i32 @select.hi32.sgpr.ult(i64 inreg %mask, i32 inreg %a, i32 inreg %b) {
+; CHECK-LABEL: select.hi32.sgpr.ult:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_cmp_eq_u32 s17, 0
+; CHECK-NEXT: s_cselect_b32 s4, s18, s19
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %mask.hi.z = icmp ult i64 %mask, 4294967296
+ %ret = select i1 %mask.hi.z, i32 %a, i32 %b
+ ret i32 %ret
+}
+
+define i32 @select.hi32.sgpr.ule(i64 inreg %mask, i32 inreg %a, i32 inreg %b) {
+; CHECK-LABEL: select.hi32.sgpr.ule:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_cmp_eq_u32 s17, 0
+; CHECK-NEXT: s_cselect_b32 s4, s18, s19
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %mask.hi.z = icmp ule i64 %mask, 4294967295
+ %ret = select i1 %mask.hi.z, i32 %a, i32 %b
+ ret i32 %ret
+}
+
+define i32 @select.hi32.sgpr.ugt(i64 inreg %mask, i32 inreg %a, i32 inreg %b) {
+; CHECK-LABEL: select.hi32.sgpr.ugt:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_cmp_lg_u32 s17, 0
+; CHECK-NEXT: s_cselect_b32 s4, s18, s19
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %mask.hi.nz = icmp ugt i64 %mask, 4294967295
+ %ret = select i1 %mask.hi.nz, i32 %a, i32 %b
+ ret i32 %ret
+}
+
+define i32 @select.hi32.sgpr.uge(i64 inreg %mask, i32 inreg %a, i32 inreg %b) {
+; CHECK-LABEL: select.hi32.sgpr.uge:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_cmp_lg_u32 s17, 0
+; CHECK-NEXT: s_cselect_b32 s4, s18, s19
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %mask.hi.nz = icmp uge i64 %mask, 4294967296
+ %ret = select i1 %mask.hi.nz, i32 %a, i32 %b
+ ret i32 %ret
+}
+
+define i32 @select.hi32.vgpr.ult(i64 %mask, i32 %a, i32 %b) {
+; CHECK-LABEL: select.hi32.vgpr.ult:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %mask.hi.z = icmp ult i64 %mask, 4294967296
+ %ret = select i1 %mask.hi.z, i32 %a, i32 %b
+ ret i32 %ret
+}
+
+define i32 @select.hi32.vgpr.ule(i64 %mask, i32 %a, i32 %b) {
+; CHECK-LABEL: select.hi32.vgpr.ule:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %mask.hi.z = icmp ule i64 %mask, 4294967295
+ %ret = select i1 %mask.hi.z, i32 %a, i32 %b
+ ret i32 %ret
+}
+
+define i32 @select.hi32.vgpr.ugt(i64 %mask, i32 %a, i32 %b) {
+; CHECK-LABEL: select.hi32.vgpr.ugt:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %mask.hi.nz = icmp ugt i64 %mask, 4294967295
+ %ret = select i1 %mask.hi.nz, i32 %a, i32 %b
+ ret i32 %ret
+}
+
+define i32 @select.hi32.vgpr.uge(i64 %mask, i32 %a, i32 %b) {
+; CHECK-LABEL: select.hi32.vgpr.uge:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %mask.hi.nz = icmp uge i64 %mask, 4294967296
+ %ret = select i1 %mask.hi.nz, i32 %a, i32 %b
+ ret i32 %ret
+}
+
+define i32 @select.hi32.sgpr.multiuse(i64 inreg %mask, i32 inreg %a, i32 inreg %b, i32 inreg %c, i32 inreg %d) {
+; CHECK-LABEL: select.hi32.sgpr.multiuse:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_cmp_eq_u32 s17, 0
+; CHECK-NEXT: s_cselect_b32 s4, s18, s19
+; CHECK-NEXT: s_cselect_b32 s5, s20, s21
+; CHECK-NEXT: s_add_i32 s4, s4, s5
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %mask.hi.nz = icmp ult i64 %mask, 4294967296
+ %ab = select i1 %mask.hi.nz, i32 %a, i32 %b
+ %cd = select i1 %mask.hi.nz, i32 %c, i32 %d
+ %ret = add i32 %ab, %cd
+ ret i32 %ret
+}
+
+
+define i32 @select.hi32.vgpr.multiuse(i64 %mask, i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: select.hi32.vgpr.multiuse:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %mask.hi.nz = icmp ult i64 %mask, 4294967296
+ %ab = select i1 %mask.hi.nz, i32 %a, i32 %b
+ %cd = select i1 %mask.hi.nz, i32 %c, i32 %d
+ %ret = add i32 %ab, %cd
+ ret i32 %ret
+}
+
+define i32 @select.bad.sgpr(i64 inreg %mask, i32 inreg %a, i32 inreg %b) {
+; CHECK-LABEL: select.bad.sgpr:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, 1
+; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1]
+; CHECK-NEXT: s_and_b64 s[4:5], vcc, exec
+; CHECK-NEXT: s_cselect_b32 s4, s18, s19
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %test = icmp ule i64 %mask, 4294967296
+ %ret = select i1 %test, i32 %a, i32 %b
+ ret i32 %ret
+}
+
+define i32 @select.bad.vgpr(i64 %mask, i32 %a, i32 %b) {
+; CHECK-LABEL: select.bad.vgpr:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_mov_b32 s4, 1
+; CHECK-NEXT: s_mov_b32 s5, s4
+; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %test = icmp ule i64 %mask, 4294967296
+ %ret = select i1 %test, i32 %a, i32 %b
+ ret i32 %ret
+}
diff --git a/llvm/test/CodeGen/AMDGPU/setcc64.ll b/llvm/test/CodeGen/AMDGPU/setcc64.ll
index 4660b1c1e93e2..b36ed3e91c573 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc64.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc64.ll
@@ -284,15 +284,4 @@ entry:
ret void
}
-; GCN-LABEL: {{^}}i64_select:
-; GCN: s_cmp_eq_u32
-; GCN: s_cselect_b32
-; GCN: s_cselect_b32
-define amdgpu_kernel void @i64_select(ptr addrspace(1) %out, i64 %a, i64 %b, i64 %mask) #0 {
- %mask.hi.z = icmp ult i64 %mask, 4294967296
- %tmp = select i1 %mask.hi.z, i64 %a, i64 %b
- store i64 %tmp, ptr addrspace(1) %out
- ret void
-}
-
attributes #0 = { nounwind }
>From 7d92cda7361117f85c26d84b9345e786518e4605 Mon Sep 17 00:00:00 2001
From: Zach Goldthorpe <Zach.Goldthorpe at amd.com>
Date: Wed, 28 Jan 2026 11:46:56 -0600
Subject: [PATCH 3/5] Properly generalised peephole optimisation.
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 25 +++--
llvm/test/CodeGen/AMDGPU/setcc-select.ll | 124 ++++++++++++++--------
2 files changed, 96 insertions(+), 53 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a4c169c3d0fb4..fdd4063ab5fd2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17051,23 +17051,30 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
}
}
- // setcc v.64, 0x1'000'000, ult => setcc v.hi32, 0, eq
+ // setcc v.64, 0xXXXX'XXXX'0000'0000, lt/ge
+ // => setcc v.hi32, 0xXXXX'XXXX, lt/ge
+ //
+ // setcc v.64, 0xXXXX'XXXX'FFFF'FFFF, le/gt
+ // => setcc v.hi32, 0xXXXX'XXXX, le/gt
if (VT == MVT::i64) {
- const uint64_t Bit32 = 1ull << 32;
+ const uint64_t Mask32 = maskTrailingOnes<uint64_t>(32);
const uint64_t CRHSInt = CRHSVal.getZExtValue();
ISD::CondCode NewCC = ISD::SETCC_INVALID;
- if ((CRHSInt == Bit32 && CC == ISD::SETULT) ||
- (CRHSInt == Bit32 - 1 && CC == ISD::SETULE)) {
- NewCC = ISD::SETEQ;
- } else if ((CRHSInt == Bit32 && CC == ISD::SETUGE) ||
- (CRHSInt == Bit32 - 1 && CC == ISD::SETUGT)) {
- NewCC = ISD::SETNE;
+
+ if ((CRHSInt & Mask32) == 0 && (CC == ISD::SETULT || CC == ISD::SETUGE ||
+ CC == ISD::SETLT || CC == ISD::SETGE)) {
+ NewCC = CC;
+ } else if ((CRHSInt & Mask32) == Mask32 &&
+ (CC == ISD::SETULE || CC == ISD::SETUGT || CC == ISD::SETLE ||
+ CC == ISD::SETGT)) {
+ NewCC = CC;
}
if (NewCC != ISD::SETCC_INVALID)
return DAG.getSetCC(SL, N->getValueType(0), getHiHalf64(LHS, DAG),
- DAG.getConstant(0, SL, MVT::i32), NewCC);
+ DAG.getConstant(CRHSInt >> 32, SL, MVT::i32),
+ NewCC);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/setcc-select.ll b/llvm/test/CodeGen/AMDGPU/setcc-select.ll
index fd4229fc57117..30c669c46ac1a 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc-select.ll
@@ -5,50 +5,50 @@ define i32 @select.hi32.sgpr.ult(i64 inreg %mask, i32 inreg %a, i32 inreg %b) {
; CHECK-LABEL: select.hi32.sgpr.ult:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_cmp_eq_u32 s17, 0
+; CHECK-NEXT: s_cmp_lt_u32 s17, 0xaaaaaaaa
; CHECK-NEXT: s_cselect_b32 s4, s18, s19
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: s_setpc_b64 s[30:31]
- %mask.hi.z = icmp ult i64 %mask, 4294967296
+ %mask.hi.z = icmp ult i64 %mask, u0xaaaaaaaa00000000
%ret = select i1 %mask.hi.z, i32 %a, i32 %b
ret i32 %ret
}
-define i32 @select.hi32.sgpr.ule(i64 inreg %mask, i32 inreg %a, i32 inreg %b) {
-; CHECK-LABEL: select.hi32.sgpr.ule:
+define i32 @select.hi32.sgpr.uge(i64 inreg %mask, i32 inreg %a, i32 inreg %b) {
+; CHECK-LABEL: select.hi32.sgpr.uge:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_cmp_eq_u32 s17, 0
+; CHECK-NEXT: s_cmp_gt_u32 s17, 0xaaaaaaa9
; CHECK-NEXT: s_cselect_b32 s4, s18, s19
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: s_setpc_b64 s[30:31]
- %mask.hi.z = icmp ule i64 %mask, 4294967295
- %ret = select i1 %mask.hi.z, i32 %a, i32 %b
+ %mask.hi.nz = icmp uge i64 %mask, u0xaaaaaaaa00000000
+ %ret = select i1 %mask.hi.nz, i32 %a, i32 %b
ret i32 %ret
}
-define i32 @select.hi32.sgpr.ugt(i64 inreg %mask, i32 inreg %a, i32 inreg %b) {
-; CHECK-LABEL: select.hi32.sgpr.ugt:
+define i32 @select.hi32.sgpr.ule(i64 inreg %mask, i32 inreg %a, i32 inreg %b) {
+; CHECK-LABEL: select.hi32.sgpr.ule:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_cmp_lg_u32 s17, 0
+; CHECK-NEXT: s_cmp_lt_u32 s17, 0xaaaaaaab
; CHECK-NEXT: s_cselect_b32 s4, s18, s19
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: s_setpc_b64 s[30:31]
- %mask.hi.nz = icmp ugt i64 %mask, 4294967295
- %ret = select i1 %mask.hi.nz, i32 %a, i32 %b
+ %mask.hi.z = icmp ule i64 %mask, u0xaaaaaaaaffffffff
+ %ret = select i1 %mask.hi.z, i32 %a, i32 %b
ret i32 %ret
}
-define i32 @select.hi32.sgpr.uge(i64 inreg %mask, i32 inreg %a, i32 inreg %b) {
-; CHECK-LABEL: select.hi32.sgpr.uge:
+define i32 @select.hi32.sgpr.ugt(i64 inreg %mask, i32 inreg %a, i32 inreg %b) {
+; CHECK-LABEL: select.hi32.sgpr.ugt:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_cmp_lg_u32 s17, 0
+; CHECK-NEXT: s_cmp_gt_u32 s17, 0xaaaaaaaa
; CHECK-NEXT: s_cselect_b32 s4, s18, s19
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: s_setpc_b64 s[30:31]
- %mask.hi.nz = icmp uge i64 %mask, 4294967296
+ %mask.hi.nz = icmp ugt i64 %mask, u0xaaaaaaaaffffffff
%ret = select i1 %mask.hi.nz, i32 %a, i32 %b
ret i32 %ret
}
@@ -57,46 +57,50 @@ define i32 @select.hi32.vgpr.ult(i64 %mask, i32 %a, i32 %b) {
; CHECK-LABEL: select.hi32.vgpr.ult:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; CHECK-NEXT: s_mov_b32 s4, 0xaaaaaaaa
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v1
; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
- %mask.hi.z = icmp ult i64 %mask, 4294967296
+ %mask.hi.z = icmp ult i64 %mask, u0xaaaaaaaa00000000
%ret = select i1 %mask.hi.z, i32 %a, i32 %b
ret i32 %ret
}
-define i32 @select.hi32.vgpr.ule(i64 %mask, i32 %a, i32 %b) {
-; CHECK-LABEL: select.hi32.vgpr.ule:
+define i32 @select.hi32.vgpr.uge(i64 %mask, i32 %a, i32 %b) {
+; CHECK-LABEL: select.hi32.vgpr.uge:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; CHECK-NEXT: s_mov_b32 s4, 0xaaaaaaa9
+; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s4, v1
; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
- %mask.hi.z = icmp ule i64 %mask, 4294967295
- %ret = select i1 %mask.hi.z, i32 %a, i32 %b
+ %mask.hi.nz = icmp uge i64 %mask, u0xaaaaaaaa00000000
+ %ret = select i1 %mask.hi.nz, i32 %a, i32 %b
ret i32 %ret
}
-define i32 @select.hi32.vgpr.ugt(i64 %mask, i32 %a, i32 %b) {
-; CHECK-LABEL: select.hi32.vgpr.ugt:
+define i32 @select.hi32.vgpr.ule(i64 %mask, i32 %a, i32 %b) {
+; CHECK-LABEL: select.hi32.vgpr.ule:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK-NEXT: s_mov_b32 s4, 0xaaaaaaab
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v1
; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
- %mask.hi.nz = icmp ugt i64 %mask, 4294967295
- %ret = select i1 %mask.hi.nz, i32 %a, i32 %b
+ %mask.hi.z = icmp ule i64 %mask, u0xaaaaaaaaffffffff
+ %ret = select i1 %mask.hi.z, i32 %a, i32 %b
ret i32 %ret
}
-define i32 @select.hi32.vgpr.uge(i64 %mask, i32 %a, i32 %b) {
-; CHECK-LABEL: select.hi32.vgpr.uge:
+define i32 @select.hi32.vgpr.ugt(i64 %mask, i32 %a, i32 %b) {
+; CHECK-LABEL: select.hi32.vgpr.ugt:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK-NEXT: s_mov_b32 s4, 0xaaaaaaaa
+; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s4, v1
; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
- %mask.hi.nz = icmp uge i64 %mask, 4294967296
+ %mask.hi.nz = icmp ugt i64 %mask, u0xaaaaaaaaffffffff
%ret = select i1 %mask.hi.nz, i32 %a, i32 %b
ret i32 %ret
}
@@ -105,13 +109,13 @@ define i32 @select.hi32.sgpr.multiuse(i64 inreg %mask, i32 inreg %a, i32 inreg %
; CHECK-LABEL: select.hi32.sgpr.multiuse:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_cmp_eq_u32 s17, 0
+; CHECK-NEXT: s_cmp_lt_u32 s17, 0xaaaaaaaa
; CHECK-NEXT: s_cselect_b32 s4, s18, s19
; CHECK-NEXT: s_cselect_b32 s5, s20, s21
; CHECK-NEXT: s_add_i32 s4, s4, s5
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: s_setpc_b64 s[30:31]
- %mask.hi.nz = icmp ult i64 %mask, 4294967296
+ %mask.hi.nz = icmp ult i64 %mask, u0xaaaaaaaa00000000
%ab = select i1 %mask.hi.nz, i32 %a, i32 %b
%cd = select i1 %mask.hi.nz, i32 %c, i32 %d
%ret = add i32 %ab, %cd
@@ -123,44 +127,76 @@ define i32 @select.hi32.vgpr.multiuse(i64 %mask, i32 %a, i32 %b, i32 %c, i32 %d)
; CHECK-LABEL: select.hi32.vgpr.multiuse:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; CHECK-NEXT: s_mov_b32 s4, 0xaaaaaaaa
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v1
; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; CHECK-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; CHECK-NEXT: s_setpc_b64 s[30:31]
- %mask.hi.nz = icmp ult i64 %mask, 4294967296
+ %mask.hi.nz = icmp ult i64 %mask, u0xaaaaaaaa00000000
%ab = select i1 %mask.hi.nz, i32 %a, i32 %b
%cd = select i1 %mask.hi.nz, i32 %c, i32 %d
%ret = add i32 %ab, %cd
ret i32 %ret
}
-define i32 @select.bad.sgpr(i64 inreg %mask, i32 inreg %a, i32 inreg %b) {
-; CHECK-LABEL: select.bad.sgpr:
+define i32 @select.bad.sgpr.ule(i64 inreg %mask, i32 inreg %a, i32 inreg %b) {
+; CHECK-LABEL: select.bad.sgpr.ule:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: v_mov_b32_e32 v1, v0
+; CHECK-NEXT: v_mov_b32_e32 v1, 0xaaaaaaaa
+; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1]
+; CHECK-NEXT: s_and_b64 s[4:5], vcc, exec
+; CHECK-NEXT: s_cselect_b32 s4, s18, s19
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %test = icmp ule i64 %mask, u0xaaaaaaaa00000000
+ %ret = select i1 %test, i32 %a, i32 %b
+ ret i32 %ret
+}
+
+define i32 @select.bad.sgpr.ult(i64 inreg %mask, i32 inreg %a, i32 inreg %b) {
+; CHECK-LABEL: select.bad.sgpr.ult:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, -1
+; CHECK-NEXT: v_mov_b32_e32 v1, 0xaaaaaaaa
; CHECK-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1]
; CHECK-NEXT: s_and_b64 s[4:5], vcc, exec
; CHECK-NEXT: s_cselect_b32 s4, s18, s19
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: s_setpc_b64 s[30:31]
- %test = icmp ule i64 %mask, 4294967296
+ %test = icmp ult i64 %mask, u0xaaaaaaaaffffffff
%ret = select i1 %test, i32 %a, i32 %b
ret i32 %ret
}
-define i32 @select.bad.vgpr(i64 %mask, i32 %a, i32 %b) {
-; CHECK-LABEL: select.bad.vgpr:
+
+define i32 @select.bad.vgpr.ule(i64 %mask, i32 %a, i32 %b) {
+; CHECK-LABEL: select.bad.vgpr.ule:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s4, 1
-; CHECK-NEXT: s_mov_b32 s5, s4
+; CHECK-NEXT: s_mov_b32 s5, 0xaaaaaaaa
+; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %test = icmp ule i64 %mask, u0xaaaaaaaa00000000
+ %ret = select i1 %test, i32 %a, i32 %b
+ ret i32 %ret
+}
+
+define i32 @select.bad.vgpr.ult(i64 %mask, i32 %a, i32 %b) {
+; CHECK-LABEL: select.bad.vgpr.ult:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_mov_b32 s4, -1
+; CHECK-NEXT: s_mov_b32 s5, 0xaaaaaaaa
; CHECK-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
- %test = icmp ule i64 %mask, 4294967296
+ %test = icmp ult i64 %mask, u0xaaaaaaaaffffffff
%ret = select i1 %test, i32 %a, i32 %b
ret i32 %ret
}
>From 42f7bfebd8aedd16d021cf96bbf402b549a56a2c Mon Sep 17 00:00:00 2001
From: Zach Goldthorpe <Zach.Goldthorpe at amd.com>
Date: Wed, 28 Jan 2026 12:12:14 -0600
Subject: [PATCH 4/5] Updated lit tests.
---
.../AMDGPU/agpr-copy-no-free-registers.ll | 24 +--
llvm/test/CodeGen/AMDGPU/commute-compares.ll | 15 +-
llvm/test/CodeGen/AMDGPU/div_i128.ll | 8 +-
llvm/test/CodeGen/AMDGPU/div_v2i128.ll | 104 ++++++------
.../AMDGPU/divergence-driven-trunc-to-i1.ll | 19 +--
llvm/test/CodeGen/AMDGPU/extract-subvector.ll | 28 ++--
llvm/test/CodeGen/AMDGPU/fptoi.i128.ll | 152 +++++++++---------
llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll | 4 +-
llvm/test/CodeGen/AMDGPU/itofp.i128.ll | 12 +-
llvm/test/CodeGen/AMDGPU/rem_i128.ll | 6 +-
llvm/test/CodeGen/AMDGPU/saddo.ll | 112 +++++++------
llvm/test/CodeGen/AMDGPU/saddsat.ll | 10 +-
llvm/test/CodeGen/AMDGPU/v_cndmask.ll | 14 +-
.../CodeGen/AMDGPU/widen-vselect-and-mask.ll | 15 +-
14 files changed, 266 insertions(+), 257 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 1d55dcf5056f8..571b0fddf75b6 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -568,17 +568,19 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: ; %bb.3: ; %bb14
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
-; GFX908-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1
+; GFX908-NEXT: s_cmp_lt_i32 s11, 0
; GFX908-NEXT: s_mov_b32 s13, s12
-; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3]
+; GFX908-NEXT: s_cselect_b64 s[18:19], -1, 0
+; GFX908-NEXT: s_cmp_gt_i32 s11, -1
; GFX908-NEXT: v_mov_b32_e32 v4, s12
-; GFX908-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v6
; GFX908-NEXT: v_mov_b32_e32 v6, s12
; GFX908-NEXT: v_mov_b32_e32 v8, s12
; GFX908-NEXT: v_mov_b32_e32 v5, s13
+; GFX908-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX908-NEXT: v_mov_b32_e32 v7, s13
; GFX908-NEXT: v_mov_b32_e32 v9, s13
-; GFX908-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0
+; GFX908-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[2:3]
+; GFX908-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v10
; GFX908-NEXT: v_mov_b32_e32 v11, v5
; GFX908-NEXT: s_mov_b64 s[20:21], s[14:15]
; GFX908-NEXT: v_mov_b32_e32 v10, v4
@@ -599,9 +601,9 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX908-NEXT: s_add_u32 s20, s20, s4
-; GFX908-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[2:3]
; GFX908-NEXT: s_addc_u32 s21, s21, s5
; GFX908-NEXT: s_mov_b64 s[22:23], 0
+; GFX908-NEXT: v_cmp_lt_i32_e64 s[24:25], -1, v3
; GFX908-NEXT: s_andn2_b64 vcc, exec, s[24:25]
; GFX908-NEXT: s_cbranch_vccz .LBB3_9
; GFX908-NEXT: .LBB3_5: ; %bb16
@@ -732,15 +734,17 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: ; %bb.3: ; %bb14
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
-; GFX90A-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1
+; GFX90A-NEXT: s_cmp_lt_i32 s11, 0
; GFX90A-NEXT: s_mov_b32 s13, s12
-; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[2:3]
+; GFX90A-NEXT: s_cselect_b64 s[18:19], -1, 0
+; GFX90A-NEXT: s_cmp_gt_i32 s11, -1
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[12:13], s[12:13] op_sel:[0,1]
-; GFX90A-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v8
+; GFX90A-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1]
; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1]
-; GFX90A-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0
+; GFX90A-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[2:3]
; GFX90A-NEXT: s_mov_b64 s[20:21], s[14:15]
+; GFX90A-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v12
; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_readfirstlane_b32 s9, v4
@@ -760,8 +764,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX90A-NEXT: s_add_u32 s20, s20, s4
; GFX90A-NEXT: s_addc_u32 s21, s21, s5
-; GFX90A-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[4:5]
; GFX90A-NEXT: s_mov_b64 s[22:23], 0
+; GFX90A-NEXT: v_cmp_lt_i32_e64 s[24:25], -1, v5
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[24:25]
; GFX90A-NEXT: s_cbranch_vccz .LBB3_9
; GFX90A-NEXT: .LBB3_5: ; %bb16
diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares.ll b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
index ce4609495b0e0..e4fb014af46ad 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-compares.ll
+++ b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
@@ -541,19 +541,20 @@ define amdgpu_kernel void @commute_sgt_neg1_i64(ptr addrspace(1) %out, ptr addrs
; GCN-LABEL: commute_sgt_neg1_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: s_mov_b64 s[10:11], s[6:7]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GCN-NEXT: buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:4
+; GCN-NEXT: s_mov_b64 s[4:5], s[0:1]
; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[3:4]
-; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; GCN-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT: v_not_b32_e32 v0, v3
+; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v0
+; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index d5b5ab6e457f9..db4647d09f061 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -13,8 +13,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v1, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v2, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
-; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
-; GFX9-NEXT: v_ashrrev_i32_e32 v17, 31, v3
+; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
; GFX9-NEXT: v_cndmask_b32_e32 v9, v1, v9, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v8, v0, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v11, v3, v11, vcc
@@ -22,9 +21,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, 0, v4
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, 0, v5, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, 0, v6, vcc
+; GFX9-NEXT: v_ashrrev_i32_e32 v17, 31, v3
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, 0, v7, vcc
-; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7]
-; GFX9-NEXT: v_ashrrev_i32_e32 v18, 31, v7
+; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v7
; GFX9-NEXT: v_cndmask_b32_e32 v21, v5, v1, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v22, v4, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc
@@ -53,6 +52,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_add_u32_e32 v3, 32, v3
; GFX9-NEXT: v_min_u32_e32 v3, v3, v6
; GFX9-NEXT: v_ffbh_u32_e32 v6, v8
+; GFX9-NEXT: v_ashrrev_i32_e32 v18, 31, v7
; GFX9-NEXT: v_add_u32_e32 v6, 32, v6
; GFX9-NEXT: v_ffbh_u32_e32 v7, v9
; GFX9-NEXT: v_min_u32_e32 v6, v6, v7
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 1e96b63bcd321..aaf3a368bcc53 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -8,14 +8,14 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0
; SDAG-NEXT: v_mov_b32_e32 v20, 0
+; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
; SDAG-NEXT: v_ashrrev_i32_e32 v24, 31, v3
; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v11
-; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc
; SDAG-NEXT: v_mov_b32_e32 v26, v24
; SDAG-NEXT: v_mov_b32_e32 v27, v25
; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v2, vcc
-; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
+; SDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3
; SDAG-NEXT: v_cndmask_b32_e64 v19, v1, v17, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v18, v0, v16, s[4:5]
; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v3, vcc
@@ -23,31 +23,31 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_ffbh_u32_e32 v1, v18
; SDAG-NEXT: v_ffbh_u32_e32 v2, v19
; SDAG-NEXT: v_cndmask_b32_e64 v17, v3, v0, s[4:5]
-; SDAG-NEXT: v_or_b32_e32 v0, v18, v16
; SDAG-NEXT: v_sub_i32_e32 v3, vcc, 0, v8
+; SDAG-NEXT: v_or_b32_e32 v0, v18, v16
; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], 32, v1
; SDAG-NEXT: v_ffbh_u32_e32 v22, v16
-; SDAG-NEXT: v_or_b32_e32 v1, v19, v17
; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc
+; SDAG-NEXT: v_or_b32_e32 v1, v19, v17
; SDAG-NEXT: v_min_u32_e32 v2, v21, v2
; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], 32, v22
; SDAG-NEXT: v_ffbh_u32_e32 v22, v17
-; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
-; SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11]
-; SDAG-NEXT: v_cndmask_b32_e64 v28, v9, v23, s[6:7]
-; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v10, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v29, v8, v3, s[6:7]
-; SDAG-NEXT: v_min_u32_e32 v1, v21, v22
-; SDAG-NEXT: v_add_i32_e64 v3, s[8:9], 64, v2
+; SDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v11
+; SDAG-NEXT: v_cndmask_b32_e64 v28, v9, v23, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v29, v8, v3, s[4:5]
+; SDAG-NEXT: v_subb_u32_e32 v3, vcc, 0, v10, vcc
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[0:1]
+; SDAG-NEXT: v_min_u32_e32 v0, v21, v22
+; SDAG-NEXT: v_add_i32_e64 v1, s[8:9], 64, v2
; SDAG-NEXT: v_addc_u32_e64 v8, s[8:9], 0, 0, s[8:9]
; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v11, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v0, s[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v3, s[4:5]
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
; SDAG-NEXT: v_cndmask_b32_e64 v10, v8, 0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v8, v3, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v8, v1, v0, vcc
; SDAG-NEXT: v_ffbh_u32_e32 v1, v29
; SDAG-NEXT: v_ffbh_u32_e32 v21, v28
-; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v9, s[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v9, s[4:5]
; SDAG-NEXT: v_or_b32_e32 v0, v29, v2
; SDAG-NEXT: v_add_i32_e32 v9, vcc, 32, v1
; SDAG-NEXT: v_ffbh_u32_e32 v11, v2
@@ -57,12 +57,12 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_ffbh_u32_e32 v21, v3
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
; SDAG-NEXT: v_min_u32_e32 v0, v11, v21
-; SDAG-NEXT: v_add_i32_e64 v1, s[6:7], 64, v9
-; SDAG-NEXT: v_addc_u32_e64 v9, s[6:7], 0, 0, s[6:7]
-; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[2:3]
-; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, 0, s[6:7]
-; SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[6:7]
-; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
+; SDAG-NEXT: v_add_i32_e64 v1, s[4:5], 64, v9
+; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[2:3]
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[4:5]
+; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v0, v8
; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v9, v10, vcc
; SDAG-NEXT: v_xor_b32_e32 v0, 0x7f, v8
@@ -208,7 +208,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mov_b32_e32 v23, v19
; SDAG-NEXT: v_subb_u32_e32 v3, vcc, 0, v5, vcc
; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v6, vcc
-; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
+; SDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v7
; SDAG-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[4:5]
; SDAG-NEXT: v_subb_u32_e32 v2, vcc, 0, v7, vcc
@@ -226,7 +226,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_ffbh_u32_e32 v30, v9
; SDAG-NEXT: v_min_u32_e32 v6, v16, v6
; SDAG-NEXT: v_subb_u32_e32 v16, vcc, 0, v14, vcc
-; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15]
+; SDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v15
; SDAG-NEXT: v_cndmask_b32_e64 v28, v13, v17, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v29, v12, v7, s[4:5]
; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3]
@@ -1547,61 +1547,61 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0
; SDAG-NEXT: v_mov_b32_e32 v19, 0
-; SDAG-NEXT: v_ashrrev_i32_e32 v28, 31, v3
; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
+; SDAG-NEXT: v_ashrrev_i32_e32 v28, 31, v3
; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc
; SDAG-NEXT: v_mov_b32_e32 v29, v28
; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v2, vcc
-; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
+; SDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3
; SDAG-NEXT: v_cndmask_b32_e64 v17, v1, v17, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v16, v0, v16, s[4:5]
; SDAG-NEXT: v_subb_u32_e32 v1, vcc, 0, v3, vcc
; SDAG-NEXT: v_cndmask_b32_e64 v0, v2, v18, s[4:5]
; SDAG-NEXT: v_ffbh_u32_e32 v18, v16
; SDAG-NEXT: v_ffbh_u32_e32 v20, v17
-; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 0, v8
; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
; SDAG-NEXT: v_or_b32_e32 v2, v16, v0
-; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 32, v18
+; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 0, v8
; SDAG-NEXT: v_ffbh_u32_e32 v22, v0
-; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc
+; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 32, v18
; SDAG-NEXT: v_or_b32_e32 v3, v17, v1
+; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc
+; SDAG-NEXT: v_add_i32_e64 v22, s[4:5], 32, v22
; SDAG-NEXT: v_min_u32_e32 v18, v18, v20
-; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], 32, v22
-; SDAG-NEXT: v_ffbh_u32_e32 v22, v1
-; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[10:11]
-; SDAG-NEXT: v_cndmask_b32_e64 v30, v9, v23, s[4:5]
-; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v10, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v31, v8, v21, s[4:5]
-; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3]
-; SDAG-NEXT: v_min_u32_e32 v3, v20, v22
+; SDAG-NEXT: v_ffbh_u32_e32 v20, v1
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
+; SDAG-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v11
+; SDAG-NEXT: v_cndmask_b32_e64 v30, v9, v23, s[6:7]
+; SDAG-NEXT: v_subb_u32_e32 v2, vcc, 0, v10, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v31, v8, v21, s[6:7]
+; SDAG-NEXT: v_min_u32_e32 v3, v22, v20
; SDAG-NEXT: v_add_i32_e64 v8, s[8:9], 64, v18
-; SDAG-NEXT: v_addc_u32_e64 v18, s[8:9], 0, 0, s[8:9]
-; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v11, vcc
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v9, s[4:5]
+; SDAG-NEXT: v_addc_u32_e64 v9, s[8:9], 0, 0, s[8:9]
+; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v11, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7]
; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v20, v9, 0, vcc
; SDAG-NEXT: v_cndmask_b32_e32 v10, v8, v3, vcc
; SDAG-NEXT: v_ffbh_u32_e32 v9, v31
; SDAG-NEXT: v_ffbh_u32_e32 v21, v30
-; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v20, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v18, s[6:7]
; SDAG-NEXT: v_or_b32_e32 v8, v31, v2
; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v9
-; SDAG-NEXT: v_ffbh_u32_e32 v20, v2
+; SDAG-NEXT: v_ffbh_u32_e32 v18, v2
; SDAG-NEXT: v_or_b32_e32 v9, v30, v3
; SDAG-NEXT: v_min_u32_e32 v11, v11, v21
-; SDAG-NEXT: v_add_i32_e32 v20, vcc, 32, v20
+; SDAG-NEXT: v_add_i32_e32 v18, vcc, 32, v18
; SDAG-NEXT: v_ffbh_u32_e32 v21, v3
; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; SDAG-NEXT: v_min_u32_e32 v8, v20, v21
-; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 64, v11
-; SDAG-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[2:3]
-; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[4:5]
-; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; SDAG-NEXT: v_min_u32_e32 v8, v18, v21
+; SDAG-NEXT: v_add_i32_e64 v9, s[6:7], 64, v11
+; SDAG-NEXT: v_addc_u32_e64 v11, s[6:7], 0, 0, s[6:7]
+; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[2:3]
+; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, 0, s[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[6:7]
+; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v8, v10
-; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v18, vcc
+; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v20, vcc
; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v10
; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v19, vcc
; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11]
@@ -1743,7 +1743,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_mov_b32_e32 v35, v32
; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc
; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v6, vcc
-; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
+; SDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v7
; SDAG-NEXT: v_cndmask_b32_e64 v11, v5, v11, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v10, v4, v10, s[4:5]
; SDAG-NEXT: v_subb_u32_e32 v5, vcc, 0, v7, vcc
@@ -1761,7 +1761,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; SDAG-NEXT: v_ffbh_u32_e32 v24, v5
; SDAG-NEXT: v_min_u32_e32 v18, v18, v20
; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v14, vcc
-; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15]
+; SDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v15
; SDAG-NEXT: v_cndmask_b32_e64 v36, v13, v23, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v37, v12, v21, s[4:5]
; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
index 0299cc60bfc86..1bbe503bfe072 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
@@ -125,17 +125,14 @@ define amdgpu_kernel void @uniform_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x
; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY4]], %subreg.sub0, killed [[COPY3]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3
- ; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[DEF]], %subreg.sub0, killed [[COPY5]], %subreg.sub1
; GCN-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORD_IMM]]
; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY6]], implicit-def dead $scc
; GCN-NEXT: S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc
; GCN-NEXT: [[COPY7:%[0-9]+]]:sreg_64 = COPY $scc
- ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
- ; GCN-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY killed [[S_MOV_B64_]]
- ; GCN-NEXT: [[V_CMP_LT_I64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I64_e64 killed [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
- ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I64_e64_]], killed [[COPY7]], implicit-def dead $scc
+ ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GCN-NEXT: S_CMP_LT_I32 killed [[COPY5]], killed [[S_MOV_B32_2]], implicit-def $scc
+ ; GCN-NEXT: [[COPY8:%[0-9]+]]:sreg_64 = COPY $scc
+ ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY8]], killed [[COPY7]], implicit-def dead $scc
; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
; GCN-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.2, addrspace 1)
; GCN-NEXT: S_ENDPGM 0
@@ -155,12 +152,12 @@ define i1 @divergent_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x, i1 %z) {
; GCN-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[DEF]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
; GCN-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY]], implicit $exec
; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_AND_B32_e64_]], 1, implicit $exec
- ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
- ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY killed [[S_MOV_B64_]]
- ; GCN-NEXT: [[V_CMP_LT_I64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I64_e64 killed [[REG_SEQUENCE]], [[COPY2]], implicit $exec
- ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I64_e64_]], killed [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+ ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GCN-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 killed [[COPY2]], killed [[S_MOV_B32_]], implicit $exec
+ ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I32_e64_]], killed [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
; GCN-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]]
; GCN-NEXT: SI_RETURN implicit $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
index 87d7a73c5c01f..4279b4f285b6f 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll
@@ -137,9 +137,9 @@ define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
; GCN-NEXT: .LBB1_4: ; %exit
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000
-; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5]
+; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -1, v5
; GCN-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc
-; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[6:7]
+; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -1, v7
; GCN-NEXT: v_cndmask_b32_e32 v2, -1, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v1, -1
; GCN-NEXT: v_mov_b32_e32 v3, -1
@@ -205,13 +205,13 @@ define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
; GCN-NEXT: .LBB2_4: ; %exit
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000
-; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5]
+; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 0, v5
; GCN-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc
-; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7]
+; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 0, v7
; GCN-NEXT: v_cndmask_b32_e64 v2, v1, -1, vcc
-; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[8:9]
+; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 0, v9
; GCN-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc
-; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11]
+; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 0, v11
; GCN-NEXT: v_cndmask_b32_e64 v6, v1, -1, vcc
; GCN-NEXT: v_mov_b32_e32 v1, -1
; GCN-NEXT: v_mov_b32_e32 v3, -1
@@ -295,14 +295,14 @@ define <8 x i64> @extract_8xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 %
; GCN-NEXT: .LBB3_4: ; %exit
; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
; GCN-NEXT: v_mov_b32_e32 v1, 0xffff8000
-; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7]
-; GCN-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[8:9]
-; GCN-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11]
-; GCN-NEXT: v_cmp_gt_i64_e64 s[8:9], 0, v[12:13]
-; GCN-NEXT: v_cmp_gt_i64_e64 s[10:11], 0, v[14:15]
-; GCN-NEXT: v_cmp_gt_i64_e64 s[12:13], 0, v[16:17]
-; GCN-NEXT: v_cmp_gt_i64_e64 s[14:15], 0, v[18:19]
-; GCN-NEXT: v_cmp_gt_i64_e64 s[16:17], 0, v[4:5]
+; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 0, v7
+; GCN-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v9
+; GCN-NEXT: v_cmp_gt_i32_e64 s[6:7], 0, v11
+; GCN-NEXT: v_cmp_gt_i32_e64 s[8:9], 0, v13
+; GCN-NEXT: v_cmp_gt_i32_e64 s[10:11], 0, v15
+; GCN-NEXT: v_cmp_gt_i32_e64 s[12:13], 0, v17
+; GCN-NEXT: v_cmp_gt_i32_e64 s[14:15], 0, v19
+; GCN-NEXT: v_cmp_gt_i32_e64 s[16:17], 0, v5
; GCN-NEXT: v_cndmask_b32_e64 v0, v1, -1, s[16:17]
; GCN-NEXT: v_cndmask_b32_e64 v2, v1, -1, vcc
; GCN-NEXT: v_cndmask_b32_e64 v4, v1, -1, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index 200fbf5d220b4..3e1da4b41989b 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -20,28 +20,28 @@ define i128 @fptosi_f64_to_i128(double %x) {
; SDAG-NEXT: s_cbranch_execz .LBB0_10
; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-check.saturate
; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6
-; SDAG-NEXT: v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc
-; SDAG-NEXT: s_movk_i32 s6, 0xff7f
-; SDAG-NEXT: s_mov_b32 s7, -1
-; SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], -1, v[4:5]
-; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
+; SDAG-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, -1, vcc
+; SDAG-NEXT: s_movk_i32 s4, 0xff7f
+; SDAG-NEXT: s_mov_b32 s5, -1
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1]
+; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, -1, v5
; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7]
; SDAG-NEXT: s_cbranch_execz .LBB0_7
; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-check.exp.size
-; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; SDAG-NEXT: v_add_co_u32_e32 v9, vcc, -1, v0
-; SDAG-NEXT: s_mov_b64 s[6:7], 0x432
+; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0
+; SDAG-NEXT: s_mov_b64 s[4:5], 0x432
; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v5
-; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, s[4:5]
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc
; SDAG-NEXT: v_or_b32_e32 v5, 0x100000, v0
; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7]
; SDAG-NEXT: s_cbranch_execz .LBB0_4
; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-exp.large
@@ -50,35 +50,35 @@ define i128 @fptosi_f64_to_i128(double %x) {
; SDAG-NEXT: v_add_u32_e32 v3, 0xfffffbcd, v6
; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[4:5]
; SDAG-NEXT: v_lshlrev_b64 v[6:7], v2, v[4:5]
-; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v3
; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v3
; SDAG-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5]
-; SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[6:7]
-; SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v3, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v3, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[6:7]
; SDAG-NEXT: v_mul_lo_u32 v12, v10, v1
; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v7, v10, 0
; SDAG-NEXT: v_mov_b32_e32 v2, 0
-; SDAG-NEXT: v_cndmask_b32_e32 v13, 0, v4, vcc
-; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v13, v10, v[1:2]
+; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v4, s[4:5]
+; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v10, v[1:2]
; SDAG-NEXT: v_mul_lo_u32 v11, v8, v5
; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[6:7], v10, v5, 0
; SDAG-NEXT: v_mov_b32_e32 v1, v3
-; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[1:2]
+; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v8, v[1:2]
; SDAG-NEXT: v_add3_u32 v6, v6, v12, v11
-; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[6:7], v9, v7, v[5:6]
-; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2
-; SDAG-NEXT: v_addc_co_u32_e64 v3, s[6:7], 0, 0, vcc
+; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, v[5:6]
+; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v4, v2
+; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5]
; SDAG-NEXT: v_mul_lo_u32 v10, v9, v13
; SDAG-NEXT: v_mul_lo_u32 v7, v9, v7
-; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v13, v8, v[2:3]
+; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v8, v[2:3]
; SDAG-NEXT: ; implicit-def: $vgpr8
; SDAG-NEXT: ; implicit-def: $vgpr9
; SDAG-NEXT: v_add3_u32 v4, v7, v6, v10
-; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
+; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v2, v5
+; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v3, v4, s[4:5]
; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; SDAG-NEXT: ; implicit-def: $vgpr10
@@ -89,30 +89,30 @@ define i128 @fptosi_f64_to_i128(double %x) {
; SDAG-NEXT: v_sub_u32_e32 v0, 0x433, v6
; SDAG-NEXT: v_lshrrev_b64 v[4:5], v0, v[4:5]
; SDAG-NEXT: v_mov_b32_e32 v2, 0
-; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v4, v10, 0
-; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[12:13], v5, v10, v[1:2]
+; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, 0
+; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v10, v[1:2]
; SDAG-NEXT: v_mov_b32_e32 v1, v6
-; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v4, v8, v[1:2]
-; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2
-; SDAG-NEXT: v_addc_co_u32_e64 v3, s[12:13], 0, 0, vcc
-; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[12:13], v5, v8, v[2:3]
-; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[12:13], v9, v4, v[2:3]
-; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v9, v4, v[3:4]
+; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v8, v[1:2]
+; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v7, v2
+; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v8, v[2:3]
+; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v4, v[2:3]
+; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v4, v[3:4]
; SDAG-NEXT: v_mad_i32_i24 v3, v9, v5, v3
; SDAG-NEXT: .LBB0_6: ; %Flow1
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: .LBB0_7: ; %Flow2
-; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[10:11]
+; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11]
; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-saturate
; SDAG-NEXT: v_bfrev_b32_e32 v0, 1
; SDAG-NEXT: v_bfrev_b32_e32 v1, -2
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; SDAG-NEXT: v_mov_b32_e32 v3, v2
; SDAG-NEXT: v_mov_b32_e32 v0, v1
; SDAG-NEXT: v_mov_b32_e32 v2, v1
; SDAG-NEXT: ; %bb.9: ; %Flow3
-; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
; SDAG-NEXT: .LBB0_10: ; %fp-to-i-cleanup
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -359,28 +359,28 @@ define i128 @fptoui_f64_to_i128(double %x) {
; SDAG-NEXT: s_cbranch_execz .LBB1_10
; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-check.saturate
; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6
-; SDAG-NEXT: v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc
-; SDAG-NEXT: s_movk_i32 s6, 0xff7f
-; SDAG-NEXT: s_mov_b32 s7, -1
-; SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], -1, v[4:5]
-; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
+; SDAG-NEXT: v_addc_co_u32_e64 v1, s[4:5], 0, -1, vcc
+; SDAG-NEXT: s_movk_i32 s4, 0xff7f
+; SDAG-NEXT: s_mov_b32 s5, -1
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1]
+; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, -1, v5
; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7]
; SDAG-NEXT: s_cbranch_execz .LBB1_7
; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-check.exp.size
-; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; SDAG-NEXT: v_add_co_u32_e32 v9, vcc, -1, v0
-; SDAG-NEXT: s_mov_b64 s[6:7], 0x432
+; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0
+; SDAG-NEXT: s_mov_b64 s[4:5], 0x432
; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v5
-; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7]
-; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, s[4:5]
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc
; SDAG-NEXT: v_or_b32_e32 v5, 0x100000, v0
; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
-; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7]
; SDAG-NEXT: s_cbranch_execz .LBB1_4
; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-exp.large
@@ -389,35 +389,35 @@ define i128 @fptoui_f64_to_i128(double %x) {
; SDAG-NEXT: v_add_u32_e32 v3, 0xfffffbcd, v6
; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[4:5]
; SDAG-NEXT: v_lshlrev_b64 v[6:7], v2, v[4:5]
-; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v3
; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v3
; SDAG-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5]
-; SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v7, v1, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[6:7]
-; SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
-; SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v3, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v3, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v0, s[6:7]
; SDAG-NEXT: v_mul_lo_u32 v12, v10, v1
; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v7, v10, 0
; SDAG-NEXT: v_mov_b32_e32 v2, 0
-; SDAG-NEXT: v_cndmask_b32_e32 v13, 0, v4, vcc
-; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v13, v10, v[1:2]
+; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v4, s[4:5]
+; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v13, v10, v[1:2]
; SDAG-NEXT: v_mul_lo_u32 v11, v8, v5
; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[6:7], v10, v5, 0
; SDAG-NEXT: v_mov_b32_e32 v1, v3
-; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[1:2]
+; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v8, v[1:2]
; SDAG-NEXT: v_add3_u32 v6, v6, v12, v11
-; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[6:7], v9, v7, v[5:6]
-; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2
-; SDAG-NEXT: v_addc_co_u32_e64 v3, s[6:7], 0, 0, vcc
+; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, v[5:6]
+; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v4, v2
+; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5]
; SDAG-NEXT: v_mul_lo_u32 v10, v9, v13
; SDAG-NEXT: v_mul_lo_u32 v7, v9, v7
-; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v13, v8, v[2:3]
+; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v13, v8, v[2:3]
; SDAG-NEXT: ; implicit-def: $vgpr8
; SDAG-NEXT: ; implicit-def: $vgpr9
; SDAG-NEXT: v_add3_u32 v4, v7, v6, v10
-; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
-; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
+; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v2, v5
+; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v3, v4, s[4:5]
; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
; SDAG-NEXT: ; implicit-def: $vgpr10
@@ -428,30 +428,30 @@ define i128 @fptoui_f64_to_i128(double %x) {
; SDAG-NEXT: v_sub_u32_e32 v0, 0x433, v6
; SDAG-NEXT: v_lshrrev_b64 v[4:5], v0, v[4:5]
; SDAG-NEXT: v_mov_b32_e32 v2, 0
-; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v4, v10, 0
-; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[12:13], v5, v10, v[1:2]
+; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v10, 0
+; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v10, v[1:2]
; SDAG-NEXT: v_mov_b32_e32 v1, v6
-; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[12:13], v4, v8, v[1:2]
-; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2
-; SDAG-NEXT: v_addc_co_u32_e64 v3, s[12:13], 0, 0, vcc
-; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[12:13], v5, v8, v[2:3]
-; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[12:13], v9, v4, v[2:3]
-; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[12:13], v9, v4, v[3:4]
+; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, v8, v[1:2]
+; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v7, v2
+; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v8, v[2:3]
+; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v4, v[2:3]
+; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v4, v[3:4]
; SDAG-NEXT: v_mad_i32_i24 v3, v9, v5, v3
; SDAG-NEXT: .LBB1_6: ; %Flow1
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: .LBB1_7: ; %Flow2
-; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[10:11]
+; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11]
; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-saturate
; SDAG-NEXT: v_bfrev_b32_e32 v0, 1
; SDAG-NEXT: v_bfrev_b32_e32 v1, -2
-; SDAG-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[4:5]
-; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; SDAG-NEXT: v_mov_b32_e32 v3, v2
; SDAG-NEXT: v_mov_b32_e32 v0, v1
; SDAG-NEXT: v_mov_b32_e32 v2, v1
; SDAG-NEXT: ; %bb.9: ; %Flow3
-; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
; SDAG-NEXT: .LBB1_10: ; %fp-to-i-cleanup
; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
; SDAG-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll
index eee3352fa7452..e775ed84f4534 100644
--- a/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll
+++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.bf.ll
@@ -21,8 +21,7 @@ define bfloat @sitofp_i128_to_bf16(i128 %x) {
; GCN-NEXT: v_subb_co_u32_e32 v5, vcc, 0, v1, vcc
; GCN-NEXT: v_subb_co_u32_e32 v6, vcc, 0, v2, vcc
; GCN-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v3, vcc
-; GCN-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
-; GCN-NEXT: ; implicit-def: $vgpr8
+; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; GCN-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
@@ -40,6 +39,7 @@ define bfloat @sitofp_i128_to_bf16(i128 %x) {
; GCN-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc
; GCN-NEXT: v_sub_u32_e32 v2, 0x80, v7
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 25, v2
+; GCN-NEXT: ; implicit-def: $vgpr8
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GCN-NEXT: ; %bb.2: ; %itofp-if-else
diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
index 2f9182e6e7c6a..eae0ff9e0f315 100644
--- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
@@ -17,8 +17,7 @@ define float @sitofp_i128_to_f32(i128 %x) {
; SDAG-NEXT: v_subb_co_u32_e32 v5, vcc, 0, v1, vcc
; SDAG-NEXT: v_subb_co_u32_e32 v6, vcc, 0, v2, vcc
; SDAG-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v3, vcc
-; SDAG-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
-; SDAG-NEXT: ; implicit-def: $vgpr8
+; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
; SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; SDAG-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
@@ -36,6 +35,7 @@ define float @sitofp_i128_to_f32(i128 %x) {
; SDAG-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc
; SDAG-NEXT: v_sub_u32_e32 v2, 0x80, v7
; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v2
+; SDAG-NEXT: ; implicit-def: $vgpr8
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SDAG-NEXT: ; %bb.2: ; %itofp-if-else
@@ -528,8 +528,7 @@ define double @sitofp_i128_to_f64(i128 %x) {
; SDAG-NEXT: v_subb_co_u32_e32 v1, vcc, 0, v5, vcc
; SDAG-NEXT: v_subb_co_u32_e32 v6, vcc, 0, v2, vcc
; SDAG-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v3, vcc
-; SDAG-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
-; SDAG-NEXT: ; implicit-def: $vgpr10
+; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
; SDAG-NEXT: v_cndmask_b32_e32 v6, v2, v6, vcc
; SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
; SDAG-NEXT: v_cndmask_b32_e32 v7, v3, v7, vcc
@@ -547,6 +546,7 @@ define double @sitofp_i128_to_f64(i128 %x) {
; SDAG-NEXT: v_cndmask_b32_e32 v9, v1, v0, vcc
; SDAG-NEXT: v_sub_u32_e32 v2, 0x80, v9
; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 54, v2
+; SDAG-NEXT: ; implicit-def: $vgpr10
; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
@@ -1106,8 +1106,7 @@ define half @sitofp_i128_to_f16(i128 %x) {
; SDAG-NEXT: v_subb_co_u32_e32 v5, vcc, 0, v1, vcc
; SDAG-NEXT: v_subb_co_u32_e32 v6, vcc, 0, v2, vcc
; SDAG-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v3, vcc
-; SDAG-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
-; SDAG-NEXT: ; implicit-def: $vgpr8
+; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
; SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; SDAG-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
; SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
@@ -1125,6 +1124,7 @@ define half @sitofp_i128_to_f16(i128 %x) {
; SDAG-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc
; SDAG-NEXT: v_sub_u32_e32 v2, 0x80, v7
; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v2
+; SDAG-NEXT: ; implicit-def: $vgpr8
; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SDAG-NEXT: ; %bb.2: ; %itofp-if-else
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index 4e1f0c0538bb5..941b1fa66c49e 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -14,7 +14,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v1, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v2, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
-; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
+; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
; GFX9-NEXT: v_ashrrev_i32_e32 v20, 31, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
@@ -24,8 +24,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v5, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v6, vcc
; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v7, vcc
-; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7]
-; GFX9-NEXT: v_mov_b32_e32 v21, v20
+; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v7
; GFX9-NEXT: v_cndmask_b32_e32 v22, v5, v9, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v23, v4, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v11, vcc
@@ -70,6 +69,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v9, vcc
; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v21, v20
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll
index 8861b7726a4c5..0e9ea5430d0ea 100644
--- a/llvm/test/CodeGen/AMDGPU/saddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddo.ll
@@ -20,18 +20,19 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v0, s2
-; SI-NEXT: s_add_u32 s10, s2, s8
-; SI-NEXT: s_addc_u32 s11, s3, s9
-; SI-NEXT: v_mov_b32_e32 v1, s3
-; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1]
-; SI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[8:9], 0
; SI-NEXT: s_mov_b32 s4, s0
+; SI-NEXT: s_add_u32 s0, s2, s8
+; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT: v_mov_b32_e32 v1, s11
-; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v0
+; SI-NEXT: s_addc_u32 s1, s3, s9
+; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; SI-NEXT: s_cmp_lt_i32 s9, 0
+; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; SI-NEXT: s_xor_b64 s[2:3], s[2:3], vcc
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
@@ -41,18 +42,19 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: s_add_u32 s6, s2, s4
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: s_addc_u32 s7, s3, s5
-; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[4:5], 0
-; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[1:2]
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_add_u32 s0, s2, s4
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_addc_u32 s1, s3, s5
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[2:3]
+; VI-NEXT: s_cmp_lt_i32 s5, 0
+; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT: s_xor_b64 s[2:3], s[2:3], vcc
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@@ -67,11 +69,12 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
; GFX9-NEXT: s_add_u32 s4, s2, s6
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: s_addc_u32 s5, s3, s7
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[6:7], 0
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc
+; GFX9-NEXT: s_cmp_lt_i32 s7, 0
+; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], vcc
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -86,9 +89,10 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s4, s2, s6
; GFX10-NEXT: s_addc_u32 s5, s3, s7
-; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[6:7], 0
+; GFX10-NEXT: s_cmp_lt_i32 s7, 0
; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3]
-; GFX10-NEXT: s_xor_b32 s2, s6, s2
+; GFX10-NEXT: s_cselect_b32 s3, -1, 0
+; GFX10-NEXT: s_xor_b32 s2, s3, s2
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
; GFX10-NEXT: v_add_co_u32 v0, s2, s4, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s5, 0, s2
@@ -104,13 +108,14 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s6, s2, s4
; GFX11-NEXT: s_addc_u32 s7, s3, s5
-; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[4:5], 0
+; GFX11-NEXT: s_cmp_lt_i32 s5, 0
; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], s[2:3]
-; GFX11-NEXT: s_xor_b32 s2, s4, s2
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_cselect_b32 s3, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_xor_b32 s2, s3, s2
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, s2, s6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s2
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
@@ -355,7 +360,8 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_addc_u32 s13, s5, s7
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1]
-; SI-NEXT: v_cmp_lt_i64_e64 s[4:5], s[6:7], 0
+; SI-NEXT: s_cmp_lt_i32 s7, 0
+; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: v_mov_b32_e32 v0, s12
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
@@ -381,10 +387,11 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_addc_u32 s1, s5, s7
; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
+; VI-NEXT: s_cmp_lt_i32 s7, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
-; VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
+; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
@@ -402,11 +409,12 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: v_mov_b32_e32 v0, s12
; GFX9-NEXT: v_mov_b32_e32 v1, s13
; GFX9-NEXT: s_addc_u32 s1, s13, s15
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[14:15], 0
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: s_cmp_lt_i32 s15, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; GFX9-NEXT: global_store_byte v2, v0, s[10:11]
@@ -419,11 +427,12 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_add_u32 s0, s12, s14
; GFX10-NEXT: s_addc_u32 s1, s13, s15
-; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[14:15], 0
-; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[12:13]
+; GFX10-NEXT: s_cmp_lt_i32 s15, 0
+; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[0:1], s[12:13]
+; GFX10-NEXT: s_cselect_b32 s3, -1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: s_xor_b32 s0, s2, s3
+; GFX10-NEXT: s_xor_b32 s0, s3, s2
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX10-NEXT: global_store_byte v2, v3, s[10:11]
@@ -435,11 +444,12 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s8, s4, s6
; GFX11-NEXT: s_addc_u32 s9, s5, s7
-; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[6:7], 0
+; GFX11-NEXT: s_cmp_lt_i32 s7, 0
; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5]
+; GFX11-NEXT: s_cselect_b32 s5, -1, 0
; GFX11-NEXT: v_mov_b32_e32 v0, s8
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
-; GFX11-NEXT: s_xor_b32 s4, s6, s4
+; GFX11-NEXT: s_xor_b32 s4, s5, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX11-NEXT: s_clause 0x1
@@ -478,11 +488,11 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2
; SI-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc
-; SI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
; SI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1]
-; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0
+; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -503,11 +513,11 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v9, vcc, v1, v3, vcc
-; VI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1]
-; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9]
+; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9]
; VI-NEXT: flat_store_byte v[6:7], v0
; VI-NEXT: s_endpgm
;
@@ -521,11 +531,11 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1]
-; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9]
+; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3
; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9]
; GFX9-NEXT: global_store_byte v6, v0, s[10:11]
; GFX9-NEXT: s_endpgm
;
@@ -540,9 +550,9 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
-; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
-; GFX10-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
-; GFX10-NEXT: s_xor_b32 s0, vcc_lo, s0
+; GFX10-NEXT: v_cmp_gt_i32_e64 s0, 0, v3
+; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
+; GFX10-NEXT: s_xor_b32 s0, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX10-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9]
; GFX10-NEXT: global_store_byte v6, v0, s[10:11]
@@ -560,9 +570,9 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
-; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
-; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
-; GFX11-NEXT: s_xor_b32 s0, vcc_lo, s0
+; GFX11-NEXT: v_cmp_gt_i32_e64 s0, 0, v3
+; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
+; GFX11-NEXT: s_xor_b32 s0, s0, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_clause 0x1
diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll
index 9debb88dd0d7f..5d5f3be9ce9fd 100644
--- a/llvm/test/CodeGen/AMDGPU/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll
@@ -430,7 +430,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
; GFX6-NEXT: v_add_i32_e32 v4, vcc, v0, v2
; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
-; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
+; GFX6-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v5
; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc
; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
@@ -443,7 +443,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
-; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
+; GFX8-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v5
; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
@@ -456,7 +456,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
+; GFX9-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v5
; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
@@ -468,7 +468,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
-; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3]
+; GFX10-NEXT: v_cmp_gt_i32_e64 s4, 0, v3
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v5
; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo
@@ -481,7 +481,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
-; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3]
+; GFX11-NEXT: v_cmp_gt_i32_e64 s0, 0, v3
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v5
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index c3a7e2ae4f344..23172eb2d8158 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -1047,7 +1047,7 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o
; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 glc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[2:3]
+; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v3
; SI-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
; SI-NEXT: v_cndmask_b32_e32 v2, 2, v4, vcc
; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
@@ -1069,10 +1069,10 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
-; VI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1]
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc
+; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
; VI-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc
; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
@@ -1089,7 +1089,7 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1]
+; GFX10-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
; GFX10-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
@@ -1108,7 +1108,7 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b64 v[2:3], v4, s[4:5] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1]
+; GFX11-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
; GFX11-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc
; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1]
@@ -1127,7 +1127,7 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_load_b64 v[2:3], v4, s[4:5] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1]
+; GFX12-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
; GFX12-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
; GFX12-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc
; GFX12-NEXT: global_store_b64 v4, v[0:1], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll b/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll
index 69f6c38d55a2d..ecb0c8eb9e0be 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll
@@ -9,11 +9,11 @@ define amdgpu_kernel void @widen_vselect_and_mask_v4f64(<4 x double> %arg) #0 {
; GCN-LABEL: widen_vselect_and_mask_v4f64:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
-; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_mov_b64 s[4:5], 16
; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: v_mov_b32_e32 v3, v0
@@ -22,13 +22,11 @@ define amdgpu_kernel void @widen_vselect_and_mask_v4f64(<4 x double> %arg) #0 {
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[2:3]
; GCN-NEXT: v_cmp_neq_f64_e64 s[0:1], s[0:1], 0
-; GCN-NEXT: v_mov_b32_e32 v2, v1
-; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[1:2]
+; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
; GCN-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec
; GCN-NEXT: s_cselect_b32 s0, 0x3ff00000, 0
; GCN-NEXT: s_mov_b64 s[4:5], 0
-; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GCN-NEXT: s_endpgm
@@ -54,18 +52,17 @@ define amdgpu_kernel void @widen_vselect_and_mask_v4i64(<4 x i64> %arg) #0 {
; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: s_mov_b32 s10, -1
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_mov_b64 s[8:9], 16
; GCN-NEXT: s_mov_b32 s11, 0xf000
-; GCN-NEXT: s_mov_b32 s10, -1
; GCN-NEXT: v_mov_b32_e32 v2, v1
; GCN-NEXT: v_mov_b32_e32 v3, v1
; GCN-NEXT: v_mov_b32_e32 v4, v1
; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[0:1], 0
-; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[2:3]
+; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[2:3]
; GCN-NEXT: v_cmp_ne_u64_e64 s[0:1], s[0:1], 0
-; GCN-NEXT: v_mov_b32_e32 v6, v5
-; GCN-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[5:6]
+; GCN-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
; GCN-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; GCN-NEXT: buffer_store_dwordx4 v[1:4], off, s[8:11], 0
>From 2f705140ac17f804c3846a613598f44734ca2983 Mon Sep 17 00:00:00 2001
From: Zach Goldthorpe <Zach.Goldthorpe at amd.com>
Date: Tue, 10 Feb 2026 12:55:48 -0600
Subject: [PATCH 5/5] Updated lit-test after merge.
---
llvm/test/CodeGen/AMDGPU/div_v2i128.ll | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 4464524abb633..52410c6d3698e 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -833,7 +833,7 @@ define <2 x i128> @v_sdiv_v2i128_v_pow2k(<2 x i128> %lhs) {
; SDAG-NEXT: v_mov_b32_e32 v19, v18
; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v1, vcc
; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v2, vcc
-; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
+; SDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v3
; SDAG-NEXT: v_cndmask_b32_e64 v13, v1, v10, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v12, v0, v8, s[4:5]
; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v3, vcc
@@ -991,7 +991,7 @@ define <2 x i128> @v_sdiv_v2i128_v_pow2k(<2 x i128> %lhs) {
; SDAG-NEXT: v_mov_b32_e32 v21, v20
; SDAG-NEXT: v_subb_u32_e32 v3, vcc, 0, v5, vcc
; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v6, vcc
-; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
+; SDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v7
; SDAG-NEXT: v_cndmask_b32_e64 v11, v5, v3, s[4:5]
; SDAG-NEXT: v_cndmask_b32_e64 v10, v4, v2, s[4:5]
; SDAG-NEXT: v_subb_u32_e32 v2, vcc, 0, v7, vcc
More information about the llvm-commits
mailing list