[llvm] [AMDGPU] Swap select operands to allow later v_cndmask shrinking into vop2 (PR #142140)

Ana Mihajlovic via llvm-commits llvm-commits at lists.llvm.org
Fri May 30 08:01:20 PDT 2025


https://github.com/mihajlovicana updated https://github.com/llvm/llvm-project/pull/142140

>From 25b2dd8526b52f407bbc51431289768564c11363 Mon Sep 17 00:00:00 2001
From: Ana Mihajlovic <Ana.Mihajlovic at amd.com>
Date: Fri, 30 May 2025 11:43:34 +0200
Subject: [PATCH 1/5] test precommit

---
 llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll | 764 +++++++++++++++++++++
 1 file changed, 764 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll

diff --git a/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll b/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll
new file mode 100644
index 0000000000000..12ccdfff07c6f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll
@@ -0,0 +1,764 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GCN
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=GCN
+
+;tests for integer 32
+define amdgpu_cs void @test_i32_sge(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_i32_sge:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 1, v0
+; GCN-NEXT:    v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
+; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = icmp sge i32 %a, 2
+  %val1 = select i1 %vcc, i32 %p, i32 0
+  %val2 = select i1 %vcc, i32 %q, i32 0
+  %ret0 = insertelement <2 x i32> poison, i32 %val1, i32 0
+  %ret1 = insertelement <2 x i32> %ret0, i32 %val2, i32 1
+  store <2 x i32> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_i32_sle(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_i32_sle:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 3, v0
+; GCN-NEXT:    v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
+; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = icmp sle i32 %a, 2
+  %val1 = select i1 %vcc, i32 %p, i32 0
+  %val2 = select i1 %vcc, i32 %q, i32 0
+  %ret0 = insertelement <2 x i32> poison, i32 %val1, i32 0
+  %ret1 = insertelement <2 x i32> %ret0, i32 %val2, i32 1
+  store <2 x i32> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_i32_sgt(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_i32_sgt:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 2, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = icmp sgt i32 2, %a
+  %val1 = select i1 %vcc, i32 0, i32 %p
+  %val2 = select i1 %vcc, i32 0, i32 %q
+  %ret0 = insertelement <2 x i32> poison, i32 %val1, i32 0
+  %ret1 = insertelement <2 x i32> %ret0, i32 %val2, i32 1
+  store <2 x i32> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_i32_slt(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_i32_slt:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 2, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = icmp slt i32 2, %a
+  %val1 = select i1 %vcc, i32 0, i32 %p
+  %val2 = select i1 %vcc, i32 0, i32 %q
+  %ret0 = insertelement <2 x i32> poison, i32 %val1, i32 0
+  %ret1 = insertelement <2 x i32> %ret0, i32 %val2, i32 1
+  store <2 x i32> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+;tests for integer 64
+define amdgpu_cs void @test_i64_sge(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_i64_sge:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 1, v[0:1]
+; GCN-NEXT:    v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT:    v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
+; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = icmp sge i64 %a, 2
+  %val1 = select i1 %vcc, i64 %p, i64 0
+  %val2 = select i1 %vcc, i64 %q, i64 0
+  %ret0 = insertelement <2 x i64> poison, i64 %val1, i64 0
+  %ret1 = insertelement <2 x i64> %ret0, i64 %val2, i64 1
+  store <2 x i64> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_i64_sle(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_i64_sle:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 3, v[0:1]
+; GCN-NEXT:    v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT:    v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
+; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = icmp sle i64 %a, 2
+  %val1 = select i1 %vcc, i64 %p, i64 0
+  %val2 = select i1 %vcc, i64 %q, i64 0
+  %ret0 = insertelement <2 x i64> poison, i64 %val1, i64 0
+  %ret1 = insertelement <2 x i64> %ret0, i64 %val2, i64 1
+  store <2 x i64> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_i64_sgt(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_i64_sgt:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 2, v[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = icmp sgt i64 2, %a
+  %val1 = select i1 %vcc, i64 0, i64 %p
+  %val2 = select i1 %vcc, i64 0, i64 %q
+  %ret0 = insertelement <2 x i64> poison, i64 %val1, i64 0
+  %ret1 = insertelement <2 x i64> %ret0, i64 %val2, i64 1
+  store <2 x i64> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_i64_slt(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_i64_slt:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 2, v[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = icmp slt i64 2, %a
+  %val1 = select i1 %vcc, i64 0, i64 %p
+  %val2 = select i1 %vcc, i64 0, i64 %q
+  %ret0 = insertelement <2 x i64> poison, i64 %val1, i64 0
+  %ret1 = insertelement <2 x i64> %ret0, i64 %val2, i64 1
+  store <2 x i64> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+;tests for unsigned 32
+define amdgpu_cs void @test_u32_eq(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_u32_eq:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = icmp eq i32 1, %a
+  %val1 = select i1 %vcc, i32 0, i32 %p
+  %val2 = select i1 %vcc, i32 0, i32 %q
+  %ret0 = insertelement <2 x i32> poison, i32 %val1, i32 0
+  %ret1 = insertelement <2 x i32> %ret0, i32 %val2, i32 1
+  store <2 x i32> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_negative_case(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_negative_case:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc_lo, -1, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = icmp eq i32 %a, -1
+  %val1 = select i1 %vcc, i32 %p, i32 0
+  %val2 = select i1 %vcc, i32 0, i32 %q
+  %ret0 = insertelement <2 x i32> poison, i32 %val1, i32 0
+  %ret1 = insertelement <2 x i32> %ret0, i32 %val2, i32 1
+  store <2 x i32> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_mixed(i32 %a, i32 %p, i32 %q, i32 %r, i32 %s, ptr addrspace(1) %out) {
+; GCN-LABEL: test_mixed:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc_lo, -1, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v3, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v4, 0, vcc_lo
+; GCN-NEXT:    global_store_b128 v[5:6], v[0:3], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = icmp eq i32 -1, %a
+  %val1 = select i1 %vcc, i32 0, i32 %p
+  %val2 = select i1 %vcc, i32 %q, i32 0
+  %val3 = select i1 %vcc, i32 0, i32 %r
+  %val4 = select i1 %vcc, i32 0, i32 %s
+  %ret0 = insertelement <4 x i32> poison, i32 %val1, i32 0
+  %ret1 = insertelement <4 x i32> %ret0, i32 %val2, i32 1
+  %ret2 = insertelement <4 x i32> %ret1, i32 %val3, i32 2
+  %ret3 = insertelement <4 x i32> %ret2, i32 %val4, i32 3
+  store <4 x i32> %ret3, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_sgpr(i32 %a, i32 %p, i32 inreg %q, i32 inreg %r, ptr addrspace(1) %out) {
+; GCN-LABEL: test_sgpr:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc_lo, -1, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v4, 0, v1, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v5, s0, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v6, s1, 0, vcc_lo
+; GCN-NEXT:    global_store_b96 v[2:3], v[4:6], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = icmp eq i32 %a, -1
+  %val1 = select i1 %vcc, i32 %p, i32 0
+  %val2 = select i1 %vcc, i32 0, i32 %q
+  %val3 = select i1 %vcc, i32 0, i32 %r
+  %ret0 = insertelement <3 x i32> poison, i32 %val1, i32 0
+  %ret1 = insertelement <3 x i32> %ret0, i32 %val2, i32 1
+  %ret2 = insertelement <3 x i32> %ret1, i32 %val3, i32 2
+  store <3 x i32> %ret2, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_u32_ne(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_u32_ne:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = icmp ne i32 1, %a
+  %val1 = select i1 %vcc, i32 0, i32 %p
+  %val2 = select i1 %vcc, i32 0, i32 %q
+  %ret0 = insertelement <2 x i32> poison, i32 %val1, i32 0
+  %ret1 = insertelement <2 x i32> %ret0, i32 %val2, i32 1
+  store <2 x i32> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_u32_uge(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_u32_uge:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 1, v0
+; GCN-NEXT:    v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
+; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = icmp uge i32 %a, 2
+  %val1 = select i1 %vcc, i32 %p, i32 0
+  %val2 = select i1 %vcc, i32 %q, i32 0
+  %ret0 = insertelement <2 x i32> poison, i32 %val1, i32 0
+  %ret1 = insertelement <2 x i32> %ret0, i32 %val2, i32 1
+  store <2 x i32> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_u32_ule(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_u32_ule:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 3, v0
+; GCN-NEXT:    v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
+; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = icmp ule i32 %a, 2
+  %val1 = select i1 %vcc, i32 %p, i32 0
+  %val2 = select i1 %vcc, i32 %q, i32 0
+  %ret0 = insertelement <2 x i32> poison, i32 %val1, i32 0
+  %ret1 = insertelement <2 x i32> %ret0, i32 %val2, i32 1
+  store <2 x i32> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_u32_ugt(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_u32_ugt:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 2, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = icmp ugt i32 2, %a
+  %val1 = select i1 %vcc, i32 0, i32 %p
+  %val2 = select i1 %vcc, i32 0, i32 %q
+  %ret0 = insertelement <2 x i32> poison, i32 %val1, i32 0
+  %ret1 = insertelement <2 x i32> %ret0, i32 %val2, i32 1
+  store <2 x i32> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_u32_ult(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_u32_ult:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 2, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = icmp ult i32 2, %a
+  %val1 = select i1 %vcc, i32 0, i32 %p
+  %val2 = select i1 %vcc, i32 0, i32 %q
+  %ret0 = insertelement <2 x i32> poison, i32 %val1, i32 0
+  %ret1 = insertelement <2 x i32> %ret0, i32 %val2, i32 1
+  store <2 x i32> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+;tests for unsigned 64
+define amdgpu_cs void @test_u64_eq(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_u64_eq:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 1, v[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = icmp eq i64 1, %a
+  %val1 = select i1 %vcc, i64 0, i64 %p
+  %val2 = select i1 %vcc, i64 0, i64 %q
+  %ret0 = insertelement <2 x i64> poison, i64 %val1, i64 0
+  %ret1 = insertelement <2 x i64> %ret0, i64 %val2, i64 1
+  store <2 x i64> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_u64_ne(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_u64_ne:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 1, v[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = icmp ne i64 1, %a
+  %val1 = select i1 %vcc, i64 0, i64 %p
+  %val2 = select i1 %vcc, i64 0, i64 %q
+  %ret0 = insertelement <2 x i64> poison, i64 %val1, i64 0
+  %ret1 = insertelement <2 x i64> %ret0, i64 %val2, i64 1
+  store <2 x i64> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_u64_uge(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_u64_uge:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_lt_u64_e32 vcc_lo, 1, v[0:1]
+; GCN-NEXT:    v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT:    v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
+; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = icmp uge i64 %a, 2
+  %val1 = select i1 %vcc, i64 %p, i64 0
+  %val2 = select i1 %vcc, i64 %q, i64 0
+  %ret0 = insertelement <2 x i64> poison, i64 %val1, i64 0
+  %ret1 = insertelement <2 x i64> %ret0, i64 %val2, i64 1
+  store <2 x i64> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_u64_ule(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_u64_ule:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_gt_u64_e32 vcc_lo, 3, v[0:1]
+; GCN-NEXT:    v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT:    v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
+; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = icmp ule i64 %a, 2
+  %val1 = select i1 %vcc, i64 %p, i64 0
+  %val2 = select i1 %vcc, i64 %q, i64 0
+  %ret0 = insertelement <2 x i64> poison, i64 %val1, i64 0
+  %ret1 = insertelement <2 x i64> %ret0, i64 %val2, i64 1
+  store <2 x i64> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_u64_ugt(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_u64_ugt:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_gt_u64_e32 vcc_lo, 2, v[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = icmp ugt i64 2, %a
+  %val1 = select i1 %vcc, i64 0, i64 %p
+  %val2 = select i1 %vcc, i64 0, i64 %q
+  %ret0 = insertelement <2 x i64> poison, i64 %val1, i64 0
+  %ret1 = insertelement <2 x i64> %ret0, i64 %val2, i64 1
+  store <2 x i64> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_u64_ult(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_u64_ult:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_lt_u64_e32 vcc_lo, 2, v[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = icmp ult i64 2, %a
+  %val1 = select i1 %vcc, i64 0, i64 %p
+  %val2 = select i1 %vcc, i64 0, i64 %q
+  %ret0 = insertelement <2 x i64> poison, i64 %val1, i64 0
+  %ret1 = insertelement <2 x i64> %ret0, i64 %val2, i64 1
+  store <2 x i64> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+;tests for float 32
+define amdgpu_cs void @test_f32_oeq(float %a, float %p, float %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f32_oeq:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 2.0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = fcmp oeq float %a, 2.0
+  %val1 = select i1 %vcc, float 0.0, float %p
+  %val2 = select i1 %vcc, float 0.0, float %q
+  %ret0 = insertelement <2 x float> poison, float %val1, i32 0
+  %ret1 = insertelement <2 x float> %ret0, float %val2, i32 1
+  store <2 x float> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_f32_negative_modifiers(float %a, float %p, float %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f32_negative_modifiers:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 2.0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v1, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v1, -v2, 0, vcc_lo
+; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %r = fneg float %p
+  %s = fneg  float %q
+  %vcc = fcmp oeq float 2.0, %a
+  %val1 = select i1 %vcc, float 0.0, float %r
+  %val2 = select i1 %vcc, float 0.0, float %s
+  %ret0 = insertelement <2 x float> poison, float %val1, i32 0
+  %ret1 = insertelement <2 x float> %ret0, float %val2, i32 1
+  store <2 x float> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_f32_one(float %a, float %p, float %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f32_one:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_lg_f32_e32 vcc_lo, 2.0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = fcmp one float %a, 2.0
+  %val1 = select i1 %vcc, float 0.0, float %p
+  %val2 = select i1 %vcc, float 0.0, float %q
+  %ret0 = insertelement <2 x float> poison, float %val1, i32 0
+  %ret1 = insertelement <2 x float> %ret0, float %val2, i32 1
+  store <2 x float> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_f32_ord(float %a, float %p, float %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f32_ord:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = fcmp ord float %a, 2.0
+  %val1 = select i1 %vcc, float 0.0, float %p
+  %val2 = select i1 %vcc, float 0.0, float %q
+  %ret0 = insertelement <2 x float> poison, float %val1, i32 0
+  %ret1 = insertelement <2 x float> %ret0, float %val2, i32 1
+  store <2 x float> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_f32_uno(float %a, float %p, float %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f32_uno:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = fcmp uno float %a, 2.0
+  %val1 = select i1 %vcc, float 0.0, float %p
+  %val2 = select i1 %vcc, float 0.0, float %q
+  %ret0 = insertelement <2 x float> poison, float %val1, i32 0
+  %ret1 = insertelement <2 x float> %ret0, float %val2, i32 1
+  store <2 x float> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_f32_oge(float %a, float %p, float %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f32_oge:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_ge_f32_e32 vcc_lo, 2.0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = fcmp oge float 2.0, %a
+  %val1 = select i1 %vcc, float 0.0, float %p
+  %val2 = select i1 %vcc, float 0.0, float %q
+  %ret0 = insertelement <2 x float> poison, float %val1, i32 0
+  %ret1 = insertelement <2 x float> %ret0, float %val2, i32 1
+  store <2 x float> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_f32_ole(float %a, float %p, float %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f32_ole:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_le_f32_e32 vcc_lo, 2.0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = fcmp ole float 2.0, %a
+  %val1 = select i1 %vcc, float 0.0, float %p
+  %val2 = select i1 %vcc, float 0.0, float %q
+  %ret0 = insertelement <2 x float> poison, float %val1, i32 0
+  %ret1 = insertelement <2 x float> %ret0, float %val2, i32 1
+  store <2 x float> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_f32_ogt(float %a, float %p, float %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f32_ogt:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 2.0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = fcmp ogt float 2.0, %a
+  %val1 = select i1 %vcc, float 0.0, float %p
+  %val2 = select i1 %vcc, float 0.0, float %q
+  %ret0 = insertelement <2 x float> poison, float %val1, i32 0
+  %ret1 = insertelement <2 x float> %ret0, float %val2, i32 1
+  store <2 x float> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_f32_olt(float %a, float %p, float %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f32_olt:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 2.0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = fcmp olt float 2.0, %a
+  %val1 = select i1 %vcc, float 0.0, float %p
+  %val2 = select i1 %vcc, float 0.0, float %q
+  %ret0 = insertelement <2 x float> poison, float %val1, i32 0
+  %ret1 = insertelement <2 x float> %ret0, float %val2, i32 1
+  store <2 x float> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+;tests for float64
+define amdgpu_cs void @test_f64_oeq(double %a, double %p, double %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f64_oeq:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_eq_f64_e32 vcc_lo, 2.0, v[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = fcmp oeq double 2.0, %a
+  %val1 = select i1 %vcc, double 0.0, double %p
+  %val2 = select i1 %vcc, double 0.0, double %q
+  %ret0 = insertelement <2 x double> poison, double %val1, i32 0
+  %ret1 = insertelement <2 x double> %ret0, double %val2, i32 1
+  store <2 x double> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_f64_one(double %a, double %p, double %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f64_one:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_lg_f64_e32 vcc_lo, 2.0, v[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = fcmp one double 2.0, %a
+  %val1 = select i1 %vcc, double 0.0, double %p
+  %val2 = select i1 %vcc, double 0.0, double %q
+  %ret0 = insertelement <2 x double> poison, double %val1, i32 0
+  %ret1 = insertelement <2 x double> %ret0, double %val2, i32 1
+  store <2 x double> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_f64_oge(double %a, double %p, double %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f64_oge:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_ge_f64_e32 vcc_lo, 2.0, v[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = fcmp oge double 2.0, %a
+  %val1 = select i1 %vcc, double 0.0, double %p
+  %val2 = select i1 %vcc, double 0.0, double %q
+  %ret0 = insertelement <2 x double> poison, double %val1, i32 0
+  %ret1 = insertelement <2 x double> %ret0, double %val2, i32 1
+  store <2 x double> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_f64_ole(double %a, double %p, double %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f64_ole:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_le_f64_e32 vcc_lo, 2.0, v[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = fcmp ole double 2.0, %a
+  %val1 = select i1 %vcc, double 0.0, double %p
+  %val2 = select i1 %vcc, double 0.0, double %q
+  %ret0 = insertelement <2 x double> poison, double %val1, i32 0
+  %ret1 = insertelement <2 x double> %ret0, double %val2, i32 1
+  store <2 x double> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_f64_ogt(double %a, double %p, double %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f64_ogt:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 2.0, v[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = fcmp ogt double 2.0, %a
+  %val1 = select i1 %vcc, double 0.0, double %p
+  %val2 = select i1 %vcc, double 0.0, double %q
+  %ret0 = insertelement <2 x double> poison, double %val1, i32 0
+  %ret1 = insertelement <2 x double> %ret0, double %val2, i32 1
+  store <2 x double> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_f64_olt(double %a, double %p, double %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f64_olt:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_lt_f64_e32 vcc_lo, 2.0, v[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = fcmp olt double 2.0, %a
+  %val1 = select i1 %vcc, double 0.0, double %p
+  %val2 = select i1 %vcc, double 0.0, double %q
+  %ret0 = insertelement <2 x double> poison, double %val1, i32 0
+  %ret1 = insertelement <2 x double> %ret0, double %val2, i32 1
+  store <2 x double> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_f64_ord(double %a, double %p, double %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f64_ord:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_o_f64_e32 vcc_lo, v[0:1], v[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = fcmp ord double 2.0, %a
+  %val1 = select i1 %vcc, double 0.0, double %p
+  %val2 = select i1 %vcc, double 0.0, double %q
+  %ret0 = insertelement <2 x double> poison, double %val1, i32 0
+  %ret1 = insertelement <2 x double> %ret0, double %val2, i32 1
+  store <2 x double> %ret1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_f64_uno(double %a, double %p, double %q, ptr addrspace(1) %out) {
+; GCN-LABEL: test_f64_uno:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
+; GCN-NEXT:    s_endpgm
+.entry:
+  %vcc = fcmp uno double 2.0, %a
+  %val1 = select i1 %vcc, double 0.0, double %p
+  %val2 = select i1 %vcc, double 0.0, double %q
+  %ret0 = insertelement <2 x double> poison, double %val1, i32 0
+  %ret1 = insertelement <2 x double> %ret0, double %val2, i32 1
+  store <2 x double> %ret1, ptr addrspace(1) %out
+  ret void
+}

>From 91f5ad7bd6b815e163815946172e5b51dd7b65c3 Mon Sep 17 00:00:00 2001
From: Ana Mihajlovic <Ana.Mihajlovic at amd.com>
Date: Fri, 30 May 2025 14:47:02 +0200
Subject: [PATCH 2/5] [AMDGPU] Swap select operands to allow later v_cndmask
 shrinking into vop2

---
 .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    |  34 +
 .../GlobalISel/divergence-structurizer.ll     |  10 +-
 llvm/test/CodeGen/AMDGPU/ctlz.ll              |  20 +-
 llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll   |  16 +-
 llvm/test/CodeGen/AMDGPU/cttz.ll              |  20 +-
 llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll   |  12 +-
 llvm/test/CodeGen/AMDGPU/div_i128.ll          | 182 ++---
 llvm/test/CodeGen/AMDGPU/div_v2i128.ll        | 754 +++++++++---------
 .../issue130120-eliminate-frame-index.ll      |  34 +-
 llvm/test/CodeGen/AMDGPU/rem_i128.ll          | 124 ++-
 .../AMDGPU/select-fabs-fneg-extract.v2f16.ll  | 180 ++---
 llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll    |  95 +--
 12 files changed, 718 insertions(+), 763 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 52177a2523bcb..7f17132be12aa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -1775,6 +1775,40 @@ bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) {
       UA.isUniform(&I))
     Changed |= promoteUniformOpToI32(I);
 
+  // check if select operands should be swapped
+  // so that v_cndmask can be later shrinked into
+  // vop2
+  int ShouldSwap = 0;
+  for (auto Use = I.use_begin(); Use != I.use_end(); Use++) {
+    auto User = Use->getUser();
+
+    if (!isa<SelectInst>(User))
+      return Changed;
+
+    auto SelectI = dyn_cast<SelectInst>(User);
+
+    if (isa<Constant>(SelectI->getOperand(1)) &&
+        !isa<Constant>(SelectI->getOperand(2)))
+      ShouldSwap++;
+    else if (!isa<Constant>(SelectI->getOperand(1)) &&
+             isa<Constant>(SelectI->getOperand(2)))
+      ShouldSwap--;
+  }
+
+  if (ShouldSwap <= 0)
+    return Changed;
+
+  I.setPredicate(I.getInverseCmpPredicate());
+
+  for (auto Use = I.use_begin(); Use != I.use_end(); Use++) {
+    auto SelectI = dyn_cast<Instruction>(Use->getUser());
+
+    auto Op = SelectI->getOperand(1);
+
+    SelectI->setOperand(1, SelectI->getOperand(2));
+    SelectI->setOperand(2, Op);
+  }
+
   return Changed;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
index e31077dd1986f..71f4bfaab77c6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
@@ -547,14 +547,16 @@ define amdgpu_ps i32 @irreducible_cfg(i32 %x, i32 %y, i32 %a0, i32 %a1, i32 %a2,
 ; GFX10-NEXT:    s_xor_b32 s5, exec_lo, s5
 ; GFX10-NEXT:  ; %bb.3: ; %.loopexit
 ; GFX10-NEXT:    ; in Loop: Header=BB6_2 Depth=1
-; GFX10-NEXT:    v_cmp_gt_i32_e64 s0, v5, v0
+; GFX10-NEXT:    v_cmp_le_i32_e64 s0, v5, v0
 ; GFX10-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX10-NEXT:    s_mov_b32 s7, exec_lo
 ; GFX10-NEXT:    s_xor_b32 s6, vcc_lo, s6
+; GFX10-NEXT:    s_mov_b32 s8, exec_lo
+; GFX10-NEXT:    s_xor_b32 s7, s0, s7
 ; GFX10-NEXT:    s_andn2_b32 s3, s3, exec_lo
-; GFX10-NEXT:    s_or_b32 s6, s0, s6
+; GFX10-NEXT:    s_or_b32 s6, s7, s6
 ; GFX10-NEXT:    s_and_b32 s0, exec_lo, s0
-; GFX10-NEXT:    s_xor_b32 s6, s6, s7
+; GFX10-NEXT:    s_xor_b32 s6, s6, s8
 ; GFX10-NEXT:    s_andn2_b32 s4, s4, exec_lo
 ; GFX10-NEXT:    s_and_b32 s6, exec_lo, s6
 ; GFX10-NEXT:    s_or_b32 s3, s3, s0
@@ -588,7 +590,7 @@ define amdgpu_ps i32 @irreducible_cfg(i32 %x, i32 %y, i32 %a0, i32 %a1, i32 %a2,
 ; GFX10-NEXT:    s_branch .LBB6_1
 ; GFX10-NEXT:  .LBB6_8: ; %.exit
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v2, v3, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s2
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 .entry:
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 52c90817dddd1..e3cc8ee340f0c 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -1100,9 +1100,9 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
@@ -1328,8 +1328,8 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 32, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
 ; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
@@ -1565,10 +1565,10 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, 0xffe8, v1
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0xffff, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0xffff, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
@@ -1676,10 +1676,10 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-GISEL-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v2, v2, -16
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
 ; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
@@ -1790,10 +1790,10 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, 0xffe7, v1
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0x7f, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
 ; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 9503ffbdb4104..5e24af3e71c1d 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -1543,8 +1543,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc
+; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
@@ -1713,8 +1713,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v3, v3
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_sdwa vcc, v0, v1 src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-GISEL-NEXT:    v_cmp_ne_u32_sdwa vcc, v0, v1 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1898,8 +1898,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali
 ; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
@@ -2067,8 +2067,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1
 ; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index 7f83fc571bf29..d997904d81d54 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -951,9 +951,9 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v0
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
@@ -1153,8 +1153,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 32, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
+; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
 ; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1357,9 +1357,9 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, 0x100, v0
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_sdwa s2, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-GISEL-NEXT:    v_cmp_ne_u32_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0xffff, s2
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0xffff, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_store_byte v2, v0, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1455,9 +1455,9 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v2, 0x10000, v1
-; GFX10-GISEL-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
 ; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %valptr
@@ -1558,8 +1558,8 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, 0x80, v0
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
+; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0x7f, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
 ; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 97bcd8b5ee68a..c1abc3002a990 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -1168,8 +1168,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX9-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
+; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, -1, v2, vcc
 ; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
@@ -1502,8 +1502,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_or_b32_e32 v3, 0x100, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v3, v3
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_sdwa vcc, v1, v0 src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-GISEL-NEXT:    v_cmp_ne_u32_sdwa vcc, v1, v0 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
 ; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %arrayidx, align 1
@@ -1598,8 +1598,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    v_or_b32_e32 v2, 0x10000, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
-; GFX9-GISEL-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-GISEL-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %arrayidx, align 1
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index 06c0417211809..7c525c0a66070 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -24,17 +24,14 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, 0, v6, vcc
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[6:7]
-; GFX9-NEXT:    v_ashrrev_i32_e32 v17, 31, v7
+; GFX9-NEXT:    v_ffbh_u32_e32 v13, v9
 ; GFX9-NEXT:    v_cndmask_b32_e32 v20, v5, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v21, v4, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v2, vcc
 ; GFX9-NEXT:    v_or_b32_e32 v3, v20, v1
 ; GFX9-NEXT:    v_or_b32_e32 v2, v21, v0
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; GFX9-NEXT:    v_or_b32_e32 v3, v9, v11
-; GFX9-NEXT:    v_or_b32_e32 v2, v8, v10
-; GFX9-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
+; GFX9-NEXT:    v_cmp_ne_u64_e64 s[4:5], 0, v[2:3]
 ; GFX9-NEXT:    v_ffbh_u32_e32 v2, v0
 ; GFX9-NEXT:    v_add_u32_e32 v2, 32, v2
 ; GFX9-NEXT:    v_ffbh_u32_e32 v3, v1
@@ -43,52 +40,49 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_add_u32_e32 v3, 32, v3
 ; GFX9-NEXT:    v_ffbh_u32_e32 v4, v20
 ; GFX9-NEXT:    v_min_u32_e32 v3, v3, v4
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, 64, v3
 ; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[6:7], 0, 0, vcc
 ; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX9-NEXT:    v_ffbh_u32_e32 v6, v11
+; GFX9-NEXT:    v_ffbh_u32_e32 v5, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX9-NEXT:    v_ffbh_u32_e32 v3, v10
 ; GFX9-NEXT:    v_add_u32_e32 v3, 32, v3
-; GFX9-NEXT:    v_min_u32_e32 v3, v3, v6
-; GFX9-NEXT:    v_ffbh_u32_e32 v6, v8
-; GFX9-NEXT:    v_add_u32_e32 v6, 32, v6
-; GFX9-NEXT:    v_ffbh_u32_e32 v7, v9
-; GFX9-NEXT:    v_min_u32_e32 v6, v6, v7
+; GFX9-NEXT:    v_min_u32_e32 v3, v3, v5
+; GFX9-NEXT:    v_ffbh_u32_e32 v5, v8
+; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
+; GFX9-NEXT:    v_min_u32_e32 v5, v5, v13
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, 64, v6
-; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[6:7], 0, 0, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, 64, v5
+; GFX9-NEXT:    v_addc_co_u32_e64 v13, s[6:7], 0, 0, vcc
 ; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, 0, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v12, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v13, 0, vcc
 ; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v7, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX9-NEXT:    s_mov_b64 s[6:7], 0x7f
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v18, v16
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v19, v17
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX9-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v13, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, 0, v12, vcc
+; GFX9-NEXT:    v_ashrrev_i32_e32 v17, 31, v7
+; GFX9-NEXT:    v_or_b32_e32 v7, v9, v11
+; GFX9-NEXT:    v_or_b32_e32 v6, v8, v10
+; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, 0, v12, vcc
+; GFX9-NEXT:    s_mov_b64 s[6:7], 0x80
+; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3]
+; GFX9-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[4:5]
+; GFX9-NEXT:    v_cmp_ne_u64_e64 s[8:9], 0, v[6:7]
 ; GFX9-NEXT:    v_xor_b32_e32 v6, 0x7f, v2
 ; GFX9-NEXT:    v_or_b32_e32 v7, v3, v5
 ; GFX9-NEXT:    v_or_b32_e32 v6, v6, v4
-; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; GFX9-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; GFX9-NEXT:    s_and_b64 s[6:7], s[6:7], vcc
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v18, v16
+; GFX9-NEXT:    v_mov_b32_e32 v19, v17
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, v11, 0, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, v10, 0, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, v9, 0, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, 0, s[4:5]
-; GFX9-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB0_6
 ; GFX9-NEXT:  ; %bb.1: ; %udiv-bb1
@@ -1241,10 +1235,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-NEXT:    v_subb_co_u32_e32 v5, vcc, v3, v17, vcc
 ; GFX9-G-NEXT:    v_or_b32_e32 v0, v18, v4
 ; GFX9-G-NEXT:    v_or_b32_e32 v1, v19, v5
-; GFX9-G-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX9-G-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; GFX9-G-NEXT:    v_or_b32_e32 v0, v8, v10
 ; GFX9-G-NEXT:    v_or_b32_e32 v1, v9, v11
-; GFX9-G-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; GFX9-G-NEXT:    v_cmp_ne_u64_e64 s[4:5], 0, v[0:1]
 ; GFX9-G-NEXT:    v_ffbh_u32_e32 v1, v18
 ; GFX9-G-NEXT:    v_ffbh_u32_e32 v0, v19
 ; GFX9-G-NEXT:    v_add_u32_e32 v1, 32, v1
@@ -1273,10 +1267,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-NEXT:    v_subb_co_u32_e64 v2, s[6:7], 0, 0, s[6:7]
 ; GFX9-G-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-G-NEXT:    v_subb_co_u32_e64 v3, s[6:7], 0, 0, s[6:7]
-; GFX9-G-NEXT:    v_cmp_gt_u64_e64 s[6:7], v[0:1], v[6:7]
-; GFX9-G-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-G-NEXT:    v_cmp_le_u64_e64 s[6:7], v[0:1], v[6:7]
+; GFX9-G-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX9-G-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[6:7]
-; GFX9-G-NEXT:    v_cmp_lt_u64_e64 s[6:7], 0, v[2:3]
+; GFX9-G-NEXT:    v_cmp_ge_u64_e64 s[6:7], 0, v[2:3]
 ; GFX9-G-NEXT:    v_or_b32_e32 v15, v1, v3
 ; GFX9-G-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[6:7]
 ; GFX9-G-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[2:3]
@@ -1292,7 +1286,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-NEXT:    v_cndmask_b32_e64 v7, v9, 0, vcc
 ; GFX9-G-NEXT:    v_cndmask_b32_e64 v12, v10, 0, vcc
 ; GFX9-G-NEXT:    v_cndmask_b32_e64 v13, v11, 0, vcc
-; GFX9-G-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
+; GFX9-G-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; GFX9-G-NEXT:    v_xor_b32_e32 v20, 1, v20
 ; GFX9-G-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GFX9-G-NEXT:    v_or_b32_e32 v14, v20, v14
 ; GFX9-G-NEXT:    v_and_b32_e32 v14, 1, v14
@@ -2307,64 +2302,58 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-LABEL: v_udiv_i128_vv:
 ; GFX9:       ; %bb.0: ; %_udiv-special-cases
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_or_b32_e32 v9, v5, v7
-; GFX9-NEXT:    v_or_b32_e32 v8, v4, v6
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; GFX9-NEXT:    v_or_b32_e32 v9, v1, v3
-; GFX9-NEXT:    v_or_b32_e32 v8, v0, v2
-; GFX9-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[8:9]
-; GFX9-NEXT:    v_ffbh_u32_e32 v8, v6
-; GFX9-NEXT:    v_add_u32_e32 v8, 32, v8
-; GFX9-NEXT:    v_ffbh_u32_e32 v9, v7
-; GFX9-NEXT:    v_min_u32_e32 v8, v8, v9
-; GFX9-NEXT:    v_ffbh_u32_e32 v9, v4
-; GFX9-NEXT:    v_add_u32_e32 v9, 32, v9
-; GFX9-NEXT:    v_ffbh_u32_e32 v10, v5
-; GFX9-NEXT:    v_min_u32_e32 v9, v9, v10
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, 64, v9
-; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[6:7], 0, 0, vcc
-; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; GFX9-NEXT:    v_ffbh_u32_e32 v11, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
-; GFX9-NEXT:    v_ffbh_u32_e32 v9, v2
-; GFX9-NEXT:    v_add_u32_e32 v9, 32, v9
-; GFX9-NEXT:    v_min_u32_e32 v9, v9, v11
-; GFX9-NEXT:    v_ffbh_u32_e32 v11, v0
+; GFX9-NEXT:    v_ffbh_u32_e32 v10, v6
+; GFX9-NEXT:    v_add_u32_e32 v10, 32, v10
+; GFX9-NEXT:    v_ffbh_u32_e32 v11, v7
+; GFX9-NEXT:    v_min_u32_e32 v10, v10, v11
+; GFX9-NEXT:    v_ffbh_u32_e32 v11, v4
 ; GFX9-NEXT:    v_add_u32_e32 v11, 32, v11
-; GFX9-NEXT:    v_ffbh_u32_e32 v12, v1
+; GFX9-NEXT:    v_ffbh_u32_e32 v12, v5
 ; GFX9-NEXT:    v_min_u32_e32 v11, v11, v12
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, 0, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, 64, v11
 ; GFX9-NEXT:    v_addc_co_u32_e64 v12, s[6:7], 0, 0, vcc
-; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX9-NEXT:    s_mov_b64 s[6:7], 0x7f
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX9-NEXT:    v_ffbh_u32_e32 v14, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
+; GFX9-NEXT:    v_ffbh_u32_e32 v11, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, v12, 0, vcc
-; GFX9-NEXT:    v_sub_co_u32_e32 v12, vcc, v8, v9
-; GFX9-NEXT:    v_subb_co_u32_e32 v13, vcc, v10, v13, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-NEXT:    v_subb_co_u32_e32 v14, vcc, 0, v8, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v15, vcc, 0, v8, vcc
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[12:13]
-; GFX9-NEXT:    v_or_b32_e32 v10, v13, v15
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
-; GFX9-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX9-NEXT:    v_xor_b32_e32 v9, 0x7f, v12
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
-; GFX9-NEXT:    v_or_b32_e32 v9, v9, v14
-; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[9:10]
-; GFX9-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; GFX9-NEXT:    v_add_u32_e32 v11, 32, v11
+; GFX9-NEXT:    v_ffbh_u32_e32 v12, v3
+; GFX9-NEXT:    v_min_u32_e32 v11, v11, v12
+; GFX9-NEXT:    v_ffbh_u32_e32 v12, v0
+; GFX9-NEXT:    v_add_u32_e32 v12, 32, v12
+; GFX9-NEXT:    v_min_u32_e32 v12, v12, v14
+; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, 64, v12
+; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[6:7], 0, 0, vcc
+; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX9-NEXT:    v_or_b32_e32 v9, v5, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v14, 0, vcc
+; GFX9-NEXT:    v_sub_co_u32_e32 v12, vcc, v10, v11
+; GFX9-NEXT:    v_subb_co_u32_e32 v13, vcc, v13, v14, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0
+; GFX9-NEXT:    v_or_b32_e32 v8, v4, v6
+; GFX9-NEXT:    v_subb_co_u32_e32 v14, vcc, 0, v10, vcc
+; GFX9-NEXT:    v_cmp_ne_u64_e64 s[4:5], 0, v[8:9]
+; GFX9-NEXT:    v_or_b32_e32 v9, v1, v3
+; GFX9-NEXT:    v_or_b32_e32 v8, v0, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v15, vcc, 0, v10, vcc
+; GFX9-NEXT:    s_mov_b64 s[6:7], 0x80
+; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[6:7], v[12:13]
+; GFX9-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[14:15]
+; GFX9-NEXT:    v_cmp_ne_u64_e64 s[8:9], 0, v[8:9]
+; GFX9-NEXT:    v_xor_b32_e32 v8, 0x7f, v12
+; GFX9-NEXT:    v_or_b32_e32 v9, v13, v15
+; GFX9-NEXT:    v_or_b32_e32 v8, v8, v14
+; GFX9-NEXT:    s_and_b64 s[6:7], s[6:7], vcc
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, v3, 0, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, v2, 0, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, v1, 0, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, v0, 0, s[4:5]
-; GFX9-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB1_6
 ; GFX9-NEXT:  ; %bb.1: ; %udiv-bb1
@@ -3368,10 +3357,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-G-NEXT:    v_or_b32_e32 v8, v4, v6
 ; GFX9-G-NEXT:    v_or_b32_e32 v9, v5, v7
-; GFX9-G-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX9-G-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
 ; GFX9-G-NEXT:    v_or_b32_e32 v8, v0, v2
 ; GFX9-G-NEXT:    v_or_b32_e32 v9, v1, v3
-; GFX9-G-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[8:9]
+; GFX9-G-NEXT:    v_cmp_ne_u64_e64 s[4:5], 0, v[8:9]
 ; GFX9-G-NEXT:    v_ffbh_u32_e32 v9, v4
 ; GFX9-G-NEXT:    v_ffbh_u32_e32 v8, v5
 ; GFX9-G-NEXT:    v_add_u32_e32 v9, 32, v9
@@ -3400,10 +3389,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-NEXT:    v_subb_co_u32_e64 v14, s[6:7], 0, 0, s[6:7]
 ; GFX9-G-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9-G-NEXT:    v_subb_co_u32_e64 v15, s[6:7], 0, 0, s[6:7]
-; GFX9-G-NEXT:    v_cmp_gt_u64_e64 s[6:7], v[12:13], v[8:9]
-; GFX9-G-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-G-NEXT:    v_cmp_le_u64_e64 s[6:7], v[12:13], v[8:9]
+; GFX9-G-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX9-G-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[6:7]
-; GFX9-G-NEXT:    v_cmp_lt_u64_e64 s[6:7], 0, v[14:15]
+; GFX9-G-NEXT:    v_cmp_ge_u64_e64 s[6:7], 0, v[14:15]
 ; GFX9-G-NEXT:    v_or_b32_e32 v17, v13, v15
 ; GFX9-G-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[6:7]
 ; GFX9-G-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[14:15]
@@ -3419,7 +3408,8 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-NEXT:    v_cndmask_b32_e64 v11, v1, 0, vcc
 ; GFX9-G-NEXT:    v_cndmask_b32_e64 v8, v2, 0, vcc
 ; GFX9-G-NEXT:    v_cndmask_b32_e64 v9, v3, 0, vcc
-; GFX9-G-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; GFX9-G-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; GFX9-G-NEXT:    v_xor_b32_e32 v18, 1, v18
 ; GFX9-G-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GFX9-G-NEXT:    v_or_b32_e32 v16, v18, v16
 ; GFX9-G-NEXT:    v_and_b32_e32 v16, 1, v16
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 77b78f1f8a333..7e373f08f7c85 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -7,100 +7,94 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG:       ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_sub_i32_e32 v16, vcc, 0, v0
-; SDAG-NEXT:    v_mov_b32_e32 v18, 0
+; SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; SDAG-NEXT:    v_ashrrev_i32_e32 v24, 31, v3
 ; SDAG-NEXT:    v_ashrrev_i32_e32 v25, 31, v11
-; SDAG-NEXT:    s_mov_b64 s[10:11], 0x7f
-; SDAG-NEXT:    v_subb_u32_e32 v17, vcc, 0, v1, vcc
+; SDAG-NEXT:    s_mov_b64 s[10:11], 0x80
+; SDAG-NEXT:    v_subb_u32_e32 v18, vcc, 0, v1, vcc
 ; SDAG-NEXT:    v_mov_b32_e32 v26, v24
 ; SDAG-NEXT:    v_mov_b32_e32 v27, v25
 ; SDAG-NEXT:    v_subb_u32_e32 v19, vcc, 0, v2, vcc
 ; SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
-; SDAG-NEXT:    v_cndmask_b32_e64 v21, v1, v17, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v21, v1, v18, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v20, v0, v16, s[4:5]
 ; SDAG-NEXT:    v_subb_u32_e32 v0, vcc, 0, v3, vcc
-; SDAG-NEXT:    v_cndmask_b32_e64 v16, v2, v19, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v18, v2, v19, s[4:5]
 ; SDAG-NEXT:    v_ffbh_u32_e32 v1, v20
 ; SDAG-NEXT:    v_ffbh_u32_e32 v2, v21
-; SDAG-NEXT:    v_cndmask_b32_e64 v17, v3, v0, s[4:5]
-; SDAG-NEXT:    v_or_b32_e32 v0, v20, v16
+; SDAG-NEXT:    v_cndmask_b32_e64 v19, v3, v0, s[4:5]
+; SDAG-NEXT:    v_or_b32_e32 v0, v20, v18
 ; SDAG-NEXT:    v_sub_i32_e32 v3, vcc, 0, v8
-; SDAG-NEXT:    v_add_i32_e64 v19, s[4:5], 32, v1
-; SDAG-NEXT:    v_ffbh_u32_e32 v22, v16
-; SDAG-NEXT:    v_or_b32_e32 v1, v21, v17
+; SDAG-NEXT:    v_add_i32_e64 v16, s[4:5], 32, v1
+; SDAG-NEXT:    v_ffbh_u32_e32 v22, v18
+; SDAG-NEXT:    v_or_b32_e32 v1, v21, v19
 ; SDAG-NEXT:    v_subb_u32_e32 v23, vcc, 0, v9, vcc
-; SDAG-NEXT:    v_min_u32_e32 v2, v19, v2
-; SDAG-NEXT:    v_add_i32_e64 v19, s[4:5], 32, v22
-; SDAG-NEXT:    v_ffbh_u32_e32 v22, v17
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; SDAG-NEXT:    v_min_u32_e32 v2, v16, v2
+; SDAG-NEXT:    v_add_i32_e64 v16, s[4:5], 32, v22
+; SDAG-NEXT:    v_ffbh_u32_e32 v22, v19
+; SDAG-NEXT:    v_cmp_ne_u64_e64 s[4:5], 0, v[0:1]
 ; SDAG-NEXT:    v_cmp_gt_i64_e64 s[6:7], 0, v[10:11]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v28, v9, v23, s[6:7]
 ; SDAG-NEXT:    v_subb_u32_e32 v0, vcc, 0, v10, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e64 v29, v8, v3, s[6:7]
-; SDAG-NEXT:    v_min_u32_e32 v1, v19, v22
+; SDAG-NEXT:    v_min_u32_e32 v1, v16, v22
 ; SDAG-NEXT:    v_add_i32_e64 v2, s[8:9], 64, v2
 ; SDAG-NEXT:    v_addc_u32_e64 v3, s[8:9], 0, 0, s[8:9]
 ; SDAG-NEXT:    v_subb_u32_e32 v8, vcc, 0, v11, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e64 v0, v10, v0, s[6:7]
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[18:19]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v9, v3, 0, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e32 v10, v2, v1, vcc
 ; SDAG-NEXT:    v_ffbh_u32_e32 v3, v29
-; SDAG-NEXT:    v_ffbh_u32_e32 v19, v28
+; SDAG-NEXT:    v_ffbh_u32_e32 v16, v28
 ; SDAG-NEXT:    v_cndmask_b32_e64 v1, v11, v8, s[6:7]
 ; SDAG-NEXT:    v_or_b32_e32 v2, v29, v0
 ; SDAG-NEXT:    v_add_i32_e32 v8, vcc, 32, v3
 ; SDAG-NEXT:    v_ffbh_u32_e32 v11, v0
 ; SDAG-NEXT:    v_or_b32_e32 v3, v28, v1
-; SDAG-NEXT:    v_min_u32_e32 v8, v8, v19
+; SDAG-NEXT:    v_min_u32_e32 v8, v8, v16
 ; SDAG-NEXT:    v_add_i32_e32 v11, vcc, 32, v11
-; SDAG-NEXT:    v_ffbh_u32_e32 v19, v1
-; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; SDAG-NEXT:    v_min_u32_e32 v2, v11, v19
+; SDAG-NEXT:    v_ffbh_u32_e32 v16, v1
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    v_min_u32_e32 v2, v11, v16
 ; SDAG-NEXT:    v_add_i32_e64 v3, s[6:7], 64, v8
 ; SDAG-NEXT:    v_addc_u32_e64 v8, s[6:7], 0, 0, s[6:7]
 ; SDAG-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[0:1]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v8, v8, 0, s[6:7]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[6:7]
-; SDAG-NEXT:    s_or_b64 s[6:7], vcc, s[4:5]
+; SDAG-NEXT:    s_and_b64 s[8:9], vcc, s[4:5]
 ; SDAG-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
 ; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v8, v9, vcc
 ; SDAG-NEXT:    v_xor_b32_e32 v8, 0x7f, v2
-; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, 0, v18, vcc
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[10:11], v[2:3]
-; SDAG-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, 0, v18, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, 0, v17, vcc
+; SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], s[10:11], v[2:3]
+; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, 0, v17, vcc
 ; SDAG-NEXT:    v_or_b32_e32 v8, v8, v10
 ; SDAG-NEXT:    v_or_b32_e32 v9, v3, v11
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
-; SDAG-NEXT:    v_cndmask_b32_e64 v8, v18, v19, s[4:5]
-; SDAG-NEXT:    v_and_b32_e32 v8, 1, v8
-; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v8
-; SDAG-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v18, v17, 0, s[4:5]
-; SDAG-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
-; SDAG-NEXT:    v_cndmask_b32_e64 v22, v16, 0, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v19, v21, 0, s[4:5]
-; SDAG-NEXT:    s_and_b64 s[8:9], s[6:7], vcc
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[8:9]
+; SDAG-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; SDAG-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v16, v19, 0, s[4:5]
+; SDAG-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e64 v22, v18, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v17, v21, 0, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v23, v20, 0, s[4:5]
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[8:9]
 ; SDAG-NEXT:    s_cbranch_execz .LBB0_6
 ; SDAG-NEXT:  ; %bb.1: ; %udiv-bb15
 ; SDAG-NEXT:    v_add_i32_e32 v30, vcc, 1, v2
-; SDAG-NEXT:    v_sub_i32_e64 v18, s[4:5], 63, v2
+; SDAG-NEXT:    v_sub_i32_e64 v16, s[4:5], 63, v2
 ; SDAG-NEXT:    v_mov_b32_e32 v8, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v9, 0
 ; SDAG-NEXT:    v_addc_u32_e32 v31, vcc, 0, v3, vcc
-; SDAG-NEXT:    v_lshl_b64 v[18:19], v[20:21], v18
+; SDAG-NEXT:    v_lshl_b64 v[16:17], v[20:21], v16
 ; SDAG-NEXT:    v_addc_u32_e32 v32, vcc, 0, v10, vcc
 ; SDAG-NEXT:    v_addc_u32_e32 v33, vcc, 0, v11, vcc
 ; SDAG-NEXT:    v_or_b32_e32 v10, v30, v32
 ; SDAG-NEXT:    v_sub_i32_e32 v34, vcc, 0x7f, v2
 ; SDAG-NEXT:    v_or_b32_e32 v11, v31, v33
-; SDAG-NEXT:    v_lshl_b64 v[2:3], v[16:17], v34
+; SDAG-NEXT:    v_lshl_b64 v[2:3], v[18:19], v34
 ; SDAG-NEXT:    v_sub_i32_e32 v35, vcc, 64, v34
 ; SDAG-NEXT:    v_lshl_b64 v[22:23], v[20:21], v34
 ; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
@@ -108,13 +102,13 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_or_b32_e32 v3, v3, v11
 ; SDAG-NEXT:    v_or_b32_e32 v2, v2, v10
 ; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v34
-; SDAG-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, v18, v2, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v19, 0, v23, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v18, 0, v22, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v17, v3, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v16, v2, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v17, 0, v23, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v16, 0, v22, s[4:5]
 ; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v34
-; SDAG-NEXT:    v_cndmask_b32_e64 v3, v3, v17, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, v2, v16, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v3, v19, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v2, v18, s[4:5]
 ; SDAG-NEXT:    v_mov_b32_e32 v10, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v11, 0
 ; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -123,18 +117,18 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:  ; %bb.2: ; %udiv-preheader4
 ; SDAG-NEXT:    v_lshr_b64 v[8:9], v[20:21], v30
 ; SDAG-NEXT:    v_sub_i32_e32 v10, vcc, 64, v30
-; SDAG-NEXT:    v_lshl_b64 v[10:11], v[16:17], v10
+; SDAG-NEXT:    v_lshl_b64 v[10:11], v[18:19], v10
 ; SDAG-NEXT:    v_or_b32_e32 v11, v9, v11
 ; SDAG-NEXT:    v_or_b32_e32 v10, v8, v10
 ; SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v30
 ; SDAG-NEXT:    v_subrev_i32_e64 v8, s[4:5], 64, v30
-; SDAG-NEXT:    v_lshr_b64 v[8:9], v[16:17], v8
+; SDAG-NEXT:    v_lshr_b64 v[8:9], v[18:19], v8
 ; SDAG-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
 ; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v30
 ; SDAG-NEXT:    v_cndmask_b32_e64 v21, v9, v21, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e64 v20, v8, v20, s[4:5]
-; SDAG-NEXT:    v_lshr_b64 v[8:9], v[16:17], v30
+; SDAG-NEXT:    v_lshr_b64 v[8:9], v[18:19], v30
 ; SDAG-NEXT:    v_cndmask_b32_e32 v23, 0, v9, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e32 v22, 0, v8, vcc
 ; SDAG-NEXT:    v_add_i32_e32 v34, vcc, -1, v29
@@ -143,27 +137,27 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_addc_u32_e32 v37, vcc, -1, v1, vcc
 ; SDAG-NEXT:    v_mov_b32_e32 v10, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v11, 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    v_mov_b32_e32 v17, 0
+; SDAG-NEXT:    v_mov_b32_e32 v18, 0
+; SDAG-NEXT:    v_mov_b32_e32 v19, 0
 ; SDAG-NEXT:    s_mov_b64 s[4:5], 0
 ; SDAG-NEXT:    v_mov_b32_e32 v9, 0
 ; SDAG-NEXT:  .LBB0_3: ; %udiv-do-while3
 ; SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SDAG-NEXT:    v_lshrrev_b32_e32 v8, 31, v19
-; SDAG-NEXT:    v_lshl_b64 v[18:19], v[18:19], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v8, 31, v17
+; SDAG-NEXT:    v_lshl_b64 v[16:17], v[16:17], 1
 ; SDAG-NEXT:    v_lshl_b64 v[22:23], v[22:23], 1
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v38, 31, v21
 ; SDAG-NEXT:    v_lshl_b64 v[20:21], v[20:21], 1
 ; SDAG-NEXT:    v_lshrrev_b32_e32 v39, 31, v3
 ; SDAG-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; SDAG-NEXT:    v_or_b32_e32 v19, v17, v19
-; SDAG-NEXT:    v_or_b32_e32 v18, v16, v18
-; SDAG-NEXT:    v_or_b32_e32 v16, v22, v38
-; SDAG-NEXT:    v_or_b32_e32 v17, v20, v39
+; SDAG-NEXT:    v_or_b32_e32 v17, v19, v17
+; SDAG-NEXT:    v_or_b32_e32 v16, v18, v16
+; SDAG-NEXT:    v_or_b32_e32 v18, v22, v38
+; SDAG-NEXT:    v_or_b32_e32 v19, v20, v39
 ; SDAG-NEXT:    v_or_b32_e32 v2, v2, v8
-; SDAG-NEXT:    v_sub_i32_e32 v8, vcc, v34, v17
+; SDAG-NEXT:    v_sub_i32_e32 v8, vcc, v34, v19
 ; SDAG-NEXT:    v_subb_u32_e32 v8, vcc, v35, v21, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v8, vcc, v36, v16, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v8, vcc, v36, v18, vcc
 ; SDAG-NEXT:    v_subb_u32_e32 v8, vcc, v37, v23, vcc
 ; SDAG-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
 ; SDAG-NEXT:    v_and_b32_e32 v20, v8, v29
@@ -171,22 +165,22 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_and_b32_e32 v38, v8, v0
 ; SDAG-NEXT:    v_and_b32_e32 v39, v8, v1
 ; SDAG-NEXT:    v_and_b32_e32 v8, 1, v8
-; SDAG-NEXT:    v_sub_i32_e32 v20, vcc, v17, v20
+; SDAG-NEXT:    v_sub_i32_e32 v20, vcc, v19, v20
 ; SDAG-NEXT:    v_subb_u32_e32 v21, vcc, v21, v22, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v22, vcc, v16, v38, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v22, vcc, v18, v38, vcc
 ; SDAG-NEXT:    v_subb_u32_e32 v23, vcc, v23, v39, vcc
 ; SDAG-NEXT:    v_add_i32_e32 v30, vcc, -1, v30
 ; SDAG-NEXT:    v_addc_u32_e32 v31, vcc, -1, v31, vcc
 ; SDAG-NEXT:    v_addc_u32_e32 v32, vcc, -1, v32, vcc
 ; SDAG-NEXT:    v_addc_u32_e32 v33, vcc, -1, v33, vcc
-; SDAG-NEXT:    v_or_b32_e32 v16, v30, v32
-; SDAG-NEXT:    v_or_b32_e32 v17, v31, v33
-; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT:    v_or_b32_e32 v18, v30, v32
+; SDAG-NEXT:    v_or_b32_e32 v19, v31, v33
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[18:19]
 ; SDAG-NEXT:    v_or_b32_e32 v3, v11, v3
 ; SDAG-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; SDAG-NEXT:    v_or_b32_e32 v2, v10, v2
-; SDAG-NEXT:    v_mov_b32_e32 v17, v9
-; SDAG-NEXT:    v_mov_b32_e32 v16, v8
+; SDAG-NEXT:    v_mov_b32_e32 v19, v9
+; SDAG-NEXT:    v_mov_b32_e32 v18, v8
 ; SDAG-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; SDAG-NEXT:    s_cbranch_execnz .LBB0_3
 ; SDAG-NEXT:  ; %bb.4: ; %Flow13
@@ -194,184 +188,178 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:  .LBB0_5: ; %Flow14
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; SDAG-NEXT:    v_lshl_b64 v[0:1], v[2:3], 1
-; SDAG-NEXT:    v_lshrrev_b32_e32 v16, 31, v19
-; SDAG-NEXT:    v_lshl_b64 v[2:3], v[18:19], 1
-; SDAG-NEXT:    v_or_b32_e32 v0, v0, v16
-; SDAG-NEXT:    v_or_b32_e32 v18, v11, v1
-; SDAG-NEXT:    v_or_b32_e32 v19, v9, v3
+; SDAG-NEXT:    v_lshrrev_b32_e32 v18, 31, v17
+; SDAG-NEXT:    v_lshl_b64 v[2:3], v[16:17], 1
+; SDAG-NEXT:    v_or_b32_e32 v0, v0, v18
+; SDAG-NEXT:    v_or_b32_e32 v16, v11, v1
+; SDAG-NEXT:    v_or_b32_e32 v17, v9, v3
 ; SDAG-NEXT:    v_or_b32_e32 v22, v10, v0
 ; SDAG-NEXT:    v_or_b32_e32 v23, v8, v2
 ; SDAG-NEXT:  .LBB0_6: ; %Flow16
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
-; SDAG-NEXT:    v_ashrrev_i32_e32 v16, 31, v7
-; SDAG-NEXT:    v_ashrrev_i32_e32 v17, 31, v15
+; SDAG-NEXT:    v_ashrrev_i32_e32 v18, 31, v7
+; SDAG-NEXT:    v_ashrrev_i32_e32 v19, 31, v15
 ; SDAG-NEXT:    v_sub_i32_e32 v0, vcc, 0, v4
-; SDAG-NEXT:    v_mov_b32_e32 v8, 0
-; SDAG-NEXT:    s_mov_b64 s[10:11], 0x7f
-; SDAG-NEXT:    v_mov_b32_e32 v20, v16
-; SDAG-NEXT:    v_mov_b32_e32 v21, v17
+; SDAG-NEXT:    v_mov_b32_e32 v9, 0
+; SDAG-NEXT:    s_mov_b64 s[10:11], 0x80
+; SDAG-NEXT:    v_mov_b32_e32 v20, v18
+; SDAG-NEXT:    v_mov_b32_e32 v21, v19
 ; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, 0, v5, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v9, vcc, 0, v6, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v2, vcc, 0, v6, vcc
 ; SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
-; SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v1, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, v0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v5, v5, v1, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v4, v4, v0, s[4:5]
 ; SDAG-NEXT:    v_subb_u32_e32 v0, vcc, 0, v7, vcc
-; SDAG-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[4:5]
-; SDAG-NEXT:    v_ffbh_u32_e32 v1, v2
-; SDAG-NEXT:    v_ffbh_u32_e32 v4, v3
-; SDAG-NEXT:    v_cndmask_b32_e64 v7, v7, v0, s[4:5]
-; SDAG-NEXT:    v_sub_i32_e32 v5, vcc, 0, v12
-; SDAG-NEXT:    v_or_b32_e32 v0, v2, v6
-; SDAG-NEXT:    v_ffbh_u32_e32 v9, v6
-; SDAG-NEXT:    v_add_i32_e64 v10, s[4:5], 32, v1
-; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, 0, v13, vcc
-; SDAG-NEXT:    v_or_b32_e32 v1, v3, v7
-; SDAG-NEXT:    v_add_i32_e64 v9, s[4:5], 32, v9
-; SDAG-NEXT:    v_ffbh_u32_e32 v30, v7
-; SDAG-NEXT:    v_min_u32_e32 v4, v10, v4
-; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, 0, v14, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v10, v6, v2, s[4:5]
+; SDAG-NEXT:    v_ffbh_u32_e32 v1, v4
+; SDAG-NEXT:    v_ffbh_u32_e32 v2, v5
+; SDAG-NEXT:    v_cndmask_b32_e64 v11, v7, v0, s[4:5]
+; SDAG-NEXT:    v_sub_i32_e32 v3, vcc, 0, v12
+; SDAG-NEXT:    v_or_b32_e32 v0, v4, v10
+; SDAG-NEXT:    v_ffbh_u32_e32 v6, v10
+; SDAG-NEXT:    v_add_i32_e64 v7, s[4:5], 32, v1
+; SDAG-NEXT:    v_subb_u32_e32 v8, vcc, 0, v13, vcc
+; SDAG-NEXT:    v_or_b32_e32 v1, v5, v11
+; SDAG-NEXT:    v_add_i32_e64 v6, s[4:5], 32, v6
+; SDAG-NEXT:    v_ffbh_u32_e32 v30, v11
+; SDAG-NEXT:    v_min_u32_e32 v2, v7, v2
+; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, 0, v14, vcc
 ; SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[14:15]
-; SDAG-NEXT:    v_cndmask_b32_e64 v28, v13, v11, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v29, v12, v5, s[4:5]
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[0:1]
-; SDAG-NEXT:    v_min_u32_e32 v1, v9, v30
-; SDAG-NEXT:    v_add_i32_e64 v4, s[8:9], 64, v4
-; SDAG-NEXT:    v_addc_u32_e64 v5, s[8:9], 0, 0, s[8:9]
-; SDAG-NEXT:    v_subb_u32_e32 v9, vcc, 0, v15, vcc
-; SDAG-NEXT:    v_cndmask_b32_e64 v0, v14, v10, s[4:5]
-; SDAG-NEXT:    v_ffbh_u32_e32 v10, v29
-; SDAG-NEXT:    v_ffbh_u32_e32 v11, v28
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; SDAG-NEXT:    v_cndmask_b32_e64 v12, v5, 0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v13, v4, v1, vcc
-; SDAG-NEXT:    v_cndmask_b32_e64 v1, v15, v9, s[4:5]
-; SDAG-NEXT:    v_or_b32_e32 v4, v29, v0
-; SDAG-NEXT:    v_ffbh_u32_e32 v9, v0
-; SDAG-NEXT:    v_add_i32_e32 v10, vcc, 32, v10
-; SDAG-NEXT:    v_or_b32_e32 v5, v28, v1
-; SDAG-NEXT:    v_add_i32_e32 v9, vcc, 32, v9
-; SDAG-NEXT:    v_ffbh_u32_e32 v14, v1
-; SDAG-NEXT:    v_min_u32_e32 v10, v10, v11
-; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[4:5]
-; SDAG-NEXT:    v_min_u32_e32 v4, v9, v14
-; SDAG-NEXT:    v_add_i32_e64 v5, s[4:5], 64, v10
-; SDAG-NEXT:    v_addc_u32_e64 v9, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v9, v9, 0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; SDAG-NEXT:    v_sub_i32_e32 v4, vcc, v4, v13
-; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v9, v12, vcc
-; SDAG-NEXT:    v_xor_b32_e32 v9, 0x7f, v4
-; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, 0, v8, vcc
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[10:11], v[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, 0, v8, vcc
-; SDAG-NEXT:    v_or_b32_e32 v8, v9, v10
+; SDAG-NEXT:    v_cndmask_b32_e64 v28, v13, v8, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v29, v12, v3, s[4:5]
+; SDAG-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[0:1]
+; SDAG-NEXT:    v_min_u32_e32 v1, v6, v30
+; SDAG-NEXT:    v_add_i32_e64 v2, s[8:9], 64, v2
+; SDAG-NEXT:    v_addc_u32_e64 v3, s[8:9], 0, 0, s[8:9]
+; SDAG-NEXT:    v_subb_u32_e32 v6, vcc, 0, v15, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v0, v14, v7, s[4:5]
+; SDAG-NEXT:    v_ffbh_u32_e32 v7, v29
+; SDAG-NEXT:    v_ffbh_u32_e32 v8, v28
 ; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; SDAG-NEXT:    v_or_b32_e32 v9, v5, v11
-; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT:    v_cndmask_b32_e32 v12, v13, v12, vcc
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
-; SDAG-NEXT:    v_and_b32_e32 v8, 1, v12
-; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v8
+; SDAG-NEXT:    v_cndmask_b32_e64 v12, v3, 0, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v13, v2, v1, vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v1, v15, v6, s[4:5]
+; SDAG-NEXT:    v_or_b32_e32 v2, v29, v0
+; SDAG-NEXT:    v_ffbh_u32_e32 v6, v0
+; SDAG-NEXT:    v_add_i32_e32 v7, vcc, 32, v7
+; SDAG-NEXT:    v_or_b32_e32 v3, v28, v1
+; SDAG-NEXT:    v_add_i32_e32 v6, vcc, 32, v6
+; SDAG-NEXT:    v_ffbh_u32_e32 v14, v1
+; SDAG-NEXT:    v_min_u32_e32 v7, v7, v8
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT:    v_min_u32_e32 v2, v6, v14
+; SDAG-NEXT:    v_add_i32_e64 v3, s[4:5], 64, v7
+; SDAG-NEXT:    v_addc_u32_e64 v6, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT:    v_cmp_ne_u64_e64 s[4:5], 0, v[0:1]
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, v6, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[4:5]
+; SDAG-NEXT:    s_and_b64 s[6:7], vcc, s[6:7]
+; SDAG-NEXT:    v_sub_i32_e32 v2, vcc, v2, v13
+; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v6, v12, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v6, 0x7f, v2
+; SDAG-NEXT:    v_subb_u32_e32 v8, vcc, 0, v9, vcc
+; SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], s[10:11], v[2:3]
+; SDAG-NEXT:    v_subb_u32_e32 v9, vcc, 0, v9, vcc
+; SDAG-NEXT:    v_or_b32_e32 v6, v6, v8
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT:    v_or_b32_e32 v7, v3, v9
+; SDAG-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
 ; SDAG-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v13, v7, 0, s[4:5]
-; SDAG-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
-; SDAG-NEXT:    v_cndmask_b32_e64 v9, v6, 0, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v14, v3, 0, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v8, v2, 0, s[4:5]
-; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT:    v_cndmask_b32_e64 v13, v11, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v7, v10, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v14, v5, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, v4, 0, s[4:5]
+; SDAG-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; SDAG-NEXT:    s_cbranch_execz .LBB0_12
 ; SDAG-NEXT:  ; %bb.7: ; %udiv-bb1
-; SDAG-NEXT:    v_add_i32_e32 v30, vcc, 1, v4
-; SDAG-NEXT:    v_sub_i32_e64 v12, s[4:5], 63, v4
-; SDAG-NEXT:    v_mov_b32_e32 v8, 0
-; SDAG-NEXT:    v_mov_b32_e32 v9, 0
-; SDAG-NEXT:    v_addc_u32_e32 v31, vcc, 0, v5, vcc
-; SDAG-NEXT:    v_lshl_b64 v[12:13], v[2:3], v12
-; SDAG-NEXT:    v_addc_u32_e32 v32, vcc, 0, v10, vcc
-; SDAG-NEXT:    v_addc_u32_e32 v33, vcc, 0, v11, vcc
-; SDAG-NEXT:    v_or_b32_e32 v10, v30, v32
-; SDAG-NEXT:    v_sub_i32_e32 v34, vcc, 0x7f, v4
-; SDAG-NEXT:    v_or_b32_e32 v11, v31, v33
-; SDAG-NEXT:    v_lshl_b64 v[4:5], v[6:7], v34
+; SDAG-NEXT:    v_add_i32_e32 v30, vcc, 1, v2
+; SDAG-NEXT:    v_sub_i32_e64 v12, s[4:5], 63, v2
+; SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; SDAG-NEXT:    v_mov_b32_e32 v7, 0
+; SDAG-NEXT:    v_addc_u32_e32 v31, vcc, 0, v3, vcc
+; SDAG-NEXT:    v_lshl_b64 v[12:13], v[4:5], v12
+; SDAG-NEXT:    v_addc_u32_e32 v32, vcc, 0, v8, vcc
+; SDAG-NEXT:    v_addc_u32_e32 v33, vcc, 0, v9, vcc
+; SDAG-NEXT:    v_or_b32_e32 v8, v30, v32
+; SDAG-NEXT:    v_sub_i32_e32 v34, vcc, 0x7f, v2
+; SDAG-NEXT:    v_or_b32_e32 v9, v31, v33
+; SDAG-NEXT:    v_lshl_b64 v[2:3], v[10:11], v34
 ; SDAG-NEXT:    v_sub_i32_e32 v35, vcc, 64, v34
-; SDAG-NEXT:    v_lshl_b64 v[14:15], v[2:3], v34
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; SDAG-NEXT:    v_lshr_b64 v[10:11], v[2:3], v35
-; SDAG-NEXT:    v_or_b32_e32 v5, v5, v11
-; SDAG-NEXT:    v_or_b32_e32 v4, v4, v10
+; SDAG-NEXT:    v_lshl_b64 v[14:15], v[4:5], v34
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT:    v_lshr_b64 v[8:9], v[4:5], v35
+; SDAG-NEXT:    v_or_b32_e32 v3, v3, v9
+; SDAG-NEXT:    v_or_b32_e32 v2, v2, v8
 ; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v34
-; SDAG-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v4, v12, v4, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v11, 0, v15, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v10, 0, v14, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v13, v3, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v12, v2, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v9, 0, v15, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, v14, s[4:5]
 ; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v34
-; SDAG-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[4:5]
 ; SDAG-NEXT:    v_mov_b32_e32 v12, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v13, 0
 ; SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SDAG-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
 ; SDAG-NEXT:    s_cbranch_execz .LBB0_11
 ; SDAG-NEXT:  ; %bb.8: ; %udiv-preheader
-; SDAG-NEXT:    v_lshr_b64 v[8:9], v[2:3], v30
+; SDAG-NEXT:    v_lshr_b64 v[6:7], v[4:5], v30
 ; SDAG-NEXT:    v_sub_i32_e32 v35, vcc, 64, v30
 ; SDAG-NEXT:    v_subrev_i32_e32 v36, vcc, 64, v30
-; SDAG-NEXT:    v_lshr_b64 v[37:38], v[6:7], v30
+; SDAG-NEXT:    v_lshr_b64 v[37:38], v[10:11], v30
 ; SDAG-NEXT:    v_add_i32_e32 v34, vcc, -1, v29
 ; SDAG-NEXT:    v_mov_b32_e32 v12, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v13, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v14, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v15, 0
 ; SDAG-NEXT:    s_mov_b64 s[10:11], 0
-; SDAG-NEXT:    v_lshl_b64 v[48:49], v[6:7], v35
-; SDAG-NEXT:    v_lshr_b64 v[6:7], v[6:7], v36
+; SDAG-NEXT:    v_lshl_b64 v[48:49], v[10:11], v35
+; SDAG-NEXT:    v_lshr_b64 v[10:11], v[10:11], v36
 ; SDAG-NEXT:    v_addc_u32_e32 v35, vcc, -1, v28, vcc
-; SDAG-NEXT:    v_or_b32_e32 v9, v9, v49
-; SDAG-NEXT:    v_or_b32_e32 v8, v8, v48
+; SDAG-NEXT:    v_or_b32_e32 v7, v7, v49
+; SDAG-NEXT:    v_or_b32_e32 v6, v6, v48
 ; SDAG-NEXT:    v_addc_u32_e32 v36, vcc, -1, v0, vcc
 ; SDAG-NEXT:    v_cmp_gt_u32_e64 s[4:5], 64, v30
-; SDAG-NEXT:    v_cndmask_b32_e64 v9, v7, v9, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v8, v6, v8, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v7, 0, v38, s[4:5]
-; SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, v37, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v7, v11, v7, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v11, 0, v38, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v10, 0, v37, s[4:5]
 ; SDAG-NEXT:    v_addc_u32_e32 v37, vcc, -1, v1, vcc
 ; SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v30
-; SDAG-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; SDAG-NEXT:    v_mov_b32_e32 v9, 0
+; SDAG-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; SDAG-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; SDAG-NEXT:    v_mov_b32_e32 v7, 0
 ; SDAG-NEXT:  .LBB0_9: ; %udiv-do-while
 ; SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SDAG-NEXT:    v_lshl_b64 v[6:7], v[6:7], 1
-; SDAG-NEXT:    v_lshrrev_b32_e32 v8, 31, v3
-; SDAG-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
-; SDAG-NEXT:    v_lshrrev_b32_e32 v38, 31, v5
-; SDAG-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
-; SDAG-NEXT:    v_lshrrev_b32_e32 v39, 31, v11
 ; SDAG-NEXT:    v_lshl_b64 v[10:11], v[10:11], 1
-; SDAG-NEXT:    v_or_b32_e32 v6, v6, v8
-; SDAG-NEXT:    v_or_b32_e32 v2, v2, v38
-; SDAG-NEXT:    v_or_b32_e32 v4, v4, v39
-; SDAG-NEXT:    v_or_b32_e32 v5, v13, v5
-; SDAG-NEXT:    v_or_b32_e32 v11, v15, v11
-; SDAG-NEXT:    v_sub_i32_e32 v8, vcc, v34, v2
-; SDAG-NEXT:    v_or_b32_e32 v4, v12, v4
-; SDAG-NEXT:    v_subb_u32_e32 v8, vcc, v35, v3, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v8, vcc, v36, v6, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v8, vcc, v37, v7, vcc
-; SDAG-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
-; SDAG-NEXT:    v_and_b32_e32 v15, v8, v29
-; SDAG-NEXT:    v_and_b32_e32 v38, v8, v28
-; SDAG-NEXT:    v_and_b32_e32 v39, v8, v0
-; SDAG-NEXT:    v_and_b32_e32 v48, v8, v1
-; SDAG-NEXT:    v_sub_i32_e32 v2, vcc, v2, v15
-; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v3, v38, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v6, vcc, v6, v39, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v7, v48, vcc
+; SDAG-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
+; SDAG-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v38, 31, v3
+; SDAG-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v39, 31, v9
+; SDAG-NEXT:    v_lshl_b64 v[8:9], v[8:9], 1
+; SDAG-NEXT:    v_or_b32_e32 v6, v10, v6
+; SDAG-NEXT:    v_or_b32_e32 v4, v4, v38
+; SDAG-NEXT:    v_or_b32_e32 v2, v2, v39
+; SDAG-NEXT:    v_or_b32_e32 v3, v13, v3
+; SDAG-NEXT:    v_or_b32_e32 v9, v15, v9
+; SDAG-NEXT:    v_sub_i32_e32 v10, vcc, v34, v4
+; SDAG-NEXT:    v_or_b32_e32 v2, v12, v2
+; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, v35, v5, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, v36, v6, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, v37, v11, vcc
+; SDAG-NEXT:    v_ashrrev_i32_e32 v15, 31, v10
+; SDAG-NEXT:    v_and_b32_e32 v10, v15, v29
+; SDAG-NEXT:    v_and_b32_e32 v38, v15, v28
+; SDAG-NEXT:    v_and_b32_e32 v39, v15, v0
+; SDAG-NEXT:    v_and_b32_e32 v48, v15, v1
+; SDAG-NEXT:    v_sub_i32_e32 v4, vcc, v4, v10
+; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v5, v38, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, v6, v39, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, v11, v48, vcc
 ; SDAG-NEXT:    v_add_i32_e32 v30, vcc, -1, v30
 ; SDAG-NEXT:    v_addc_u32_e32 v31, vcc, -1, v31, vcc
 ; SDAG-NEXT:    v_addc_u32_e32 v32, vcc, -1, v32, vcc
@@ -379,47 +367,47 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_or_b32_e32 v38, v30, v32
 ; SDAG-NEXT:    v_or_b32_e32 v39, v31, v33
 ; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[38:39]
-; SDAG-NEXT:    v_and_b32_e32 v8, 1, v8
+; SDAG-NEXT:    v_and_b32_e32 v6, 1, v15
 ; SDAG-NEXT:    s_or_b64 s[10:11], vcc, s[10:11]
-; SDAG-NEXT:    v_or_b32_e32 v10, v14, v10
-; SDAG-NEXT:    v_mov_b32_e32 v15, v9
-; SDAG-NEXT:    v_mov_b32_e32 v14, v8
+; SDAG-NEXT:    v_or_b32_e32 v8, v14, v8
+; SDAG-NEXT:    v_mov_b32_e32 v15, v7
+; SDAG-NEXT:    v_mov_b32_e32 v14, v6
 ; SDAG-NEXT:    s_andn2_b64 exec, exec, s[10:11]
 ; SDAG-NEXT:    s_cbranch_execnz .LBB0_9
 ; SDAG-NEXT:  ; %bb.10: ; %Flow
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; SDAG-NEXT:  .LBB0_11: ; %Flow11
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
-; SDAG-NEXT:    v_lshl_b64 v[0:1], v[4:5], 1
-; SDAG-NEXT:    v_lshrrev_b32_e32 v4, 31, v11
-; SDAG-NEXT:    v_lshl_b64 v[2:3], v[10:11], 1
+; SDAG-NEXT:    v_lshl_b64 v[0:1], v[2:3], 1
+; SDAG-NEXT:    v_lshrrev_b32_e32 v4, 31, v9
+; SDAG-NEXT:    v_lshl_b64 v[2:3], v[8:9], 1
 ; SDAG-NEXT:    v_or_b32_e32 v0, v0, v4
 ; SDAG-NEXT:    v_or_b32_e32 v13, v13, v1
-; SDAG-NEXT:    v_or_b32_e32 v14, v9, v3
-; SDAG-NEXT:    v_or_b32_e32 v9, v12, v0
-; SDAG-NEXT:    v_or_b32_e32 v8, v8, v2
+; SDAG-NEXT:    v_or_b32_e32 v14, v7, v3
+; SDAG-NEXT:    v_or_b32_e32 v7, v12, v0
+; SDAG-NEXT:    v_or_b32_e32 v6, v6, v2
 ; SDAG-NEXT:  .LBB0_12: ; %Flow12
 ; SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; SDAG-NEXT:    v_xor_b32_e32 v3, v27, v26
 ; SDAG-NEXT:    v_xor_b32_e32 v2, v25, v24
-; SDAG-NEXT:    v_xor_b32_e32 v7, v21, v20
-; SDAG-NEXT:    v_xor_b32_e32 v6, v17, v16
-; SDAG-NEXT:    v_xor_b32_e32 v4, v18, v3
+; SDAG-NEXT:    v_xor_b32_e32 v8, v21, v20
+; SDAG-NEXT:    v_xor_b32_e32 v9, v19, v18
+; SDAG-NEXT:    v_xor_b32_e32 v4, v16, v3
 ; SDAG-NEXT:    v_xor_b32_e32 v5, v22, v2
-; SDAG-NEXT:    v_xor_b32_e32 v1, v19, v3
+; SDAG-NEXT:    v_xor_b32_e32 v1, v17, v3
 ; SDAG-NEXT:    v_xor_b32_e32 v0, v23, v2
-; SDAG-NEXT:    v_xor_b32_e32 v10, v13, v7
-; SDAG-NEXT:    v_xor_b32_e32 v9, v9, v6
-; SDAG-NEXT:    v_xor_b32_e32 v11, v14, v7
+; SDAG-NEXT:    v_xor_b32_e32 v10, v13, v8
+; SDAG-NEXT:    v_xor_b32_e32 v7, v7, v9
+; SDAG-NEXT:    v_xor_b32_e32 v11, v14, v8
 ; SDAG-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
 ; SDAG-NEXT:    v_subb_u32_e32 v2, vcc, v5, v2, vcc
 ; SDAG-NEXT:    v_subb_u32_e32 v3, vcc, v4, v3, vcc
-; SDAG-NEXT:    v_xor_b32_e32 v4, v8, v6
-; SDAG-NEXT:    v_sub_i32_e32 v4, vcc, v4, v6
-; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v11, v7, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v6, vcc, v9, v6, vcc
-; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v10, v7, vcc
+; SDAG-NEXT:    v_xor_b32_e32 v4, v6, v9
+; SDAG-NEXT:    v_sub_i32_e32 v4, vcc, v4, v9
+; SDAG-NEXT:    v_subb_u32_e32 v5, vcc, v11, v8, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v6, vcc, v7, v9, vcc
+; SDAG-NEXT:    v_subb_u32_e32 v7, vcc, v10, v8, vcc
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: v_sdiv_v2i128_vv:
@@ -460,8 +448,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_ffbh_u32_e32 v29, v11
 ; GISEL-NEXT:    v_ffbh_u32_e32 v30, v20
 ; GISEL-NEXT:    v_ffbh_u32_e32 v31, v21
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GISEL-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_cmp_ne_u64_e64 s[4:5], 0, v[2:3]
 ; GISEL-NEXT:    v_min_u32_e32 v0, v8, v9
 ; GISEL-NEXT:    v_min_u32_e32 v1, v23, v22
 ; GISEL-NEXT:    v_add_i32_e64 v2, s[6:7], 32, v28
@@ -470,7 +458,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_min_u32_e32 v3, v31, v3
 ; GISEL-NEXT:    v_add_i32_e64 v0, s[6:7], 64, v0
 ; GISEL-NEXT:    v_add_i32_e64 v1, s[6:7], 64, v1
-; GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
@@ -481,25 +469,26 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5]
 ; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5]
 ; GISEL-NEXT:    v_xor_b32_e32 v8, 0x7f, v2
-; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[16:17]
+; GISEL-NEXT:    v_cmp_le_u64_e32 vcc, v[2:3], v[16:17]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_or_b32_e32 v8, v8, v0
 ; GISEL-NEXT:    v_or_b32_e32 v9, v3, v1
-; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, 0, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_or_b32_e32 v9, v22, v16
+; GISEL-NEXT:    v_and_b32_e32 v16, 1, v9
+; GISEL-NEXT:    v_xor_b32_e32 v9, 1, v9
 ; GISEL-NEXT:    v_or_b32_e32 v8, v9, v8
-; GISEL-NEXT:    v_and_b32_e32 v9, 1, v9
-; GISEL-NEXT:    v_and_b32_e32 v8, 1, v8
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v22, v18, 0, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v8
+; GISEL-NEXT:    v_and_b32_e32 v16, 1, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, v20, 0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, v21, 0, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v16
 ; GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v23, v19, 0, vcc
 ; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
@@ -651,8 +640,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_add_i32_e32 v17, vcc, 32, v17
 ; GISEL-NEXT:    v_ffbh_u32_e32 v28, v13
 ; GISEL-NEXT:    v_ffbh_u32_e32 v29, v12
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GISEL-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_cmp_ne_u64_e64 s[4:5], 0, v[2:3]
 ; GISEL-NEXT:    v_min_u32_e32 v0, v14, v15
 ; GISEL-NEXT:    v_add_i32_e64 v1, s[6:7], 32, v27
 ; GISEL-NEXT:    v_min_u32_e32 v2, v16, v17
@@ -661,7 +650,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_min_u32_e32 v1, v26, v1
 ; GISEL-NEXT:    v_add_i32_e64 v2, s[6:7], 64, v2
 ; GISEL-NEXT:    v_min_u32_e32 v3, v28, v3
-; GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
@@ -671,26 +660,27 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], 0, 0, vcc
 ; GISEL-NEXT:    v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5]
 ; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[10:11]
+; GISEL-NEXT:    v_cmp_le_u64_e32 vcc, v[2:3], v[10:11]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v10, 0x7f, v2
-; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, 0, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_or_b32_e32 v10, v10, v0
 ; GISEL-NEXT:    v_or_b32_e32 v11, v3, v1
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_or_b32_e32 v11, v14, v15
 ; GISEL-NEXT:    v_and_b32_e32 v14, 1, v11
-; GISEL-NEXT:    v_or_b32_e32 v10, v11, v10
+; GISEL-NEXT:    v_xor_b32_e32 v11, 1, v11
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, v6, 0, vcc
-; GISEL-NEXT:    v_and_b32_e32 v16, 1, v10
+; GISEL-NEXT:    v_or_b32_e32 v16, v11, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, v7, 0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, v12, 0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, v13, 0, vcc
+; GISEL-NEXT:    v_and_b32_e32 v16, 1, v16
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
 ; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
@@ -847,9 +837,9 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_ffbh_u32_e32 v26, v0
 ; SDAG-NEXT:    v_ffbh_u32_e32 v27, v1
 ; SDAG-NEXT:    v_mov_b32_e32 v28, 0
-; SDAG-NEXT:    s_mov_b64 s[8:9], 0x7f
-; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
+; SDAG-NEXT:    s_mov_b64 s[8:9], 0x80
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT:    v_cmp_ne_u64_e64 s[4:5], 0, v[18:19]
 ; SDAG-NEXT:    v_add_i32_e64 v16, s[6:7], 32, v20
 ; SDAG-NEXT:    v_add_i32_e64 v17, s[6:7], 32, v22
 ; SDAG-NEXT:    v_add_i32_e64 v18, s[6:7], 32, v24
@@ -858,7 +848,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_min_u32_e32 v17, v17, v23
 ; SDAG-NEXT:    v_min_u32_e32 v18, v18, v25
 ; SDAG-NEXT:    v_min_u32_e32 v19, v19, v27
-; SDAG-NEXT:    s_or_b64 s[6:7], vcc, s[4:5]
+; SDAG-NEXT:    s_and_b64 s[10:11], vcc, s[4:5]
 ; SDAG-NEXT:    v_add_i32_e32 v17, vcc, 64, v17
 ; SDAG-NEXT:    v_addc_u32_e64 v20, s[4:5], 0, 0, vcc
 ; SDAG-NEXT:    v_add_i32_e32 v19, vcc, 64, v19
@@ -873,24 +863,18 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_subb_u32_e32 v23, vcc, v20, v17, vcc
 ; SDAG-NEXT:    v_xor_b32_e32 v16, 0x7f, v22
 ; SDAG-NEXT:    v_subb_u32_e32 v24, vcc, 0, v28, vcc
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[8:9], v[22:23]
-; SDAG-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], s[8:9], v[22:23]
 ; SDAG-NEXT:    v_subb_u32_e32 v25, vcc, 0, v28, vcc
 ; SDAG-NEXT:    v_or_b32_e32 v16, v16, v24
 ; SDAG-NEXT:    v_or_b32_e32 v17, v23, v25
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[24:25]
-; SDAG-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[16:17]
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[24:25]
-; SDAG-NEXT:    v_cndmask_b32_e64 v16, v19, v18, s[4:5]
-; SDAG-NEXT:    v_and_b32_e32 v16, 1, v16
-; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v16
-; SDAG-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[24:25]
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[16:17]
+; SDAG-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; SDAG-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v16, v3, 0, s[4:5]
-; SDAG-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v17, v2, 0, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v18, v1, 0, s[4:5]
-; SDAG-NEXT:    s_and_b64 s[8:9], s[6:7], vcc
 ; SDAG-NEXT:    v_cndmask_b32_e64 v19, v0, 0, s[4:5]
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[8:9]
 ; SDAG-NEXT:    s_cbranch_execz .LBB1_6
@@ -1022,18 +1006,18 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_ffbh_u32_e32 v22, v4
 ; SDAG-NEXT:    v_ffbh_u32_e32 v23, v5
 ; SDAG-NEXT:    v_mov_b32_e32 v24, 0
-; SDAG-NEXT:    s_mov_b64 s[8:9], 0x7f
-; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
+; SDAG-NEXT:    s_mov_b64 s[8:9], 0x80
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT:    v_cmp_ne_u64_e64 s[4:5], 0, v[2:3]
 ; SDAG-NEXT:    v_add_i32_e64 v0, s[6:7], 32, v8
 ; SDAG-NEXT:    v_add_i32_e64 v1, s[6:7], 32, v10
 ; SDAG-NEXT:    v_add_i32_e64 v2, s[6:7], 32, v20
 ; SDAG-NEXT:    v_add_i32_e64 v3, s[6:7], 32, v22
-; SDAG-NEXT:    s_or_b64 s[6:7], vcc, s[4:5]
 ; SDAG-NEXT:    v_min_u32_e32 v0, v0, v9
 ; SDAG-NEXT:    v_min_u32_e32 v1, v1, v11
 ; SDAG-NEXT:    v_min_u32_e32 v2, v2, v21
 ; SDAG-NEXT:    v_min_u32_e32 v3, v3, v23
+; SDAG-NEXT:    s_and_b64 s[6:7], vcc, s[4:5]
 ; SDAG-NEXT:    v_add_i32_e32 v1, vcc, 64, v1
 ; SDAG-NEXT:    v_addc_u32_e64 v8, s[4:5], 0, 0, vcc
 ; SDAG-NEXT:    v_add_i32_e32 v3, vcc, 64, v3
@@ -1048,25 +1032,19 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_subb_u32_e32 v1, vcc, v8, v1, vcc
 ; SDAG-NEXT:    v_xor_b32_e32 v2, 0x7f, v0
 ; SDAG-NEXT:    v_subb_u32_e32 v20, vcc, 0, v24, vcc
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[8:9], v[0:1]
-; SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], s[8:9], v[0:1]
 ; SDAG-NEXT:    v_subb_u32_e32 v21, vcc, 0, v24, vcc
 ; SDAG-NEXT:    v_or_b32_e32 v2, v2, v20
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[20:21]
-; SDAG-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; SDAG-NEXT:    v_or_b32_e32 v3, v1, v21
 ; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[20:21]
-; SDAG-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; SDAG-NEXT:    v_and_b32_e32 v2, 1, v8
-; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v2
+; SDAG-NEXT:    v_or_b32_e32 v3, v1, v21
+; SDAG-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
 ; SDAG-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v8, v7, 0, s[4:5]
-; SDAG-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
 ; SDAG-NEXT:    v_cndmask_b32_e64 v9, v6, 0, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v10, v5, 0, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v11, v4, 0, s[4:5]
-; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; SDAG-NEXT:    s_cbranch_execz .LBB1_12
 ; SDAG-NEXT:  ; %bb.7: ; %udiv-bb1
@@ -1214,8 +1192,8 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_mov_b32_e32 v24, 0x7f
 ; GISEL-NEXT:    v_mov_b32_e32 v25, 0
 ; GISEL-NEXT:    s_mov_b64 s[8:9], 0
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; GISEL-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_cmp_ne_u64_e64 s[4:5], 0, v[18:19]
 ; GISEL-NEXT:    v_add_i32_e64 v2, s[6:7], 32, v21
 ; GISEL-NEXT:    v_add_i32_e64 v3, s[6:7], 32, v23
 ; GISEL-NEXT:    v_add_i32_e64 v18, s[6:7], 32, v27
@@ -1224,7 +1202,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_min_u32_e32 v3, v22, v3
 ; GISEL-NEXT:    v_min_u32_e32 v18, v26, v18
 ; GISEL-NEXT:    v_min_u32_e32 v19, v29, v19
-; GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v26, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, 64, v2
 ; GISEL-NEXT:    v_add_i32_e32 v18, vcc, 64, v18
@@ -1237,25 +1215,26 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_subb_u32_e64 v20, s[4:5], 0, 0, s[4:5]
 ; GISEL-NEXT:    v_subb_u32_e64 v21, s[4:5], 0, 0, s[4:5]
 ; GISEL-NEXT:    v_xor_b32_e32 v2, 0x7f, v22
-; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[22:23], v[24:25]
+; GISEL-NEXT:    v_cmp_le_u64_e32 vcc, v[22:23], v[24:25]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
 ; GISEL-NEXT:    v_or_b32_e32 v2, v2, v20
 ; GISEL-NEXT:    v_or_b32_e32 v3, v23, v21
-; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[20:21]
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, 0, v[20:21]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[20:21]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_or_b32_e32 v3, v26, v18
+; GISEL-NEXT:    v_and_b32_e32 v18, 1, v3
+; GISEL-NEXT:    v_xor_b32_e32 v3, 1, v3
 ; GISEL-NEXT:    v_or_b32_e32 v2, v3, v2
-; GISEL-NEXT:    v_and_b32_e32 v3, 1, v3
-; GISEL-NEXT:    v_and_b32_e32 v2, 1, v2
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, v0, 0, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v2
+; GISEL-NEXT:    v_and_b32_e32 v19, 1, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, v16, 0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, v17, 0, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v19
 ; GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, v1, 0, vcc
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
@@ -1387,8 +1366,8 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_ffbh_u32_e32 v25, v6
 ; GISEL-NEXT:    v_mov_b32_e32 v10, 0x7f
 ; GISEL-NEXT:    v_mov_b32_e32 v11, 0
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GISEL-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[8:9]
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_cmp_ne_u64_e64 s[4:5], 0, v[8:9]
 ; GISEL-NEXT:    v_add_i32_e64 v0, s[6:7], 32, v17
 ; GISEL-NEXT:    v_add_i32_e64 v1, s[6:7], 32, v21
 ; GISEL-NEXT:    v_add_i32_e64 v8, s[6:7], 32, v23
@@ -1397,7 +1376,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_min_u32_e32 v1, v20, v1
 ; GISEL-NEXT:    v_min_u32_e32 v8, v22, v8
 ; GISEL-NEXT:    v_min_u32_e32 v9, v24, v9
-; GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, 64, v0
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 64, v8
@@ -1409,26 +1388,27 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], 0, 0, vcc
 ; GISEL-NEXT:    v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5]
 ; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[16:17], v[10:11]
+; GISEL-NEXT:    v_cmp_le_u64_e32 vcc, v[16:17], v[10:11]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v8, 0x7f, v16
-; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, 0, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_or_b32_e32 v8, v8, v0
 ; GISEL-NEXT:    v_or_b32_e32 v9, v17, v1
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_or_b32_e32 v9, v20, v10
 ; GISEL-NEXT:    v_and_b32_e32 v10, 1, v9
-; GISEL-NEXT:    v_or_b32_e32 v8, v9, v8
+; GISEL-NEXT:    v_xor_b32_e32 v9, 1, v9
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, v4, 0, vcc
-; GISEL-NEXT:    v_and_b32_e32 v20, 1, v8
+; GISEL-NEXT:    v_or_b32_e32 v20, v9, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, v5, 0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, v6, 0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, v7, 0, vcc
+; GISEL-NEXT:    v_and_b32_e32 v20, 1, v20
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
 ; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
@@ -1564,7 +1544,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_sub_i32_e32 v16, vcc, 0, v0
 ; SDAG-NEXT:    v_mov_b32_e32 v19, 0
 ; SDAG-NEXT:    v_ashrrev_i32_e32 v28, 31, v3
-; SDAG-NEXT:    s_mov_b64 s[10:11], 0x7f
+; SDAG-NEXT:    s_mov_b64 s[10:11], 0x80
 ; SDAG-NEXT:    v_subb_u32_e32 v17, vcc, 0, v1, vcc
 ; SDAG-NEXT:    v_mov_b32_e32 v29, v28
 ; SDAG-NEXT:    v_subb_u32_e32 v18, vcc, 0, v2, vcc
@@ -1589,7 +1569,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_cndmask_b32_e64 v30, v9, v23, s[4:5]
 ; SDAG-NEXT:    v_subb_u32_e32 v9, vcc, 0, v10, vcc
 ; SDAG-NEXT:    v_cndmask_b32_e64 v31, v8, v21, s[4:5]
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[2:3]
+; SDAG-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[2:3]
 ; SDAG-NEXT:    v_min_u32_e32 v3, v20, v22
 ; SDAG-NEXT:    v_add_i32_e64 v8, s[8:9], 64, v18
 ; SDAG-NEXT:    v_addc_u32_e64 v18, s[8:9], 0, 0, s[8:9]
@@ -1608,36 +1588,30 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_min_u32_e32 v11, v11, v21
 ; SDAG-NEXT:    v_add_i32_e32 v20, vcc, 32, v20
 ; SDAG-NEXT:    v_ffbh_u32_e32 v21, v3
-; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
 ; SDAG-NEXT:    v_min_u32_e32 v8, v20, v21
 ; SDAG-NEXT:    v_add_i32_e64 v9, s[4:5], 64, v11
 ; SDAG-NEXT:    v_addc_u32_e64 v11, s[4:5], 0, 0, s[4:5]
 ; SDAG-NEXT:    v_cmp_ne_u64_e64 s[4:5], 0, v[2:3]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v11, v11, 0, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v8, v9, v8, s[4:5]
-; SDAG-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; SDAG-NEXT:    s_and_b64 s[8:9], vcc, s[6:7]
 ; SDAG-NEXT:    v_sub_i32_e32 v10, vcc, v8, v10
 ; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, v11, v18, vcc
 ; SDAG-NEXT:    v_xor_b32_e32 v8, 0x7f, v10
 ; SDAG-NEXT:    v_subb_u32_e32 v18, vcc, 0, v19, vcc
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11]
-; SDAG-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], s[10:11], v[10:11]
 ; SDAG-NEXT:    v_subb_u32_e32 v19, vcc, 0, v19, vcc
 ; SDAG-NEXT:    v_or_b32_e32 v8, v8, v18
 ; SDAG-NEXT:    v_or_b32_e32 v9, v11, v19
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[18:19]
-; SDAG-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
-; SDAG-NEXT:    v_cndmask_b32_e64 v8, v21, v20, s[4:5]
-; SDAG-NEXT:    v_and_b32_e32 v8, 1, v8
-; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v8
-; SDAG-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[8:9]
+; SDAG-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; SDAG-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v34, v1, 0, s[4:5]
-; SDAG-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v32, v0, 0, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v27, v17, 0, s[4:5]
-; SDAG-NEXT:    s_and_b64 s[8:9], s[6:7], vcc
 ; SDAG-NEXT:    v_cndmask_b32_e64 v33, v16, 0, s[4:5]
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[8:9]
 ; SDAG-NEXT:    s_cbranch_execz .LBB2_6
@@ -1759,7 +1733,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_ashrrev_i32_e32 v26, 31, v7
 ; SDAG-NEXT:    v_sub_i32_e32 v8, vcc, 0, v4
 ; SDAG-NEXT:    v_mov_b32_e32 v18, 0
-; SDAG-NEXT:    s_mov_b64 s[10:11], 0x7f
+; SDAG-NEXT:    s_mov_b64 s[10:11], 0x80
 ; SDAG-NEXT:    v_mov_b32_e32 v35, v26
 ; SDAG-NEXT:    v_subb_u32_e32 v9, vcc, 0, v5, vcc
 ; SDAG-NEXT:    v_subb_u32_e32 v10, vcc, 0, v6, vcc
@@ -1784,7 +1758,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[14:15]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v36, v13, v21, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v37, v12, v19, s[4:5]
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[6:7]
+; SDAG-NEXT:    v_cmp_ne_u64_e64 s[6:7], 0, v[6:7]
 ; SDAG-NEXT:    v_min_u32_e32 v7, v20, v22
 ; SDAG-NEXT:    v_add_i32_e64 v10, s[8:9], 64, v10
 ; SDAG-NEXT:    v_addc_u32_e64 v12, s[8:9], 0, 0, s[8:9]
@@ -1803,37 +1777,31 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_add_i32_e32 v13, vcc, 32, v13
 ; SDAG-NEXT:    v_ffbh_u32_e32 v20, v7
 ; SDAG-NEXT:    v_min_u32_e32 v14, v15, v14
-; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
 ; SDAG-NEXT:    v_min_u32_e32 v10, v13, v20
 ; SDAG-NEXT:    v_add_i32_e64 v11, s[4:5], 64, v14
 ; SDAG-NEXT:    v_addc_u32_e64 v13, s[4:5], 0, 0, s[4:5]
-; SDAG-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; SDAG-NEXT:    v_cndmask_b32_e64 v13, v13, 0, vcc
-; SDAG-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
+; SDAG-NEXT:    v_cmp_ne_u64_e64 s[4:5], 0, v[6:7]
+; SDAG-NEXT:    v_cndmask_b32_e64 v13, v13, 0, s[4:5]
+; SDAG-NEXT:    v_cndmask_b32_e64 v10, v11, v10, s[4:5]
+; SDAG-NEXT:    s_and_b64 s[6:7], vcc, s[6:7]
 ; SDAG-NEXT:    v_sub_i32_e32 v10, vcc, v10, v19
 ; SDAG-NEXT:    v_subb_u32_e32 v11, vcc, v13, v12, vcc
 ; SDAG-NEXT:    v_xor_b32_e32 v14, 0x7f, v10
 ; SDAG-NEXT:    v_subb_u32_e32 v12, vcc, 0, v18, vcc
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11]
-; SDAG-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], s[10:11], v[10:11]
 ; SDAG-NEXT:    v_subb_u32_e32 v13, vcc, 0, v18, vcc
 ; SDAG-NEXT:    v_or_b32_e32 v14, v14, v12
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[12:13]
-; SDAG-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; SDAG-NEXT:    v_or_b32_e32 v15, v11, v13
 ; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[12:13]
-; SDAG-NEXT:    v_cndmask_b32_e32 v18, v18, v19, vcc
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
-; SDAG-NEXT:    v_and_b32_e32 v14, 1, v18
-; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v14
+; SDAG-NEXT:    v_or_b32_e32 v15, v11, v13
+; SDAG-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
 ; SDAG-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v19, v5, 0, s[4:5]
-; SDAG-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
 ; SDAG-NEXT:    v_cndmask_b32_e64 v18, v4, 0, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v15, v9, 0, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v14, v8, 0, s[4:5]
-; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; SDAG-NEXT:    s_cbranch_execz .LBB2_12
 ; SDAG-NEXT:  ; %bb.7: ; %udiv-bb1
@@ -2058,8 +2026,8 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_ffbh_u32_e32 v25, v11
 ; GISEL-NEXT:    v_ffbh_u32_e32 v26, v8
 ; GISEL-NEXT:    v_ffbh_u32_e32 v27, v9
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GISEL-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_cmp_ne_u64_e64 s[4:5], 0, v[2:3]
 ; GISEL-NEXT:    v_min_u32_e32 v0, v18, v21
 ; GISEL-NEXT:    v_min_u32_e32 v1, v22, v23
 ; GISEL-NEXT:    v_add_i32_e64 v2, s[6:7], 32, v24
@@ -2068,7 +2036,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_min_u32_e32 v3, v27, v3
 ; GISEL-NEXT:    v_add_i32_e64 v0, s[6:7], 64, v0
 ; GISEL-NEXT:    v_add_i32_e64 v1, s[6:7], 64, v1
-; GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
@@ -2079,25 +2047,26 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5]
 ; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5]
 ; GISEL-NEXT:    v_xor_b32_e32 v18, 0x7f, v2
-; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[19:20]
+; GISEL-NEXT:    v_cmp_le_u64_e32 vcc, v[2:3], v[19:20]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
 ; GISEL-NEXT:    v_or_b32_e32 v18, v18, v0
 ; GISEL-NEXT:    v_or_b32_e32 v19, v3, v1
-; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, 0, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v20, v22, v20, vcc
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[18:19]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
 ; GISEL-NEXT:    v_or_b32_e32 v19, v21, v20
+; GISEL-NEXT:    v_and_b32_e32 v20, 1, v19
+; GISEL-NEXT:    v_xor_b32_e32 v19, 1, v19
 ; GISEL-NEXT:    v_or_b32_e32 v18, v19, v18
-; GISEL-NEXT:    v_and_b32_e32 v19, 1, v19
-; GISEL-NEXT:    v_and_b32_e32 v18, 1, v18
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v19
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
 ; GISEL-NEXT:    v_cndmask_b32_e64 v31, v16, 0, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v18
+; GISEL-NEXT:    v_and_b32_e32 v20, 1, v18
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, v8, 0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, v9, 0, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v20
 ; GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v32, v17, 0, vcc
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
@@ -2249,8 +2218,8 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_add_i32_e32 v23, vcc, 32, v23
 ; GISEL-NEXT:    v_ffbh_u32_e32 v26, v7
 ; GISEL-NEXT:    v_ffbh_u32_e32 v27, v6
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GISEL-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[14:15]
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_cmp_ne_u64_e64 s[4:5], 0, v[14:15]
 ; GISEL-NEXT:    v_min_u32_e32 v0, v20, v21
 ; GISEL-NEXT:    v_add_i32_e64 v1, s[6:7], 32, v25
 ; GISEL-NEXT:    v_min_u32_e32 v14, v22, v23
@@ -2259,7 +2228,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_min_u32_e32 v1, v24, v1
 ; GISEL-NEXT:    v_add_i32_e64 v14, s[6:7], 64, v14
 ; GISEL-NEXT:    v_min_u32_e32 v15, v26, v15
-; GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
@@ -2269,26 +2238,27 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_subb_u32_e64 v15, s[4:5], 0, 0, vcc
 ; GISEL-NEXT:    v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5]
 ; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[14:15], v[2:3]
+; GISEL-NEXT:    v_cmp_le_u64_e32 vcc, v[14:15], v[2:3]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v2, 0x7f, v14
-; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, 0, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
 ; GISEL-NEXT:    v_or_b32_e32 v2, v2, v0
 ; GISEL-NEXT:    v_or_b32_e32 v3, v15, v1
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v21, v22, v21, vcc
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_or_b32_e32 v3, v20, v21
 ; GISEL-NEXT:    v_and_b32_e32 v20, 1, v3
-; GISEL-NEXT:    v_or_b32_e32 v2, v3, v2
+; GISEL-NEXT:    v_xor_b32_e32 v3, 1, v3
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
 ; GISEL-NEXT:    v_cndmask_b32_e64 v20, v12, 0, vcc
-; GISEL-NEXT:    v_and_b32_e32 v22, 1, v2
+; GISEL-NEXT:    v_or_b32_e32 v22, v3, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v21, v13, 0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, 0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, 0, vcc
+; GISEL-NEXT:    v_and_b32_e32 v22, 1, v22
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v22
 ; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
@@ -2477,9 +2447,9 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_ffbh_u32_e32 v26, v0
 ; SDAG-NEXT:    v_ffbh_u32_e32 v27, v1
 ; SDAG-NEXT:    v_mov_b32_e32 v28, 0
-; SDAG-NEXT:    s_mov_b64 s[8:9], 0x7f
-; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
+; SDAG-NEXT:    s_mov_b64 s[8:9], 0x80
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT:    v_cmp_ne_u64_e64 s[4:5], 0, v[18:19]
 ; SDAG-NEXT:    v_add_i32_e64 v16, s[6:7], 32, v20
 ; SDAG-NEXT:    v_add_i32_e64 v17, s[6:7], 32, v22
 ; SDAG-NEXT:    v_add_i32_e64 v18, s[6:7], 32, v24
@@ -2488,7 +2458,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_min_u32_e32 v17, v17, v23
 ; SDAG-NEXT:    v_min_u32_e32 v18, v18, v25
 ; SDAG-NEXT:    v_min_u32_e32 v19, v19, v27
-; SDAG-NEXT:    s_or_b64 s[6:7], vcc, s[4:5]
+; SDAG-NEXT:    s_and_b64 s[10:11], vcc, s[4:5]
 ; SDAG-NEXT:    v_add_i32_e32 v17, vcc, 64, v17
 ; SDAG-NEXT:    v_addc_u32_e64 v20, s[4:5], 0, 0, vcc
 ; SDAG-NEXT:    v_add_i32_e32 v19, vcc, 64, v19
@@ -2503,24 +2473,18 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_subb_u32_e32 v19, vcc, v20, v17, vcc
 ; SDAG-NEXT:    v_xor_b32_e32 v16, 0x7f, v18
 ; SDAG-NEXT:    v_subb_u32_e32 v20, vcc, 0, v28, vcc
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[8:9], v[18:19]
-; SDAG-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s[4:5]
+; SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], s[8:9], v[18:19]
 ; SDAG-NEXT:    v_subb_u32_e32 v21, vcc, 0, v28, vcc
 ; SDAG-NEXT:    v_or_b32_e32 v16, v16, v20
 ; SDAG-NEXT:    v_or_b32_e32 v17, v19, v21
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[20:21]
-; SDAG-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[16:17]
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[20:21]
-; SDAG-NEXT:    v_cndmask_b32_e64 v16, v23, v22, s[4:5]
-; SDAG-NEXT:    v_and_b32_e32 v16, 1, v16
-; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v16
-; SDAG-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[20:21]
+; SDAG-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[16:17]
+; SDAG-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; SDAG-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v33, v3, 0, s[4:5]
-; SDAG-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v31, v2, 0, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v30, v1, 0, s[4:5]
-; SDAG-NEXT:    s_and_b64 s[8:9], s[6:7], vcc
 ; SDAG-NEXT:    v_cndmask_b32_e64 v32, v0, 0, s[4:5]
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[8:9]
 ; SDAG-NEXT:    s_cbranch_execz .LBB3_6
@@ -2652,18 +2616,18 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_ffbh_u32_e32 v26, v4
 ; SDAG-NEXT:    v_ffbh_u32_e32 v27, v5
 ; SDAG-NEXT:    v_mov_b32_e32 v28, 0
-; SDAG-NEXT:    s_mov_b64 s[8:9], 0x7f
-; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
-; SDAG-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
+; SDAG-NEXT:    s_mov_b64 s[8:9], 0x80
+; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT:    v_cmp_ne_u64_e64 s[4:5], 0, v[18:19]
 ; SDAG-NEXT:    v_add_i32_e64 v16, s[6:7], 32, v20
 ; SDAG-NEXT:    v_add_i32_e64 v17, s[6:7], 32, v22
 ; SDAG-NEXT:    v_add_i32_e64 v18, s[6:7], 32, v24
 ; SDAG-NEXT:    v_add_i32_e64 v19, s[6:7], 32, v26
-; SDAG-NEXT:    s_or_b64 s[6:7], vcc, s[4:5]
 ; SDAG-NEXT:    v_min_u32_e32 v16, v16, v21
 ; SDAG-NEXT:    v_min_u32_e32 v17, v17, v23
 ; SDAG-NEXT:    v_min_u32_e32 v18, v18, v25
 ; SDAG-NEXT:    v_min_u32_e32 v19, v19, v27
+; SDAG-NEXT:    s_and_b64 s[6:7], vcc, s[4:5]
 ; SDAG-NEXT:    v_add_i32_e32 v17, vcc, 64, v17
 ; SDAG-NEXT:    v_addc_u32_e64 v20, s[4:5], 0, 0, vcc
 ; SDAG-NEXT:    v_add_i32_e32 v19, vcc, 64, v19
@@ -2678,25 +2642,19 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-NEXT:    v_subb_u32_e32 v17, vcc, v20, v17, vcc
 ; SDAG-NEXT:    v_xor_b32_e32 v18, 0x7f, v16
 ; SDAG-NEXT:    v_subb_u32_e32 v20, vcc, 0, v28, vcc
-; SDAG-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17]
-; SDAG-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s[4:5]
+; SDAG-NEXT:    v_cmp_gt_u64_e64 s[4:5], s[8:9], v[16:17]
 ; SDAG-NEXT:    v_subb_u32_e32 v21, vcc, 0, v28, vcc
 ; SDAG-NEXT:    v_or_b32_e32 v18, v18, v20
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[20:21]
-; SDAG-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
-; SDAG-NEXT:    v_or_b32_e32 v19, v17, v21
 ; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[20:21]
-; SDAG-NEXT:    v_cndmask_b32_e32 v22, v23, v22, vcc
-; SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[18:19]
-; SDAG-NEXT:    v_and_b32_e32 v18, 1, v22
-; SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v18
+; SDAG-NEXT:    v_or_b32_e32 v19, v17, v21
+; SDAG-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[18:19]
 ; SDAG-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v23, v7, 0, s[4:5]
-; SDAG-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
 ; SDAG-NEXT:    v_cndmask_b32_e64 v22, v6, 0, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v19, v5, 0, s[4:5]
 ; SDAG-NEXT:    v_cndmask_b32_e64 v18, v4, 0, s[4:5]
-; SDAG-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; SDAG-NEXT:    s_cbranch_execz .LBB3_12
 ; SDAG-NEXT:  ; %bb.7: ; %udiv-bb1
@@ -2883,8 +2841,8 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_mov_b32_e32 v20, 0x7f
 ; GISEL-NEXT:    v_mov_b32_e32 v21, 0
 ; GISEL-NEXT:    s_mov_b64 s[8:9], 0
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
-; GISEL-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT:    v_cmp_ne_u64_e64 s[4:5], 0, v[18:19]
 ; GISEL-NEXT:    v_add_i32_e64 v16, s[6:7], 32, v23
 ; GISEL-NEXT:    v_add_i32_e64 v17, s[6:7], 32, v25
 ; GISEL-NEXT:    v_add_i32_e64 v18, s[6:7], 32, v27
@@ -2893,7 +2851,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_min_u32_e32 v17, v24, v17
 ; GISEL-NEXT:    v_min_u32_e32 v18, v26, v18
 ; GISEL-NEXT:    v_min_u32_e32 v19, v29, v19
-; GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, 64, v16
 ; GISEL-NEXT:    v_add_i32_e32 v18, vcc, 64, v18
@@ -2906,25 +2864,26 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5]
 ; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], 0, 0, s[4:5]
 ; GISEL-NEXT:    v_xor_b32_e32 v23, 0x7f, v18
-; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[18:19], v[20:21]
+; GISEL-NEXT:    v_cmp_le_u64_e32 vcc, v[18:19], v[20:21]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
 ; GISEL-NEXT:    v_or_b32_e32 v20, v23, v16
 ; GISEL-NEXT:    v_or_b32_e32 v21, v19, v17
-; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, 0, v[16:17]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v23, v23, v24, vcc
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[20:21]
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[20:21]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
 ; GISEL-NEXT:    v_or_b32_e32 v21, v22, v23
+; GISEL-NEXT:    v_and_b32_e32 v22, 1, v21
+; GISEL-NEXT:    v_xor_b32_e32 v21, 1, v21
 ; GISEL-NEXT:    v_or_b32_e32 v20, v21, v20
-; GISEL-NEXT:    v_and_b32_e32 v21, 1, v21
-; GISEL-NEXT:    v_and_b32_e32 v20, 1, v20
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v21
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v22
 ; GISEL-NEXT:    v_cndmask_b32_e64 v32, v0, 0, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v20
+; GISEL-NEXT:    v_and_b32_e32 v22, 1, v20
 ; GISEL-NEXT:    v_cndmask_b32_e64 v20, v2, 0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v21, v3, 0, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v22
 ; GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v33, v1, 0, vcc
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
@@ -3056,8 +3015,8 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_ffbh_u32_e32 v31, v6
 ; GISEL-NEXT:    v_mov_b32_e32 v24, 0x7f
 ; GISEL-NEXT:    v_mov_b32_e32 v25, 0
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
-; GISEL-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT:    v_cmp_ne_u64_e64 s[4:5], 0, v[18:19]
 ; GISEL-NEXT:    v_add_i32_e64 v16, s[6:7], 32, v23
 ; GISEL-NEXT:    v_add_i32_e64 v17, s[6:7], 32, v27
 ; GISEL-NEXT:    v_add_i32_e64 v18, s[6:7], 32, v29
@@ -3066,7 +3025,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_min_u32_e32 v17, v26, v17
 ; GISEL-NEXT:    v_min_u32_e32 v18, v28, v18
 ; GISEL-NEXT:    v_min_u32_e32 v19, v30, v19
-; GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v26, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, 64, v16
 ; GISEL-NEXT:    v_add_i32_e32 v18, vcc, 64, v18
@@ -3078,26 +3037,27 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_subb_u32_e64 v23, s[4:5], 0, 0, vcc
 ; GISEL-NEXT:    v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5]
 ; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, v[22:23], v[24:25]
+; GISEL-NEXT:    v_cmp_le_u64_e32 vcc, v[22:23], v[24:25]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v18, 0x7f, v22
-; GISEL-NEXT:    v_cmp_lt_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, 0, v[16:17]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v25, 0, 1, vcc
 ; GISEL-NEXT:    v_or_b32_e32 v18, v18, v16
 ; GISEL-NEXT:    v_or_b32_e32 v19, v23, v17
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v24, v25, v24, vcc
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[18:19]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
 ; GISEL-NEXT:    v_or_b32_e32 v19, v26, v24
 ; GISEL-NEXT:    v_and_b32_e32 v24, 1, v19
-; GISEL-NEXT:    v_or_b32_e32 v18, v19, v18
+; GISEL-NEXT:    v_xor_b32_e32 v19, 1, v19
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v24
 ; GISEL-NEXT:    v_cndmask_b32_e64 v24, v4, 0, vcc
-; GISEL-NEXT:    v_and_b32_e32 v26, 1, v18
+; GISEL-NEXT:    v_or_b32_e32 v26, v19, v18
 ; GISEL-NEXT:    v_cndmask_b32_e64 v25, v5, 0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, v6, 0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, v7, 0, vcc
+; GISEL-NEXT:    v_and_b32_e32 v26, 1, v26
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
 ; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
index 1c298014e33e7..2bfe5492263d3 100644
--- a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
@@ -27,27 +27,27 @@ define amdgpu_gfx [13 x i32] @issue130120() {
 ; CHECK-NEXT:    s_mov_b32 s48, 0
 ; CHECK-NEXT:  .LBB0_1: ; %bb3
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_cmp_eq_u32 s46, 0
+; CHECK-NEXT:    s_cmp_lg_u32 s46, 0
 ; CHECK-NEXT:    s_mov_b32 s49, s48
 ; CHECK-NEXT:    s_mov_b32 s50, s48
-; CHECK-NEXT:    s_cselect_b32 s51, 0, s1
-; CHECK-NEXT:    s_cselect_b32 s55, 0, s35
+; CHECK-NEXT:    s_cselect_b32 s51, s1, 0
+; CHECK-NEXT:    s_cselect_b32 s55, s35, 0
 ; CHECK-NEXT:    v_dual_mov_b32 v2, s48 :: v_dual_mov_b32 v3, s49
-; CHECK-NEXT:    s_cselect_b32 s52, 0, s2
-; CHECK-NEXT:    s_cselect_b32 s56, 0, s36
-; CHECK-NEXT:    s_cselect_b32 vcc_lo, 0, s43
+; CHECK-NEXT:    s_cselect_b32 s52, s2, 0
+; CHECK-NEXT:    s_cselect_b32 s56, s36, 0
+; CHECK-NEXT:    s_cselect_b32 vcc_lo, s43, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v4, s50
-; CHECK-NEXT:    s_cselect_b32 s47, s45, 0xf0
-; CHECK-NEXT:    s_cselect_b32 s53, 0, s3
-; CHECK-NEXT:    s_cselect_b32 s54, 0, s34
-; CHECK-NEXT:    s_cselect_b32 s57, 0, s37
-; CHECK-NEXT:    s_cselect_b32 s58, 0, s38
-; CHECK-NEXT:    s_cselect_b32 s59, 0, s0
-; CHECK-NEXT:    s_cselect_b32 s60, 0, s39
-; CHECK-NEXT:    s_cselect_b32 s61, 0, s40
-; CHECK-NEXT:    s_cselect_b32 s62, 0, s41
-; CHECK-NEXT:    s_cselect_b32 s63, 0, s42
-; CHECK-NEXT:    s_cselect_b32 vcc_hi, 0, s44
+; CHECK-NEXT:    s_cselect_b32 s47, 0xf0, s45
+; CHECK-NEXT:    s_cselect_b32 s53, s3, 0
+; CHECK-NEXT:    s_cselect_b32 s54, s34, 0
+; CHECK-NEXT:    s_cselect_b32 s57, s37, 0
+; CHECK-NEXT:    s_cselect_b32 s58, s38, 0
+; CHECK-NEXT:    s_cselect_b32 s59, s0, 0
+; CHECK-NEXT:    s_cselect_b32 s60, s39, 0
+; CHECK-NEXT:    s_cselect_b32 s61, s40, 0
+; CHECK-NEXT:    s_cselect_b32 s62, s41, 0
+; CHECK-NEXT:    s_cselect_b32 s63, s42, 0
+; CHECK-NEXT:    s_cselect_b32 vcc_hi, s44, 0
 ; CHECK-NEXT:    s_mov_b32 s46, s48
 ; CHECK-NEXT:    scratch_store_b32 off, v0, s51
 ; CHECK-NEXT:    scratch_store_b32 off, v0, s52
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index 6512bee36e88b..59888b614a837 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -11,31 +11,28 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9:       ; %bb.0: ; %_udiv-special-cases
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, 0, v0
-; GFX9-NEXT:    v_subb_co_u32_e32 v9, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v10, vcc, 0, v2, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v10, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, 0, v2, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v12, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v20, 31, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v12, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v11, vcc
 ; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, 0, v4
-; GFX9-NEXT:    v_subb_co_u32_e32 v9, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v10, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, 0, v7, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v10, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v12, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[6:7]
-; GFX9-NEXT:    v_mov_b32_e32 v21, v20
-; GFX9-NEXT:    v_cndmask_b32_e32 v22, v5, v9, vcc
+; GFX9-NEXT:    v_ffbh_u32_e32 v13, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v22, v5, v10, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v23, v4, v8, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v11, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v10, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v12, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v11, vcc
 ; GFX9-NEXT:    v_or_b32_e32 v7, v22, v5
 ; GFX9-NEXT:    v_or_b32_e32 v6, v23, v4
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
-; GFX9-NEXT:    v_or_b32_e32 v7, v1, v3
-; GFX9-NEXT:    v_or_b32_e32 v6, v0, v2
-; GFX9-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[6:7]
+; GFX9-NEXT:    v_cmp_ne_u64_e64 s[4:5], 0, v[6:7]
 ; GFX9-NEXT:    v_ffbh_u32_e32 v6, v4
 ; GFX9-NEXT:    v_add_u32_e32 v6, 32, v6
 ; GFX9-NEXT:    v_ffbh_u32_e32 v7, v5
@@ -44,50 +41,47 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_add_u32_e32 v7, 32, v7
 ; GFX9-NEXT:    v_ffbh_u32_e32 v8, v22
 ; GFX9-NEXT:    v_min_u32_e32 v7, v7, v8
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 64, v7
 ; GFX9-NEXT:    v_addc_co_u32_e64 v8, s[6:7], 0, 0, vcc
 ; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX9-NEXT:    v_ffbh_u32_e32 v10, v3
+; GFX9-NEXT:    v_ffbh_u32_e32 v12, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
 ; GFX9-NEXT:    v_ffbh_u32_e32 v7, v2
 ; GFX9-NEXT:    v_add_u32_e32 v7, 32, v7
-; GFX9-NEXT:    v_min_u32_e32 v7, v7, v10
-; GFX9-NEXT:    v_ffbh_u32_e32 v10, v0
-; GFX9-NEXT:    v_add_u32_e32 v10, 32, v10
-; GFX9-NEXT:    v_ffbh_u32_e32 v11, v1
-; GFX9-NEXT:    v_min_u32_e32 v10, v10, v11
+; GFX9-NEXT:    v_min_u32_e32 v7, v7, v12
+; GFX9-NEXT:    v_ffbh_u32_e32 v12, v0
+; GFX9-NEXT:    v_add_u32_e32 v12, 32, v12
+; GFX9-NEXT:    v_min_u32_e32 v12, v12, v13
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, 0, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, 64, v10
-; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[6:7], 0, 0, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, 64, v12
+; GFX9-NEXT:    v_addc_co_u32_e64 v13, s[6:7], 0, 0, vcc
 ; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v12, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v13, 0, vcc
 ; GFX9-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v7
-; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v8, v11, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v8, v13, vcc
 ; GFX9-NEXT:    v_subb_co_u32_e32 v8, vcc, 0, v9, vcc
+; GFX9-NEXT:    v_or_b32_e32 v11, v1, v3
+; GFX9-NEXT:    v_or_b32_e32 v10, v0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v9, vcc, 0, v9, vcc
-; GFX9-NEXT:    s_mov_b64 s[6:7], 0x7f
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7]
-; GFX9-NEXT:    v_or_b32_e32 v12, v7, v9
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
-; GFX9-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX9-NEXT:    v_xor_b32_e32 v11, 0x7f, v6
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
-; GFX9-NEXT:    v_or_b32_e32 v11, v11, v8
-; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[11:12]
-; GFX9-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; GFX9-NEXT:    s_mov_b64 s[6:7], 0x80
+; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7]
+; GFX9-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[8:9]
+; GFX9-NEXT:    v_cmp_ne_u64_e64 s[8:9], 0, v[10:11]
+; GFX9-NEXT:    v_xor_b32_e32 v10, 0x7f, v6
+; GFX9-NEXT:    v_or_b32_e32 v11, v7, v9
+; GFX9-NEXT:    v_or_b32_e32 v10, v10, v8
+; GFX9-NEXT:    s_and_b64 s[6:7], s[6:7], vcc
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v21, v20
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, v3, 0, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, v2, 0, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, v1, 0, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, v0, 0, s[4:5]
-; GFX9-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB0_6
 ; GFX9-NEXT:  ; %bb.1: ; %udiv-bb1
@@ -1503,10 +1497,7 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_or_b32_e32 v9, v5, v7
 ; GFX9-NEXT:    v_or_b32_e32 v8, v4, v6
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
-; GFX9-NEXT:    v_or_b32_e32 v9, v1, v3
-; GFX9-NEXT:    v_or_b32_e32 v8, v0, v2
-; GFX9-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[8:9]
+; GFX9-NEXT:    v_cmp_ne_u64_e64 s[4:5], 0, v[8:9]
 ; GFX9-NEXT:    v_ffbh_u32_e32 v8, v6
 ; GFX9-NEXT:    v_add_u32_e32 v8, 32, v8
 ; GFX9-NEXT:    v_ffbh_u32_e32 v9, v7
@@ -1515,7 +1506,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_add_u32_e32 v9, 32, v9
 ; GFX9-NEXT:    v_ffbh_u32_e32 v10, v5
 ; GFX9-NEXT:    v_min_u32_e32 v9, v9, v10
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, 64, v9
 ; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[6:7], 0, 0, vcc
 ; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
@@ -1526,39 +1516,37 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_min_u32_e32 v9, v9, v11
 ; GFX9-NEXT:    v_ffbh_u32_e32 v11, v0
 ; GFX9-NEXT:    v_add_u32_e32 v11, 32, v11
-; GFX9-NEXT:    v_ffbh_u32_e32 v12, v1
-; GFX9-NEXT:    v_min_u32_e32 v11, v11, v12
+; GFX9-NEXT:    v_ffbh_u32_e32 v14, v1
+; GFX9-NEXT:    v_min_u32_e32 v11, v11, v14
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, 0, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, 64, v11
-; GFX9-NEXT:    v_addc_co_u32_e64 v12, s[6:7], 0, 0, vcc
+; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[6:7], 0, 0, vcc
 ; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX9-NEXT:    s_mov_b64 s[6:7], 0x7f
+; GFX9-NEXT:    v_or_b32_e32 v13, v1, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, 0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v14, 0, vcc
 ; GFX9-NEXT:    v_sub_co_u32_e32 v8, vcc, v8, v9
-; GFX9-NEXT:    v_subb_co_u32_e32 v9, vcc, v10, v12, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v9, vcc, v10, v14, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v10, vcc, 0, v11, vcc
+; GFX9-NEXT:    v_or_b32_e32 v12, v0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v11, vcc, 0, v11, vcc
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v13, v12, vcc
-; GFX9-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
+; GFX9-NEXT:    s_mov_b64 s[6:7], 0x80
+; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[6:7], v[8:9]
+; GFX9-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[10:11]
+; GFX9-NEXT:    v_cmp_ne_u64_e64 s[8:9], 0, v[12:13]
 ; GFX9-NEXT:    v_xor_b32_e32 v12, 0x7f, v8
 ; GFX9-NEXT:    v_or_b32_e32 v13, v9, v11
 ; GFX9-NEXT:    v_or_b32_e32 v12, v12, v10
-; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX9-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[12:13]
-; GFX9-NEXT:    s_xor_b64 s[6:7], s[4:5], -1
+; GFX9-NEXT:    s_and_b64 s[6:7], s[6:7], vcc
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[12:13]
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v15, v3, 0, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v14, v2, 0, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, v1, 0, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, v0, 0, s[4:5]
-; GFX9-NEXT:    s_and_b64 s[4:5], s[6:7], vcc
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
 ; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
 ; GFX9-NEXT:    s_cbranch_execz .LBB1_6
 ; GFX9-NEXT:  ; %bb.1: ; %udiv-bb1
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
index 7ed27f008083e..d8163cdebad8c 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
@@ -1015,12 +1015,12 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; VI-LABEL: add_select_negk_fabs_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0xbc00
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
+; VI-NEXT:    v_mov_b32_e32 v2, 0xbc00
+; VI-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1029,12 +1029,12 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX9-LABEL: add_select_negk_fabs_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xbc00
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xbc00
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v3
@@ -1043,12 +1043,12 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11-SAFE-TRUE16-LABEL: add_select_negk_fabs_v2f16:
 ; GFX11-SAFE-TRUE16:       ; %bb.0:
 ; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-TRUE16-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v2
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xbc00, v2.l, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xbc00, v2.h, s0
+; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xbc00, v0.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xbc00, v0.h, s0
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -1071,12 +1071,12 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11-NSZ-TRUE16-LABEL: add_select_negk_fabs_v2f16:
 ; GFX11-NSZ-TRUE16:       ; %bb.0:
 ; GFX11-NSZ-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-TRUE16-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-TRUE16-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v2
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xbc00, v2.l, vcc_lo
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xbc00, v2.h, s0
+; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xbc00, v0.l, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xbc00, v0.h, s0
 ; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NSZ-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-NSZ-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -1126,12 +1126,12 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x
 ; VI-LABEL: add_select_negliteralk_fabs_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0xe400
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
+; VI-NEXT:    v_mov_b32_e32 v2, 0xe400
+; VI-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1140,12 +1140,12 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x
 ; GFX9-LABEL: add_select_negliteralk_fabs_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xe400
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xe400
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v3
@@ -1154,12 +1154,12 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x
 ; GFX11-SAFE-TRUE16-LABEL: add_select_negliteralk_fabs_v2f16:
 ; GFX11-SAFE-TRUE16:       ; %bb.0:
 ; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-TRUE16-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v2
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xe400, v2.l, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xe400, v2.h, s0
+; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xe400, v0.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xe400, v0.h, s0
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -1182,12 +1182,12 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x
 ; GFX11-NSZ-TRUE16-LABEL: add_select_negliteralk_fabs_v2f16:
 ; GFX11-NSZ-TRUE16:       ; %bb.0:
 ; GFX11-NSZ-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-TRUE16-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-TRUE16-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v2
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xe400, v2.l, vcc_lo
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xe400, v2.h, s0
+; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xe400, v0.l, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xe400, v0.h, s0
 ; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NSZ-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-NSZ-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -1346,12 +1346,12 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; VI-LABEL: add_select_posk_fabs_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0x3c00
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
+; VI-NEXT:    v_mov_b32_e32 v2, 0x3c00
+; VI-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1360,12 +1360,12 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX9-LABEL: add_select_posk_fabs_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3c00
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x3c00
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v3
@@ -1374,12 +1374,12 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11-SAFE-TRUE16-LABEL: add_select_posk_fabs_v2f16:
 ; GFX11-SAFE-TRUE16:       ; %bb.0:
 ; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-TRUE16-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v2
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x3c00, v2.l, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x3c00, v2.h, s0
+; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x3c00, v0.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x3c00, v0.h, s0
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -1402,12 +1402,12 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11-NSZ-TRUE16-LABEL: add_select_posk_fabs_v2f16:
 ; GFX11-NSZ-TRUE16:       ; %bb.0:
 ; GFX11-NSZ-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-TRUE16-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-TRUE16-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v2
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x3c00, v2.l, vcc_lo
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x3c00, v2.h, s0
+; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x3c00, v0.l, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x3c00, v0.h, s0
 ; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NSZ-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-NSZ-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -3836,12 +3836,12 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; VI-LABEL: mul_select_posk_negfabs_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0x4400
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_or_b32_e32 v1, 0x80008000, v2
+; VI-NEXT:    v_mov_b32_e32 v2, 0x4400
+; VI-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_mul_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -3850,12 +3850,12 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX9-LABEL: mul_select_posk_negfabs_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4400
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_or_b32_e32 v1, 0x80008000, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4400
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v3
@@ -3864,12 +3864,12 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX11-SAFE-TRUE16-LABEL: mul_select_posk_negfabs_v2f16:
 ; GFX11-SAFE-TRUE16:       ; %bb.0:
 ; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-TRUE16-NEXT:    v_or_b32_e32 v0, 0x80008000, v2
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4400, v2.l, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x4400, v2.h, s0
+; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4400, v0.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x4400, v0.h, s0
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-TRUE16-NEXT:    v_pk_mul_f16 v0, v0, v3
 ; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -3892,12 +3892,12 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX11-NSZ-TRUE16-LABEL: mul_select_posk_negfabs_v2f16:
 ; GFX11-NSZ-TRUE16:       ; %bb.0:
 ; GFX11-NSZ-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-TRUE16-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-TRUE16-NEXT:    v_or_b32_e32 v0, 0x80008000, v2
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4400, v2.l, vcc_lo
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x4400, v2.h, s0
+; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4400, v0.l, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x4400, v0.h, s0
 ; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NSZ-TRUE16-NEXT:    v_pk_mul_f16 v0, v0, v3
 ; GFX11-NSZ-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -4066,12 +4066,12 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; VI-LABEL: mul_select_negk_negfabs_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0xc400
-; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_or_b32_e32 v1, 0x80008000, v2
+; VI-NEXT:    v_mov_b32_e32 v2, 0xc400
+; VI-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_mul_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -4080,12 +4080,12 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX9-LABEL: mul_select_negk_negfabs_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xc400
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_or_b32_e32 v1, 0x80008000, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xc400
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v3
@@ -4094,12 +4094,12 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX11-SAFE-TRUE16-LABEL: mul_select_negk_negfabs_v2f16:
 ; GFX11-SAFE-TRUE16:       ; %bb.0:
 ; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-TRUE16-NEXT:    v_or_b32_e32 v0, 0x80008000, v2
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xc400, v2.l, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xc400, v2.h, s0
+; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xc400, v0.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xc400, v0.h, s0
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-TRUE16-NEXT:    v_pk_mul_f16 v0, v0, v3
 ; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -4122,12 +4122,12 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX11-NSZ-TRUE16-LABEL: mul_select_negk_negfabs_v2f16:
 ; GFX11-NSZ-TRUE16:       ; %bb.0:
 ; GFX11-NSZ-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-TRUE16-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-TRUE16-NEXT:    v_or_b32_e32 v0, 0x80008000, v2
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xc400, v2.l, vcc_lo
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xc400, v2.h, s0
+; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xc400, v0.l, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xc400, v0.h, s0
 ; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NSZ-TRUE16-NEXT:    v_pk_mul_f16 v0, v0, v3
 ; GFX11-NSZ-TRUE16-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll b/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll
index 12ccdfff07c6f..71e41659d41dd 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll
@@ -40,9 +40,8 @@ define amdgpu_cs void @test_i32_sle(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %ou
 define amdgpu_cs void @test_i32_sgt(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_i32_sgt:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 2, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 1, v0
+; GCN-NEXT:    v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
 ; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -58,9 +57,8 @@ define amdgpu_cs void @test_i32_sgt(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %ou
 define amdgpu_cs void @test_i32_slt(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_i32_slt:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 2, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 3, v0
+; GCN-NEXT:    v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
 ; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -113,11 +111,9 @@ define amdgpu_cs void @test_i64_sle(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %ou
 define amdgpu_cs void @test_i64_sgt(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_i64_sgt:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 2, v[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 1, v[0:1]
+; GCN-NEXT:    v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT:    v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
 ; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -133,11 +129,9 @@ define amdgpu_cs void @test_i64_sgt(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %ou
 define amdgpu_cs void @test_i64_slt(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_i64_slt:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 2, v[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 3, v[0:1]
+; GCN-NEXT:    v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT:    v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
 ; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -154,9 +148,8 @@ define amdgpu_cs void @test_i64_slt(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %ou
 define amdgpu_cs void @test_u32_eq(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_u32_eq:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 1, v0
+; GCN-NEXT:    v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
 ; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -190,11 +183,10 @@ define amdgpu_cs void @test_negative_case(i32 %a, i32 %p, i32 %q, ptr addrspace(
 define amdgpu_cs void @test_mixed(i32 %a, i32 %p, i32 %q, i32 %r, i32 %s, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_mixed:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc_lo, -1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v3, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v4, 0, vcc_lo
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc_lo, -1, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    v_dual_cndmask_b32 v2, 0, v3 :: v_dual_cndmask_b32 v3, 0, v4
 ; GCN-NEXT:    global_store_b128 v[5:6], v[0:3], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -214,10 +206,10 @@ define amdgpu_cs void @test_mixed(i32 %a, i32 %p, i32 %q, i32 %r, i32 %s, ptr ad
 define amdgpu_cs void @test_sgpr(i32 %a, i32 %p, i32 inreg %q, i32 inreg %r, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_sgpr:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc_lo, -1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v4, 0, v1, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v5, s0, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v6, s1, 0, vcc_lo
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc_lo, -1, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v4, v1, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, s0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, s1, vcc_lo
 ; GCN-NEXT:    global_store_b96 v[2:3], v[4:6], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -235,9 +227,8 @@ define amdgpu_cs void @test_sgpr(i32 %a, i32 %p, i32 inreg %q, i32 inreg %r, ptr
 define amdgpu_cs void @test_u32_ne(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_u32_ne:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GCN-NEXT:    v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
 ; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -287,9 +278,8 @@ define amdgpu_cs void @test_u32_ule(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %ou
 define amdgpu_cs void @test_u32_ugt(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_u32_ugt:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 2, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 1, v0
+; GCN-NEXT:    v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
 ; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -305,9 +295,8 @@ define amdgpu_cs void @test_u32_ugt(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %ou
 define amdgpu_cs void @test_u32_ult(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_u32_ult:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 2, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 3, v0
+; GCN-NEXT:    v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
 ; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -324,11 +313,9 @@ define amdgpu_cs void @test_u32_ult(i32 %a, i32 %p, i32 %q, ptr addrspace(1) %ou
 define amdgpu_cs void @test_u64_eq(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_u64_eq:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 1, v[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 1, v[0:1]
+; GCN-NEXT:    v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT:    v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
 ; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -344,11 +331,9 @@ define amdgpu_cs void @test_u64_eq(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out
 define amdgpu_cs void @test_u64_ne(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_u64_ne:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_ne_u64_e32 vcc_lo, 1, v[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 1, v[0:1]
+; GCN-NEXT:    v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT:    v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
 ; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -400,11 +385,9 @@ define amdgpu_cs void @test_u64_ule(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %ou
 define amdgpu_cs void @test_u64_ugt(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_u64_ugt:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_gt_u64_e32 vcc_lo, 2, v[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    v_cmp_lt_u64_e32 vcc_lo, 1, v[0:1]
+; GCN-NEXT:    v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT:    v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
 ; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -420,11 +403,9 @@ define amdgpu_cs void @test_u64_ugt(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %ou
 define amdgpu_cs void @test_u64_ult(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_u64_ult:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_lt_u64_e32 vcc_lo, 2, v[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    v_cmp_gt_u64_e32 vcc_lo, 3, v[0:1]
+; GCN-NEXT:    v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT:    v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
 ; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
 ; GCN-NEXT:    s_endpgm
 .entry:

>From 2589eb30bbb3d126807a544474874877a0f034ce Mon Sep 17 00:00:00 2001
From: Ana Mihajlovic <Ana.Mihajlovic at amd.com>
Date: Fri, 30 May 2025 15:30:21 +0200
Subject: [PATCH 3/5] replaced isa<Constant> check with isDivergent check

---
 .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    |   9 +-
 .../atomic_optimizations_local_pointer.ll     | 482 +++++++++---------
 llvm/test/CodeGen/AMDGPU/ctlz.ll              |   4 +-
 llvm/test/CodeGen/AMDGPU/cttz.ll              |   4 +-
 llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll   |  12 +-
 llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll  |  90 ++--
 .../issue130120-eliminate-frame-index.ll      |  34 +-
 llvm/test/CodeGen/AMDGPU/known-never-snan.ll  |   8 +-
 llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll    |   8 +-
 9 files changed, 326 insertions(+), 325 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 7f17132be12aa..c3f9533d36323 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -1787,11 +1787,12 @@ bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) {
 
     auto SelectI = dyn_cast<SelectInst>(User);
 
-    if (isa<Constant>(SelectI->getOperand(1)) &&
-        !isa<Constant>(SelectI->getOperand(2)))
+    auto Op1 = SelectI->getOperand(1);
+    auto Op2 = SelectI->getOperand(2);
+
+    if (!UA.isDivergent(Op1) && UA.isDivergent(Op2))
       ShouldSwap++;
-    else if (!isa<Constant>(SelectI->getOperand(1)) &&
-             isa<Constant>(SelectI->getOperand(2)))
+    else if (UA.isDivergent(Op1) && !UA.isDivergent(Op2))
       ShouldSwap--;
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 8e0b3cb9aa1d5..27a0b5e3a48bd 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -10964,10 +10964,10 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
-; GFX7LESS-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
-; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7LESS-NEXT:    v_cmp_le_i64_e32 vcc, s[4:5], v[0:1]
+; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
-; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
@@ -10995,13 +10995,13 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX8-NEXT:    v_cmp_le_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s5
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
@@ -11028,13 +11028,13 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT:    v_cmp_le_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
@@ -11062,9 +11062,9 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
 ; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX1064-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
+; GFX1064-NEXT:    v_cmp_le_i64_e32 vcc, s[2:3], v[0:1]
+; GFX1064-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc
+; GFX1064-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11093,9 +11093,9 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
 ; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
-; GFX1032-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX1032-NEXT:    v_cmp_le_i64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1032-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX1032-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11126,9 +11126,9 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
 ; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX1164-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
+; GFX1164-NEXT:    v_cmp_le_i64_e32 vcc, s[2:3], v[0:1]
+; GFX1164-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc
+; GFX1164-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11157,9 +11157,9 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
 ; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX1132-NEXT:    v_cmp_le_i64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1132-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX1132-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11221,10 +11221,10 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v4
 ; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
 ; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s5
-; GFX7LESS_ITERATIVE-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[1:2]
-; GFX7LESS_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX7LESS_ITERATIVE-NEXT:    v_cmp_le_i64_e32 vcc, s[4:5], v[1:2]
+; GFX7LESS_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7LESS_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX7LESS_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
 ; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
@@ -11274,13 +11274,13 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v4
 ; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX8_ITERATIVE-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[1:2]
+; GFX8_ITERATIVE-NEXT:    v_cmp_le_i64_e32 vcc, s[4:5], v[1:2]
 ; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s5
-; GFX8_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
-; GFX8_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_endpgm
@@ -11329,13 +11329,13 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v4
 ; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX9_ITERATIVE-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[1:2]
+; GFX9_ITERATIVE-NEXT:    v_cmp_le_i64_e32 vcc, s[4:5], v[1:2]
 ; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX9_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
-; GFX9_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX9_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
@@ -11383,9 +11383,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
 ; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1064_ITERATIVE-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[1:2]
-; GFX1064_ITERATIVE-NEXT:    v_cndmask_b32_e64 v2, v2, s3, vcc
-; GFX1064_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s2, vcc
+; GFX1064_ITERATIVE-NEXT:    v_cmp_le_i64_e32 vcc, s[2:3], v[1:2]
+; GFX1064_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, s3, v2, vcc
+; GFX1064_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, s2, v1, vcc
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11434,9 +11434,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
 ; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1032_ITERATIVE-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[1:2]
-; GFX1032_ITERATIVE-NEXT:    v_cndmask_b32_e64 v2, v2, s3, vcc_lo
-; GFX1032_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s2, vcc_lo
+; GFX1032_ITERATIVE-NEXT:    v_cmp_le_i64_e32 vcc_lo, s[2:3], v[1:2]
+; GFX1032_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, s3, v2, vcc_lo
+; GFX1032_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, s2, v1, vcc_lo
 ; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11492,9 +11492,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_ITERATIVE-NEXT:    v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX1164_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1164_ITERATIVE-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
+; GFX1164_ITERATIVE-NEXT:    v_cmp_le_i64_e32 vcc, s[2:3], v[0:1]
+; GFX1164_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc
+; GFX1164_ITERATIVE-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11546,9 +11546,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132_ITERATIVE-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1132_ITERATIVE-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX1132_ITERATIVE-NEXT:    v_cmp_le_i64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1132_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX1132_ITERATIVE-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11647,13 +11647,13 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v7
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX8_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8]
+; GFX8_DPP-NEXT:    v_cmp_le_i64_e32 vcc, s[4:5], v[7:8]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s5
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v8, v0, v8, vcc
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v7, v0, v7, vcc
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
 ; GFX8_DPP-NEXT:    s_endpgm
@@ -11736,13 +11736,13 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v7
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX9_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8]
+; GFX9_DPP-NEXT:    v_cmp_le_i64_e32 vcc, s[4:5], v[7:8]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v8, v0, v8, vcc
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v7, v0, v7, vcc
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
@@ -11848,9 +11848,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1064_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc
+; GFX1064_DPP-NEXT:    v_cmp_le_i64_e32 vcc, s[4:5], v[7:8]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v8, s5, v8, vcc
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v7, s4, v7, vcc
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
 ; GFX1064_DPP-NEXT:    s_endpgm
@@ -11933,9 +11933,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1032_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[7:8]
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc_lo
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc_lo
+; GFX1032_DPP-NEXT:    v_cmp_le_i64_e32 vcc_lo, s[4:5], v[7:8]
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v8, s5, v8, vcc_lo
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v7, s4, v7, vcc_lo
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
 ; GFX1032_DPP-NEXT:    s_endpgm
@@ -12055,9 +12055,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[7:8]
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc
+; GFX1164_DPP-NEXT:    v_cmp_le_i64_e32 vcc, s[4:5], v[7:8]
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v8, s5, v8, vcc
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v7, s4, v7, vcc
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[0:3], 0
 ; GFX1164_DPP-NEXT:    s_endpgm
@@ -12145,9 +12145,9 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[4:5], v[7:8]
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc_lo
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc_lo
+; GFX1132_DPP-NEXT:    v_cmp_le_i64_e32 vcc_lo, s[4:5], v[7:8]
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e32 v8, s5, v8, vcc_lo
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e32 v7, s4, v7, vcc_lo
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[0:3], 0
 ; GFX1132_DPP-NEXT:    s_endpgm
@@ -12790,10 +12790,10 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
 ; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
-; GFX7LESS-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
-; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7LESS-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[0:1]
+; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
-; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
@@ -12821,13 +12821,13 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX8-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s5
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
@@ -12854,13 +12854,13 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
@@ -12888,9 +12888,9 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
 ; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX1064-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
+; GFX1064-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1]
+; GFX1064-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc
+; GFX1064-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12919,9 +12919,9 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
 ; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
-; GFX1032-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX1032-NEXT:    v_cmp_ge_i64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1032-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX1032-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12952,9 +12952,9 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
 ; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX1164-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
+; GFX1164-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1]
+; GFX1164-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc
+; GFX1164-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12983,9 +12983,9 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
 ; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX1132-NEXT:    v_cmp_ge_i64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1132-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX1132-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13047,10 +13047,10 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v4
 ; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
 ; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s5
-; GFX7LESS_ITERATIVE-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[1:2]
-; GFX7LESS_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[1:2]
+; GFX7LESS_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7LESS_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX7LESS_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
 ; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
@@ -13100,13 +13100,13 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v4
 ; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX8_ITERATIVE-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[1:2]
+; GFX8_ITERATIVE-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[1:2]
 ; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s5
-; GFX8_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
-; GFX8_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_endpgm
@@ -13155,13 +13155,13 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v4
 ; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX9_ITERATIVE-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[1:2]
+; GFX9_ITERATIVE-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[1:2]
 ; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX9_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
-; GFX9_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX9_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
@@ -13209,9 +13209,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
 ; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1064_ITERATIVE-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
-; GFX1064_ITERATIVE-NEXT:    v_cndmask_b32_e64 v2, v2, s3, vcc
-; GFX1064_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s2, vcc
+; GFX1064_ITERATIVE-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[1:2]
+; GFX1064_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, s3, v2, vcc
+; GFX1064_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, s2, v1, vcc
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13260,9 +13260,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
 ; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1032_ITERATIVE-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[1:2]
-; GFX1032_ITERATIVE-NEXT:    v_cndmask_b32_e64 v2, v2, s3, vcc_lo
-; GFX1032_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s2, vcc_lo
+; GFX1032_ITERATIVE-NEXT:    v_cmp_ge_i64_e32 vcc_lo, s[2:3], v[1:2]
+; GFX1032_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, s3, v2, vcc_lo
+; GFX1032_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, s2, v1, vcc_lo
 ; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13318,9 +13318,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_ITERATIVE-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX1164_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1164_ITERATIVE-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
+; GFX1164_ITERATIVE-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[0:1]
+; GFX1164_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc
+; GFX1164_ITERATIVE-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13372,9 +13372,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132_ITERATIVE-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1132_ITERATIVE-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX1132_ITERATIVE-NEXT:    v_cmp_ge_i64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1132_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX1132_ITERATIVE-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13473,13 +13473,13 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v7
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX8_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8]
+; GFX8_DPP-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[7:8]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s5
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v8, v0, v8, vcc
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v7, v0, v7, vcc
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
 ; GFX8_DPP-NEXT:    s_endpgm
@@ -13562,13 +13562,13 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v7
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v8, v2
-; GFX9_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8]
+; GFX9_DPP-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[7:8]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v8, v0, v8, vcc
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v7, v0, v7, vcc
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
@@ -13674,9 +13674,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1064_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc
+; GFX1064_DPP-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[7:8]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v8, s5, v8, vcc
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v7, s4, v7, vcc
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
 ; GFX1064_DPP-NEXT:    s_endpgm
@@ -13759,9 +13759,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1032_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[7:8]
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc_lo
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc_lo
+; GFX1032_DPP-NEXT:    v_cmp_ge_i64_e32 vcc_lo, s[4:5], v[7:8]
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v8, s5, v8, vcc_lo
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v7, s4, v7, vcc_lo
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
 ; GFX1032_DPP-NEXT:    s_endpgm
@@ -13881,9 +13881,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8]
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc
+; GFX1164_DPP-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[7:8]
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v8, s5, v8, vcc
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v7, s4, v7, vcc
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[0:3], 0
 ; GFX1164_DPP-NEXT:    s_endpgm
@@ -13971,9 +13971,9 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_cmp_lt_i64_e32 vcc_lo, s[4:5], v[7:8]
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc_lo
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc_lo
+; GFX1132_DPP-NEXT:    v_cmp_ge_i64_e32 vcc_lo, s[4:5], v[7:8]
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e32 v8, s5, v8, vcc_lo
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e32 v7, s4, v7, vcc_lo
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[0:3], 0
 ; GFX1132_DPP-NEXT:    s_endpgm
@@ -14614,10 +14614,10 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
-; GFX7LESS-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
-; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7LESS-NEXT:    v_cmp_le_u64_e32 vcc, s[4:5], v[0:1]
+; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX7LESS-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
@@ -14644,13 +14644,13 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
+; GFX8-NEXT:    v_cmp_le_u64_e32 vcc, s[4:5], v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
@@ -14676,13 +14676,13 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT:    v_cmp_le_u64_e32 vcc, s[4:5], v[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
@@ -14710,9 +14710,9 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX1064-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc
+; GFX1064-NEXT:    v_cmp_le_u64_e32 vcc, s[2:3], v[0:1]
+; GFX1064-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc
+; GFX1064-NEXT:    v_cndmask_b32_e64 v1, s3, 0, vcc
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14741,9 +14741,9 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
-; GFX1032-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc_lo
+; GFX1032-NEXT:    v_cmp_le_u64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1032-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
+; GFX1032-NEXT:    v_cndmask_b32_e64 v1, s3, 0, vcc_lo
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14774,9 +14774,9 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc
+; GFX1164-NEXT:    v_cmp_le_u64_e32 vcc, s[2:3], v[0:1]
+; GFX1164-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc
+; GFX1164-NEXT:    v_cndmask_b32_e64 v1, s3, 0, vcc
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14802,12 +14802,12 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc_lo
+; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, s3, vcc_lo
+; GFX1132-NEXT:    v_cmp_le_u64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1132-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
+; GFX1132-NEXT:    v_cndmask_b32_e64 v1, s3, 0, vcc_lo
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14868,10 +14868,10 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v4
 ; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
 ; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s5
-; GFX7LESS_ITERATIVE-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[1:2]
-; GFX7LESS_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX7LESS_ITERATIVE-NEXT:    v_cmp_le_u64_e32 vcc, s[4:5], v[1:2]
+; GFX7LESS_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7LESS_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX7LESS_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
 ; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
@@ -14920,13 +14920,13 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v4
 ; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX8_ITERATIVE-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[1:2]
+; GFX8_ITERATIVE-NEXT:    v_cmp_le_u64_e32 vcc, s[4:5], v[1:2]
 ; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s5
-; GFX8_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
-; GFX8_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_endpgm
@@ -14974,13 +14974,13 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v4
 ; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX9_ITERATIVE-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[1:2]
+; GFX9_ITERATIVE-NEXT:    v_cmp_le_u64_e32 vcc, s[4:5], v[1:2]
 ; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX9_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
-; GFX9_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX9_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
@@ -15027,9 +15027,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
 ; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1064_ITERATIVE-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[1:2]
-; GFX1064_ITERATIVE-NEXT:    v_cndmask_b32_e64 v2, v2, s3, vcc
-; GFX1064_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s2, vcc
+; GFX1064_ITERATIVE-NEXT:    v_cmp_le_u64_e32 vcc, s[2:3], v[1:2]
+; GFX1064_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, s3, v2, vcc
+; GFX1064_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, s2, v1, vcc
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
@@ -15077,9 +15077,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
 ; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1032_ITERATIVE-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[1:2]
-; GFX1032_ITERATIVE-NEXT:    v_cndmask_b32_e64 v2, v2, s3, vcc_lo
-; GFX1032_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s2, vcc_lo
+; GFX1032_ITERATIVE-NEXT:    v_cmp_le_u64_e32 vcc_lo, s[2:3], v[1:2]
+; GFX1032_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, s3, v2, vcc_lo
+; GFX1032_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, s2, v1, vcc_lo
 ; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
@@ -15135,9 +15135,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_ITERATIVE-NEXT:    v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX1164_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1164_ITERATIVE-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
+; GFX1164_ITERATIVE-NEXT:    v_cmp_le_u64_e32 vcc, s[2:3], v[0:1]
+; GFX1164_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc
+; GFX1164_ITERATIVE-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
@@ -15189,9 +15189,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132_ITERATIVE-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1132_ITERATIVE-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX1132_ITERATIVE-NEXT:    v_cmp_le_u64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1132_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX1132_ITERATIVE-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
@@ -15291,13 +15291,13 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v5
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX8_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[5:6]
+; GFX8_DPP-NEXT:    v_cmp_le_u64_e32 vcc, s[4:5], v[5:6]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s5
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v6, v6, v0, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v6, v0, v6, vcc
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v5, v5, v0, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v5, v0, v5, vcc
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[0:3], 0
 ; GFX8_DPP-NEXT:    s_endpgm
@@ -15381,13 +15381,13 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v5
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v4
-; GFX9_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[5:6]
+; GFX9_DPP-NEXT:    v_cmp_le_u64_e32 vcc, s[4:5], v[5:6]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v6, v6, v0, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v6, v0, v6, vcc
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v5, v5, v0, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v5, v0, v5, vcc
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
@@ -15493,9 +15493,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1064_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc
+; GFX1064_DPP-NEXT:    v_cmp_le_u64_e32 vcc, s[4:5], v[7:8]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v8, s5, v8, vcc
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v7, s4, v7, vcc
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
 ; GFX1064_DPP-NEXT:    s_endpgm
@@ -15578,9 +15578,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1032_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[7:8]
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc_lo
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc_lo
+; GFX1032_DPP-NEXT:    v_cmp_le_u64_e32 vcc_lo, s[4:5], v[7:8]
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v8, s5, v8, vcc_lo
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v7, s4, v7, vcc_lo
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
 ; GFX1032_DPP-NEXT:    s_endpgm
@@ -15700,9 +15700,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[7:8]
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc
+; GFX1164_DPP-NEXT:    v_cmp_le_u64_e32 vcc, s[4:5], v[7:8]
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v8, s5, v8, vcc
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v7, s4, v7, vcc
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[0:3], 0
 ; GFX1164_DPP-NEXT:    s_endpgm
@@ -15784,9 +15784,9 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[7:8]
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc_lo
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc_lo
+; GFX1132_DPP-NEXT:    v_cmp_le_u64_e32 vcc_lo, s[4:5], v[7:8]
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e32 v8, s5, v8, vcc_lo
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e32 v7, s4, v7, vcc_lo
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[0:3], 0
 ; GFX1132_DPP-NEXT:    s_endpgm
@@ -16428,10 +16428,10 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
 ; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
-; GFX7LESS-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
-; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX7LESS-NEXT:    v_cmp_ge_u64_e32 vcc, s[4:5], v[0:1]
+; GFX7LESS-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s4
-; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
@@ -16458,13 +16458,13 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
+; GFX8-NEXT:    v_cmp_ge_u64_e32 vcc, s[4:5], v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s5
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
@@ -16490,13 +16490,13 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
+; GFX9-NEXT:    v_cmp_ge_u64_e32 vcc, s[4:5], v[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
@@ -16524,9 +16524,9 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1064-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
 ; GFX1064-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX1064-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX1064-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1064-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
+; GFX1064-NEXT:    v_cmp_ge_u64_e32 vcc, s[2:3], v[0:1]
+; GFX1064-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc
+; GFX1064-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
@@ -16555,9 +16555,9 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1032-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
 ; GFX1032-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
-; GFX1032-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1032-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1032-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX1032-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1032-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX1032-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
@@ -16588,9 +16588,9 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
 ; GFX1164-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX1164-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1164-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
+; GFX1164-NEXT:    v_cmp_ge_u64_e32 vcc, s[2:3], v[0:1]
+; GFX1164-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc
+; GFX1164-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
@@ -16619,9 +16619,9 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX1132-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
 ; GFX1132-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc_lo
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1132-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX1132-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1132-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX1132-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
@@ -16682,10 +16682,10 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v4
 ; GFX7LESS_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
 ; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s5
-; GFX7LESS_ITERATIVE-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[1:2]
-; GFX7LESS_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX7LESS_ITERATIVE-NEXT:    v_cmp_ge_u64_e32 vcc, s[4:5], v[1:2]
+; GFX7LESS_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX7LESS_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7LESS_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX7LESS_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; GFX7LESS_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
 ; GFX7LESS_ITERATIVE-NEXT:    s_endpgm
@@ -16734,13 +16734,13 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v4
 ; GFX8_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX8_ITERATIVE-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[1:2]
+; GFX8_ITERATIVE-NEXT:    v_cmp_ge_u64_e32 vcc, s[4:5], v[1:2]
 ; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s5
-; GFX8_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX8_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX8_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 s2, -1
-; GFX8_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX8_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; GFX8_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_endpgm
@@ -16788,13 +16788,13 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v4
 ; GFX9_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX9_ITERATIVE-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[1:2]
+; GFX9_ITERATIVE-NEXT:    v_cmp_ge_u64_e32 vcc, s[4:5], v[1:2]
 ; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX9_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX9_ITERATIVE-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 s2, -1
-; GFX9_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
+; GFX9_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; GFX9_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
@@ -16841,9 +16841,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
 ; GFX1064_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1064_ITERATIVE-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[1:2]
-; GFX1064_ITERATIVE-NEXT:    v_cndmask_b32_e64 v2, v2, s3, vcc
-; GFX1064_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s2, vcc
+; GFX1064_ITERATIVE-NEXT:    v_cmp_ge_u64_e32 vcc, s[2:3], v[1:2]
+; GFX1064_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, s3, v2, vcc
+; GFX1064_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, s2, v1, vcc
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1064_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
@@ -16891,9 +16891,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v4
 ; GFX1032_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v3
-; GFX1032_ITERATIVE-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[1:2]
-; GFX1032_ITERATIVE-NEXT:    v_cndmask_b32_e64 v2, v2, s3, vcc_lo
-; GFX1032_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s2, vcc_lo
+; GFX1032_ITERATIVE-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[2:3], v[1:2]
+; GFX1032_ITERATIVE-NEXT:    v_cndmask_b32_e32 v2, s3, v2, vcc_lo
+; GFX1032_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, s2, v1, vcc_lo
 ; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1032_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
@@ -16949,9 +16949,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX1164_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_ITERATIVE-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX1164_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1164_ITERATIVE-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc
+; GFX1164_ITERATIVE-NEXT:    v_cmp_ge_u64_e32 vcc, s[2:3], v[0:1]
+; GFX1164_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc
+; GFX1164_ITERATIVE-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1164_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
@@ -17003,9 +17003,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX1132_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v2
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132_ITERATIVE-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132_ITERATIVE-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1132_ITERATIVE-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX1132_ITERATIVE-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[2:3], v[0:1]
+; GFX1132_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX1132_ITERATIVE-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1132_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
@@ -17104,13 +17104,13 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_DPP-NEXT:    v_readfirstlane_b32 s4, v6
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v7, v2
-; GFX8_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7]
+; GFX8_DPP-NEXT:    v_cmp_ge_u64_e32 vcc, s[4:5], v[6:7]
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s5
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v7, v0, v7, vcc
 ; GFX8_DPP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8_DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8_DPP-NEXT:    s_mov_b32 s2, -1
-; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v6, v6, v0, vcc
+; GFX8_DPP-NEXT:    v_cndmask_b32_e32 v6, v0, v6, vcc
 ; GFX8_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8_DPP-NEXT:    buffer_store_dwordx2 v[6:7], off, s[0:3], 0
 ; GFX8_DPP-NEXT:    s_endpgm
@@ -17193,13 +17193,13 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    v_readfirstlane_b32 s4, v6
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v6, v1
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v7, v2
-; GFX9_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7]
+; GFX9_DPP-NEXT:    v_cmp_ge_u64_e32 vcc, s[4:5], v[6:7]
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v7, v0, v7, vcc
 ; GFX9_DPP-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9_DPP-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9_DPP-NEXT:    s_mov_b32 s2, -1
-; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v6, v6, v0, vcc
+; GFX9_DPP-NEXT:    v_cndmask_b32_e32 v6, v0, v6, vcc
 ; GFX9_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[6:7], off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
@@ -17305,9 +17305,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX1064_DPP-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX1064_DPP-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1064_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8]
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc
-; GFX1064_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc
+; GFX1064_DPP-NEXT:    v_cmp_ge_u64_e32 vcc, s[4:5], v[7:8]
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v8, s5, v8, vcc
+; GFX1064_DPP-NEXT:    v_cndmask_b32_e32 v7, s4, v7, vcc
 ; GFX1064_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
 ; GFX1064_DPP-NEXT:    s_endpgm
@@ -17390,9 +17390,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX1032_DPP-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX1032_DPP-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1032_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[7:8]
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc_lo
-; GFX1032_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc_lo
+; GFX1032_DPP-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v8, s5, v8, vcc_lo
+; GFX1032_DPP-NEXT:    v_cndmask_b32_e32 v7, s4, v7, vcc_lo
 ; GFX1032_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
 ; GFX1032_DPP-NEXT:    s_endpgm
@@ -17512,9 +17512,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_DPP-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX1164_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8]
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc
-; GFX1164_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc
+; GFX1164_DPP-NEXT:    v_cmp_ge_u64_e32 vcc, s[4:5], v[7:8]
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v8, s5, v8, vcc
+; GFX1164_DPP-NEXT:    v_cndmask_b32_e32 v7, s4, v7, vcc
 ; GFX1164_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[0:3], 0
 ; GFX1164_DPP-NEXT:    s_endpgm
@@ -17596,9 +17596,9 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_DPP-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX1132_DPP-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132_DPP-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132_DPP-NEXT:    v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[7:8]
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v8, v8, s5, vcc_lo
-; GFX1132_DPP-NEXT:    v_cndmask_b32_e64 v7, v7, s4, vcc_lo
+; GFX1132_DPP-NEXT:    v_cmp_ge_u64_e32 vcc_lo, s[4:5], v[7:8]
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e32 v8, s5, v8, vcc_lo
+; GFX1132_DPP-NEXT:    v_cndmask_b32_e32 v7, s4, v7, vcc_lo
 ; GFX1132_DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132_DPP-NEXT:    buffer_store_b64 v[7:8], off, s[0:3], 0
 ; GFX1132_DPP-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index e3cc8ee340f0c..b9a6f782fc682 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -1676,10 +1676,10 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_ffbh_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-GISEL-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
 ; GFX10-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-GISEL-NEXT:    v_add_nc_u16 v2, v2, -16
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
 ; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index d997904d81d54..cabe0a017df8c 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -1455,9 +1455,9 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    v_or_b32_e32 v2, 0x10000, v1
-; GFX10-GISEL-NEXT:    v_cmp_ne_u16_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
 ; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
+; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
 ; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %valptr
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index c1abc3002a990..97bcd8b5ee68a 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -1168,8 +1168,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX9-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
 ; GFX9-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
-; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, -1, v2, vcc
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
 ; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
@@ -1502,8 +1502,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_or_b32_e32 v3, 0x100, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v3, v3
-; GFX9-GISEL-NEXT:    v_cmp_ne_u32_sdwa vcc, v1, v0 src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-GISEL-NEXT:    v_cmp_eq_u32_sdwa vcc, v1, v0 src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %arrayidx, align 1
@@ -1598,8 +1598,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    v_or_b32_e32 v2, 0x10000, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
-; GFX9-GISEL-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-GISEL-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
 ; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %arrayidx, align 1
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index ffe0596a95e33..e45dd57554675 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -12225,9 +12225,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old
 ; GCN1-NEXT:    buffer_load_dword v1, v3, s[12:15], 0 offen
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s3
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
-; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN1-NEXT:    v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
+; GCN1-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v0, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v1, v3, s[12:15], 0 offen
 ; GCN1-NEXT:    s_endpgm
@@ -12278,9 +12278,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old
 ; GCN2-NEXT:    v_mov_b32_e32 v5, s2
 ; GCN2-NEXT:    v_mov_b32_e32 v4, s3
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
-; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN2-NEXT:    v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
+; GCN2-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
 ; GCN2-NEXT:    s_endpgm
@@ -12317,9 +12317,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -12376,9 +12376,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol
 ; GCN1-NEXT:    buffer_load_dword v1, v3, s[12:15], 0 offen
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s3
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
-; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN1-NEXT:    v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
+; GCN1-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v0, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v1, v3, s[12:15], 0 offen
 ; GCN1-NEXT:    s_endpgm
@@ -12429,9 +12429,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol
 ; GCN2-NEXT:    v_mov_b32_e32 v5, s2
 ; GCN2-NEXT:    v_mov_b32_e32 v4, s3
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
-; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN2-NEXT:    v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
+; GCN2-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
 ; GCN2-NEXT:    s_endpgm
@@ -12468,9 +12468,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -12680,9 +12680,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i
 ; GCN1-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s11
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
-; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, s[14:15], v[0:1]
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN1-NEXT:    v_cmp_ne_u64_e32 vcc, s[14:15], v[0:1]
+; GCN1-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v0, v2, s[16:19], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v1, v3, s[16:19], 0 offen
 ; GCN1-NEXT:    s_endpgm
@@ -12735,9 +12735,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i
 ; GCN2-NEXT:    v_mov_b32_e32 v5, s10
 ; GCN2-NEXT:    v_mov_b32_e32 v4, s11
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
-; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, s[14:15], v[0:1]
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN2-NEXT:    v_cmp_ne_u64_e32 vcc, s[14:15], v[0:1]
+; GCN2-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
 ; GCN2-NEXT:    s_endpgm
@@ -12775,9 +12775,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX12-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[6:7], v[0:1]
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -12998,9 +12998,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) {
 ; GCN1-NEXT:    buffer_load_dword v1, v3, s[12:15], 0 offen
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s3
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
-; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN1-NEXT:    v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
+; GCN1-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v0, v2, s[12:15], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v1, v3, s[12:15], 0 offen
 ; GCN1-NEXT:    s_endpgm
@@ -13049,9 +13049,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) {
 ; GCN2-NEXT:    v_mov_b32_e32 v5, s2
 ; GCN2-NEXT:    v_mov_b32_e32 v4, s3
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
-; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN2-NEXT:    v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1]
+; GCN2-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
 ; GCN2-NEXT:    s_endpgm
@@ -13087,9 +13087,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) {
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
 ; GFX12-NEXT:    s_endpgm
 entry:
@@ -13290,9 +13290,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind
 ; GCN1-NEXT:    buffer_load_dword v1, v3, s[16:19], 0 offen
 ; GCN1-NEXT:    v_mov_b32_e32 v4, s11
 ; GCN1-NEXT:    s_waitcnt vmcnt(0)
-; GCN1-NEXT:    v_cmp_eq_u64_e32 vcc, s[14:15], v[0:1]
-; GCN1-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GCN1-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN1-NEXT:    v_cmp_ne_u64_e32 vcc, s[14:15], v[0:1]
+; GCN1-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GCN1-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN1-NEXT:    buffer_store_dword v0, v2, s[16:19], 0 offen
 ; GCN1-NEXT:    buffer_store_dword v1, v3, s[16:19], 0 offen
 ; GCN1-NEXT:    s_endpgm
@@ -13343,9 +13343,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind
 ; GCN2-NEXT:    v_mov_b32_e32 v5, s10
 ; GCN2-NEXT:    v_mov_b32_e32 v4, s11
 ; GCN2-NEXT:    s_waitcnt vmcnt(0)
-; GCN2-NEXT:    v_cmp_eq_u64_e32 vcc, s[14:15], v[0:1]
-; GCN2-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GCN2-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN2-NEXT:    v_cmp_ne_u64_e32 vcc, s[14:15], v[0:1]
+; GCN2-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GCN2-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GCN2-NEXT:    buffer_store_dword v0, v2, s[88:91], 0 offen
 ; GCN2-NEXT:    buffer_store_dword v1, v3, s[88:91], 0 offen
 ; GCN2-NEXT:    s_endpgm
@@ -13382,9 +13382,9 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind
 ; GFX12-NEXT:    s_cselect_b32 s0, s0, -1
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], off, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[6:7], v[0:1]
-; GFX12-NEXT:    v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX12-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
+; GFX12-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[6:7], v[0:1]
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, s3, v1, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, s2, v0, vcc_lo
 ; GFX12-NEXT:    scratch_store_b64 off, v[0:1], s0
 ; GFX12-NEXT:    s_endpgm
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
index 2bfe5492263d3..1c298014e33e7 100644
--- a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll
@@ -27,27 +27,27 @@ define amdgpu_gfx [13 x i32] @issue130120() {
 ; CHECK-NEXT:    s_mov_b32 s48, 0
 ; CHECK-NEXT:  .LBB0_1: ; %bb3
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_cmp_lg_u32 s46, 0
+; CHECK-NEXT:    s_cmp_eq_u32 s46, 0
 ; CHECK-NEXT:    s_mov_b32 s49, s48
 ; CHECK-NEXT:    s_mov_b32 s50, s48
-; CHECK-NEXT:    s_cselect_b32 s51, s1, 0
-; CHECK-NEXT:    s_cselect_b32 s55, s35, 0
+; CHECK-NEXT:    s_cselect_b32 s51, 0, s1
+; CHECK-NEXT:    s_cselect_b32 s55, 0, s35
 ; CHECK-NEXT:    v_dual_mov_b32 v2, s48 :: v_dual_mov_b32 v3, s49
-; CHECK-NEXT:    s_cselect_b32 s52, s2, 0
-; CHECK-NEXT:    s_cselect_b32 s56, s36, 0
-; CHECK-NEXT:    s_cselect_b32 vcc_lo, s43, 0
+; CHECK-NEXT:    s_cselect_b32 s52, 0, s2
+; CHECK-NEXT:    s_cselect_b32 s56, 0, s36
+; CHECK-NEXT:    s_cselect_b32 vcc_lo, 0, s43
 ; CHECK-NEXT:    v_mov_b32_e32 v4, s50
-; CHECK-NEXT:    s_cselect_b32 s47, 0xf0, s45
-; CHECK-NEXT:    s_cselect_b32 s53, s3, 0
-; CHECK-NEXT:    s_cselect_b32 s54, s34, 0
-; CHECK-NEXT:    s_cselect_b32 s57, s37, 0
-; CHECK-NEXT:    s_cselect_b32 s58, s38, 0
-; CHECK-NEXT:    s_cselect_b32 s59, s0, 0
-; CHECK-NEXT:    s_cselect_b32 s60, s39, 0
-; CHECK-NEXT:    s_cselect_b32 s61, s40, 0
-; CHECK-NEXT:    s_cselect_b32 s62, s41, 0
-; CHECK-NEXT:    s_cselect_b32 s63, s42, 0
-; CHECK-NEXT:    s_cselect_b32 vcc_hi, s44, 0
+; CHECK-NEXT:    s_cselect_b32 s47, s45, 0xf0
+; CHECK-NEXT:    s_cselect_b32 s53, 0, s3
+; CHECK-NEXT:    s_cselect_b32 s54, 0, s34
+; CHECK-NEXT:    s_cselect_b32 s57, 0, s37
+; CHECK-NEXT:    s_cselect_b32 s58, 0, s38
+; CHECK-NEXT:    s_cselect_b32 s59, 0, s0
+; CHECK-NEXT:    s_cselect_b32 s60, 0, s39
+; CHECK-NEXT:    s_cselect_b32 s61, 0, s40
+; CHECK-NEXT:    s_cselect_b32 s62, 0, s41
+; CHECK-NEXT:    s_cselect_b32 s63, 0, s42
+; CHECK-NEXT:    s_cselect_b32 vcc_hi, 0, s44
 ; CHECK-NEXT:    s_mov_b32 s46, s48
 ; CHECK-NEXT:    scratch_store_b32 off, v0, s51
 ; CHECK-NEXT:    scratch_store_b32 off, v0, s52
diff --git a/llvm/test/CodeGen/AMDGPU/known-never-snan.ll b/llvm/test/CodeGen/AMDGPU/known-never-snan.ll
index 64948c374e4dd..2bbfa8f7a47ed 100644
--- a/llvm/test/CodeGen/AMDGPU/known-never-snan.ll
+++ b/llvm/test/CodeGen/AMDGPU/known-never-snan.ll
@@ -228,8 +228,8 @@ define float @v_test_known_not_snan_select_input_fmed3_r_i_i_f32(float %a, float
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %a.nnan.add = fdiv nnan float 1.0, %a, !fpmath !0
@@ -264,8 +264,8 @@ define float @v_select_possible_nan_rhs_input_fmed3_r_i_i_f32(float %a, float %b
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_med3_f32 v0, v0, 2.0, 4.0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll b/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll
index 71e41659d41dd..2051e8011d296 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll
@@ -206,10 +206,10 @@ define amdgpu_cs void @test_mixed(i32 %a, i32 %p, i32 %q, i32 %r, i32 %s, ptr ad
 define amdgpu_cs void @test_sgpr(i32 %a, i32 %p, i32 inreg %q, i32 inreg %r, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_sgpr:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc_lo, -1, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v1, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, s0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, s1, vcc_lo
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc_lo, -1, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v4, 0, v1, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v5, s0, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v6, s1, 0, vcc_lo
 ; GCN-NEXT:    global_store_b96 v[2:3], v[4:6], off
 ; GCN-NEXT:    s_endpgm
 .entry:

>From b26f8a4e25ada8c00fd1d90fda40c5a6404252ae Mon Sep 17 00:00:00 2001
From: Ana Mihajlovic <Ana.Mihajlovic at amd.com>
Date: Fri, 30 May 2025 16:18:15 +0200
Subject: [PATCH 4/5] add fcmp case

---
 .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    |  79 +++++++------
 .../AMDGPU/GlobalISel/select-to-fmin-fmax.ll  | 109 +++++++++--------
 .../AMDGPU/copysign-simplify-demanded-bits.ll |   4 +-
 llvm/test/CodeGen/AMDGPU/fmed3.ll             |  24 ++--
 llvm/test/CodeGen/AMDGPU/fract-match.ll       |  16 +--
 llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll         |  74 ++++++------
 llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll    | 110 +++++++-----------
 llvm/test/CodeGen/AMDGPU/v_cndmask.ll         |  30 ++---
 8 files changed, 220 insertions(+), 226 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index c3f9533d36323..ddccd239dc41e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -313,6 +313,7 @@ class AMDGPUCodeGenPrepareImpl
                       FastMathFlags FMF) const;
   Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
                           FastMathFlags FMF) const;
+  bool swapSelectOperands(CmpInst &I);
 
 public:
   bool visitFDiv(BinaryOperator &I);
@@ -321,6 +322,7 @@ class AMDGPUCodeGenPrepareImpl
   bool visitBinaryOperator(BinaryOperator &I);
   bool visitLoadInst(LoadInst &I);
   bool visitICmpInst(ICmpInst &I);
+  bool visitFCmpInst(FCmpInst &I);
   bool visitSelectInst(SelectInst &I);
   bool visitPHINode(PHINode &I);
   bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
@@ -891,6 +893,44 @@ static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
   return Builder.CreateFMul(Rsq, OutputScaleFactor);
 }
 
+// check if select operands should be swapped
+// so that v_cndmask can be later shrinked into vop2
+bool AMDGPUCodeGenPrepareImpl::swapSelectOperands(CmpInst &I) {
+  int ShouldSwap = 0;
+  for (auto Use = I.use_begin(); Use != I.use_end(); Use++) {
+    auto User = Use->getUser();
+
+    if (!isa<SelectInst>(User))
+      return false;
+
+    auto SelectI = dyn_cast<SelectInst>(User);
+
+    auto Op1 = SelectI->getOperand(1);
+    auto Op2 = SelectI->getOperand(2);
+
+    if (!UA.isDivergent(Op1) && UA.isDivergent(Op2))
+      ShouldSwap++;
+    else if (UA.isDivergent(Op1) && !UA.isDivergent(Op2))
+      ShouldSwap--;
+  }
+
+  if (ShouldSwap <= 0)
+    return false;
+
+  // swapping operands requires us to invert the comparison
+  I.setPredicate(I.getInversePredicate());
+
+  for (auto Use = I.use_begin(); Use != I.use_end(); Use++) {
+    auto SelectI = dyn_cast<Instruction>(Use->getUser());
+
+    auto Op = SelectI->getOperand(1);
+
+    SelectI->setOperand(1, SelectI->getOperand(2));
+    SelectI->setOperand(2, Op);
+  }
+  return true;
+}
+
 bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
                                                   FastMathFlags DivFMF,
                                                   FastMathFlags SqrtFMF) const {
@@ -1768,6 +1808,10 @@ bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
   return false;
 }
 
+bool AMDGPUCodeGenPrepareImpl::visitFCmpInst(FCmpInst &I){
+  return swapSelectOperands(I);
+}
+
 bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) {
   bool Changed = false;
 
@@ -1775,40 +1819,7 @@ bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) {
       UA.isUniform(&I))
     Changed |= promoteUniformOpToI32(I);
 
-  // check if select operands should be swapped
-  // so that v_cndmask can be later shrinked into
-  // vop2
-  int ShouldSwap = 0;
-  for (auto Use = I.use_begin(); Use != I.use_end(); Use++) {
-    auto User = Use->getUser();
-
-    if (!isa<SelectInst>(User))
-      return Changed;
-
-    auto SelectI = dyn_cast<SelectInst>(User);
-
-    auto Op1 = SelectI->getOperand(1);
-    auto Op2 = SelectI->getOperand(2);
-
-    if (!UA.isDivergent(Op1) && UA.isDivergent(Op2))
-      ShouldSwap++;
-    else if (UA.isDivergent(Op1) && !UA.isDivergent(Op2))
-      ShouldSwap--;
-  }
-
-  if (ShouldSwap <= 0)
-    return Changed;
-
-  I.setPredicate(I.getInverseCmpPredicate());
-
-  for (auto Use = I.use_begin(); Use != I.use_end(); Use++) {
-    auto SelectI = dyn_cast<Instruction>(Use->getUser());
-
-    auto Op = SelectI->getOperand(1);
-
-    SelectI->setOperand(1, SelectI->getOperand(2));
-    SelectI->setOperand(2, Op);
-  }
+  Changed |= swapSelectOperands(I);
 
   return Changed;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/select-to-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/select-to-fmin-fmax.ll
index ee3bf96111994..b52f50a09589f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/select-to-fmin-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/select-to-fmin-fmax.ll
@@ -5,8 +5,8 @@ define half @test_s16(half %a) #0 {
 ; GCN-LABEL: test_s16:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cmp_gt_f16_e32 vcc, 0, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; GCN-NEXT:    v_cmp_ngt_f16_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %fcmp = fcmp olt half %a, 0.0
@@ -18,8 +18,8 @@ define float @test_s32(float %a) #0 {
 ; GCN-LABEL: test_s32:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %fcmp = fcmp olt float %a, 0.0
@@ -31,9 +31,9 @@ define double @test_s64(double %a) #0 {
 ; GCN-LABEL: test_s64:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GCN-NEXT:    v_cmp_ngt_f64_e32 vcc, 0, v[0:1]
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %fcmp = fcmp olt double %a, 0.0
@@ -45,20 +45,19 @@ define <4 x half> @test_v4s16(<4 x half> %a) #0 {
 ; GCN-LABEL: test_v4s16:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NEXT:    v_cmp_gt_f16_e32 vcc, 0, v0
-; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v5, v0, 0, vcc
-; GCN-NEXT:    v_cmp_lt_f16_sdwa s[4:5], v0, v4 src0_sel:WORD_1 src1_sel:DWORD
-; GCN-NEXT:    v_cmp_gt_f16_e32 vcc, 0, v1
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v1, 0, vcc
-; GCN-NEXT:    v_cmp_lt_f16_sdwa s[4:5], v1, v4 src0_sel:WORD_1 src1_sel:DWORD
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, s[4:5]
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v5
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_cmp_nlt_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:DWORD
+; GCN-NEXT:    v_cmp_ngt_f16_e64 s[6:7], 0, v0
+; GCN-NEXT:    v_cmp_nlt_f16_sdwa s[4:5], v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, v0, s[6:7]
+; GCN-NEXT:    v_cndmask_b32_sdwa v0, v2, v0, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GCN-NEXT:    v_cmp_ngt_f16_e32 vcc, 0, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v4, 0, v1, vcc
+; GCN-NEXT:    s_mov_b64 vcc, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GCN-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v4
 ; GCN-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -72,29 +71,29 @@ define <8 x half> @test_v8s16(<8 x half> %a) #0 {
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-NEXT:    v_cmp_gt_f16_e32 vcc, 0, v0
+; GCN-NEXT:    v_cmp_ngt_f16_e32 vcc, 0, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v9, v0, 0, vcc
-; GCN-NEXT:    v_cmp_lt_f16_sdwa s[4:5], v0, v8 src0_sel:WORD_1 src1_sel:DWORD
-; GCN-NEXT:    v_cmp_gt_f16_e32 vcc, 0, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v9, 0, v0, vcc
+; GCN-NEXT:    v_cmp_nlt_f16_sdwa vcc, v0, v8 src0_sel:WORD_1 src1_sel:DWORD
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; GCN-NEXT:    v_cmp_ngt_f16_e32 vcc, 0, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v4, 0, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v1, 0, vcc
-; GCN-NEXT:    v_cmp_lt_f16_sdwa s[4:5], v1, v8 src0_sel:WORD_1 src1_sel:DWORD
-; GCN-NEXT:    v_cmp_gt_f16_e32 vcc, 0, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v4, 0, v1, vcc
+; GCN-NEXT:    v_cmp_nlt_f16_sdwa vcc, v1, v8 src0_sel:WORD_1 src1_sel:DWORD
+; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
+; GCN-NEXT:    v_cmp_ngt_f16_e32 vcc, 0, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, 0, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v5, v2, 0, vcc
-; GCN-NEXT:    v_cmp_lt_f16_sdwa s[4:5], v2, v8 src0_sel:WORD_1 src1_sel:DWORD
-; GCN-NEXT:    v_cmp_gt_f16_e32 vcc, 0, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v5, 0, v2, vcc
+; GCN-NEXT:    v_cmp_nlt_f16_sdwa vcc, v2, v8 src0_sel:WORD_1 src1_sel:DWORD
+; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
+; GCN-NEXT:    v_cmp_ngt_f16_e32 vcc, 0, v3
 ; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v6, 0, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v6, v3, 0, vcc
-; GCN-NEXT:    v_cmp_lt_f16_sdwa s[4:5], v3, v8 src0_sel:WORD_1 src1_sel:DWORD
+; GCN-NEXT:    v_cndmask_b32_e32 v6, 0, v3, vcc
+; GCN-NEXT:    v_cmp_nlt_f16_sdwa vcc, v3, v8 src0_sel:WORD_1 src1_sel:DWORD
 ; GCN-NEXT:    v_lshl_or_b32 v1, v1, 16, v4
 ; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v5
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v7, 0, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v7, vcc
 ; GCN-NEXT:    v_and_b32_e32 v7, 0xffff, v9
 ; GCN-NEXT:    v_lshl_or_b32 v2, v2, 16, v4
 ; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v6
@@ -111,10 +110,10 @@ define <2 x float> @test_v2s32(<2 x float> %a) #0 {
 ; GCN-LABEL: test_v2s32:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %fcmp = fcmp olt <2 x float> %a, zeroinitializer
@@ -126,14 +125,14 @@ define <4 x float> @test_v4s32(<4 x float> %a) #0 {
 ; GCN-LABEL: test_v4s32:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v2
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GCN-NEXT:    v_cmp_gt_f32_e32 vcc, 0, v3
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
+; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v2
+; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %fcmp = fcmp olt <4 x float> %a, zeroinitializer
@@ -145,12 +144,12 @@ define <2 x double> @test_v2s64(<2 x double> %a) #0 {
 ; GCN-LABEL: test_v2s64:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cmp_gt_f64_e32 vcc, 0, v[0:1]
-; GCN-NEXT:    v_cmp_gt_f64_e64 s[4:5], 0, v[2:3]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, 0, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[4:5]
+; GCN-NEXT:    v_cmp_ngt_f64_e32 vcc, 0, v[0:1]
+; GCN-NEXT:    v_cmp_ngt_f64_e64 s[4:5], 0, v[2:3]
+; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[4:5]
+; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %fcmp = fcmp olt <2 x double> %a, zeroinitializer
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index a01c2fa152ab3..3d6bf45eaf48d 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -218,8 +218,8 @@ define float @copysign_f32_f32_sign_known_p0_or_n0__mag_known_positive_select(fl
 ; GFX9-LABEL: copysign_f32_f32_sign_known_p0_or_n0__mag_known_positive_select:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v0, 0, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
 ; GFX9-NEXT:    s_brev_b32 s4, -2
 ; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index cbb07672be8ec..22b6225eba76d 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -1032,10 +1032,10 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out,
 ; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, 1.0, v3
-; VI-GISEL-NEXT:    v_cmp_nlt_f32_e32 vcc, 2.0, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 2.0, vcc
-; VI-GISEL-NEXT:    v_cmp_ngt_f32_e32 vcc, 4.0, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 4.0, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, 2.0, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 2.0, v2, vcc
+; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, 4.0, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 4.0, v2, vcc
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; VI-GISEL-NEXT:    s_endpgm
 ;
@@ -1059,10 +1059,10 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out,
 ; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; GFX9-GISEL-NEXT:    v_cmp_nlt_f32_e32 vcc, 2.0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 2.0, vcc
-; GFX9-GISEL-NEXT:    v_cmp_ngt_f32_e32 vcc, 4.0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 4.0, vcc
+; GFX9-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, 2.0, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
+; GFX9-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, 4.0, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 4.0, v1, vcc
 ; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
@@ -1090,11 +1090,11 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out,
 ; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; GFX11-GISEL-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 2.0, v1
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 2.0, vcc_lo
+; GFX11-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 2.0, v1
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 4.0, v1
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 4.0, vcc_lo
+; GFX11-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 4.0, v1
+; GFX11-GISEL-NEXT:    v_cndmask_b32_e32 v1, 4.0, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll
index d957ba93e4fb3..4ea13c460a590 100644
--- a/llvm/test/CodeGen/AMDGPU/fract-match.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll
@@ -2103,16 +2103,16 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) write
 ; GFX6-NEXT:    v_min_f32_e32 v7, 0x3f7fffff, v7
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v6, v1, vcc
 ; GFX6-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v8, 0x204
+; GFX6-NEXT:    v_mov_b32_e32 v8, 0x1fb
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
 ; GFX6-NEXT:    v_cmp_class_f32_e32 vcc, v0, v8
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v7, 0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v7, vcc
 ; GFX6-NEXT:    v_cmp_class_f32_e32 vcc, v1, v8
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v6, 0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v6, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[4:5], v[2:3], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -2778,19 +2778,19 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) wri
 ; GFX6-NEXT:    v_cndmask_b32_e32 v11, v11, v3, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v10, v10, v2, vcc
 ; GFX6-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
-; GFX6-NEXT:    v_mov_b32_e32 v14, 0x204
+; GFX6-NEXT:    v_mov_b32_e32 v14, 0x1fb
 ; GFX6-NEXT:    v_cndmask_b32_e32 v13, v13, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v12, v12, v0, vcc
 ; GFX6-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v14
 ; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v12, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v13, 0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v12, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v13, vcc
 ; GFX6-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v14
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v10, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v11, 0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v10, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v11, vcc
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
index 87c7cce854b11..77b2891921ecd 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
@@ -3783,21 +3783,25 @@ define float @v_elim_redun_check_ult_sqrt(float %in) {
 ; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-IEEE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v1, v0
-; SDAG-IEEE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v1
-; SDAG-IEEE-NEXT:    v_fma_f32 v3, -v2, v1, v0
-; SDAG-IEEE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; SDAG-IEEE-NEXT:    v_add_i32_e64 v3, s[4:5], 1, v1
-; SDAG-IEEE-NEXT:    v_fma_f32 v1, -v3, v1, v0
-; SDAG-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; SDAG-IEEE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; SDAG-IEEE-NEXT:    v_mov_b32_e32 v2, 0x260
-; SDAG-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
+; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v2, v1
+; SDAG-IEEE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
+; SDAG-IEEE-NEXT:    v_fma_f32 v4, -v3, v2, v1
+; SDAG-IEEE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
+; SDAG-IEEE-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
+; SDAG-IEEE-NEXT:    v_fma_f32 v2, -v4, v2, v1
+; SDAG-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
+; SDAG-IEEE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; SDAG-IEEE-NEXT:    v_mov_b32_e32 v3, 0x260
+; SDAG-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v1, v3
+; SDAG-IEEE-NEXT:    s_brev_b32 s4, 1
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; SDAG-IEEE-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; SDAG-IEEE-NEXT:    v_cmp_le_f32_e32 vcc, s4, v0
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-IEEE-LABEL: v_elim_redun_check_ult_sqrt:
@@ -3823,8 +3827,8 @@ define float @v_elim_redun_check_ult_sqrt(float %in) {
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GISEL-IEEE-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GISEL-IEEE-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GISEL-IEEE-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v2
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-DAZ-LABEL: v_elim_redun_check_ult_sqrt:
@@ -3833,20 +3837,24 @@ define float @v_elim_redun_check_ult_sqrt(float %in) {
 ; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0xf800000
 ; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
 ; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SDAG-DAZ-NEXT:    v_rsq_f32_e32 v1, v0
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v2, v0, v1
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; SDAG-DAZ-NEXT:    v_fma_f32 v3, -v1, v2, 0.5
-; SDAG-DAZ-NEXT:    v_fma_f32 v2, v2, v3, v2
-; SDAG-DAZ-NEXT:    v_fma_f32 v4, -v2, v2, v0
-; SDAG-DAZ-NEXT:    v_fma_f32 v1, v1, v3, v1
-; SDAG-DAZ-NEXT:    v_fma_f32 v1, v4, v1, v2
-; SDAG-DAZ-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x260
-; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
+; SDAG-DAZ-NEXT:    v_rsq_f32_e32 v2, v1
+; SDAG-DAZ-NEXT:    s_brev_b32 s4, 1
+; SDAG-DAZ-NEXT:    v_mul_f32_e32 v3, v1, v2
+; SDAG-DAZ-NEXT:    v_mul_f32_e32 v2, 0.5, v2
+; SDAG-DAZ-NEXT:    v_fma_f32 v4, -v2, v3, 0.5
+; SDAG-DAZ-NEXT:    v_fma_f32 v3, v3, v4, v3
+; SDAG-DAZ-NEXT:    v_fma_f32 v5, -v3, v3, v1
+; SDAG-DAZ-NEXT:    v_fma_f32 v2, v2, v4, v2
+; SDAG-DAZ-NEXT:    v_fma_f32 v2, v5, v2, v3
+; SDAG-DAZ-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
+; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; SDAG-DAZ-NEXT:    v_mov_b32_e32 v3, 0x260
+; SDAG-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v1, v3
+; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; SDAG-DAZ-NEXT:    v_cmp_le_f32_e32 vcc, s4, v0
+; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-DAZ-LABEL: v_elim_redun_check_ult_sqrt:
@@ -3871,8 +3879,8 @@ define float @v_elim_redun_check_ult_sqrt(float %in) {
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GISEL-DAZ-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GISEL-DAZ-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v2
-; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GISEL-DAZ-NEXT:    v_cmp_ge_f32_e32 vcc, v0, v2
+; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call float @llvm.sqrt.f32(float %in)
   %cmp = fcmp ult float %in, -0.000000e+00
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll b/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll
index 2051e8011d296..bedd4ff60ac6f 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll
@@ -422,9 +422,8 @@ define amdgpu_cs void @test_u64_ult(i64 %a, i64 %p, i64 %q, ptr addrspace(1) %ou
 define amdgpu_cs void @test_f32_oeq(float %a, float %p, float %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_f32_oeq:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 2.0, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 2.0, v0
+; GCN-NEXT:    v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
 ; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -440,9 +439,9 @@ define amdgpu_cs void @test_f32_oeq(float %a, float %p, float %q, ptr addrspace(
 define amdgpu_cs void @test_f32_negative_modifiers(float %a, float %p, float %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_f32_negative_modifiers:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 2.0, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -v1, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v1, -v2, 0, vcc_lo
+; GCN-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 2.0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -v1, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, -v2, vcc_lo
 ; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -460,9 +459,8 @@ define amdgpu_cs void @test_f32_negative_modifiers(float %a, float %p, float %q,
 define amdgpu_cs void @test_f32_one(float %a, float %p, float %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_f32_one:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_lg_f32_e32 vcc_lo, 2.0, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    v_cmp_nlg_f32_e32 vcc_lo, 2.0, v0
+; GCN-NEXT:    v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
 ; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -478,9 +476,8 @@ define amdgpu_cs void @test_f32_one(float %a, float %p, float %q, ptr addrspace(
 define amdgpu_cs void @test_f32_ord(float %a, float %p, float %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_f32_ord:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GCN-NEXT:    v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
 ; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -496,9 +493,8 @@ define amdgpu_cs void @test_f32_ord(float %a, float %p, float %q, ptr addrspace(
 define amdgpu_cs void @test_f32_uno(float %a, float %p, float %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_f32_uno:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    v_cmp_o_f32_e32 vcc_lo, v0, v0
+; GCN-NEXT:    v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
 ; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -514,9 +510,8 @@ define amdgpu_cs void @test_f32_uno(float %a, float %p, float %q, ptr addrspace(
 define amdgpu_cs void @test_f32_oge(float %a, float %p, float %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_f32_oge:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_ge_f32_e32 vcc_lo, 2.0, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    v_cmp_nge_f32_e32 vcc_lo, 2.0, v0
+; GCN-NEXT:    v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
 ; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -532,9 +527,8 @@ define amdgpu_cs void @test_f32_oge(float %a, float %p, float %q, ptr addrspace(
 define amdgpu_cs void @test_f32_ole(float %a, float %p, float %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_f32_ole:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_le_f32_e32 vcc_lo, 2.0, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    v_cmp_nle_f32_e32 vcc_lo, 2.0, v0
+; GCN-NEXT:    v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
 ; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -550,9 +544,8 @@ define amdgpu_cs void @test_f32_ole(float %a, float %p, float %q, ptr addrspace(
 define amdgpu_cs void @test_f32_ogt(float %a, float %p, float %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_f32_ogt:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 2.0, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 2.0, v0
+; GCN-NEXT:    v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
 ; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -568,9 +561,8 @@ define amdgpu_cs void @test_f32_ogt(float %a, float %p, float %q, ptr addrspace(
 define amdgpu_cs void @test_f32_olt(float %a, float %p, float %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_f32_olt:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 2.0, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc_lo
+; GCN-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 2.0, v0
+; GCN-NEXT:    v_dual_cndmask_b32 v0, 0, v1 :: v_dual_cndmask_b32 v1, 0, v2
 ; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -587,11 +579,9 @@ define amdgpu_cs void @test_f32_olt(float %a, float %p, float %q, ptr addrspace(
 define amdgpu_cs void @test_f64_oeq(double %a, double %p, double %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_f64_oeq:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_eq_f64_e32 vcc_lo, 2.0, v[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    v_cmp_neq_f64_e32 vcc_lo, 2.0, v[0:1]
+; GCN-NEXT:    v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT:    v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
 ; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -607,11 +597,9 @@ define amdgpu_cs void @test_f64_oeq(double %a, double %p, double %q, ptr addrspa
 define amdgpu_cs void @test_f64_one(double %a, double %p, double %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_f64_one:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_lg_f64_e32 vcc_lo, 2.0, v[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 2.0, v[0:1]
+; GCN-NEXT:    v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT:    v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
 ; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -627,11 +615,9 @@ define amdgpu_cs void @test_f64_one(double %a, double %p, double %q, ptr addrspa
 define amdgpu_cs void @test_f64_oge(double %a, double %p, double %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_f64_oge:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_ge_f64_e32 vcc_lo, 2.0, v[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    v_cmp_nge_f64_e32 vcc_lo, 2.0, v[0:1]
+; GCN-NEXT:    v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT:    v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
 ; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -647,11 +633,9 @@ define amdgpu_cs void @test_f64_oge(double %a, double %p, double %q, ptr addrspa
 define amdgpu_cs void @test_f64_ole(double %a, double %p, double %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_f64_ole:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_le_f64_e32 vcc_lo, 2.0, v[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    v_cmp_nle_f64_e32 vcc_lo, 2.0, v[0:1]
+; GCN-NEXT:    v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT:    v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
 ; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -667,11 +651,9 @@ define amdgpu_cs void @test_f64_ole(double %a, double %p, double %q, ptr addrspa
 define amdgpu_cs void @test_f64_ogt(double %a, double %p, double %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_f64_ogt:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_gt_f64_e32 vcc_lo, 2.0, v[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    v_cmp_ngt_f64_e32 vcc_lo, 2.0, v[0:1]
+; GCN-NEXT:    v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT:    v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
 ; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -687,11 +669,9 @@ define amdgpu_cs void @test_f64_ogt(double %a, double %p, double %q, ptr addrspa
 define amdgpu_cs void @test_f64_olt(double %a, double %p, double %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_f64_olt:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_lt_f64_e32 vcc_lo, 2.0, v[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    v_cmp_nlt_f64_e32 vcc_lo, 2.0, v[0:1]
+; GCN-NEXT:    v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT:    v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
 ; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -707,11 +687,9 @@ define amdgpu_cs void @test_f64_olt(double %a, double %p, double %q, ptr addrspa
 define amdgpu_cs void @test_f64_ord(double %a, double %p, double %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_f64_ord:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_o_f64_e32 vcc_lo, v[0:1], v[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
+; GCN-NEXT:    v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT:    v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
 ; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
 ; GCN-NEXT:    s_endpgm
 .entry:
@@ -727,11 +705,9 @@ define amdgpu_cs void @test_f64_ord(double %a, double %p, double %q, ptr addrspa
 define amdgpu_cs void @test_f64_uno(double %a, double %p, double %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_f64_uno:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, 0, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GCN-NEXT:    v_cmp_o_f64_e32 vcc_lo, v[0:1], v[0:1]
+; GCN-NEXT:    v_dual_cndmask_b32 v1, 0, v3 :: v_dual_cndmask_b32 v0, 0, v2
+; GCN-NEXT:    v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4
 ; GCN-NEXT:    global_store_b128 v[6:7], v[0:3], off
 ; GCN-NEXT:    s_endpgm
 .entry:
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index a41063f467d01..1ef4164a93b9f 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -2043,9 +2043,9 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add
 ; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; SI-NEXT:    v_cmp_nle_f32_e32 vcc, 4.0, v2
-; SI-NEXT:    v_cndmask_b32_e64 v2, v3, -1.0, vcc
-; SI-NEXT:    v_cndmask_b32_e64 v3, v3, -2.0, vcc
+; SI-NEXT:    v_cmp_le_f32_e32 vcc, 4.0, v2
+; SI-NEXT:    v_cndmask_b32_e32 v2, -1.0, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v3, -2.0, v3, vcc
 ; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
@@ -2071,9 +2071,9 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_cmp_nle_f32_e32 vcc, 4.0, v5
-; VI-NEXT:    v_cndmask_b32_e64 v3, v2, -1.0, vcc
-; VI-NEXT:    v_cndmask_b32_e64 v2, v2, -2.0, vcc
+; VI-NEXT:    v_cmp_le_f32_e32 vcc, 4.0, v5
+; VI-NEXT:    v_cndmask_b32_e32 v3, -1.0, v2, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v2, -2.0, v2, vcc
 ; VI-NEXT:    flat_store_dword v[0:1], v3
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -2091,9 +2091,9 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_load_dword v2, v0, s[6:7] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_nle_f32_e32 vcc, 4.0, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, -1.0, vcc
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, -2.0, vcc
+; GFX10-NEXT:    v_cmp_le_f32_e32 vcc, 4.0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, -1.0, v2, vcc
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, -2.0, v2, vcc
 ; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    global_store_dword v0, v2, s[0:1]
@@ -2113,9 +2113,9 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v2, v0, s[4:5] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_nle_f32_e32 vcc, 4.0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v2, -1.0, vcc
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, -2.0, vcc
+; GFX11-NEXT:    v_cmp_le_f32_e32 vcc, 4.0, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, -1.0, v2, vcc
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, -2.0, v2, vcc
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1] dlc
@@ -2135,9 +2135,9 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_cmp_nle_f32_e32 vcc, 4.0, v1
-; GFX12-NEXT:    v_cndmask_b32_e64 v1, v2, -1.0, vcc
-; GFX12-NEXT:    v_cndmask_b32_e64 v2, v2, -2.0, vcc
+; GFX12-NEXT:    v_cmp_le_f32_e32 vcc, 4.0, v1
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, -1.0, v2, vcc
+; GFX12-NEXT:    v_cndmask_b32_e32 v2, -2.0, v2, vcc
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_store_b32 v0, v2, s[0:1] scope:SCOPE_SYS

>From 9c3244bc61790346d41f16ea777c0dc7649108f4 Mon Sep 17 00:00:00 2001
From: Ana Mihajlovic <Ana.Mihajlovic at amd.com>
Date: Fri, 30 May 2025 17:00:32 +0200
Subject: [PATCH 5/5] added fneg/fabs operand check

---
 .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    |  23 +++
 llvm/test/CodeGen/AMDGPU/fract-match.ll       | 175 ++---------------
 .../AMDGPU/select-fabs-fneg-extract.v2f16.ll  | 180 +++++++++---------
 llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll    |   6 +-
 4 files changed, 134 insertions(+), 250 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index ddccd239dc41e..72b30d997d48d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -314,6 +314,7 @@ class AMDGPUCodeGenPrepareImpl
   Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
                           FastMathFlags FMF) const;
   bool swapSelectOperands(CmpInst &I);
+  bool isFnegOrFabs(Value &V);
 
 public:
   bool visitFDiv(BinaryOperator &I);
@@ -893,6 +894,23 @@ static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
   return Builder.CreateFMul(Rsq, OutputScaleFactor);
 }
 
+bool AMDGPUCodeGenPrepareImpl::isFnegOrFabs(Value &V) {
+  Instruction *I = dyn_cast<Instruction>(&V);
+  if (!I)
+    return false;
+
+  if (I->getOpcode() == Instruction::FNeg)
+    return true;
+
+  if (!isa<CallInst>(I))
+    return false;
+
+  auto CallI = dyn_cast<CallInst>(I);
+  auto CallF = CallI->getCalledFunction();
+  return CallF->isIntrinsic() && CallF->getIntrinsicID() == Intrinsic::fabs;
+  return true;
+}
+
 // check if select operands should be swapped
 // so that v_cndmask can be later shrinked into vop2
 bool AMDGPUCodeGenPrepareImpl::swapSelectOperands(CmpInst &I) {
@@ -908,6 +926,11 @@ bool AMDGPUCodeGenPrepareImpl::swapSelectOperands(CmpInst &I) {
     auto Op1 = SelectI->getOperand(1);
     auto Op2 = SelectI->getOperand(2);
 
+    if (isFnegOrFabs(*Op1) || isFnegOrFabs(*Op2))
+      continue;
+
+    // if the operand is defined by fneg or fabs it means the instruction
+    // will have source modifiers and therefore can't be shrinked to vop2
     if (!UA.isDivergent(Op1) && UA.isDivergent(Op2))
       ShouldSwap++;
     else if (UA.isDivergent(Op1) && !UA.isDivergent(Op2))
diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll
index 4ea13c460a590..b4dd441097b78 100644
--- a/llvm/test/CodeGen/AMDGPU/fract-match.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll
@@ -33,8 +33,8 @@ define float @safe_math_fract_f32(float %x, ptr addrspace(1) writeonly captures(
 ; GFX6-IR-NEXT:    [[UNO:%.*]] = fcmp uno float [[X]], 0.000000e+00
 ; GFX6-IR-NEXT:    [[COND:%.*]] = select i1 [[UNO]], float [[X]], float [[MIN]]
 ; GFX6-IR-NEXT:    [[FABS:%.*]] = tail call float @llvm.fabs.f32(float [[X]])
-; GFX6-IR-NEXT:    [[CMPINF:%.*]] = fcmp oeq float [[FABS]], 0x7FF0000000000000
-; GFX6-IR-NEXT:    [[COND6:%.*]] = select i1 [[CMPINF]], float 0.000000e+00, float [[COND]]
+; GFX6-IR-NEXT:    [[CMPINF:%.*]] = fcmp une float [[FABS]], 0x7FF0000000000000
+; GFX6-IR-NEXT:    [[COND6:%.*]] = select i1 [[CMPINF]], float [[COND]], float 0.000000e+00
 ; GFX6-IR-NEXT:    store float [[FLOOR]], ptr addrspace(1) [[IP]], align 4
 ; GFX6-IR-NEXT:    ret float [[COND6]]
 ;
@@ -67,7 +67,6 @@ define float @safe_math_fract_f32(float %x, ptr addrspace(1) writeonly captures(
 ; GFX6-NEXT:    buffer_store_dword v3, v[1:2], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: safe_math_fract_f32:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -83,7 +82,6 @@ define float @safe_math_fract_f32(float %x, ptr addrspace(1) writeonly captures(
 ; GFX7-NEXT:    buffer_store_dword v3, v[1:2], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: safe_math_fract_f32:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -95,7 +93,6 @@ define float @safe_math_fract_f32(float %x, ptr addrspace(1) writeonly captures(
 ; GFX8-NEXT:    global_store_dword v[1:2], v3, off
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: safe_math_fract_f32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -106,7 +103,6 @@ define float @safe_math_fract_f32(float %x, ptr addrspace(1) writeonly captures(
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc_lo
 ; GFX11-NEXT:    global_store_b32 v[1:2], v4, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: safe_math_fract_f32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -169,7 +165,6 @@ define float @safe_math_fract_f32_noinf_check(float %x, ptr addrspace(1) writeon
 ; GFX6-NEXT:    buffer_store_dword v3, v[1:2], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: safe_math_fract_f32_noinf_check:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -182,7 +177,6 @@ define float @safe_math_fract_f32_noinf_check(float %x, ptr addrspace(1) writeon
 ; GFX7-NEXT:    buffer_store_dword v3, v[1:2], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: safe_math_fract_f32_noinf_check:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -191,7 +185,6 @@ define float @safe_math_fract_f32_noinf_check(float %x, ptr addrspace(1) writeon
 ; GFX8-NEXT:    global_store_dword v[1:2], v3, off
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: safe_math_fract_f32_noinf_check:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -199,7 +192,6 @@ define float @safe_math_fract_f32_noinf_check(float %x, ptr addrspace(1) writeon
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    global_store_b32 v[1:2], v3, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: safe_math_fract_f32_noinf_check:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -230,8 +222,8 @@ define float @no_nan_check_math_fract_f32(float %x, ptr addrspace(1) writeonly c
 ; IR-NEXT:    [[SUB:%.*]] = fsub float [[X]], [[FLOOR]]
 ; IR-NEXT:    [[MIN:%.*]] = tail call float @llvm.minnum.f32(float [[SUB]], float 0x3FEFFFFFE0000000)
 ; IR-NEXT:    [[FABS:%.*]] = tail call float @llvm.fabs.f32(float [[X]])
-; IR-NEXT:    [[CMPINF:%.*]] = fcmp oeq float [[FABS]], 0x7FF0000000000000
-; IR-NEXT:    [[COND6:%.*]] = select i1 [[CMPINF]], float 0.000000e+00, float [[MIN]]
+; IR-NEXT:    [[CMPINF:%.*]] = fcmp une float [[FABS]], 0x7FF0000000000000
+; IR-NEXT:    [[COND6:%.*]] = select i1 [[CMPINF]], float [[MIN]], float 0.000000e+00
 ; IR-NEXT:    store float [[FLOOR]], ptr addrspace(1) [[IP]], align 4
 ; IR-NEXT:    ret float [[COND6]]
 ;
@@ -251,7 +243,6 @@ define float @no_nan_check_math_fract_f32(float %x, ptr addrspace(1) writeonly c
 ; GFX6-NEXT:    buffer_store_dword v3, v[1:2], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: no_nan_check_math_fract_f32:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -268,7 +259,6 @@ define float @no_nan_check_math_fract_f32(float %x, ptr addrspace(1) writeonly c
 ; GFX7-NEXT:    buffer_store_dword v3, v[1:2], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: no_nan_check_math_fract_f32:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -281,7 +271,6 @@ define float @no_nan_check_math_fract_f32(float %x, ptr addrspace(1) writeonly c
 ; GFX8-NEXT:    global_store_dword v[1:2], v3, off
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: no_nan_check_math_fract_f32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -293,7 +282,6 @@ define float @no_nan_check_math_fract_f32(float %x, ptr addrspace(1) writeonly c
 ; GFX11-NEXT:    v_min_f32_e32 v4, 0x3f7fffff, v4
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: no_nan_check_math_fract_f32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -343,25 +331,21 @@ define float @basic_fract_f32_nonans(float nofpclass(nan) %x) {
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: basic_fract_f32_nonans:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: basic_fract_f32_nonans:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: basic_fract_f32_nonans:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: basic_fract_f32_nonans:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -394,7 +378,6 @@ define float @basic_fract_f32_flags_minnum(float %x) {
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: basic_fract_f32_flags_minnum:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -402,7 +385,6 @@ define float @basic_fract_f32_flags_minnum(float %x) {
 ; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: basic_fract_f32_flags_minnum:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -410,7 +392,6 @@ define float @basic_fract_f32_flags_minnum(float %x) {
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: basic_fract_f32_flags_minnum:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -419,7 +400,6 @@ define float @basic_fract_f32_flags_minnum(float %x) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: basic_fract_f32_flags_minnum:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -461,25 +441,21 @@ define float @basic_fract_f32_flags_fsub(float nofpclass(nan) %x) {
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: basic_fract_f32_flags_fsub:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: basic_fract_f32_flags_fsub:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: basic_fract_f32_flags_fsub:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: basic_fract_f32_flags_fsub:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -526,28 +502,24 @@ define <2 x float> @basic_fract_v2f32_nonans(<2 x float> nofpclass(nan) %x) {
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX6-NEXT:    v_min_f32_e32 v1, 0x3f7fffff, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: basic_fract_v2f32_nonans:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX7-NEXT:    v_fract_f32_e32 v1, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: basic_fract_v2f32_nonans:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX8-NEXT:    v_fract_f32_e32 v1, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: basic_fract_v2f32_nonans:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    v_fract_f32_e32 v1, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: basic_fract_v2f32_nonans:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -597,7 +569,6 @@ define float @basic_fract_f32_multi_use_fsub_nonans(float nofpclass(nan) %x, ptr
 ; GFX6-NEXT:    buffer_store_dword v3, v[1:2], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: basic_fract_f32_multi_use_fsub_nonans:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -611,7 +582,6 @@ define float @basic_fract_f32_multi_use_fsub_nonans(float nofpclass(nan) %x, ptr
 ; GFX7-NEXT:    buffer_store_dword v3, v[1:2], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: basic_fract_f32_multi_use_fsub_nonans:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -621,7 +591,6 @@ define float @basic_fract_f32_multi_use_fsub_nonans(float nofpclass(nan) %x, ptr
 ; GFX8-NEXT:    global_store_dword v[1:2], v3, off
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: basic_fract_f32_multi_use_fsub_nonans:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -631,7 +600,6 @@ define float @basic_fract_f32_multi_use_fsub_nonans(float nofpclass(nan) %x, ptr
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    global_store_b32 v[1:2], v3, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: basic_fract_f32_multi_use_fsub_nonans:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -675,25 +643,21 @@ define float @nnan_minnum_fract_f32(float %x) {
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: nnan_minnum_fract_f32:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: nnan_minnum_fract_f32:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: nnan_minnum_fract_f32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: nnan_minnum_fract_f32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -728,7 +692,6 @@ define float @nnan_fsub_fract_f32(float %x) {
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: nnan_fsub_fract_f32:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -736,7 +699,6 @@ define float @nnan_fsub_fract_f32(float %x) {
 ; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: nnan_fsub_fract_f32:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -744,7 +706,6 @@ define float @nnan_fsub_fract_f32(float %x) {
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: nnan_fsub_fract_f32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -753,7 +714,6 @@ define float @nnan_fsub_fract_f32(float %x) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: nnan_fsub_fract_f32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -789,7 +749,6 @@ define float @nnan_floor_fract_f32(float %x) {
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: nnan_floor_fract_f32:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -797,7 +756,6 @@ define float @nnan_floor_fract_f32(float %x) {
 ; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: nnan_floor_fract_f32:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -805,7 +763,6 @@ define float @nnan_floor_fract_f32(float %x) {
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: nnan_floor_fract_f32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -814,7 +771,6 @@ define float @nnan_floor_fract_f32(float %x) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: nnan_floor_fract_f32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -856,25 +812,21 @@ define float @nnan_src_fract_f32(float nofpclass(nan) %x) {
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: nnan_src_fract_f32:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: nnan_src_fract_f32:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: nnan_src_fract_f32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: nnan_src_fract_f32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -908,7 +860,6 @@ define float @not_fract_f32_wrong_const(float nofpclass(nan) %x) {
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7ffffe, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: not_fract_f32_wrong_const:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -916,7 +867,6 @@ define float @not_fract_f32_wrong_const(float nofpclass(nan) %x) {
 ; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_min_f32_e32 v0, 0x3f7ffffe, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: not_fract_f32_wrong_const:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -924,7 +874,6 @@ define float @not_fract_f32_wrong_const(float nofpclass(nan) %x) {
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_min_f32_e32 v0, 0x3f7ffffe, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: not_fract_f32_wrong_const:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -933,7 +882,6 @@ define float @not_fract_f32_wrong_const(float nofpclass(nan) %x) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7ffffe, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: not_fract_f32_wrong_const:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -970,7 +918,6 @@ define float @not_fract_f32_swapped_fsub(float nofpclass(nan) %x) {
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v1, v0
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: not_fract_f32_swapped_fsub:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -978,7 +925,6 @@ define float @not_fract_f32_swapped_fsub(float nofpclass(nan) %x) {
 ; GFX7-NEXT:    v_sub_f32_e32 v0, v1, v0
 ; GFX7-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: not_fract_f32_swapped_fsub:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -986,7 +932,6 @@ define float @not_fract_f32_swapped_fsub(float nofpclass(nan) %x) {
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: not_fract_f32_swapped_fsub:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -995,7 +940,6 @@ define float @not_fract_f32_swapped_fsub(float nofpclass(nan) %x) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v1, v0
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: not_fract_f32_swapped_fsub:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1032,7 +976,6 @@ define float @not_fract_f32_not_floor(float nofpclass(nan) %x) {
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: not_fract_f32_not_floor:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1040,7 +983,6 @@ define float @not_fract_f32_not_floor(float nofpclass(nan) %x) {
 ; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: not_fract_f32_not_floor:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1048,7 +990,6 @@ define float @not_fract_f32_not_floor(float nofpclass(nan) %x) {
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: not_fract_f32_not_floor:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1057,7 +998,6 @@ define float @not_fract_f32_not_floor(float nofpclass(nan) %x) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: not_fract_f32_not_floor:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1094,7 +1034,6 @@ define float @not_fract_f32_different_floor(float %x, float %y) {
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: not_fract_f32_different_floor:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1102,7 +1041,6 @@ define float @not_fract_f32_different_floor(float %x, float %y) {
 ; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: not_fract_f32_different_floor:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1110,7 +1048,6 @@ define float @not_fract_f32_different_floor(float %x, float %y) {
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: not_fract_f32_different_floor:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1119,7 +1056,6 @@ define float @not_fract_f32_different_floor(float %x, float %y) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v0, 0x3f7fffff, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: not_fract_f32_different_floor:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1156,7 +1092,6 @@ define float @not_fract_f32_maxnum(float nofpclass(nan) %x) {
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_max_f32_e32 v0, 0x3f7fffff, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: not_fract_f32_maxnum:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1164,7 +1099,6 @@ define float @not_fract_f32_maxnum(float nofpclass(nan) %x) {
 ; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_max_f32_e32 v0, 0x3f7fffff, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: not_fract_f32_maxnum:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1172,7 +1106,6 @@ define float @not_fract_f32_maxnum(float nofpclass(nan) %x) {
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_max_f32_e32 v0, 0x3f7fffff, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: not_fract_f32_maxnum:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1181,7 +1114,6 @@ define float @not_fract_f32_maxnum(float nofpclass(nan) %x) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_max_f32_e32 v0, 0x3f7fffff, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: not_fract_f32_maxnum:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1216,7 +1148,6 @@ define float @fcmp_uno_check_is_nan_f32(float %x) {
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: fcmp_uno_check_is_nan_f32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1261,25 +1192,21 @@ define float @select_nan_fract_f32(float %x) {
 ; GFX6-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: select_nan_fract_f32:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: select_nan_fract_f32:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: select_nan_fract_f32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: select_nan_fract_f32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1324,25 +1251,21 @@ define float @commuted_select_nan_fract_f32(float %x) {
 ; GFX6-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: commuted_select_nan_fract_f32:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: commuted_select_nan_fract_f32:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: commuted_select_nan_fract_f32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: commuted_select_nan_fract_f32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1381,7 +1304,6 @@ define float @wrong_commuted_nan_select_f32(float %x) {
 ; GFX6-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: wrong_commuted_nan_select_f32:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1391,7 +1313,6 @@ define float @wrong_commuted_nan_select_f32(float %x) {
 ; GFX7-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: wrong_commuted_nan_select_f32:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1401,7 +1322,6 @@ define float @wrong_commuted_nan_select_f32(float %x) {
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: wrong_commuted_nan_select_f32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1413,7 +1333,6 @@ define float @wrong_commuted_nan_select_f32(float %x) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: wrong_commuted_nan_select_f32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1471,7 +1390,6 @@ define half @basic_fract_f16_nonan(half nofpclass(nan) %x) {
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7fe000, v0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: basic_fract_f16_nonan:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1481,25 +1399,21 @@ define half @basic_fract_f16_nonan(half nofpclass(nan) %x) {
 ; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_min_f32_e32 v0, 0x3f7fe000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: basic_fract_f16_nonan:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fract_f16_e32 v0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-TRUE16-LABEL: basic_fract_f16_nonan:
 ; GFX11-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_fract_f16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-FAKE16-LABEL: basic_fract_f16_nonan:
 ; GFX11-FAKE16:       ; %bb.0: ; %entry
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_fract_f16_e32 v0, v0
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-TRUE16-LABEL: basic_fract_f16_nonan:
 ; GFX12-TRUE16:       ; %bb.0: ; %entry
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1509,7 +1423,6 @@ define half @basic_fract_f16_nonan(half nofpclass(nan) %x) {
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_fract_f16_e32 v0.l, v0.l
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-FAKE16-LABEL: basic_fract_f16_nonan:
 ; GFX12-FAKE16:       ; %bb.0: ; %entry
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1568,7 +1481,6 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) {
 ; GFX6-NEXT:    v_min_f32_e32 v0, 0x3f7fe000, v0
 ; GFX6-NEXT:    v_min_f32_e32 v1, 0x3f7fe000, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: basic_fract_v2f16_nonan:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1583,7 +1495,6 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) {
 ; GFX7-NEXT:    v_min_f32_e32 v0, 0x3f7fe000, v0
 ; GFX7-NEXT:    v_min_f32_e32 v1, 0x3f7fe000, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: basic_fract_v2f16_nonan:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1591,7 +1502,6 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) {
 ; GFX8-NEXT:    v_fract_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-TRUE16-LABEL: basic_fract_v2f16_nonan:
 ; GFX11-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1600,7 +1510,6 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) {
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-FAKE16-LABEL: basic_fract_v2f16_nonan:
 ; GFX11-FAKE16:       ; %bb.0: ; %entry
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1610,7 +1519,6 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) {
 ; GFX11-FAKE16-NEXT:    v_fract_f16_e32 v1, v1
 ; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-TRUE16-LABEL: basic_fract_v2f16_nonan:
 ; GFX12-TRUE16:       ; %bb.0: ; %entry
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1623,7 +1531,6 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) {
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-FAKE16-LABEL: basic_fract_v2f16_nonan:
 ; GFX12-FAKE16:       ; %bb.0: ; %entry
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1675,25 +1582,21 @@ define double @basic_fract_f64_nanans(double nofpclass(nan) %x) {
 ; GFX6-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
 ; GFX6-NEXT:    v_min_f64 v[0:1], v[0:1], s[4:5]
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: basic_fract_f64_nanans:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_fract_f64_e32 v[0:1], v[0:1]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: basic_fract_f64_nanans:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fract_f64_e32 v[0:1], v[0:1]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: basic_fract_f64_nanans:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f64_e32 v[0:1], v[0:1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: basic_fract_f64_nanans:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1759,7 +1662,6 @@ define half @safe_math_fract_f16_noinf_check(half %x, ptr addrspace(1) writeonly
 ; GFX6-NEXT:    buffer_store_short v3, v[1:2], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: safe_math_fract_f16_noinf_check:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1778,7 +1680,6 @@ define half @safe_math_fract_f16_noinf_check(half %x, ptr addrspace(1) writeonly
 ; GFX7-NEXT:    buffer_store_short v3, v[1:2], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: safe_math_fract_f16_noinf_check:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1787,7 +1688,6 @@ define half @safe_math_fract_f16_noinf_check(half %x, ptr addrspace(1) writeonly
 ; GFX8-NEXT:    global_store_short v[1:2], v3, off
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-TRUE16-LABEL: safe_math_fract_f16_noinf_check:
 ; GFX11-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1795,7 +1695,6 @@ define half @safe_math_fract_f16_noinf_check(half %x, ptr addrspace(1) writeonly
 ; GFX11-TRUE16-NEXT:    v_fract_f16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    global_store_d16_hi_b16 v[1:2], v0, off
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-FAKE16-LABEL: safe_math_fract_f16_noinf_check:
 ; GFX11-FAKE16:       ; %bb.0: ; %entry
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1803,7 +1702,6 @@ define half @safe_math_fract_f16_noinf_check(half %x, ptr addrspace(1) writeonly
 ; GFX11-FAKE16-NEXT:    v_fract_f16_e32 v0, v0
 ; GFX11-FAKE16-NEXT:    global_store_b16 v[1:2], v3, off
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-TRUE16-LABEL: safe_math_fract_f16_noinf_check:
 ; GFX12-TRUE16:       ; %bb.0: ; %entry
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1815,7 +1713,6 @@ define half @safe_math_fract_f16_noinf_check(half %x, ptr addrspace(1) writeonly
 ; GFX12-TRUE16-NEXT:    v_fract_f16_e32 v0.l, v0.l
 ; GFX12-TRUE16-NEXT:    global_store_d16_hi_b16 v[1:2], v0, off
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-FAKE16-LABEL: safe_math_fract_f16_noinf_check:
 ; GFX12-FAKE16:       ; %bb.0: ; %entry
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1882,7 +1779,6 @@ define double @safe_math_fract_f64_noinf_check(double %x, ptr addrspace(1) write
 ; GFX6-NEXT:    buffer_store_dwordx2 v[4:5], v[2:3], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: safe_math_fract_f64_noinf_check:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1895,7 +1791,6 @@ define double @safe_math_fract_f64_noinf_check(double %x, ptr addrspace(1) write
 ; GFX7-NEXT:    buffer_store_dwordx2 v[4:5], v[2:3], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: safe_math_fract_f64_noinf_check:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1904,7 +1799,6 @@ define double @safe_math_fract_f64_noinf_check(double %x, ptr addrspace(1) write
 ; GFX8-NEXT:    global_store_dwordx2 v[2:3], v[4:5], off
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: safe_math_fract_f64_noinf_check:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1912,7 +1806,6 @@ define double @safe_math_fract_f64_noinf_check(double %x, ptr addrspace(1) write
 ; GFX11-NEXT:    v_fract_f64_e32 v[0:1], v[0:1]
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[4:5], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: safe_math_fract_f64_noinf_check:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1960,25 +1853,21 @@ define float @select_nan_fract_f32_flags_select(float %x) {
 ; GFX6-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: select_nan_fract_f32_flags_select:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: select_nan_fract_f32_flags_select:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: select_nan_fract_f32_flags_select:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: select_nan_fract_f32_flags_select:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2023,25 +1912,21 @@ define float @select_nan_fract_f32_flags_minnum(float %x) {
 ; GFX6-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: select_nan_fract_f32_flags_minnum:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: select_nan_fract_f32_flags_minnum:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: select_nan_fract_f32_flags_minnum:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f32_e32 v0, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: select_nan_fract_f32_flags_minnum:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2070,8 +1955,8 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) write
 ; GFX6-IR-NEXT:    [[UNO:%.*]] = fcmp uno <2 x float> [[X]], zeroinitializer
 ; GFX6-IR-NEXT:    [[COND:%.*]] = select <2 x i1> [[UNO]], <2 x float> [[X]], <2 x float> [[MIN]]
 ; GFX6-IR-NEXT:    [[FABS:%.*]] = tail call <2 x float> @llvm.fabs.v2f32(<2 x float> [[X]])
-; GFX6-IR-NEXT:    [[CMPINF:%.*]] = fcmp oeq <2 x float> [[FABS]], splat (float 0x7FF0000000000000)
-; GFX6-IR-NEXT:    [[COND6:%.*]] = select <2 x i1> [[CMPINF]], <2 x float> zeroinitializer, <2 x float> [[COND]]
+; GFX6-IR-NEXT:    [[CMPINF:%.*]] = fcmp une <2 x float> [[FABS]], splat (float 0x7FF0000000000000)
+; GFX6-IR-NEXT:    [[COND6:%.*]] = select <2 x i1> [[CMPINF]], <2 x float> [[COND]], <2 x float> zeroinitializer
 ; GFX6-IR-NEXT:    store <2 x float> [[FLOOR]], ptr addrspace(1) [[IP]], align 4
 ; GFX6-IR-NEXT:    ret <2 x float> [[COND6]]
 ;
@@ -2116,7 +2001,6 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) write
 ; GFX6-NEXT:    buffer_store_dwordx2 v[4:5], v[2:3], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: safe_math_fract_v2f32:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2136,7 +2020,6 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) write
 ; GFX7-NEXT:    buffer_store_dwordx2 v[4:5], v[2:3], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: safe_math_fract_v2f32:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2152,7 +2035,6 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) write
 ; GFX8-NEXT:    global_store_dwordx2 v[2:3], v[4:5], off
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: safe_math_fract_v2f32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2167,7 +2049,6 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) write
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[4:5], off
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v7, 0, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: safe_math_fract_v2f32:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2210,8 +2091,8 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) writeonly capture
 ; GFX6-IR-NEXT:    [[UNO:%.*]] = fcmp uno double [[X]], 0.000000e+00
 ; GFX6-IR-NEXT:    [[COND:%.*]] = select i1 [[UNO]], double [[X]], double [[MIN]]
 ; GFX6-IR-NEXT:    [[FABS:%.*]] = tail call double @llvm.fabs.f64(double [[X]])
-; GFX6-IR-NEXT:    [[CMPINF:%.*]] = fcmp oeq double [[FABS]], 0x7FF0000000000000
-; GFX6-IR-NEXT:    [[COND6:%.*]] = select i1 [[CMPINF]], double 0.000000e+00, double [[COND]]
+; GFX6-IR-NEXT:    [[CMPINF:%.*]] = fcmp une double [[FABS]], 0x7FF0000000000000
+; GFX6-IR-NEXT:    [[COND6:%.*]] = select i1 [[CMPINF]], double [[COND]], double 0.000000e+00
 ; GFX6-IR-NEXT:    store double [[FLOOR]], ptr addrspace(1) [[IP]], align 4
 ; GFX6-IR-NEXT:    ret double [[COND6]]
 ;
@@ -2256,7 +2137,6 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) writeonly capture
 ; GFX6-NEXT:    buffer_store_dwordx2 v[4:5], v[2:3], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: safe_math_fract_f64:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2274,7 +2154,6 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) writeonly capture
 ; GFX7-NEXT:    buffer_store_dwordx2 v[6:7], v[2:3], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: safe_math_fract_f64:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2288,7 +2167,6 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) writeonly capture
 ; GFX8-NEXT:    global_store_dwordx2 v[2:3], v[6:7], off
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: safe_math_fract_f64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2299,7 +2177,6 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) writeonly capture
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[6:7], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: safe_math_fract_f64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2337,8 +2214,8 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) writeonly captures(no
 ; GFX6-IR-NEXT:    [[UNO:%.*]] = fcmp uno half [[X]], 0xH0000
 ; GFX6-IR-NEXT:    [[COND:%.*]] = select i1 [[UNO]], half [[X]], half [[MIN]]
 ; GFX6-IR-NEXT:    [[FABS:%.*]] = tail call half @llvm.fabs.f16(half [[X]])
-; GFX6-IR-NEXT:    [[CMPINF:%.*]] = fcmp oeq half [[FABS]], 0xH7C00
-; GFX6-IR-NEXT:    [[COND6:%.*]] = select i1 [[CMPINF]], half 0xH0000, half [[COND]]
+; GFX6-IR-NEXT:    [[CMPINF:%.*]] = fcmp une half [[FABS]], 0xH7C00
+; GFX6-IR-NEXT:    [[COND6:%.*]] = select i1 [[CMPINF]], half [[COND]], half 0xH0000
 ; GFX6-IR-NEXT:    store half [[FLOOR]], ptr addrspace(1) [[IP]], align 4
 ; GFX6-IR-NEXT:    ret half [[COND6]]
 ;
@@ -2351,8 +2228,8 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) writeonly captures(no
 ; GFX7-IR-NEXT:    [[UNO:%.*]] = fcmp uno half [[X]], 0xH0000
 ; GFX7-IR-NEXT:    [[COND:%.*]] = select i1 [[UNO]], half [[X]], half [[MIN]]
 ; GFX7-IR-NEXT:    [[FABS:%.*]] = tail call half @llvm.fabs.f16(half [[X]])
-; GFX7-IR-NEXT:    [[CMPINF:%.*]] = fcmp oeq half [[FABS]], 0xH7C00
-; GFX7-IR-NEXT:    [[COND6:%.*]] = select i1 [[CMPINF]], half 0xH0000, half [[COND]]
+; GFX7-IR-NEXT:    [[CMPINF:%.*]] = fcmp une half [[FABS]], 0xH7C00
+; GFX7-IR-NEXT:    [[COND6:%.*]] = select i1 [[CMPINF]], half [[COND]], half 0xH0000
 ; GFX7-IR-NEXT:    store half [[FLOOR]], ptr addrspace(1) [[IP]], align 4
 ; GFX7-IR-NEXT:    ret half [[COND6]]
 ;
@@ -2389,7 +2266,6 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) writeonly captures(no
 ; GFX6-NEXT:    buffer_store_short v4, v[1:2], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: safe_math_fract_f16:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2412,7 +2288,6 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) writeonly captures(no
 ; GFX7-NEXT:    buffer_store_short v4, v[1:2], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: safe_math_fract_f16:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2424,7 +2299,6 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) writeonly captures(no
 ; GFX8-NEXT:    global_store_short v[1:2], v3, off
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-TRUE16-LABEL: safe_math_fract_f16:
 ; GFX11-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2435,7 +2309,6 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) writeonly captures(no
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0, v0.h, s0
 ; GFX11-TRUE16-NEXT:    global_store_b16 v[1:2], v3, off
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-FAKE16-LABEL: safe_math_fract_f16:
 ; GFX11-FAKE16:       ; %bb.0: ; %entry
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2446,7 +2319,6 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) writeonly captures(no
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc_lo
 ; GFX11-FAKE16-NEXT:    global_store_b16 v[1:2], v4, off
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-TRUE16-LABEL: safe_math_fract_f16:
 ; GFX12-TRUE16:       ; %bb.0: ; %entry
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2461,7 +2333,6 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) writeonly captures(no
 ; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0, v0.h, s0
 ; GFX12-TRUE16-NEXT:    global_store_b16 v[1:2], v3, off
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-FAKE16-LABEL: safe_math_fract_f16:
 ; GFX12-FAKE16:       ; %bb.0: ; %entry
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2499,8 +2370,8 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) writeon
 ; GFX6-IR-NEXT:    [[UNO:%.*]] = fcmp uno <2 x half> [[X]], zeroinitializer
 ; GFX6-IR-NEXT:    [[COND:%.*]] = select <2 x i1> [[UNO]], <2 x half> [[X]], <2 x half> [[MIN]]
 ; GFX6-IR-NEXT:    [[FABS:%.*]] = tail call <2 x half> @llvm.fabs.v2f16(<2 x half> [[X]])
-; GFX6-IR-NEXT:    [[CMPINF:%.*]] = fcmp oeq <2 x half> [[FABS]], splat (half 0xH7C00)
-; GFX6-IR-NEXT:    [[COND6:%.*]] = select <2 x i1> [[CMPINF]], <2 x half> zeroinitializer, <2 x half> [[COND]]
+; GFX6-IR-NEXT:    [[CMPINF:%.*]] = fcmp une <2 x half> [[FABS]], splat (half 0xH7C00)
+; GFX6-IR-NEXT:    [[COND6:%.*]] = select <2 x i1> [[CMPINF]], <2 x half> [[COND]], <2 x half> zeroinitializer
 ; GFX6-IR-NEXT:    store <2 x half> [[FLOOR]], ptr addrspace(1) [[IP]], align 4
 ; GFX6-IR-NEXT:    ret <2 x half> [[COND6]]
 ;
@@ -2513,8 +2384,8 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) writeon
 ; GFX7-IR-NEXT:    [[UNO:%.*]] = fcmp uno <2 x half> [[X]], zeroinitializer
 ; GFX7-IR-NEXT:    [[COND:%.*]] = select <2 x i1> [[UNO]], <2 x half> [[X]], <2 x half> [[MIN]]
 ; GFX7-IR-NEXT:    [[FABS:%.*]] = tail call <2 x half> @llvm.fabs.v2f16(<2 x half> [[X]])
-; GFX7-IR-NEXT:    [[CMPINF:%.*]] = fcmp oeq <2 x half> [[FABS]], splat (half 0xH7C00)
-; GFX7-IR-NEXT:    [[COND6:%.*]] = select <2 x i1> [[CMPINF]], <2 x half> zeroinitializer, <2 x half> [[COND]]
+; GFX7-IR-NEXT:    [[CMPINF:%.*]] = fcmp une <2 x half> [[FABS]], splat (half 0xH7C00)
+; GFX7-IR-NEXT:    [[COND6:%.*]] = select <2 x i1> [[CMPINF]], <2 x half> [[COND]], <2 x half> zeroinitializer
 ; GFX7-IR-NEXT:    store <2 x half> [[FLOOR]], ptr addrspace(1) [[IP]], align 4
 ; GFX7-IR-NEXT:    ret <2 x half> [[COND6]]
 ;
@@ -2569,7 +2440,6 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) writeon
 ; GFX6-NEXT:    buffer_store_dword v7, v[2:3], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: safe_math_fract_v2f16:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2605,7 +2475,6 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) writeon
 ; GFX7-NEXT:    buffer_store_dword v7, v[2:3], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: safe_math_fract_v2f16:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2623,7 +2492,6 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) writeon
 ; GFX8-NEXT:    global_store_dword v[1:2], v3, off
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-TRUE16-LABEL: safe_math_fract_v2f16:
 ; GFX11-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2641,7 +2509,6 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) writeon
 ; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v3.l, v3.h
 ; GFX11-TRUE16-NEXT:    global_store_b32 v[1:2], v4, off
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-FAKE16-LABEL: safe_math_fract_v2f16:
 ; GFX11-FAKE16:       ; %bb.0: ; %entry
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2660,7 +2527,6 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) writeon
 ; GFX11-FAKE16-NEXT:    global_store_b32 v[1:2], v4, off
 ; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v3
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-TRUE16-LABEL: safe_math_fract_v2f16:
 ; GFX12-TRUE16:       ; %bb.0: ; %entry
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2682,7 +2548,6 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) writeon
 ; GFX12-TRUE16-NEXT:    v_pack_b32_f16 v0, v3.l, v3.h
 ; GFX12-TRUE16-NEXT:    global_store_b32 v[1:2], v4, off
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-FAKE16-LABEL: safe_math_fract_v2f16:
 ; GFX12-FAKE16:       ; %bb.0: ; %entry
 ; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2730,8 +2595,8 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) wri
 ; GFX6-IR-NEXT:    [[UNO:%.*]] = fcmp uno <2 x double> [[X]], zeroinitializer
 ; GFX6-IR-NEXT:    [[COND:%.*]] = select <2 x i1> [[UNO]], <2 x double> [[X]], <2 x double> [[MIN]]
 ; GFX6-IR-NEXT:    [[FABS:%.*]] = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> [[X]])
-; GFX6-IR-NEXT:    [[CMPINF:%.*]] = fcmp oeq <2 x double> [[FABS]], splat (double 0x7FF0000000000000)
-; GFX6-IR-NEXT:    [[COND6:%.*]] = select <2 x i1> [[CMPINF]], <2 x double> zeroinitializer, <2 x double> [[COND]]
+; GFX6-IR-NEXT:    [[CMPINF:%.*]] = fcmp une <2 x double> [[FABS]], splat (double 0x7FF0000000000000)
+; GFX6-IR-NEXT:    [[COND6:%.*]] = select <2 x i1> [[CMPINF]], <2 x double> [[COND]], <2 x double> zeroinitializer
 ; GFX6-IR-NEXT:    store <2 x double> [[FLOOR]], ptr addrspace(1) [[IP]], align 4
 ; GFX6-IR-NEXT:    ret <2 x double> [[COND6]]
 ;
@@ -2794,7 +2659,6 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) wri
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX7-LABEL: safe_math_fract_v2f64:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2816,7 +2680,6 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) wri
 ; GFX7-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[8:11], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX8-LABEL: safe_math_fract_v2f64:
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2834,7 +2697,6 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) wri
 ; GFX8-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-LABEL: safe_math_fract_v2f64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2850,7 +2712,6 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) wri
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, v13, 0, s1
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[6:9], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX12-LABEL: safe_math_fract_v2f64:
 ; GFX12:       ; %bb.0: ; %entry
 ; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
index d8163cdebad8c..7ed27f008083e 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
@@ -1015,12 +1015,12 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; VI-LABEL: add_select_negk_fabs_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
+; VI-NEXT:    v_mov_b32_e32 v4, 0xbc00
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
-; VI-NEXT:    v_mov_b32_e32 v2, 0xbc00
-; VI-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
-; VI-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
-; VI-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1029,12 +1029,12 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX9-LABEL: add_select_negk_fabs_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xbc00
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0xbc00
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v3
@@ -1043,12 +1043,12 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11-SAFE-TRUE16-LABEL: add_select_negk_fabs_v2f16:
 ; GFX11-SAFE-TRUE16:       ; %bb.0:
 ; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-SAFE-TRUE16-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v2
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xbc00, v0.l, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xbc00, v0.h, s0
+; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xbc00, v2.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xbc00, v2.h, s0
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -1071,12 +1071,12 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11-NSZ-TRUE16-LABEL: add_select_negk_fabs_v2f16:
 ; GFX11-NSZ-TRUE16:       ; %bb.0:
 ; GFX11-NSZ-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NSZ-TRUE16-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v2
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xbc00, v0.l, vcc_lo
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xbc00, v0.h, s0
+; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xbc00, v2.l, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xbc00, v2.h, s0
 ; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NSZ-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-NSZ-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -1126,12 +1126,12 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x
 ; VI-LABEL: add_select_negliteralk_fabs_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
+; VI-NEXT:    v_mov_b32_e32 v4, 0xe400
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
-; VI-NEXT:    v_mov_b32_e32 v2, 0xe400
-; VI-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
-; VI-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
-; VI-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1140,12 +1140,12 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x
 ; GFX9-LABEL: add_select_negliteralk_fabs_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xe400
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0xe400
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v3
@@ -1154,12 +1154,12 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x
 ; GFX11-SAFE-TRUE16-LABEL: add_select_negliteralk_fabs_v2f16:
 ; GFX11-SAFE-TRUE16:       ; %bb.0:
 ; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-SAFE-TRUE16-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v2
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xe400, v0.l, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xe400, v0.h, s0
+; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xe400, v2.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xe400, v2.h, s0
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -1182,12 +1182,12 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x
 ; GFX11-NSZ-TRUE16-LABEL: add_select_negliteralk_fabs_v2f16:
 ; GFX11-NSZ-TRUE16:       ; %bb.0:
 ; GFX11-NSZ-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NSZ-TRUE16-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v2
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xe400, v0.l, vcc_lo
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xe400, v0.h, s0
+; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xe400, v2.l, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xe400, v2.h, s0
 ; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NSZ-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-NSZ-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -1346,12 +1346,12 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; VI-LABEL: add_select_posk_fabs_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
+; VI-NEXT:    v_mov_b32_e32 v4, 0x3c00
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
-; VI-NEXT:    v_mov_b32_e32 v2, 0x3c00
-; VI-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
-; VI-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
-; VI-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1360,12 +1360,12 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX9-LABEL: add_select_posk_fabs_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3c00
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x3c00
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v3
@@ -1374,12 +1374,12 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11-SAFE-TRUE16-LABEL: add_select_posk_fabs_v2f16:
 ; GFX11-SAFE-TRUE16:       ; %bb.0:
 ; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-SAFE-TRUE16-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v2
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x3c00, v0.l, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x3c00, v0.h, s0
+; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x3c00, v2.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x3c00, v2.h, s0
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -1402,12 +1402,12 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11-NSZ-TRUE16-LABEL: add_select_posk_fabs_v2f16:
 ; GFX11-NSZ-TRUE16:       ; %bb.0:
 ; GFX11-NSZ-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NSZ-TRUE16-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v2
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x3c00, v0.l, vcc_lo
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x3c00, v0.h, s0
+; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x3c00, v2.l, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x3c00, v2.h, s0
 ; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NSZ-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-NSZ-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -3836,12 +3836,12 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; VI-LABEL: mul_select_posk_negfabs_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
+; VI-NEXT:    v_mov_b32_e32 v4, 0x4400
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_or_b32_e32 v1, 0x80008000, v2
-; VI-NEXT:    v_mov_b32_e32 v2, 0x4400
-; VI-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
-; VI-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
-; VI-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_mul_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -3850,12 +3850,12 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX9-LABEL: mul_select_posk_negfabs_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4400
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_or_b32_e32 v1, 0x80008000, v2
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4400
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v3
@@ -3864,12 +3864,12 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX11-SAFE-TRUE16-LABEL: mul_select_posk_negfabs_v2f16:
 ; GFX11-SAFE-TRUE16:       ; %bb.0:
 ; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-SAFE-TRUE16-NEXT:    v_or_b32_e32 v0, 0x80008000, v2
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4400, v0.l, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x4400, v0.h, s0
+; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4400, v2.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x4400, v2.h, s0
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-TRUE16-NEXT:    v_pk_mul_f16 v0, v0, v3
 ; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -3892,12 +3892,12 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX11-NSZ-TRUE16-LABEL: mul_select_posk_negfabs_v2f16:
 ; GFX11-NSZ-TRUE16:       ; %bb.0:
 ; GFX11-NSZ-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NSZ-TRUE16-NEXT:    v_or_b32_e32 v0, 0x80008000, v2
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4400, v0.l, vcc_lo
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x4400, v0.h, s0
+; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x4400, v2.l, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x4400, v2.h, s0
 ; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NSZ-TRUE16-NEXT:    v_pk_mul_f16 v0, v0, v3
 ; GFX11-NSZ-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -4066,12 +4066,12 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; VI-LABEL: mul_select_negk_negfabs_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
+; VI-NEXT:    v_mov_b32_e32 v4, 0xc400
+; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_or_b32_e32 v1, 0x80008000, v2
-; VI-NEXT:    v_mov_b32_e32 v2, 0xc400
-; VI-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
-; VI-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
-; VI-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_mul_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -4080,12 +4080,12 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX9-LABEL: mul_select_negk_negfabs_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xc400
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_or_b32_e32 v1, 0x80008000, v2
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0xc400
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v3
@@ -4094,12 +4094,12 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX11-SAFE-TRUE16-LABEL: mul_select_negk_negfabs_v2f16:
 ; GFX11-SAFE-TRUE16:       ; %bb.0:
 ; GFX11-SAFE-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-SAFE-TRUE16-NEXT:    v_or_b32_e32 v0, 0x80008000, v2
 ; GFX11-SAFE-TRUE16-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xc400, v0.l, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xc400, v0.h, s0
+; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xc400, v2.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xc400, v2.h, s0
 ; GFX11-SAFE-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-TRUE16-NEXT:    v_pk_mul_f16 v0, v0, v3
 ; GFX11-SAFE-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -4122,12 +4122,12 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX11-NSZ-TRUE16-LABEL: mul_select_negk_negfabs_v2f16:
 ; GFX11-NSZ-TRUE16:       ; %bb.0:
 ; GFX11-NSZ-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NSZ-TRUE16-NEXT:    v_or_b32_e32 v0, 0x80008000, v2
 ; GFX11-NSZ-TRUE16-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
-; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xc400, v0.l, vcc_lo
-; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xc400, v0.h, s0
+; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0xc400, v2.l, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0xc400, v2.h, s0
 ; GFX11-NSZ-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NSZ-TRUE16-NEXT:    v_pk_mul_f16 v0, v0, v3
 ; GFX11-NSZ-TRUE16-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll b/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll
index bedd4ff60ac6f..9152686541935 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-cndmask.ll
@@ -439,9 +439,9 @@ define amdgpu_cs void @test_f32_oeq(float %a, float %p, float %q, ptr addrspace(
 define amdgpu_cs void @test_f32_negative_modifiers(float %a, float %p, float %q, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_f32_negative_modifiers:
 ; GCN:       ; %bb.0: ; %.entry
-; GCN-NEXT:    v_cmp_neq_f32_e32 vcc_lo, 2.0, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, -v1, vcc_lo
-; GCN-NEXT:    v_cndmask_b32_e64 v1, 0, -v2, vcc_lo
+; GCN-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 2.0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v1, 0, vcc_lo
+; GCN-NEXT:    v_cndmask_b32_e64 v1, -v2, 0, vcc_lo
 ; GCN-NEXT:    global_store_b64 v[3:4], v[0:1], off
 ; GCN-NEXT:    s_endpgm
 .entry:



More information about the llvm-commits mailing list