[llvm] [AMDGPU][SDAG] Support source modifiers on select integer operands (PR #147325)

Chris Jackson via llvm-commits llvm-commits at lists.llvm.org
Mon Jul 14 07:48:17 PDT 2025


https://github.com/chrisjbris updated https://github.com/llvm/llvm-project/pull/147325

>From d4ac937b79a628dd2d962439d9b0af9b205319ac Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Wed, 2 Jul 2025 04:39:04 -0500
Subject: [PATCH 01/28] Add new test for source modifiers on select

---
 .../AMDGPU/integer-select-source-modifiers.ll | 68 +++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll

diff --git a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll
new file mode 100644
index 0000000000000..6e7ff16b74139
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+
+define i32 @fneg_select_i32(i32 %cond, i32 %a, i32 %b) {
+  %neg.a = xor i32 %a, u0x80000000
+  %cmp = icmp eq i32 %cond, zeroinitializer
+  %select = select i1 %cmp, i32 %neg.a, i32 %b
+  ret i32 %select
+}
+
+define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
+  %neg.a = xor <2 x i32> %a, splat (i32 u0x80000000)
+  %cmp = icmp eq <2 x i32> %cond, zeroinitializer
+  %select = select <2 x i1> %cmp, <2 x i32> %neg.a, <2 x i32> %b
+  ret <2 x i32> %select
+}
+
+define i32 @fabs_select_i32(i32 %cond, i32 %a, i32 %b) {
+  %neg.a = and i32 %a, u0x7fffffff
+  %cmp = icmp eq i32 %cond, zeroinitializer
+  %select = select i1 %cmp, i32 %neg.a, i32 %b
+  ret i32 %select
+}
+
+define <2 x i32> @fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
+  %neg.a = and <2 x i32> %a, splat (i32 u0x7fffffff)
+  %cmp = icmp eq <2 x i32> %cond, zeroinitializer
+  %select = select <2 x i1> %cmp, <2 x i32> %neg.a, <2 x i32> %b
+  ret <2 x i32> %select
+}
+
+define i32 @fneg_fabs_select_i32(i32 %cond, i32 %a, i32 %b) {
+  %neg.a = or i32 %a, u0x80000000
+  %cmp = icmp eq i32 %cond, zeroinitializer
+  %select = select i1 %cmp, i32 %neg.a, i32 %b
+  ret i32 %select
+}
+
+define <2 x i32> @fneg_fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
+  %neg.a = or <2 x i32> %a, splat (i32 u0x80000000)
+  %cmp = icmp eq <2 x i32> %cond, zeroinitializer
+  %select = select <2 x i1> %cmp, <2 x i32> %neg.a, <2 x i32> %b
+  ret <2 x i32> %select
+}
+
+define i64 @fneg_select_i64(i64 %cond, i64 %a, i64 %b) {
+  %neg.a = xor i64 %a, u0x8000000000000000
+  %cmp = icmp eq i64 %cond, zeroinitializer
+  %select = select i1 %cmp, i64 %neg.a, i64 %b
+  ret i64 %select
+}
+
+define i64 @fabs_select_i64(i64 %cond, i64 %a, i64 %b) {
+  %neg.a = and i64 %a, u0x7fffffffffffffff
+  %cmp = icmp eq i64 %cond, zeroinitializer
+  %select = select i1 %cmp, i64 %neg.a, i64 %b
+  ret i64 %select
+}
+
+define i64 @fneg_fabs_select_i64(i64 %cond, i64 %a, i64 %b) {
+  %neg.a = or i64 %a, u0x8000000000000000
+  %cmp = icmp eq i64 %cond, zeroinitializer
+  %select = select i1 %cmp, i64 %neg.a, i64 %b
+  ret i64 %select
+}

>From c89274e65b68073b59089df7e861e7fb4a7979e7 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Wed, 2 Jul 2025 04:40:36 -0500
Subject: [PATCH 02/28] Populate check-lines before patching

---
 .../AMDGPU/integer-select-source-modifiers.ll | 170 ++++++++++++++++++
 1 file changed, 170 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll
index 6e7ff16b74139..dd6cf9bc6c592 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll
@@ -5,6 +5,22 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define i32 @fneg_select_i32(i32 %cond, i32 %a, i32 %b) {
+; GCN-LABEL: fneg_select_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_select_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = xor i32 %a, u0x80000000
   %cmp = icmp eq i32 %cond, zeroinitializer
   %select = select i1 %cmp, i32 %neg.a, i32 %b
@@ -12,6 +28,28 @@ define i32 @fneg_select_i32(i32 %cond, i32 %a, i32 %b) {
 }
 
 define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
+; GCN-LABEL: fneg_select_v2i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_select_v2i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = xor <2 x i32> %a, splat (i32 u0x80000000)
   %cmp = icmp eq <2 x i32> %cond, zeroinitializer
   %select = select <2 x i1> %cmp, <2 x i32> %neg.a, <2 x i32> %b
@@ -19,6 +57,22 @@ define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b)
 }
 
 define i32 @fabs_select_i32(i32 %cond, i32 %a, i32 %b) {
+; GCN-LABEL: fabs_select_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fabs_select_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = and i32 %a, u0x7fffffff
   %cmp = icmp eq i32 %cond, zeroinitializer
   %select = select i1 %cmp, i32 %neg.a, i32 %b
@@ -26,6 +80,28 @@ define i32 @fabs_select_i32(i32 %cond, i32 %a, i32 %b) {
 }
 
 define <2 x i32> @fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
+; GCN-LABEL: fabs_select_v2i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fabs_select_v2i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = and <2 x i32> %a, splat (i32 u0x7fffffff)
   %cmp = icmp eq <2 x i32> %cond, zeroinitializer
   %select = select <2 x i1> %cmp, <2 x i32> %neg.a, <2 x i32> %b
@@ -33,6 +109,22 @@ define <2 x i32> @fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b)
 }
 
 define i32 @fneg_fabs_select_i32(i32 %cond, i32 %a, i32 %b) {
+; GCN-LABEL: fneg_fabs_select_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_or_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_fabs_select_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_or_b32_e32 v1, 0x80000000, v1
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = or i32 %a, u0x80000000
   %cmp = icmp eq i32 %cond, zeroinitializer
   %select = select i1 %cmp, i32 %neg.a, i32 %b
@@ -40,6 +132,28 @@ define i32 @fneg_fabs_select_i32(i32 %cond, i32 %a, i32 %b) {
 }
 
 define <2 x i32> @fneg_fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
+; GCN-LABEL: fneg_fabs_select_v2i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_or_b32_e32 v2, 0x80000000, v2
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_or_b32_e32 v3, 0x80000000, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_fabs_select_v2i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x80000000, v2
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_or_b32_e32 v3, 0x80000000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = or <2 x i32> %a, splat (i32 u0x80000000)
   %cmp = icmp eq <2 x i32> %cond, zeroinitializer
   %select = select <2 x i1> %cmp, <2 x i32> %neg.a, <2 x i32> %b
@@ -47,6 +161,23 @@ define <2 x i32> @fneg_fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32
 }
 
 define i64 @fneg_select_i64(i64 %cond, i64 %a, i64 %b) {
+; GCN-LABEL: fneg_select_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_select_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = xor i64 %a, u0x8000000000000000
   %cmp = icmp eq i64 %cond, zeroinitializer
   %select = select i1 %cmp, i64 %neg.a, i64 %b
@@ -54,6 +185,23 @@ define i64 @fneg_select_i64(i64 %cond, i64 %a, i64 %b) {
 }
 
 define i64 @fabs_select_i64(i64 %cond, i64 %a, i64 %b) {
+; GCN-LABEL: fabs_select_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fabs_select_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v2 :: v_dual_and_b32 v1, 0x7fffffff, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = and i64 %a, u0x7fffffffffffffff
   %cmp = icmp eq i64 %cond, zeroinitializer
   %select = select i1 %cmp, i64 %neg.a, i64 %b
@@ -61,8 +209,30 @@ define i64 @fabs_select_i64(i64 %cond, i64 %a, i64 %b) {
 }
 
 define i64 @fneg_fabs_select_i64(i64 %cond, i64 %a, i64 %b) {
+; GCN-LABEL: fneg_fabs_select_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-NEXT:    v_or_b32_e32 v3, 0x80000000, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_fabs_select_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX11-NEXT:    v_or_b32_e32 v1, 0x80000000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = or i64 %a, u0x8000000000000000
   %cmp = icmp eq i64 %cond, zeroinitializer
   %select = select i1 %cmp, i64 %neg.a, i64 %b
   ret i64 %select
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11-FAKE16: {{.*}}
+; GFX11-TRUE16: {{.*}}
+; GFX7: {{.*}}
+; GFX9: {{.*}}

>From b6b37265d283ee0d6a301280e8c34a68283375ac Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Mon, 7 Jul 2025 10:12:45 -0500
Subject: [PATCH 03/28] [AMDGPU][SDAG] Support source modifiers as integer on
 select

Extend the DAGCombine() for select to directly support fneg and fabs
for i32, v2i32 and i64.
---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 77 ++++++++++++++++++-
 .../AMDGPU/integer-select-source-modifiers.ll | 40 +++-------
 2 files changed, 86 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index e64d2162441ab..4a719d8e145f4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4842,6 +4842,64 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
   return SDValue();
 }
 
+static EVT IntToFloatVT(EVT VT) {
+  return VT = VT.isVector() ? MVT::getVectorVT(MVT::getFloatingPointVT(
+                                                   VT.getScalarSizeInBits()),
+                                               VT.getVectorNumElements())
+                            : MVT::getFloatingPointVT(VT.getFixedSizeInBits());
+}
+
+static SDValue BitwiseToSrcModifierOp(SDValue N,
+                                      TargetLowering::DAGCombinerInfo &DCI) {
+
+  unsigned Opc = N.getNode()->getOpcode();
+  if (Opc != ISD::AND && Opc != ISD::XOR && Opc != ISD::AND)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue LHS = N.getNode()->getOperand(0);
+  SDValue RHS = N.getNode()->getOperand(1);
+  ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
+
+  if (!CRHS)
+    return SDValue();
+
+  EVT VT = RHS.getValueType();
+
+  assert((VT == MVT::i32 || VT == MVT::v2i32 || VT == MVT::i64) &&
+         "Expected i32, v2i32 or i64 value type.");
+
+  uint64_t Mask = 0;
+  if (VT.isVector()) {
+    SDValue Splat = DAG.getSplatValue(RHS);
+    const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat);
+    Mask = C->getZExtValue();
+  } else
+    Mask = CRHS->getZExtValue();
+
+  EVT FVT = IntToFloatVT(VT);
+  SDValue BC = DAG.getNode(ISD::BITCAST, SDLoc(N), FVT, LHS);
+
+  switch (Opc) {
+  case ISD::XOR:
+    if (Mask == 0x80000000u || Mask == 0x8000000000000000u)
+      return DAG.getNode(ISD::FNEG, SDLoc(N), FVT, BC);
+    return SDValue();
+  case ISD::OR:
+    if (Mask == 0x80000000u || Mask == 0x8000000000000000u) {
+      SDValue Abs = DAG.getNode(ISD::FNEG, SDLoc(N), FVT, BC);
+      return DAG.getNode(ISD::FABS, SDLoc(N), FVT, Abs);
+    }
+    return SDValue();
+  case ISD::AND:
+    if (Mask == 0x7fffffffu || Mask == 0x7fffffffffffffffu)
+      return DAG.getNode(ISD::FABS, SDLoc(N), FVT, BC);
+    return SDValue();
+  default:
+    return SDValue();
+  }
+}
+
 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
                                                    DAGCombinerInfo &DCI) const {
   if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
@@ -4876,12 +4934,25 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
     }
 
     if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
-      SDValue MinMax
-        = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
+      SDValue MinMax =
+          combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
       // Revisit this node so we can catch min3/max3/med3 patterns.
-      //DCI.AddToWorklist(MinMax.getNode());
+      // DCI.AddToWorklist(MinMax.getNode());
       return MinMax;
     }
+
+    // Support source modifiers as integer.
+    if (VT == MVT::i32 || VT == MVT::v2i32 || VT == MVT::i64) {
+      SDLoc SL(N);
+      SDValue LHS = N->getOperand(1);
+      SDValue RHS = N->getOperand(2);
+      if (SDValue SrcMod = BitwiseToSrcModifierOp(LHS, DCI)) {
+        SDValue FRHS = DAG.getNode(ISD::BITCAST, SL, VT, RHS);
+        SDValue FSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond, SrcMod, FRHS);
+        SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, FSelect);
+        return BC;
+      }
+    }
   }
 
   // There's no reason to not do this if the condition has other uses.
diff --git a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll
index dd6cf9bc6c592..2db20e672c303 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll
@@ -8,18 +8,15 @@ define i32 @fneg_select_i32(i32 %cond, i32 %a, i32 %b) {
 ; GCN-LABEL: fneg_select_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, -v1, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_select_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, -v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = xor i32 %a, u0x80000000
   %cmp = icmp eq i32 %cond, zeroinitializer
@@ -31,24 +28,19 @@ define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b)
 ; GCN-LABEL: fneg_select_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v4, -v2, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, -v3, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_select_v2i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, -v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, -v3, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = xor <2 x i32> %a, splat (i32 u0x80000000)
   %cmp = icmp eq <2 x i32> %cond, zeroinitializer
@@ -60,18 +52,15 @@ define i32 @fabs_select_i32(i32 %cond, i32 %a, i32 %b) {
 ; GCN-LABEL: fabs_select_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, |v1|, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fabs_select_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, |v1|, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = and i32 %a, u0x7fffffff
   %cmp = icmp eq i32 %cond, zeroinitializer
@@ -83,24 +72,19 @@ define <2 x i32> @fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b)
 ; GCN-LABEL: fabs_select_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GCN-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v4, |v2|, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, |v3|, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fabs_select_v2i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
-; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, |v2|, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, |v3|, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = and <2 x i32> %a, splat (i32 u0x7fffffff)
   %cmp = icmp eq <2 x i32> %cond, zeroinitializer

>From dea39d166b9b50e053d3e6ceeccc7fcdb304b13a Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Mon, 7 Jul 2025 10:32:27 -0500
Subject: [PATCH 04/28] Simplify switch in BitwiseToSrcModifierOp()

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 4a719d8e145f4..e9e4c2c35f1e4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4884,20 +4884,21 @@ static SDValue BitwiseToSrcModifierOp(SDValue N,
   case ISD::XOR:
     if (Mask == 0x80000000u || Mask == 0x8000000000000000u)
       return DAG.getNode(ISD::FNEG, SDLoc(N), FVT, BC);
-    return SDValue();
+    break;
   case ISD::OR:
     if (Mask == 0x80000000u || Mask == 0x8000000000000000u) {
       SDValue Abs = DAG.getNode(ISD::FNEG, SDLoc(N), FVT, BC);
       return DAG.getNode(ISD::FABS, SDLoc(N), FVT, Abs);
     }
-    return SDValue();
+    break;
   case ISD::AND:
     if (Mask == 0x7fffffffu || Mask == 0x7fffffffffffffffu)
       return DAG.getNode(ISD::FABS, SDLoc(N), FVT, BC);
-    return SDValue();
+    break;
   default:
     return SDValue();
   }
+  return SDValue();
 }
 
 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,

>From 004dc9fb25514bcd37756148b4ceeb04133f9e3e Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Mon, 7 Jul 2025 10:46:53 -0500
Subject: [PATCH 05/28] [NFC] Correct typo in BitwiseToSrcModifierOp()

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index e9e4c2c35f1e4..f13a1e12f1a7d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4887,8 +4887,8 @@ static SDValue BitwiseToSrcModifierOp(SDValue N,
     break;
   case ISD::OR:
     if (Mask == 0x80000000u || Mask == 0x8000000000000000u) {
-      SDValue Abs = DAG.getNode(ISD::FNEG, SDLoc(N), FVT, BC);
-      return DAG.getNode(ISD::FABS, SDLoc(N), FVT, Abs);
+      SDValue Neg = DAG.getNode(ISD::FNEG, SDLoc(N), FVT, BC);
+      return DAG.getNode(ISD::FABS, SDLoc(N), FVT, Neg);
     }
     break;
   case ISD::AND:

>From b27ce624f2e3d9f47a6fdd23fb75e34e8c21e10b Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Mon, 7 Jul 2025 11:11:31 -0500
Subject: [PATCH 06/28] Fix bitcast type in performSelectCombine()

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index f13a1e12f1a7d..2c89fe9e12a7e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4943,13 +4943,15 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
     }
 
     // Support source modifiers as integer.
+    // (select c, (xor/or/and x, c), y) -> (bitcast (select c)))
     if (VT == MVT::i32 || VT == MVT::v2i32 || VT == MVT::i64) {
       SDLoc SL(N);
       SDValue LHS = N->getOperand(1);
       SDValue RHS = N->getOperand(2);
       if (SDValue SrcMod = BitwiseToSrcModifierOp(LHS, DCI)) {
-        SDValue FRHS = DAG.getNode(ISD::BITCAST, SL, VT, RHS);
-        SDValue FSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond, SrcMod, FRHS);
+        EVT FVT = IntToFloatVT(VT);
+        SDValue FRHS = DAG.getNode(ISD::BITCAST, SL, FVT, RHS);
+        SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, Cond, SrcMod, FRHS);
         SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, FSelect);
         return BC;
       }

>From f503034123bbaa95523b6c607ce35f53d440482f Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Mon, 7 Jul 2025 11:31:26 -0500
Subject: [PATCH 07/28] Respond to first review comments

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 30 +++++++------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 2c89fe9e12a7e..86eb6cf622fa3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4857,8 +4857,8 @@ static SDValue BitwiseToSrcModifierOp(SDValue N,
     return SDValue();
 
   SelectionDAG &DAG = DCI.DAG;
-  SDValue LHS = N.getNode()->getOperand(0);
-  SDValue RHS = N.getNode()->getOperand(1);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
   ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
 
   if (!CRHS)
@@ -4869,31 +4869,25 @@ static SDValue BitwiseToSrcModifierOp(SDValue N,
   assert((VT == MVT::i32 || VT == MVT::v2i32 || VT == MVT::i64) &&
          "Expected i32, v2i32 or i64 value type.");
 
-  uint64_t Mask = 0;
-  if (VT.isVector()) {
-    SDValue Splat = DAG.getSplatValue(RHS);
-    const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat);
-    Mask = C->getZExtValue();
-  } else
-    Mask = CRHS->getZExtValue();
-
+  uint64_t Mask = CRHS->getZExtValue();
   EVT FVT = IntToFloatVT(VT);
-  SDValue BC = DAG.getNode(ISD::BITCAST, SDLoc(N), FVT, LHS);
+  SDLoc SL = SDLoc(N);
+  SDValue BC = DAG.getNode(ISD::BITCAST, SL, FVT, LHS);
 
   switch (Opc) {
   case ISD::XOR:
     if (Mask == 0x80000000u || Mask == 0x8000000000000000u)
-      return DAG.getNode(ISD::FNEG, SDLoc(N), FVT, BC);
+      return DAG.getNode(ISD::FNEG, SL, FVT, BC);
     break;
   case ISD::OR:
     if (Mask == 0x80000000u || Mask == 0x8000000000000000u) {
       SDValue Neg = DAG.getNode(ISD::FNEG, SDLoc(N), FVT, BC);
-      return DAG.getNode(ISD::FABS, SDLoc(N), FVT, Neg);
+      return DAG.getNode(ISD::FABS, SL, FVT, Neg);
     }
     break;
   case ISD::AND:
     if (Mask == 0x7fffffffu || Mask == 0x7fffffffffffffffu)
-      return DAG.getNode(ISD::FABS, SDLoc(N), FVT, BC);
+      return DAG.getNode(ISD::FABS, SL, FVT, BC);
     break;
   default:
     return SDValue();
@@ -4945,12 +4939,10 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
     // Support source modifiers as integer.
     // (select c, (xor/or/and x, c), y) -> (bitcast (select c)))
     if (VT == MVT::i32 || VT == MVT::v2i32 || VT == MVT::i64) {
-      SDLoc SL(N);
-      SDValue LHS = N->getOperand(1);
-      SDValue RHS = N->getOperand(2);
-      if (SDValue SrcMod = BitwiseToSrcModifierOp(LHS, DCI)) {
+      if (SDValue SrcMod = BitwiseToSrcModifierOp(True, DCI)) {
+        SDLoc SL(N);
         EVT FVT = IntToFloatVT(VT);
-        SDValue FRHS = DAG.getNode(ISD::BITCAST, SL, FVT, RHS);
+        SDValue FRHS = DAG.getNode(ISD::BITCAST, SL, FVT, False);
         SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, Cond, SrcMod, FRHS);
         SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, FSelect);
         return BC;

>From e073552b23bd1c4af4e740254a777335613a9603 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Mon, 7 Jul 2025 11:38:29 -0500
Subject: [PATCH 08/28] Respond to secon review comments - rename function and
 correct test

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  12 +-
 .../AMDGPU/integer-select-source-modifiers.ll | 168 +++++++++++++++++-
 2 files changed, 165 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 86eb6cf622fa3..bd7979f5f17e8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4842,15 +4842,15 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
   return SDValue();
 }
 
-static EVT IntToFloatVT(EVT VT) {
+static EVT getFloatVT(EVT VT) {
   return VT = VT.isVector() ? MVT::getVectorVT(MVT::getFloatingPointVT(
                                                    VT.getScalarSizeInBits()),
                                                VT.getVectorNumElements())
                             : MVT::getFloatingPointVT(VT.getFixedSizeInBits());
 }
 
-static SDValue BitwiseToSrcModifierOp(SDValue N,
-                                      TargetLowering::DAGCombinerInfo &DCI) {
+static SDValue getBitwiseToSrcModifierOp(SDValue N,
+                                         TargetLowering::DAGCombinerInfo &DCI) {
 
   unsigned Opc = N.getNode()->getOpcode();
   if (Opc != ISD::AND && Opc != ISD::XOR && Opc != ISD::AND)
@@ -4870,7 +4870,7 @@ static SDValue BitwiseToSrcModifierOp(SDValue N,
          "Expected i32, v2i32 or i64 value type.");
 
   uint64_t Mask = CRHS->getZExtValue();
-  EVT FVT = IntToFloatVT(VT);
+  EVT FVT = getFloatVT(VT);
   SDLoc SL = SDLoc(N);
   SDValue BC = DAG.getNode(ISD::BITCAST, SL, FVT, LHS);
 
@@ -4939,9 +4939,9 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
     // Support source modifiers as integer.
     // (select c, (xor/or/and x, c), y) -> (bitcast (select c)))
     if (VT == MVT::i32 || VT == MVT::v2i32 || VT == MVT::i64) {
-      if (SDValue SrcMod = BitwiseToSrcModifierOp(True, DCI)) {
+      if (SDValue SrcMod = getBitwiseToSrcModifierOp(True, DCI)) {
         SDLoc SL(N);
-        EVT FVT = IntToFloatVT(VT);
+        EVT FVT = getFloatVT(VT);
         SDValue FRHS = DAG.getNode(ISD::BITCAST, SL, FVT, False);
         SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, Cond, SrcMod, FRHS);
         SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, FSelect);
diff --git a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll
index 2db20e672c303..8e1905475f628 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll
@@ -1,8 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
 
 define i32 @fneg_select_i32(i32 %cond, i32 %a, i32 %b) {
 ; GCN-LABEL: fneg_select_i32:
@@ -12,6 +11,20 @@ define i32 @fneg_select_i32(i32 %cond, i32 %a, i32 %b) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, -v1, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX7-LABEL: fneg_select_i32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v2, -v1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fneg_select_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, -v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: fneg_select_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -34,6 +47,24 @@ define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b)
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, -v3, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX7-LABEL: fneg_select_v2i32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v4, -v2, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v5, -v3, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fneg_select_v2i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, -v2, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, -v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: fneg_select_v2i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -56,6 +87,20 @@ define i32 @fabs_select_i32(i32 %cond, i32 %a, i32 %b) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, |v1|, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX7-LABEL: fabs_select_i32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v2, |v1|, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fabs_select_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, |v1|, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: fabs_select_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -78,6 +123,24 @@ define <2 x i32> @fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b)
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, |v3|, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX7-LABEL: fabs_select_v2i32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v4, |v2|, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v5, |v3|, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fabs_select_v2i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, |v2|, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, |v3|, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: fabs_select_v2i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -101,6 +164,22 @@ define i32 @fneg_fabs_select_i32(i32 %cond, i32 %a, i32 %b) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX7-LABEL: fneg_fabs_select_i32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_or_b32_e32 v1, 0x80000000, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fneg_fabs_select_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_or_b32_e32 v1, 0x80000000, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: fneg_fabs_select_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -127,6 +206,28 @@ define <2 x i32> @fneg_fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX7-LABEL: fneg_fabs_select_v2i32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_or_b32_e32 v2, 0x80000000, v2
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_or_b32_e32 v3, 0x80000000, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fneg_fabs_select_v2i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x80000000, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x80000000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: fneg_fabs_select_v2i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -154,6 +255,24 @@ define i64 @fneg_select_i64(i64 %cond, i64 %a, i64 %b) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX7-LABEL: fneg_select_i64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX7-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fneg_select_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX9-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: fneg_select_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -178,6 +297,24 @@ define i64 @fabs_select_i64(i64 %cond, i64 %a, i64 %b) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX7-LABEL: fabs_select_i64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX7-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fabs_select_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: fabs_select_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -202,6 +339,24 @@ define i64 @fneg_fabs_select_i64(i64 %cond, i64 %a, i64 %b) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX7-LABEL: fneg_fabs_select_i64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX7-NEXT:    v_or_b32_e32 v3, 0x80000000, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fneg_fabs_select_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x80000000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: fneg_fabs_select_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -215,8 +370,3 @@ define i64 @fneg_fabs_select_i64(i64 %cond, i64 %a, i64 %b) {
   %select = select i1 %cmp, i64 %neg.a, i64 %b
   ret i64 %select
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX11-FAKE16: {{.*}}
-; GFX11-TRUE16: {{.*}}
-; GFX7: {{.*}}
-; GFX9: {{.*}}

>From 000ddc8efbedfe9cec22bbcfb6f57538f47d71e6 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Mon, 7 Jul 2025 11:57:18 -0500
Subject: [PATCH 09/28] [NFC] Remove incomplete dag-style comment

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index bd7979f5f17e8..b159746109dbf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4937,7 +4937,6 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
     }
 
     // Support source modifiers as integer.
-    // (select c, (xor/or/and x, c), y) -> (bitcast (select c)))
     if (VT == MVT::i32 || VT == MVT::v2i32 || VT == MVT::i64) {
       if (SDValue SrcMod = getBitwiseToSrcModifierOp(True, DCI)) {
         SDLoc SL(N);

>From 2604329f86229792e85fa456c288629c56328b76 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Mon, 7 Jul 2025 12:58:02 -0500
Subject: [PATCH 10/28] Make test for bitwise src mods more stringent and
 correct fneg-fabs order

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index b159746109dbf..ecb535de7e33f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4876,17 +4876,20 @@ static SDValue getBitwiseToSrcModifierOp(SDValue N,
 
   switch (Opc) {
   case ISD::XOR:
-    if (Mask == 0x80000000u || Mask == 0x8000000000000000u)
+    if ((Mask == 0x80000000u && VT.getFixedSizeInBits() == 32) ||
+        (Mask == 0x8000000000000000u && VT.getFixedSizeInBits() == 64))
       return DAG.getNode(ISD::FNEG, SL, FVT, BC);
     break;
   case ISD::OR:
-    if (Mask == 0x80000000u || Mask == 0x8000000000000000u) {
-      SDValue Neg = DAG.getNode(ISD::FNEG, SDLoc(N), FVT, BC);
-      return DAG.getNode(ISD::FABS, SL, FVT, Neg);
+    if ((Mask == 0x80000000u && VT.getFixedSizeInBits() == 32) ||
+        (Mask == 0x8000000000000000u && VT.getFixedSizeInBits() == 64)) {
+      SDValue Abs = DAG.getNode(ISD::ABS, SDLoc(N), FVT, BC);
+      return DAG.getNode(ISD::FNEG, SL, FVT, Abs);
     }
     break;
   case ISD::AND:
-    if (Mask == 0x7fffffffu || Mask == 0x7fffffffffffffffu)
+    if ((Mask == 0x7fffffffu && VT.getFixedSizeInBits() == 32) ||
+        (Mask == 0x7fffffffffffffffu && VT.getFixedSizeInBits() == 64))
       return DAG.getNode(ISD::FABS, SL, FVT, BC);
     break;
   default:

>From 97d93d620759cee2e710594c0b35b94e2eb37750 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Tue, 8 Jul 2025 06:33:32 -0500
Subject: [PATCH 11/28] Reviewer corrections

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index ecb535de7e33f..52df6171c37a0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4843,10 +4843,10 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
 }
 
 static EVT getFloatVT(EVT VT) {
-  return VT = VT.isVector() ? MVT::getVectorVT(MVT::getFloatingPointVT(
-                                                   VT.getScalarSizeInBits()),
-                                               VT.getVectorNumElements())
-                            : MVT::getFloatingPointVT(VT.getFixedSizeInBits());
+  return VT.isVector() ? MVT::getVectorVT(
+                             MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
+                             VT.getVectorNumElements())
+                       : MVT::getFloatingPointVT(VT.getFixedSizeInBits());
 }
 
 static SDValue getBitwiseToSrcModifierOp(SDValue N,
@@ -4883,7 +4883,7 @@ static SDValue getBitwiseToSrcModifierOp(SDValue N,
   case ISD::OR:
     if ((Mask == 0x80000000u && VT.getFixedSizeInBits() == 32) ||
         (Mask == 0x8000000000000000u && VT.getFixedSizeInBits() == 64)) {
-      SDValue Abs = DAG.getNode(ISD::ABS, SDLoc(N), FVT, BC);
+      SDValue Abs = DAG.getNode(ISD::FABS, SL, FVT, BC);
       return DAG.getNode(ISD::FNEG, SL, FVT, Abs);
     }
     break;

>From f255ddcaf7f8724c31f0ac177b2cb0b1b3b685a2 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Tue, 8 Jul 2025 10:08:50 -0500
Subject: [PATCH 12/28] Refactor to support the source modifiers on either or
 both operands. Also extend the test. Still struggling with 64-bit though as
 the legalizer is splitting some 64-bit ops into v2i32.

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  39 +-
 .../AMDGPU/integer-select-source-modifiers.ll | 589 ++++++++++++++----
 2 files changed, 471 insertions(+), 157 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 52df6171c37a0..3f88c949fe96a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4843,17 +4843,15 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
 }
 
 static EVT getFloatVT(EVT VT) {
-  return VT.isVector() ? MVT::getVectorVT(
-                             MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
-                             VT.getVectorNumElements())
-                       : MVT::getFloatingPointVT(VT.getFixedSizeInBits());
+  EVT FT = MVT::getFloatingPointVT(VT.getScalarSizeInBits());
+  return VT.isVector() ? VT.changeVectorElementType(FT) : FT;
 }
 
 static SDValue getBitwiseToSrcModifierOp(SDValue N,
                                          TargetLowering::DAGCombinerInfo &DCI) {
 
   unsigned Opc = N.getNode()->getOpcode();
-  if (Opc != ISD::AND && Opc != ISD::XOR && Opc != ISD::AND)
+  if (Opc != ISD::AND && Opc != ISD::XOR && Opc != ISD::OR)
     return SDValue();
 
   SelectionDAG &DAG = DCI.DAG;
@@ -4865,31 +4863,23 @@ static SDValue getBitwiseToSrcModifierOp(SDValue N,
     return SDValue();
 
   EVT VT = RHS.getValueType();
-
-  assert((VT == MVT::i32 || VT == MVT::v2i32 || VT == MVT::i64) &&
-         "Expected i32, v2i32 or i64 value type.");
-
-  uint64_t Mask = CRHS->getZExtValue();
   EVT FVT = getFloatVT(VT);
   SDLoc SL = SDLoc(N);
   SDValue BC = DAG.getNode(ISD::BITCAST, SL, FVT, LHS);
 
   switch (Opc) {
   case ISD::XOR:
-    if ((Mask == 0x80000000u && VT.getFixedSizeInBits() == 32) ||
-        (Mask == 0x8000000000000000u && VT.getFixedSizeInBits() == 64))
+    if (CRHS->getAPIntValue().isSignMask())
       return DAG.getNode(ISD::FNEG, SL, FVT, BC);
     break;
   case ISD::OR:
-    if ((Mask == 0x80000000u && VT.getFixedSizeInBits() == 32) ||
-        (Mask == 0x8000000000000000u && VT.getFixedSizeInBits() == 64)) {
+    if (CRHS->getAPIntValue().isSignMask()) {
       SDValue Abs = DAG.getNode(ISD::FABS, SL, FVT, BC);
       return DAG.getNode(ISD::FNEG, SL, FVT, Abs);
     }
     break;
   case ISD::AND:
-    if ((Mask == 0x7fffffffu && VT.getFixedSizeInBits() == 32) ||
-        (Mask == 0x7fffffffffffffffu && VT.getFixedSizeInBits() == 64))
+    if (CRHS->getAPIntValue().isMaxSignedValue())
       return DAG.getNode(ISD::FABS, SL, FVT, BC);
     break;
   default:
@@ -4939,15 +4929,20 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
       return MinMax;
     }
 
-    // Support source modifiers as integer.
+    // Support source modifiers on integer types.
     if (VT == MVT::i32 || VT == MVT::v2i32 || VT == MVT::i64) {
-      if (SDValue SrcMod = getBitwiseToSrcModifierOp(True, DCI)) {
+      SDValue SrcModTrue = getBitwiseToSrcModifierOp(True, DCI);
+      SDValue SrcModFalse = getBitwiseToSrcModifierOp(False, DCI);
+      if (SrcModTrue || SrcModFalse) {
         SDLoc SL(N);
         EVT FVT = getFloatVT(VT);
-        SDValue FRHS = DAG.getNode(ISD::BITCAST, SL, FVT, False);
-        SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, Cond, SrcMod, FRHS);
-        SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, FSelect);
-        return BC;
+        SDValue FLHS =
+            SrcModTrue ? SrcModTrue : DAG.getNode(ISD::BITCAST, SL, FVT, True);
+        SDValue FRHS = SrcModFalse ? SrcModFalse
+                                   : DAG.getNode(ISD::BITCAST, SL, FVT, False);
+        ;
+        SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, Cond, FLHS, FRHS);
+        return DAG.getNode(ISD::BITCAST, SL, VT, FSelect);
       }
     }
   }
diff --git a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll
index 8e1905475f628..4fc31493a05f9 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll
@@ -1,31 +1,24 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GCN %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
 
-define i32 @fneg_select_i32(i32 %cond, i32 %a, i32 %b) {
-; GCN-LABEL: fneg_select_i32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, -v1, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: fneg_select_i32:
+define i32 @fneg_select_i32_1(i32 %cond, i32 %a, i32 %b) {
+; GFX7-LABEL: fneg_select_i32_1:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v2, -v1, vcc
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: fneg_select_i32:
+; GFX9-LABEL: fneg_select_i32_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, -v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fneg_select_i32:
+; GFX11-LABEL: fneg_select_i32_1:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -37,17 +30,91 @@ define i32 @fneg_select_i32(i32 %cond, i32 %a, i32 %b) {
   ret i32 %select
 }
 
-define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
-; GCN-LABEL: fneg_select_v2i32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v4, -v2, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, -v3, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+define i32 @fneg_select_i32_2(i32 %cond, i32 %a, i32 %b) {
+; GFX7-LABEL: fneg_select_i32_2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, -v1, v2, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: fneg_select_v2i32:
+; GFX9-LABEL: fneg_select_i32_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, -v1, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_select_i32_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, -v1, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor i32 %a, u0x80000000
+  %cmp = icmp eq i32 %cond, zeroinitializer
+  %select = select i1 %cmp, i32 %b, i32 %neg.a
+  ret i32 %select
+}
+
+define i32 @fneg_select_i32_both(i32 %cond, i32 %a, i32 %b) {
+; GFX7-LABEL: fneg_select_i32_both:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, -v2, -v1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fneg_select_i32_both:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, -v2, -v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_select_i32_both:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, -v2, -v1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor i32 %a, u0x80000000
+  %neg.b = xor i32 %b, u0x80000000
+  %cmp = icmp eq i32 %cond, zeroinitializer
+  %select = select i1 %cmp, i32 %neg.a, i32 %neg.b
+  ret i32 %select
+}
+
+define i32 @fneg_1_fabs_2_select_i32(i32 %cond, i32 %a, i32 %b) {
+; GFX7-LABEL: fneg_1_fabs_2_select_i32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, |v1|, -v1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fneg_1_fabs_2_select_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, |v1|, -v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_1_fabs_2_select_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, |v1|, -v1, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor i32 %a, u0x80000000
+  %abs.b = and i32 %a, u0x7fffffff
+  %cmp = icmp eq i32 %cond, zeroinitializer
+  %select = select i1 %cmp, i32 %neg.a, i32 %abs.b
+  ret i32 %select
+}
+
+define <2 x i32> @fneg_select_v2i32_1(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
+; GFX7-LABEL: fneg_select_v2i32_1:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -56,7 +123,7 @@ define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b)
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v5, -v3, vcc
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: fneg_select_v2i32:
+; GFX9-LABEL: fneg_select_v2i32_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -65,7 +132,7 @@ define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b)
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, -v3, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fneg_select_v2i32:
+; GFX11-LABEL: fneg_select_v2i32_1:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -79,29 +146,55 @@ define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b)
   ret <2 x i32> %select
 }
 
-define i32 @fabs_select_i32(i32 %cond, i32 %a, i32 %b) {
-; GCN-LABEL: fabs_select_i32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, |v1|, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: fabs_select_i32:
+define <2 x i32> @fneg_select_v2i32_2(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
+; GFX7-LABEL: fneg_select_v2i32_2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, -v2, v4, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, -v3, v5, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fneg_select_v2i32_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, -v2, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, -v3, v5, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_select_v2i32_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, -v2, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, -v3, v5, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor <2 x i32> %a, splat (i32 u0x80000000)
+  %cmp = icmp eq <2 x i32> %cond, zeroinitializer
+  %select = select <2 x i1> %cmp, <2 x i32> %b, <2 x i32> %neg.a
+  ret <2 x i32> %select
+}
+
+define i32 @fabs_select_i32_1(i32 %cond, i32 %a, i32 %b) {
+; GFX7-LABEL: fabs_select_i32_1:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v0, v2, |v1|, vcc
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: fabs_select_i32:
+; GFX9-LABEL: fabs_select_i32_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, |v1|, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fabs_select_i32:
+; GFX11-LABEL: fabs_select_i32_1:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -113,17 +206,35 @@ define i32 @fabs_select_i32(i32 %cond, i32 %a, i32 %b) {
   ret i32 %select
 }
 
-define <2 x i32> @fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
-; GCN-LABEL: fabs_select_v2i32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v4, |v2|, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, |v3|, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: fabs_select_v2i32:
+define i32 @fabs_select_i32_2(i32 %cond, i32 %a, i32 %b) {
+; GFX7-LABEL: fabs_select_i32_2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, |v1|, v2, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fabs_select_i32_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, |v1|, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fabs_select_i32_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, |v1|, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = and i32 %a, u0x7fffffff
+  %cmp = icmp eq i32 %cond, zeroinitializer
+  %select = select i1 %cmp, i32 %b, i32 %neg.a
+  ret i32 %select
+}
+
+define <2 x i32> @fabs_select_v2i32_1(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
+; GFX7-LABEL: fabs_select_v2i32_1:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -132,7 +243,7 @@ define <2 x i32> @fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b)
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v5, |v3|, vcc
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: fabs_select_v2i32:
+; GFX9-LABEL: fabs_select_v2i32_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -141,7 +252,7 @@ define <2 x i32> @fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b)
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, |v3|, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fabs_select_v2i32:
+; GFX11-LABEL: fabs_select_v2i32_1:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -155,38 +266,93 @@ define <2 x i32> @fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b)
   ret <2 x i32> %select
 }
 
-define i32 @fneg_fabs_select_i32(i32 %cond, i32 %a, i32 %b) {
-; GCN-LABEL: fneg_fabs_select_i32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_or_b32_e32 v1, 0x80000000, v1
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: fneg_fabs_select_i32:
+define <2 x i32> @fabs_select_v2i32_2(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
+; GFX7-LABEL: fabs_select_v2i32_2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, |v2|, v4, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, |v3|, v5, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fabs_select_v2i32_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, |v2|, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, |v3|, v5, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fabs_select_v2i32_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, |v2|, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, |v3|, v5, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = and <2 x i32> %a, splat (i32 u0x7fffffff)
+  %cmp = icmp eq <2 x i32> %cond, zeroinitializer
+  %select = select <2 x i1> %cmp, <2 x i32> %b, <2 x i32> %neg.a
+  ret <2 x i32> %select
+}
+
+define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
+; GFX7-LABEL: fneg_select_v2i32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, -v2, |v2|, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, -v3, |v3|, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fneg_select_v2i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, -v2, |v2|, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, -v3, |v3|, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_select_v2i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, -v2, |v2|, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, -v3, |v3|, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor <2 x i32> %a, splat (i32 u0x80000000)
+  %abs.b = and <2 x i32> %a, splat (i32 u0x7fffffff)
+  %cmp = icmp eq <2 x i32> %cond, zeroinitializer
+  %select = select <2 x i1> %cmp, <2 x i32> %abs.b, <2 x i32> %neg.a
+  ret <2 x i32> %select
+}
+
+define i32 @fneg_fabs_select_i32_1(i32 %cond, i32 %a, i32 %b) {
+; GFX7-LABEL: fneg_fabs_select_i32_1:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_or_b32_e32 v1, 0x80000000, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v2, -|v1|, vcc
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: fneg_fabs_select_i32:
+; GFX9-LABEL: fneg_fabs_select_i32_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_or_b32_e32 v1, 0x80000000, v1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, -|v1|, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fneg_fabs_select_i32:
+; GFX11-LABEL: fneg_fabs_select_i32_1:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_or_b32_e32 v1, 0x80000000, v1
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, -|v1|, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = or i32 %a, u0x80000000
   %cmp = icmp eq i32 %cond, zeroinitializer
@@ -194,50 +360,59 @@ define i32 @fneg_fabs_select_i32(i32 %cond, i32 %a, i32 %b) {
   ret i32 %select
 }
 
-define <2 x i32> @fneg_fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
-; GCN-LABEL: fneg_fabs_select_v2i32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_or_b32_e32 v2, 0x80000000, v2
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GCN-NEXT:    v_or_b32_e32 v3, 0x80000000, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: fneg_fabs_select_v2i32:
+define i32 @fneg_fabs_select_i32_2(i32 %cond, i32 %a, i32 %b) {
+; GFX7-LABEL: fneg_fabs_select_i32_2:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_or_b32_e32 v2, 0x80000000, v2
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    v_or_b32_e32 v3, 0x80000000, v3
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, -|v1|, v2, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fneg_fabs_select_i32_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, -|v1|, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_fabs_select_i32_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, -|v1|, v2, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = or i32 %a, u0x80000000
+  %cmp = icmp eq i32 %cond, zeroinitializer
+  %select = select i1 %cmp, i32 %b, i32 %neg.a
+  ret i32 %select
+}
+
+define <2 x i32> @fneg_fabs_select_v2i32_1(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
+; GFX7-LABEL: fneg_fabs_select_v2i32_1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, v4, -|v2|, vcc
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v5, -|v3|, vcc
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: fneg_fabs_select_v2i32:
+; GFX9-LABEL: fneg_fabs_select_v2i32_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x80000000, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x80000000, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, -|v2|, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, -|v3|, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fneg_fabs_select_v2i32:
+; GFX11-LABEL: fneg_fabs_select_v2i32_1:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x80000000, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x80000000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v4, -|v2|, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, -|v3|, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = or <2 x i32> %a, splat (i32 u0x80000000)
   %cmp = icmp eq <2 x i32> %cond, zeroinitializer
@@ -245,17 +420,41 @@ define <2 x i32> @fneg_fabs_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32
   ret <2 x i32> %select
 }
 
-define i64 @fneg_select_i64(i64 %cond, i64 %a, i64 %b) {
-; GCN-LABEL: fneg_select_i64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: fneg_select_i64:
+define <2 x i32> @fneg_fabs_select_v2i32_2(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
+; GFX7-LABEL: fneg_fabs_select_v2i32_2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, -|v2|, v4, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, -|v3|, v5, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fneg_fabs_select_v2i32_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, -|v2|, v4, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, -|v3|, v5, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_fabs_select_v2i32_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, -|v2|, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, -|v3|, v5, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = or <2 x i32> %a, splat (i32 u0x80000000)
+  %cmp = icmp eq <2 x i32> %cond, zeroinitializer
+  %select = select <2 x i1> %cmp, <2 x i32> %b, <2 x i32> %neg.a
+  ret <2 x i32> %select
+}
+
+define i64 @fneg_select_i64_1(i64 %cond, i64 %a, i64 %b) {
+; GFX7-LABEL: fneg_select_i64_1:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
@@ -264,7 +463,7 @@ define i64 @fneg_select_i64(i64 %cond, i64 %a, i64 %b) {
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: fneg_select_i64:
+; GFX9-LABEL: fneg_select_i64_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
@@ -273,7 +472,7 @@ define i64 @fneg_select_i64(i64 %cond, i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fneg_select_i64:
+; GFX11-LABEL: fneg_select_i64_1:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
@@ -287,17 +486,78 @@ define i64 @fneg_select_i64(i64 %cond, i64 %a, i64 %b) {
   ret i64 %select
 }
 
-define i64 @fabs_select_i64(i64 %cond, i64 %a, i64 %b) {
-; GCN-LABEL: fabs_select_i64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: fabs_select_i64:
+define i64 @fneg_select_i64_2(i64 %cond, i64 %a, i64 %b) {
+; GFX7-LABEL: fneg_select_i64_2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX7-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fneg_select_i64_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX9-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_select_i64_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v1, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor i64 %a, u0x8000000000000000
+  %cmp = icmp eq i64 %cond, zeroinitializer
+  %select = select i1 %cmp, i64 %b, i64 %neg.a
+  ret i64 %select
+}
+
+define i64 @fneg_1_fabs_2_select_i64(i64 %cond, i64 %a, i64 %b) {
+; GFX7-LABEL: fneg_1_fabs_2_select_i64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX7-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX7-NEXT:    v_and_b32_e32 v5, 0x7fffffff, v5
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fneg_1_fabs_2_select_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX9-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX9-NEXT:    v_and_b32_e32 v5, 0x7fffffff, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_1_fabs_2_select_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v2 :: v_dual_and_b32 v1, 0x7fffffff, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor i64 %a, u0x8000000000000000
+  %abs.b = and i64 %b, u0x7fffffffffffffff
+  %cmp = icmp eq i64 %cond, zeroinitializer
+  %select = select i1 %cmp, i64 %neg.a, i64 %abs.b
+  ret i64 %select
+}
+
+define i64 @fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) {
+; GFX7-LABEL: fabs_select_i64_1:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
@@ -306,7 +566,7 @@ define i64 @fabs_select_i64(i64 %cond, i64 %a, i64 %b) {
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: fabs_select_i64:
+; GFX9-LABEL: fabs_select_i64_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
@@ -315,7 +575,7 @@ define i64 @fabs_select_i64(i64 %cond, i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fabs_select_i64:
+; GFX11-LABEL: fabs_select_i64_1:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
@@ -329,17 +589,41 @@ define i64 @fabs_select_i64(i64 %cond, i64 %a, i64 %b) {
   ret i64 %select
 }
 
-define i64 @fneg_fabs_select_i64(i64 %cond, i64 %a, i64 %b) {
-; GCN-LABEL: fneg_fabs_select_i64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-NEXT:    v_or_b32_e32 v3, 0x80000000, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: fneg_fabs_select_i64:
+define i64 @fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) {
+; GFX7-LABEL: fabs_select_i64_2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX7-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fabs_select_i64_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fabs_select_i64_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_and_b32 v1, 0x7fffffff, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = and i64 %a, u0x7fffffffffffffff
+  %cmp = icmp eq i64 %cond, zeroinitializer
+  %select = select i1 %cmp, i64 %b, i64 %neg.a
+  ret i64 %select
+}
+
+define i64 @fneg_fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) {
+; GFX7-LABEL: fneg_fabs_select_i64_1:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
@@ -348,7 +632,7 @@ define i64 @fneg_fabs_select_i64(i64 %cond, i64 %a, i64 %b) {
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: fneg_fabs_select_i64:
+; GFX9-LABEL: fneg_fabs_select_i64_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
@@ -357,7 +641,7 @@ define i64 @fneg_fabs_select_i64(i64 %cond, i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fneg_fabs_select_i64:
+; GFX11-LABEL: fneg_fabs_select_i64_1:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
@@ -370,3 +654,38 @@ define i64 @fneg_fabs_select_i64(i64 %cond, i64 %a, i64 %b) {
   %select = select i1 %cmp, i64 %neg.a, i64 %b
   ret i64 %select
 }
+
+define i64 @fneg_fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) {
+; GFX7-LABEL: fneg_fabs_select_i64_2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX7-NEXT:    v_or_b32_e32 v3, 0x80000000, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fneg_fabs_select_i64_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x80000000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fneg_fabs_select_i64_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
+; GFX11-NEXT:    v_or_b32_e32 v1, 0x80000000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v1, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = or i64 %a, u0x8000000000000000
+  %cmp = icmp eq i64 %cond, zeroinitializer
+  %select = select i1 %cmp, i64 %b, i64 %neg.a
+  ret i64 %select
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}

>From 2e2249aa617db150369cabec03ad467841a868b6 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Tue, 8 Jul 2025 10:17:38 -0500
Subject: [PATCH 13/28] Fix Typo.

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 3f88c949fe96a..7916f8203c390 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4940,7 +4940,6 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
             SrcModTrue ? SrcModTrue : DAG.getNode(ISD::BITCAST, SL, FVT, True);
         SDValue FRHS = SrcModFalse ? SrcModFalse
                                    : DAG.getNode(ISD::BITCAST, SL, FVT, False);
-        ;
         SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, Cond, FLHS, FRHS);
         return DAG.getNode(ISD::BITCAST, SL, VT, FSelect);
       }

>From a505a7204fdf9754228b36c454c9ed16ad5ed1e1 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Tue, 8 Jul 2025 11:17:25 -0500
Subject: [PATCH 14/28] Respond to reviewer - Add i16 tests, simplify obtaining
 type

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |   3 +-
 .../AMDGPU/integer-select-source-modifiers.ll | 685 +++++++++---------
 2 files changed, 338 insertions(+), 350 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 7916f8203c390..e6b611eda3a10 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4935,7 +4935,8 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
       SDValue SrcModFalse = getBitwiseToSrcModifierOp(False, DCI);
       if (SrcModTrue || SrcModFalse) {
         SDLoc SL(N);
-        EVT FVT = getFloatVT(VT);
+        EVT FVT =
+            SrcModTrue ? SrcModTrue.getValueType() : SrcModFalse.getValueType();
         SDValue FLHS =
             SrcModTrue ? SrcModTrue : DAG.getNode(ISD::BITCAST, SL, FVT, True);
         SDValue FRHS = SrcModFalse ? SrcModFalse
diff --git a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll
index 4fc31493a05f9..eed83dd905c38 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll
@@ -1,22 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define i32 @fneg_select_i32_1(i32 %cond, i32 %a, i32 %b) {
-; GFX7-LABEL: fneg_select_i32_1:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v2, -v1, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fneg_select_i32_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, -v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: fneg_select_i32_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, -v1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_select_i32_1:
 ; GFX11:       ; %bb.0:
@@ -31,19 +25,12 @@ define i32 @fneg_select_i32_1(i32 %cond, i32 %a, i32 %b) {
 }
 
 define i32 @fneg_select_i32_2(i32 %cond, i32 %a, i32 %b) {
-; GFX7-LABEL: fneg_select_i32_2:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, -v1, v2, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fneg_select_i32_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, -v1, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: fneg_select_i32_2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v1, v2, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_select_i32_2:
 ; GFX11:       ; %bb.0:
@@ -58,19 +45,12 @@ define i32 @fneg_select_i32_2(i32 %cond, i32 %a, i32 %b) {
 }
 
 define i32 @fneg_select_i32_both(i32 %cond, i32 %a, i32 %b) {
-; GFX7-LABEL: fneg_select_i32_both:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, -v2, -v1, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fneg_select_i32_both:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, -v2, -v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: fneg_select_i32_both:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v2, -v1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_select_i32_both:
 ; GFX11:       ; %bb.0:
@@ -86,19 +66,12 @@ define i32 @fneg_select_i32_both(i32 %cond, i32 %a, i32 %b) {
 }
 
 define i32 @fneg_1_fabs_2_select_i32(i32 %cond, i32 %a, i32 %b) {
-; GFX7-LABEL: fneg_1_fabs_2_select_i32:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, |v1|, -v1, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fneg_1_fabs_2_select_i32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, |v1|, -v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: fneg_1_fabs_2_select_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, |v1|, -v1, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_1_fabs_2_select_i32:
 ; GFX11:       ; %bb.0:
@@ -114,23 +87,14 @@ define i32 @fneg_1_fabs_2_select_i32(i32 %cond, i32 %a, i32 %b) {
 }
 
 define <2 x i32> @fneg_select_v2i32_1(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
-; GFX7-LABEL: fneg_select_v2i32_1:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v4, -v2, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, v5, -v3, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fneg_select_v2i32_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, -v2, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, -v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: fneg_select_v2i32_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v4, -v2, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, -v3, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_select_v2i32_1:
 ; GFX11:       ; %bb.0:
@@ -147,23 +111,14 @@ define <2 x i32> @fneg_select_v2i32_1(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %
 }
 
 define <2 x i32> @fneg_select_v2i32_2(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
-; GFX7-LABEL: fneg_select_v2i32_2:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, -v2, v4, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, -v3, v5, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fneg_select_v2i32_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, -v2, v4, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, -v3, v5, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: fneg_select_v2i32_2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v2, v4, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, -v3, v5, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_select_v2i32_2:
 ; GFX11:       ; %bb.0:
@@ -180,19 +135,12 @@ define <2 x i32> @fneg_select_v2i32_2(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %
 }
 
 define i32 @fabs_select_i32_1(i32 %cond, i32 %a, i32 %b) {
-; GFX7-LABEL: fabs_select_i32_1:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v2, |v1|, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fabs_select_i32_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, |v1|, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: fabs_select_i32_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, |v1|, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fabs_select_i32_1:
 ; GFX11:       ; %bb.0:
@@ -207,19 +155,12 @@ define i32 @fabs_select_i32_1(i32 %cond, i32 %a, i32 %b) {
 }
 
 define i32 @fabs_select_i32_2(i32 %cond, i32 %a, i32 %b) {
-; GFX7-LABEL: fabs_select_i32_2:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, |v1|, v2, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fabs_select_i32_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, |v1|, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: fabs_select_i32_2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, |v1|, v2, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fabs_select_i32_2:
 ; GFX11:       ; %bb.0:
@@ -234,23 +175,14 @@ define i32 @fabs_select_i32_2(i32 %cond, i32 %a, i32 %b) {
 }
 
 define <2 x i32> @fabs_select_v2i32_1(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
-; GFX7-LABEL: fabs_select_v2i32_1:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v4, |v2|, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, v5, |v3|, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fabs_select_v2i32_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, |v2|, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, |v3|, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: fabs_select_v2i32_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v4, |v2|, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, |v3|, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fabs_select_v2i32_1:
 ; GFX11:       ; %bb.0:
@@ -267,23 +199,14 @@ define <2 x i32> @fabs_select_v2i32_1(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %
 }
 
 define <2 x i32> @fabs_select_v2i32_2(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
-; GFX7-LABEL: fabs_select_v2i32_2:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, |v2|, v4, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, |v3|, v5, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fabs_select_v2i32_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, |v2|, v4, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, |v3|, v5, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: fabs_select_v2i32_2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, |v2|, v4, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, |v3|, v5, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fabs_select_v2i32_2:
 ; GFX11:       ; %bb.0:
@@ -300,23 +223,14 @@ define <2 x i32> @fabs_select_v2i32_2(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %
 }
 
 define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
-; GFX7-LABEL: fneg_select_v2i32:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, -v2, |v2|, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, -v3, |v3|, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fneg_select_v2i32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, -v2, |v2|, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, -v3, |v3|, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: fneg_select_v2i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v2, |v2|, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, -v3, |v3|, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_select_v2i32:
 ; GFX11:       ; %bb.0:
@@ -334,19 +248,12 @@ define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b)
 }
 
 define i32 @fneg_fabs_select_i32_1(i32 %cond, i32 %a, i32 %b) {
-; GFX7-LABEL: fneg_fabs_select_i32_1:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v2, -|v1|, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fneg_fabs_select_i32_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, -|v1|, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: fneg_fabs_select_i32_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, -|v1|, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_fabs_select_i32_1:
 ; GFX11:       ; %bb.0:
@@ -361,19 +268,12 @@ define i32 @fneg_fabs_select_i32_1(i32 %cond, i32 %a, i32 %b) {
 }
 
 define i32 @fneg_fabs_select_i32_2(i32 %cond, i32 %a, i32 %b) {
-; GFX7-LABEL: fneg_fabs_select_i32_2:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, -|v1|, v2, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fneg_fabs_select_i32_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, -|v1|, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: fneg_fabs_select_i32_2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -|v1|, v2, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_fabs_select_i32_2:
 ; GFX11:       ; %bb.0:
@@ -388,23 +288,14 @@ define i32 @fneg_fabs_select_i32_2(i32 %cond, i32 %a, i32 %b) {
 }
 
 define <2 x i32> @fneg_fabs_select_v2i32_1(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
-; GFX7-LABEL: fneg_fabs_select_v2i32_1:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, v4, -|v2|, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, v5, -|v3|, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fneg_fabs_select_v2i32_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, -|v2|, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, -|v3|, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: fneg_fabs_select_v2i32_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v4, -|v2|, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, -|v3|, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_fabs_select_v2i32_1:
 ; GFX11:       ; %bb.0:
@@ -421,23 +312,14 @@ define <2 x i32> @fneg_fabs_select_v2i32_1(<2 x i32> %cond, <2 x i32> %a, <2 x i
 }
 
 define <2 x i32> @fneg_fabs_select_v2i32_2(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
-; GFX7-LABEL: fneg_fabs_select_v2i32_2:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    v_cndmask_b32_e64 v0, -|v2|, v4, vcc
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX7-NEXT:    v_cndmask_b32_e64 v1, -|v3|, v5, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fneg_fabs_select_v2i32_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, -|v2|, v4, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, -|v3|, v5, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: fneg_fabs_select_v2i32_2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -|v2|, v4, vcc
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GCN-NEXT:    v_cndmask_b32_e64 v1, -|v3|, v5, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_fabs_select_v2i32_2:
 ; GFX11:       ; %bb.0:
@@ -454,23 +336,14 @@ define <2 x i32> @fneg_fabs_select_v2i32_2(<2 x i32> %cond, <2 x i32> %a, <2 x i
 }
 
 define i64 @fneg_select_i64_1(i64 %cond, i64 %a, i64 %b) {
-; GFX7-LABEL: fneg_select_i64_1:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GFX7-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fneg_select_i64_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GFX9-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: fneg_select_i64_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_select_i64_1:
 ; GFX11:       ; %bb.0:
@@ -487,23 +360,14 @@ define i64 @fneg_select_i64_1(i64 %cond, i64 %a, i64 %b) {
 }
 
 define i64 @fneg_select_i64_2(i64 %cond, i64 %a, i64 %b) {
-; GFX7-LABEL: fneg_select_i64_2:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GFX7-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fneg_select_i64_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GFX9-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: fneg_select_i64_2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_select_i64_2:
 ; GFX11:       ; %bb.0:
@@ -520,25 +384,15 @@ define i64 @fneg_select_i64_2(i64 %cond, i64 %a, i64 %b) {
 }
 
 define i64 @fneg_1_fabs_2_select_i64(i64 %cond, i64 %a, i64 %b) {
-; GFX7-LABEL: fneg_1_fabs_2_select_i64:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GFX7-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX7-NEXT:    v_and_b32_e32 v5, 0x7fffffff, v5
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fneg_1_fabs_2_select_i64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GFX9-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX9-NEXT:    v_and_b32_e32 v5, 0x7fffffff, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: fneg_1_fabs_2_select_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GCN-NEXT:    v_and_b32_e32 v5, 0x7fffffff, v5
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_1_fabs_2_select_i64:
 ; GFX11:       ; %bb.0:
@@ -557,23 +411,14 @@ define i64 @fneg_1_fabs_2_select_i64(i64 %cond, i64 %a, i64 %b) {
 }
 
 define i64 @fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) {
-; GFX7-LABEL: fabs_select_i64_1:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GFX7-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fabs_select_i64_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: fabs_select_i64_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fabs_select_i64_1:
 ; GFX11:       ; %bb.0:
@@ -590,23 +435,14 @@ define i64 @fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) {
 }
 
 define i64 @fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) {
-; GFX7-LABEL: fabs_select_i64_2:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GFX7-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fabs_select_i64_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: fabs_select_i64_2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fabs_select_i64_2:
 ; GFX11:       ; %bb.0:
@@ -623,23 +459,14 @@ define i64 @fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) {
 }
 
 define i64 @fneg_fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) {
-; GFX7-LABEL: fneg_fabs_select_i64_1:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GFX7-NEXT:    v_or_b32_e32 v3, 0x80000000, v3
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fneg_fabs_select_i64_1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x80000000, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: fneg_fabs_select_i64_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-NEXT:    v_or_b32_e32 v3, 0x80000000, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_fabs_select_i64_1:
 ; GFX11:       ; %bb.0:
@@ -656,23 +483,14 @@ define i64 @fneg_fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) {
 }
 
 define i64 @fneg_fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) {
-; GFX7-LABEL: fneg_fabs_select_i64_2:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GFX7-NEXT:    v_or_b32_e32 v3, 0x80000000, v3
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fneg_fabs_select_i64_2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x80000000, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: fneg_fabs_select_i64_2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GCN-NEXT:    v_or_b32_e32 v3, 0x80000000, v3
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_fabs_select_i64_2:
 ; GFX11:       ; %bb.0:
@@ -687,5 +505,174 @@ define i64 @fneg_fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) {
   %select = select i1 %cmp, i64 %b, i64 %neg.a
   ret i64 %select
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GCN: {{.*}}
+define i16 @fneg_select_i16_1(i16 %cond, i16 %a, i16 %b) {
+; GFX7-LABEL: fneg_select_i16_1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fneg_select_i16_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: fneg_select_i16_1:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_xor_b16 v0.h, 0x8000, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fneg_select_i16_1:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor i16 %a, u0x8000
+  %cmp = icmp eq i16 %cond, zeroinitializer
+  %select = select i1 %cmp, i16 %neg.a, i16 %b
+  ret i16 %select
+}
+
+define i16 @fneg_select_i16_2(i16 %cond, i16 %a, i16 %b) {
+; GFX7-LABEL: fneg_select_i16_2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fneg_select_i16_2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: fneg_select_i16_2:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_xor_b16 v0.h, 0x8000, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.h, v2.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fneg_select_i16_2:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor i16 %a, u0x8000
+  %cmp = icmp eq i16 %cond, zeroinitializer
+  %select = select i1 %cmp, i16 %b, i16 %neg.a
+  ret i16 %select
+}
+
+define i16 @fneg_select_i16_both(i16 %cond, i16 %a, i16 %b) {
+; GFX7-LABEL: fneg_select_i16_both:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX7-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fneg_select_i16_both:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: fneg_select_i16_both:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fneg_select_i16_both:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor i16 %a, u0x8000
+  %neg.b = xor i16 %b, u0x8000
+  %cmp = icmp eq i16 %cond, zeroinitializer
+  %select = select i1 %cmp, i16 %neg.a, i16 %neg.b
+  ret i16 %select
+}
+
+define i16 @fneg_1_fabs_2_select_i16(i16 %cond, i16 %a, i16 %b) {
+; GFX7-LABEL: fneg_1_fabs_2_select_i16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: fneg_1_fabs_2_select_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: fneg_1_fabs_2_select_i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_xor_b16 v0.h, 0x8000, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b16 v1.l, 0x7fff, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.h, vcc_lo
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: fneg_1_fabs_2_select_i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+  %neg.a = xor i16 %a, u0x8000
+  %abs.b = and i16 %a, u0x7fff
+  %cmp = icmp eq i16 %cond, zeroinitializer
+  %select = select i1 %cmp, i16 %neg.a, i16 %abs.b
+  ret i16 %select
+}
+
+

>From a8bd72617bdf8c5616712b5d801f3b0c5a37fe53 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Tue, 8 Jul 2025 11:22:37 -0500
Subject: [PATCH 15/28] Inline bitcast node creation.

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index e6b611eda3a10..282c22930c709 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4865,22 +4865,24 @@ static SDValue getBitwiseToSrcModifierOp(SDValue N,
   EVT VT = RHS.getValueType();
   EVT FVT = getFloatVT(VT);
   SDLoc SL = SDLoc(N);
-  SDValue BC = DAG.getNode(ISD::BITCAST, SL, FVT, LHS);
 
   switch (Opc) {
   case ISD::XOR:
     if (CRHS->getAPIntValue().isSignMask())
-      return DAG.getNode(ISD::FNEG, SL, FVT, BC);
+      return DAG.getNode(ISD::FNEG, SL, FVT,
+                         DAG.getNode(ISD::BITCAST, SL, FVT, LHS));
     break;
   case ISD::OR:
     if (CRHS->getAPIntValue().isSignMask()) {
-      SDValue Abs = DAG.getNode(ISD::FABS, SL, FVT, BC);
+      SDValue Abs = DAG.getNode(ISD::FABS, SL, FVT,
+                                DAG.getNode(ISD::BITCAST, SL, FVT, LHS));
       return DAG.getNode(ISD::FNEG, SL, FVT, Abs);
     }
     break;
   case ISD::AND:
     if (CRHS->getAPIntValue().isMaxSignedValue())
-      return DAG.getNode(ISD::FABS, SL, FVT, BC);
+      return DAG.getNode(ISD::FABS, SL, FVT,
+                         DAG.getNode(ISD::BITCAST, SL, FVT, LHS));
     break;
   default:
     return SDValue();

>From e5f1e67ee8968af97d61c330aa641eeb662e2f8f Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Fri, 11 Jul 2025 06:49:20 -0500
Subject: [PATCH 16/28] Add functional implementation for i64

While this is functional it can be refactored and simplified, working on
this now.
---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 103 ++++++++++++++++--
 .../AMDGPU/integer-select-source-modifiers.ll |  70 +++++-------
 2 files changed, 122 insertions(+), 51 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 282c22930c709..ef2e9c25f352e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4931,23 +4931,112 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
       return MinMax;
     }
 
-    // Support source modifiers on integer types.
-    if (VT == MVT::i32 || VT == MVT::v2i32 || VT == MVT::i64) {
-      SDValue SrcModTrue = getBitwiseToSrcModifierOp(True, DCI);
-      SDValue SrcModFalse = getBitwiseToSrcModifierOp(False, DCI);
+    auto FoldSrcMods = [&](SDValue LHS, SDValue RHS, EVT VT) -> SDValue {
+      SDValue SrcModTrue = getBitwiseToSrcModifierOp(LHS, DCI);
+      SDValue SrcModFalse = getBitwiseToSrcModifierOp(RHS, DCI);
       if (SrcModTrue || SrcModFalse) {
         SDLoc SL(N);
         EVT FVT =
             SrcModTrue ? SrcModTrue.getValueType() : SrcModFalse.getValueType();
         SDValue FLHS =
-            SrcModTrue ? SrcModTrue : DAG.getNode(ISD::BITCAST, SL, FVT, True);
+            SrcModTrue ? SrcModTrue : DAG.getNode(ISD::BITCAST, SL, FVT, LHS);
         SDValue FRHS = SrcModFalse ? SrcModFalse
-                                   : DAG.getNode(ISD::BITCAST, SL, FVT, False);
+                                   : DAG.getNode(ISD::BITCAST, SL, FVT, RHS);
         SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, Cond, FLHS, FRHS);
         return DAG.getNode(ISD::BITCAST, SL, VT, FSelect);
+    }
+    return SDValue();
+  };
+
+    // Support source modifiers on integer operands.
+    if (VT == MVT::i32 || VT == MVT::v2i32)
+      if (SDValue F = FoldSrcMods(True, False, VT))
+        return F;
+
+    // For i64 if a source modifier is to be folded in we split into two i32
+    // select of high and low values. The Operator need only be applied to the
+    // high values in order to change the sign bit.
+    if (VT == MVT::i64) {
+      bool TrueHasModifierOp =
+          (True.getOpcode() == ISD::AND || True.getOpcode() == ISD::OR ||
+           True.getOpcode() == ISD::XOR);
+
+      bool FalseHasModifierOp =
+          (False.getOpcode() == ISD::AND || False.getOpcode() == ISD::OR ||
+           False.getOpcode() == ISD::XOR);
+
+      ConstantSDNode *CTrueRHS = nullptr;
+      if (TrueHasModifierOp) {
+        SDValue TrueRHS = True->getOperand(1);
+        CTrueRHS = dyn_cast<ConstantSDNode>(TrueRHS);
+      }
+
+      ConstantSDNode *CFalseRHS = nullptr;
+      if (FalseHasModifierOp) {
+        SDValue FalseRHS = False->getOperand(1);
+        CFalseRHS = dyn_cast<ConstantSDNode>(FalseRHS);
+      }
+
+      // If True or False is a candidate for source modifier folding, extract
+      // the high value using APInt and reconstruct a ConstantSDNode.
+      SDValue TrueHiOp;
+      SDValue BCTrue = DAG.getNode(ISD::BITCAST, SDLoc(N), MVT::i64, True);
+      SDValue TrueLo;
+      SDValue TrueHi;
+      if (CTrueRHS) {
+        SDValue TrueLHS = True->getOperand(0);
+        SDValue TrueLHSHiVal = getHiHalf64(BCTrue, DAG);
+        TrueLo = getLoHalf64(TrueLHS, DAG);
+        APInt CTrueRHSHiBits =
+            CTrueRHS->getAPIntValue().getHiBits(32).trunc(32);
+        SDValue CTrueRHSHiVal =
+            DAG.getConstant(CTrueRHSHiBits, SDLoc(N), MVT::i32);
+        unsigned OpcTrue = True.getOpcode();
+        TrueHiOp = DAG.getNode(OpcTrue, SDLoc(N), MVT::i32, TrueLHSHiVal,
+                               CTrueRHSHiVal);
+      } else {
+        TrueLo = getLoHalf64(BCTrue, DAG);
+        TrueHi = getHiHalf64(BCTrue, DAG);
+      }
+
+      SDValue FalseHiOp;
+      SDValue BCFalse = DAG.getNode(ISD::BITCAST, SDLoc(N), MVT::i64, False);
+      SDValue FalseLo;
+      SDValue FalseHi;
+      if (CFalseRHS) {
+        SDValue FalseLHS = False->getOperand(0);
+        FalseLo = getLoHalf64(FalseLHS, DAG);
+        SDValue FalseLHSHiVal = getHiHalf64(BCFalse, DAG);
+        APInt CFalseRHSHiBits =
+            CFalseRHS->getAPIntValue().getHiBits(32).trunc(32);
+        SDValue CFalseRHSHiVal =
+            DAG.getConstant(CFalseRHSHiBits, SDLoc(N), MVT::i32);
+        unsigned OpcFalse = False.getOpcode();
+        FalseHiOp = DAG.getNode(OpcFalse, SDLoc(N), MVT::i32, FalseLHSHiVal,
+                                CFalseRHSHiVal);
+      } else {
+        FalseLo = getLoHalf64(BCFalse, DAG);
+        FalseHi = getHiHalf64(BCFalse, DAG);
+      }
+
+      if (CTrueRHS || CFalseRHS) {
+        // Place the low bits directly into the select. The operator is unneeded
+        // for these.
+        SDValue LoSelect =
+            DAG.getNode(ISD::SELECT, SDLoc(N), MVT::i32, Cond, TrueLo, FalseLo);
+        // If a source modifier may be folded use the bitwise-op of the high
+        // values, otherwise just pass the high part of the value.
+        SDValue FoldedHi =
+            FoldSrcMods(CTrueRHS ? TrueHiOp : TrueHi,
+                        CFalseRHS ? FalseHiOp : FalseHi, MVT::i32);
+
+        SDValue ResV =
+            DAG.getBuildVector(MVT::v2i32, SDLoc(N), {FoldedHi, LoSelect});
+        SDValue Res = DAG.getNode(ISD::BITCAST, SDLoc(N), MVT::i64, ResV);
+        return Res;
       }
     }
-  }
+}
 
   // There's no reason to not do this if the condition has other uses.
   return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
diff --git a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll
index eed83dd905c38..c3ce0d1aa739e 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll
@@ -340,18 +340,15 @@ define i64 @fneg_select_i64_1(i64 %cond, i64 %a, i64 %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_select_i64_1:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v5, v3 :: v_dual_cndmask_b32 v1, v4, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = xor i64 %a, u0x8000000000000000
   %cmp = icmp eq i64 %cond, zeroinitializer
@@ -364,18 +361,15 @@ define i64 @fneg_select_i64_2(i64 %cond, i64 %a, i64 %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_select_i64_2:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v1, v5
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v3, v5 :: v_dual_cndmask_b32 v1, v2, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = xor i64 %a, u0x8000000000000000
   %cmp = icmp eq i64 %cond, zeroinitializer
@@ -388,20 +382,16 @@ define i64 @fneg_1_fabs_2_select_i64(i64 %cond, i64 %a, i64 %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GCN-NEXT:    v_and_b32_e32 v5, 0x7fffffff, v5
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, |v5|, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_1_fabs_2_select_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v2 :: v_dual_and_b32 v1, 0x7fffffff, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, |v5|, v3, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = xor i64 %a, u0x8000000000000000
   %abs.b = and i64 %b, u0x7fffffffffffffff
@@ -415,18 +405,16 @@ define i64 @fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v5, |v3|, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fabs_select_i64_1:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v2 :: v_dual_and_b32 v1, 0x7fffffff, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v5, |v3|, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = and i64 %a, u0x7fffffffffffffff
   %cmp = icmp eq i64 %cond, zeroinitializer
@@ -439,18 +427,16 @@ define i64 @fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, |v3|, v5, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fabs_select_i64_2:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_and_b32 v1, 0x7fffffff, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, |v3|, v5, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = and i64 %a, u0x7fffffffffffffff
   %cmp = icmp eq i64 %cond, zeroinitializer
@@ -463,18 +449,16 @@ define i64 @fneg_fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-NEXT:    v_or_b32_e32 v3, 0x80000000, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v5, -|v3|, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_fabs_select_i64_1:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT:    v_or_b32_e32 v1, 0x80000000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v5, -|v3|, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = or i64 %a, u0x8000000000000000
   %cmp = icmp eq i64 %cond, zeroinitializer
@@ -487,18 +471,16 @@ define i64 @fneg_fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-NEXT:    v_or_b32_e32 v3, 0x80000000, v3
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -|v3|, v5, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_fabs_select_i64_2:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT:    v_or_b32_e32 v1, 0x80000000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v1, v5
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, -|v3|, v5, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = or i64 %a, u0x8000000000000000
   %cmp = icmp eq i64 %cond, zeroinitializer

>From 767cc114869c7ac835a7767517e63092275cf7ef Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Fri, 11 Jul 2025 12:27:01 -0500
Subject: [PATCH 17/28] Fix formatting

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index ef2e9c25f352e..a61f7ccc8b956 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4940,22 +4940,23 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
             SrcModTrue ? SrcModTrue.getValueType() : SrcModFalse.getValueType();
         SDValue FLHS =
             SrcModTrue ? SrcModTrue : DAG.getNode(ISD::BITCAST, SL, FVT, LHS);
-        SDValue FRHS = SrcModFalse ? SrcModFalse
-                                   : DAG.getNode(ISD::BITCAST, SL, FVT, RHS);
+        SDValue FRHS =
+            SrcModFalse ? SrcModFalse : DAG.getNode(ISD::BITCAST, SL, FVT, RHS);
         SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, Cond, FLHS, FRHS);
         return DAG.getNode(ISD::BITCAST, SL, VT, FSelect);
-    }
-    return SDValue();
-  };
+      }
+      return SDValue();
+    };
 
     // Support source modifiers on integer operands.
     if (VT == MVT::i32 || VT == MVT::v2i32)
       if (SDValue F = FoldSrcMods(True, False, VT))
         return F;
 
-    // For i64 if a source modifier is to be folded in we split into two i32
-    // select of high and low values. The Operator need only be applied to the
-    // high values in order to change the sign bit.
+    // auto SplitSelect = [&]() -> std::pair(
+    //  For i64 if a source modifier is to be folded in we split into two i32
+    //  select of high and low values. The Operator need only be applied to the
+    //  high values in order to change the sign bit.
     if (VT == MVT::i64) {
       bool TrueHasModifierOp =
           (True.getOpcode() == ISD::AND || True.getOpcode() == ISD::OR ||
@@ -5036,7 +5037,7 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
         return Res;
       }
     }
-}
+  }
 
   // There's no reason to not do this if the condition has other uses.
   return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);

>From 686919f62711e0c6536d01c1e0c81bc457b28598 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Sat, 12 Jul 2025 18:41:54 -0500
Subject: [PATCH 18/28] [DAGCombine] Move the AMDGPU combine to Target
 Indepenent DAGCombine

- Allows removal of i64 specific code - the TI combine splits to i32
  ops.

- Update quite a few AMDGPU tests, these all appear to be improvements
  in codegen. Need to double-check.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   75 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |   89 -
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |    8 +
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |    4 +
 .../atomic_optimizations_global_pointer.ll    |   18 +-
 .../branch-folding-implicit-def-subreg.ll     |   18 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll     | 2219 +++++++++--------
 llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll |  302 ++-
 llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll |    7 +-
 .../CodeGen/AMDGPU/fneg-modifier-casting.ll   |   58 +-
 llvm/test/CodeGen/AMDGPU/fptrunc.ll           |  133 +-
 .../AMDGPU/fptrunc.v2f16.no.fast.math.ll      |   64 +-
 .../CodeGen/AMDGPU/insert-delay-alu-bug.ll    |  462 ++--
 .../AMDGPU/integer-select-source-modifiers.ll |   54 +-
 llvm/test/CodeGen/AMDGPU/saddsat.ll           |   52 +-
 .../AMDGPU/sdwa-peephole-cndmask-sext.ll      |    7 +-
 llvm/test/CodeGen/AMDGPU/ssubsat.ll           |  378 ++-
 17 files changed, 2016 insertions(+), 1932 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 231184587d682..4f58ffa47fd20 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -684,7 +684,7 @@ namespace {
                                   SDValue VecIn2, unsigned LeftIdx,
                                   bool DidSplitVec);
     SDValue matchVSelectOpSizesWithSetCC(SDNode *Cast);
-
+    SDValue getBitwiseToSrcModifierOp(SDValue N);
     /// Walk up chain skipping non-aliasing memory nodes,
     /// looking for aliasing nodes and adding them to the Aliases vector.
     void GatherAllAliases(SDNode *N, SDValue OriginalChain,
@@ -12175,6 +12175,56 @@ SDValue DAGCombiner::foldSelectToABD(SDValue LHS, SDValue RHS, SDValue True,
   return SDValue();
 }
 
+static EVT getFloatVT(EVT VT) {
+  EVT FT = MVT::getFloatingPointVT(VT.getScalarSizeInBits());
+  return VT.isVector() ? VT.changeVectorElementType(FT) : FT;
+}
+
+SDValue DAGCombiner::getBitwiseToSrcModifierOp(SDValue N) {
+
+  unsigned Opc = N.getNode()->getOpcode();
+  if (Opc != ISD::AND && Opc != ISD::XOR && Opc != ISD::OR)
+    return SDValue();
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  if(!TLI.shouldFoldSelectWithIdentityConstant(N.getOpcode(), N->getValueType(0), ISD::SELECT, LHS, RHS))
+    return SDValue();
+
+  ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
+
+  if (!CRHS)
+    return SDValue();
+
+  EVT VT = RHS.getValueType();
+  EVT FVT = getFloatVT(VT);
+  SDLoc SL = SDLoc(N);
+
+  switch (Opc) {
+  case ISD::XOR:
+    if (CRHS->getAPIntValue().isSignMask())
+      return DAG.getNode(ISD::FNEG, SL, FVT,
+                         DAG.getNode(ISD::BITCAST, SL, FVT, LHS));
+    break;
+  case ISD::OR:
+    if (CRHS->getAPIntValue().isSignMask()) {
+      SDValue Abs = DAG.getNode(ISD::FABS, SL, FVT,
+                                DAG.getNode(ISD::BITCAST, SL, FVT, LHS));
+      return DAG.getNode(ISD::FNEG, SL, FVT, Abs);
+    }
+    break;
+  case ISD::AND:
+    if (CRHS->getAPIntValue().isMaxSignedValue())
+      return DAG.getNode(ISD::FABS, SL, FVT,
+                         DAG.getNode(ISD::BITCAST, SL, FVT, LHS));
+    break;
+  default:
+    return SDValue();
+  }
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitSELECT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -12390,6 +12440,29 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
   if (SDValue R = combineSelectAsExtAnd(N0, N1, N2, DL, DAG))
     return R;
 
+  auto FoldSrcMods = [&](SDValue LHS, SDValue RHS, EVT VT) -> SDValue {
+    SDValue SrcModTrue = getBitwiseToSrcModifierOp(LHS);
+    SDValue SrcModFalse = getBitwiseToSrcModifierOp(RHS);
+    if (SrcModTrue || SrcModFalse) {
+      SDLoc SL(N);
+      EVT FVT =
+          SrcModTrue ? SrcModTrue.getValueType() : SrcModFalse.getValueType();
+      SDValue FLHS =
+          SrcModTrue ? SrcModTrue : DAG.getNode(ISD::BITCAST, SL, FVT, LHS);
+      SDValue FRHS =
+          SrcModFalse ? SrcModFalse : DAG.getNode(ISD::BITCAST, SL, FVT, RHS);
+      SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, N0, FLHS, FRHS);
+      return DAG.getNode(ISD::BITCAST, SL, VT, FSelect);
+    }
+    return SDValue();
+  };
+
+  // Identify bitmask operations that are source mods and create
+  // the relevant fneg, fabs or fneg+fabs.
+  if (VT == MVT::i32 || VT == MVT::v2i32)
+    if (SDValue F = FoldSrcMods(N1, N2, VT))
+      return F;
+
   return SDValue();
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index a61f7ccc8b956..7436de2d6a6a8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4948,95 +4948,6 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
       return SDValue();
     };
 
-    // Support source modifiers on integer operands.
-    if (VT == MVT::i32 || VT == MVT::v2i32)
-      if (SDValue F = FoldSrcMods(True, False, VT))
-        return F;
-
-    // auto SplitSelect = [&]() -> std::pair(
-    //  For i64 if a source modifier is to be folded in we split into two i32
-    //  select of high and low values. The Operator need only be applied to the
-    //  high values in order to change the sign bit.
-    if (VT == MVT::i64) {
-      bool TrueHasModifierOp =
-          (True.getOpcode() == ISD::AND || True.getOpcode() == ISD::OR ||
-           True.getOpcode() == ISD::XOR);
-
-      bool FalseHasModifierOp =
-          (False.getOpcode() == ISD::AND || False.getOpcode() == ISD::OR ||
-           False.getOpcode() == ISD::XOR);
-
-      ConstantSDNode *CTrueRHS = nullptr;
-      if (TrueHasModifierOp) {
-        SDValue TrueRHS = True->getOperand(1);
-        CTrueRHS = dyn_cast<ConstantSDNode>(TrueRHS);
-      }
-
-      ConstantSDNode *CFalseRHS = nullptr;
-      if (FalseHasModifierOp) {
-        SDValue FalseRHS = False->getOperand(1);
-        CFalseRHS = dyn_cast<ConstantSDNode>(FalseRHS);
-      }
-
-      // If True or False is a candidate for source modifier folding, extract
-      // the high value using APInt and reconstruct a ConstantSDNode.
-      SDValue TrueHiOp;
-      SDValue BCTrue = DAG.getNode(ISD::BITCAST, SDLoc(N), MVT::i64, True);
-      SDValue TrueLo;
-      SDValue TrueHi;
-      if (CTrueRHS) {
-        SDValue TrueLHS = True->getOperand(0);
-        SDValue TrueLHSHiVal = getHiHalf64(BCTrue, DAG);
-        TrueLo = getLoHalf64(TrueLHS, DAG);
-        APInt CTrueRHSHiBits =
-            CTrueRHS->getAPIntValue().getHiBits(32).trunc(32);
-        SDValue CTrueRHSHiVal =
-            DAG.getConstant(CTrueRHSHiBits, SDLoc(N), MVT::i32);
-        unsigned OpcTrue = True.getOpcode();
-        TrueHiOp = DAG.getNode(OpcTrue, SDLoc(N), MVT::i32, TrueLHSHiVal,
-                               CTrueRHSHiVal);
-      } else {
-        TrueLo = getLoHalf64(BCTrue, DAG);
-        TrueHi = getHiHalf64(BCTrue, DAG);
-      }
-
-      SDValue FalseHiOp;
-      SDValue BCFalse = DAG.getNode(ISD::BITCAST, SDLoc(N), MVT::i64, False);
-      SDValue FalseLo;
-      SDValue FalseHi;
-      if (CFalseRHS) {
-        SDValue FalseLHS = False->getOperand(0);
-        FalseLo = getLoHalf64(FalseLHS, DAG);
-        SDValue FalseLHSHiVal = getHiHalf64(BCFalse, DAG);
-        APInt CFalseRHSHiBits =
-            CFalseRHS->getAPIntValue().getHiBits(32).trunc(32);
-        SDValue CFalseRHSHiVal =
-            DAG.getConstant(CFalseRHSHiBits, SDLoc(N), MVT::i32);
-        unsigned OpcFalse = False.getOpcode();
-        FalseHiOp = DAG.getNode(OpcFalse, SDLoc(N), MVT::i32, FalseLHSHiVal,
-                                CFalseRHSHiVal);
-      } else {
-        FalseLo = getLoHalf64(BCFalse, DAG);
-        FalseHi = getHiHalf64(BCFalse, DAG);
-      }
-
-      if (CTrueRHS || CFalseRHS) {
-        // Place the low bits directly into the select. The operator is unneeded
-        // for these.
-        SDValue LoSelect =
-            DAG.getNode(ISD::SELECT, SDLoc(N), MVT::i32, Cond, TrueLo, FalseLo);
-        // If a source modifier may be folded use the bitwise-op of the high
-        // values, otherwise just pass the high part of the value.
-        SDValue FoldedHi =
-            FoldSrcMods(CTrueRHS ? TrueHiOp : TrueHi,
-                        CFalseRHS ? FalseHiOp : FalseHi, MVT::i32);
-
-        SDValue ResV =
-            DAG.getBuildVector(MVT::v2i32, SDLoc(N), {FoldedHi, LoSelect});
-        SDValue Res = DAG.getNode(ISD::BITCAST, SDLoc(N), MVT::i64, ResV);
-        return Res;
-      }
-    }
   }
 
   // There's no reason to not do this if the condition has other uses.
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e90316cee12fe..77632c1423f4e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15493,6 +15493,14 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
   return SDValue();
 }
 
+bool SITargetLowering::shouldFoldSelectWithIdentityConstant(
+    unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
+    SDValue Y) const {
+  return (BinOpcode == ISD::AND || BinOpcode == ISD::OR ||
+          BinOpcode == ISD::XOR) &&
+         (VT.getScalarType() == MVT::i32);
+}
+
 SDValue SITargetLowering::performSetCCCombine(SDNode *N,
                                               DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index acf6158572a4d..f118bc37b9224 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -264,6 +264,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
 
   bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override;
 
+  bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT,
+                                            unsigned SelectOpcode, SDValue X,
+                                            SDValue Y) const override;
+
 private:
   // Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
   // the three offsets (voffset, soffset and instoffset) into the SDValue[3]
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 3ca7db155b385..7584d3eb12928 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -7145,12 +7145,13 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
 ; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
-; GFX7LESS-NEXT:    v_or_b32_e32 v0, s4, v0
+; GFX7LESS-NEXT:    s_or_b32 s5, s4, s6
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX7LESS-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -8838,12 +8839,13 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
 ; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
-; GFX7LESS-NEXT:    v_or_b32_e32 v0, s4, v0
+; GFX7LESS-NEXT:    s_or_b32 s5, s4, s6
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX7LESS-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 92c63fead15ac..50efed6da381b 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -913,15 +913,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr25, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr3 = V_OR_B32_e32 killed $vgpr11, killed $vgpr19, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr3, killed $vgpr2, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr53, 0, $vgpr3, 0, 0, 6, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr10 = V_OR_B32_e32 killed $vgpr52, killed $vgpr13, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr10, killed $vgpr2, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr17, 0, $vgpr3, 0, 0, 6, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr15, implicit $exec
-  ; GFX90A-NEXT:   DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr3 = V_OR_B32_e32 killed $vgpr52, killed $vgpr13, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 $vgpr3, killed $vgpr2, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vcc = V_CMP_NE_U32_sdwa 0, killed $vgpr53, 0, $vgpr10, 0, 0, 6, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, killed $vgpr2, 0, killed $vgpr3, killed $vcc, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 $vgpr15, killed $vgpr2, implicit $exec
+  ; GFX90A-NEXT:   renamable $vcc = V_CMP_NE_U32_sdwa 0, killed $vgpr17, 0, $vgpr10, 0, 0, 6, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, killed $vgpr2, 0, killed $vgpr15, killed $vcc, implicit $exec
+  ; GFX90A-NEXT:   DS_WRITE2_B32_gfx9 killed renamable $vgpr10, killed renamable $vgpr2, renamable $vgpr10, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3)
   ; GFX90A-NEXT:   S_BRANCH %bb.65
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.68.bb174:
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index ba4fe3685458d..d52fe845d62ec 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -275,14 +275,23 @@ define i16 @s_test_copysign_f16_10_mag(half inreg %sign) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: s_test_copysign_f16_10_mag:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_and_b32 s0, s0, 0x8000
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b32 s0, s0, 0x4900
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: s_test_copysign_f16_10_mag:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0x8000
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, 0x4900
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: s_test_copysign_f16_10_mag:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0x8000
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, 0x4900
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call half @llvm.copysign.f16(half 10.0, half %sign)
   %cast = bitcast half %result to i16
   ret i16 %cast
@@ -864,21 +873,20 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) {
 ; SI-NEXT:    v_med3_i32 v5, v5, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, v5, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, v5, v6
+; SI-NEXT:    v_or_b32_e32 v7, 1, v6
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v3
 ; SI-NEXT:    s_movk_i32 s4, 0xfc10
-; SI-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 12, v4
-; SI-NEXT:    v_or_b32_e32 v3, v6, v3
 ; SI-NEXT:    v_or_b32_e32 v5, v0, v5
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v4
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-NEXT:    v_and_b32_e32 v5, 7, v3
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v5
-; SI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v5
-; SI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
+; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v5
+; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; SI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; SI-NEXT:    v_mov_b32_e32 v5, 0x7c00
@@ -914,21 +922,20 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) {
 ; VI-NEXT:    v_med3_i32 v4, v4, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, v4, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, v4, v5
+; VI-NEXT:    v_or_b32_e32 v6, 1, v5
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v4, v3
 ; VI-NEXT:    s_movk_i32 s4, 0xfc10
-; VI-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, s4, v1
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 12, v1
-; VI-NEXT:    v_or_b32_e32 v3, v5, v3
 ; VI-NEXT:    v_or_b32_e32 v4, v0, v4
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; VI-NEXT:    v_and_b32_e32 v4, 7, v3
-; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v4
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v4
-; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; VI-NEXT:    v_or_b32_e32 v4, v4, v5
+; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v4
+; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
 ; VI-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x7c00
@@ -960,19 +967,18 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) {
 ; GFX9-NEXT:    v_med3_i32 v4, v4, 0, 13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, v4, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, v4, v5
+; GFX9-NEXT:    v_or_b32_e32 v6, 1, v5
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v4, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v1, 0xfffffc10, v1
-; GFX9-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_lshl_or_b32 v4, v1, 12, v0
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v4, 7, v3
-; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX9-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v4
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7c00
@@ -1002,36 +1008,35 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_and_or_b32 v0, 0xffe, v3, v0
 ; GFX11-NEXT:    v_med3_i32 v3, v4, 0, 13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v4, 0x1000, v0
+; GFX11-NEXT:    v_lshl_or_b32 v7, v1, 12, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v3, v5
+; GFX11-NEXT:    v_or_b32_e32 v6, 1, v5
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_lshl_or_b32 v4, v1, 12, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v3, v5, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v4, 7, v3
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
-; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v4
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v4
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_i32_e64 s0, 5, v4
+; GFX11-NEXT:    s_or_b32 s0, s0, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v4, 0x7e00 :: v_dual_add_nc_u32 v3, v3, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v4, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %mag.trunc = fptrunc double %mag to half
@@ -1057,29 +1062,31 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; SI-NEXT:    s_or_b32 s3, s0, 0x1000
 ; SI-NEXT:    v_readfirstlane_b32 s5, v0
 ; SI-NEXT:    s_lshr_b32 s6, s3, s5
+; SI-NEXT:    s_or_b32 s7, s6, 1
 ; SI-NEXT:    s_lshl_b32 s5, s6, s5
 ; SI-NEXT:    s_cmp_lg_u32 s5, s3
-; SI-NEXT:    s_cselect_b32 s3, 1, 0
-; SI-NEXT:    s_addk_i32 s4, 0xfc10
-; SI-NEXT:    s_lshl_b32 s5, s4, 12
-; SI-NEXT:    s_or_b32 s3, s6, s3
-; SI-NEXT:    s_or_b32 s5, s0, s5
-; SI-NEXT:    s_cmp_lt_i32 s4, 1
-; SI-NEXT:    s_cselect_b32 s3, s3, s5
-; SI-NEXT:    s_and_b32 s5, s3, 7
-; SI-NEXT:    s_cmp_gt_i32 s5, 5
-; SI-NEXT:    s_cselect_b32 s6, 1, 0
-; SI-NEXT:    s_cmp_eq_u32 s5, 3
-; SI-NEXT:    s_cselect_b32 s5, 1, 0
-; SI-NEXT:    s_or_b32 s5, s5, s6
+; SI-NEXT:    s_cselect_b32 s3, s7, s6
+; SI-NEXT:    s_add_i32 s8, s4, 0xfffffc10
+; SI-NEXT:    s_lshl_b32 s4, s8, 12
+; SI-NEXT:    s_or_b32 s4, s0, s4
+; SI-NEXT:    s_cmp_lt_i32 s8, 1
+; SI-NEXT:    s_cselect_b32 s3, s3, s4
+; SI-NEXT:    s_and_b32 s6, s3, 7
+; SI-NEXT:    s_cmp_eq_u32 s6, 3
+; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT:    s_cmp_gt_i32 s6, 5
+; SI-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; SI-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; SI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
 ; SI-NEXT:    s_lshr_b32 s3, s3, 2
-; SI-NEXT:    s_add_i32 s3, s3, s5
-; SI-NEXT:    s_cmp_lt_i32 s4, 31
+; SI-NEXT:    s_add_i32 s3, s3, s4
+; SI-NEXT:    s_cmp_lt_i32 s8, 31
 ; SI-NEXT:    s_cselect_b32 s3, s3, 0x7c00
 ; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    s_movk_i32 s0, 0x7e00
 ; SI-NEXT:    s_cselect_b32 s0, s0, 0x7c00
-; SI-NEXT:    s_cmpk_eq_i32 s4, 0x40f
+; SI-NEXT:    s_cmpk_eq_i32 s8, 0x40f
 ; SI-NEXT:    s_cselect_b32 s0, s0, s3
 ; SI-NEXT:    s_lshr_b32 s1, s1, 16
 ; SI-NEXT:    s_and_b32 s1, s1, 0x8000
@@ -1104,35 +1111,37 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; VI-NEXT:    s_and_b32 s0, s0, 0xffe
 ; VI-NEXT:    v_readfirstlane_b32 s3, v0
 ; VI-NEXT:    s_sub_i32 s4, 0x3f1, s1
-; VI-NEXT:    s_or_b32 s0, s0, s3
+; VI-NEXT:    s_or_b32 s3, s0, s3
 ; VI-NEXT:    v_med3_i32 v0, s4, 0, 13
-; VI-NEXT:    s_or_b32 s3, s0, 0x1000
+; VI-NEXT:    s_or_b32 s0, s3, 0x1000
 ; VI-NEXT:    v_readfirstlane_b32 s4, v0
-; VI-NEXT:    s_lshr_b32 s5, s3, s4
+; VI-NEXT:    s_lshr_b32 s5, s0, s4
+; VI-NEXT:    s_or_b32 s6, s5, 1
 ; VI-NEXT:    s_lshl_b32 s4, s5, s4
-; VI-NEXT:    s_cmp_lg_u32 s4, s3
-; VI-NEXT:    s_cselect_b32 s3, 1, 0
-; VI-NEXT:    s_addk_i32 s1, 0xfc10
-; VI-NEXT:    s_lshl_b32 s4, s1, 12
-; VI-NEXT:    s_or_b32 s3, s5, s3
-; VI-NEXT:    s_or_b32 s4, s0, s4
-; VI-NEXT:    s_cmp_lt_i32 s1, 1
-; VI-NEXT:    s_cselect_b32 s3, s3, s4
-; VI-NEXT:    s_and_b32 s4, s3, 7
-; VI-NEXT:    s_cmp_gt_i32 s4, 5
-; VI-NEXT:    s_cselect_b32 s5, 1, 0
+; VI-NEXT:    s_cmp_lg_u32 s4, s0
+; VI-NEXT:    s_cselect_b32 s0, s6, s5
+; VI-NEXT:    s_add_i32 s6, s1, 0xfffffc10
+; VI-NEXT:    s_lshl_b32 s1, s6, 12
+; VI-NEXT:    s_or_b32 s1, s3, s1
+; VI-NEXT:    s_cmp_lt_i32 s6, 1
+; VI-NEXT:    s_cselect_b32 s7, s0, s1
+; VI-NEXT:    s_and_b32 s4, s7, 7
 ; VI-NEXT:    s_cmp_eq_u32 s4, 3
-; VI-NEXT:    s_cselect_b32 s4, 1, 0
-; VI-NEXT:    s_or_b32 s4, s4, s5
-; VI-NEXT:    s_lshr_b32 s3, s3, 2
-; VI-NEXT:    s_add_i32 s3, s3, s4
-; VI-NEXT:    s_cmp_lt_i32 s1, 31
-; VI-NEXT:    s_cselect_b32 s3, s3, 0x7c00
-; VI-NEXT:    s_cmp_lg_u32 s0, 0
-; VI-NEXT:    s_movk_i32 s0, 0x7e00
-; VI-NEXT:    s_cselect_b32 s0, s0, 0x7c00
-; VI-NEXT:    s_cmpk_eq_i32 s1, 0x40f
-; VI-NEXT:    s_cselect_b32 s0, s0, s3
+; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    s_cmp_gt_i32 s4, 5
+; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; VI-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
+; VI-NEXT:    s_lshr_b32 s1, s7, 2
+; VI-NEXT:    s_add_i32 s1, s1, s0
+; VI-NEXT:    s_cmp_lt_i32 s6, 31
+; VI-NEXT:    s_cselect_b32 s0, s1, 0x7c00
+; VI-NEXT:    s_cmp_lg_u32 s3, 0
+; VI-NEXT:    s_movk_i32 s1, 0x7e00
+; VI-NEXT:    s_cselect_b32 s1, s1, 0x7c00
+; VI-NEXT:    s_cmpk_eq_i32 s6, 0x40f
+; VI-NEXT:    s_cselect_b32 s0, s1, s0
 ; VI-NEXT:    s_movk_i32 s1, 0x7fff
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
@@ -1152,35 +1161,37 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xffe
 ; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX9-NEXT:    s_sub_i32 s4, 0x3f1, s1
-; GFX9-NEXT:    s_or_b32 s0, s0, s3
+; GFX9-NEXT:    s_or_b32 s3, s0, s3
 ; GFX9-NEXT:    v_med3_i32 v0, s4, 0, 13
-; GFX9-NEXT:    s_or_b32 s3, s0, 0x1000
+; GFX9-NEXT:    s_or_b32 s0, s3, 0x1000
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX9-NEXT:    s_lshr_b32 s5, s3, s4
+; GFX9-NEXT:    s_lshr_b32 s5, s0, s4
+; GFX9-NEXT:    s_or_b32 s6, s5, 1
 ; GFX9-NEXT:    s_lshl_b32 s4, s5, s4
-; GFX9-NEXT:    s_cmp_lg_u32 s4, s3
-; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX9-NEXT:    s_addk_i32 s1, 0xfc10
-; GFX9-NEXT:    s_lshl_b32 s4, s1, 12
-; GFX9-NEXT:    s_or_b32 s3, s5, s3
-; GFX9-NEXT:    s_or_b32 s4, s0, s4
-; GFX9-NEXT:    s_cmp_lt_i32 s1, 1
-; GFX9-NEXT:    s_cselect_b32 s3, s3, s4
-; GFX9-NEXT:    s_and_b32 s4, s3, 7
-; GFX9-NEXT:    s_cmp_gt_i32 s4, 5
-; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s4, s0
+; GFX9-NEXT:    s_cselect_b32 s0, s6, s5
+; GFX9-NEXT:    s_add_i32 s6, s1, 0xfffffc10
+; GFX9-NEXT:    s_lshl_b32 s1, s6, 12
+; GFX9-NEXT:    s_or_b32 s1, s3, s1
+; GFX9-NEXT:    s_cmp_lt_i32 s6, 1
+; GFX9-NEXT:    s_cselect_b32 s7, s0, s1
+; GFX9-NEXT:    s_and_b32 s4, s7, 7
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 3
-; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX9-NEXT:    s_or_b32 s4, s4, s5
-; GFX9-NEXT:    s_lshr_b32 s3, s3, 2
-; GFX9-NEXT:    s_add_i32 s3, s3, s4
-; GFX9-NEXT:    s_cmp_lt_i32 s1, 31
-; GFX9-NEXT:    s_cselect_b32 s3, s3, 0x7c00
-; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX9-NEXT:    s_movk_i32 s0, 0x7e00
-; GFX9-NEXT:    s_cselect_b32 s0, s0, 0x7c00
-; GFX9-NEXT:    s_cmpk_eq_i32 s1, 0x40f
-; GFX9-NEXT:    s_cselect_b32 s0, s0, s3
+; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT:    s_cmp_gt_i32 s4, 5
+; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
+; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-NEXT:    s_lshr_b32 s1, s7, 2
+; GFX9-NEXT:    s_add_i32 s1, s1, s0
+; GFX9-NEXT:    s_cmp_lt_i32 s6, 31
+; GFX9-NEXT:    s_cselect_b32 s0, s1, 0x7c00
+; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX9-NEXT:    s_movk_i32 s1, 0x7e00
+; GFX9-NEXT:    s_cselect_b32 s1, s1, 0x7c00
+; GFX9-NEXT:    s_cmpk_eq_i32 s6, 0x40f
+; GFX9-NEXT:    s_cselect_b32 s0, s1, s0
 ; GFX9-NEXT:    s_movk_i32 s1, 0x7fff
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
@@ -1188,59 +1199,120 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b32 s3, s1, 0x1ff
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b32 s0, s3, s0
-; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT:    s_bfe_u32 s0, s1, 0xb0014
-; GFX11-NEXT:    s_lshr_b32 s1, s1, 8
-; GFX11-NEXT:    s_sub_i32 s3, 0x3f1, s0
-; GFX11-NEXT:    s_and_b32 s1, s1, 0xffe
-; GFX11-NEXT:    v_med3_i32 v1, s3, 0, 13
-; GFX11-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX11-NEXT:    s_or_b32 s1, s1, s3
-; GFX11-NEXT:    s_or_b32 s3, s1, 0x1000
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_lshr_b32 s5, s3, s4
-; GFX11-NEXT:    s_lshl_b32 s4, s5, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_cmp_lg_u32 s4, s3
-; GFX11-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX11-NEXT:    s_addk_i32 s0, 0xfc10
-; GFX11-NEXT:    s_or_b32 s3, s5, s3
-; GFX11-NEXT:    s_lshl_b32 s4, s0, 12
-; GFX11-NEXT:    s_or_b32 s4, s1, s4
-; GFX11-NEXT:    s_cmp_lt_i32 s0, 1
-; GFX11-NEXT:    s_cselect_b32 s3, s3, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s4, s3, 7
-; GFX11-NEXT:    s_cmp_gt_i32 s4, 5
-; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s4, 3
-; GFX11-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX11-NEXT:    s_lshr_b32 s3, s3, 2
-; GFX11-NEXT:    s_or_b32 s4, s4, s5
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_i32 s3, s3, s4
-; GFX11-NEXT:    s_cmp_lt_i32 s0, 31
-; GFX11-NEXT:    s_movk_i32 s4, 0x7e00
-; GFX11-NEXT:    s_cselect_b32 s3, s3, 0x7c00
-; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-NEXT:    s_cselect_b32 s1, s4, 0x7c00
-; GFX11-NEXT:    s_cmpk_eq_i32 s0, 0x40f
-; GFX11-NEXT:    s_cselect_b32 s0, s1, s3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_and_b32 s3, s1, 0x1ff
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s3, s0
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-TRUE16-NEXT:    s_bfe_u32 s0, s1, 0xb0014
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s1, s1, 8
+; GFX11-TRUE16-NEXT:    s_sub_i32 s3, 0x3f1, s0
+; GFX11-TRUE16-NEXT:    s_and_b32 s1, s1, 0xffe
+; GFX11-TRUE16-NEXT:    v_med3_i32 v1, s3, 0, 13
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX11-TRUE16-NEXT:    s_or_b32 s1, s1, s3
+; GFX11-TRUE16-NEXT:    s_or_b32 s3, s1, 0x1000
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s3, s4
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX11-TRUE16-NEXT:    s_or_b32 s6, s5, 1
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s4, s3
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s3, s6, s5
+; GFX11-TRUE16-NEXT:    s_addk_i32 s0, 0xfc10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s0, 12
+; GFX11-TRUE16-NEXT:    s_or_b32 s4, s1, s4
+; GFX11-TRUE16-NEXT:    s_cmp_lt_i32 s0, 1
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s3, s3, s4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s3, 7
+; GFX11-TRUE16-NEXT:    s_cmp_eq_u32 s4, 3
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s5, -1, 0
+; GFX11-TRUE16-NEXT:    s_cmp_gt_i32 s4, 5
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_or_b32 s4, s4, s5
+; GFX11-TRUE16-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s3, s3, 2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_add_i32 s3, s3, s4
+; GFX11-TRUE16-NEXT:    s_cmp_lt_i32 s0, 31
+; GFX11-TRUE16-NEXT:    s_movk_i32 s4, 0x7e00
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s3, s3, 0x7c00
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s1, s4, 0x7c00
+; GFX11-TRUE16-NEXT:    s_cmpk_eq_i32 s0, 0x40f
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s0, s1, s3
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v0, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_and_b32 s3, s1, 0x1ff
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s3, s0
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-FAKE16-NEXT:    s_bfe_u32 s0, s1, 0xb0014
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s1, s1, 8
+; GFX11-FAKE16-NEXT:    s_sub_i32 s3, 0x3f1, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 0xffe
+; GFX11-FAKE16-NEXT:    v_med3_i32 v1, s3, 0, 13
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s3
+; GFX11-FAKE16-NEXT:    s_or_b32 s3, s1, 0x1000
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s5, s3, s4
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX11-FAKE16-NEXT:    s_or_b32 s6, s5, 1
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s4, s3
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s3, s6, s5
+; GFX11-FAKE16-NEXT:    s_addk_i32 s0, 0xfc10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s0, 12
+; GFX11-FAKE16-NEXT:    s_or_b32 s4, s1, s4
+; GFX11-FAKE16-NEXT:    s_cmp_lt_i32 s0, 1
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s3, s3, s4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s3, 7
+; GFX11-FAKE16-NEXT:    s_cmp_eq_u32 s4, 3
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s5, -1, 0
+; GFX11-FAKE16-NEXT:    s_cmp_gt_i32 s4, 5
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_or_b32 s4, s4, s5
+; GFX11-FAKE16-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s3, s3, 2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_add_i32 s3, s3, s4
+; GFX11-FAKE16-NEXT:    s_cmp_lt_i32 s0, 31
+; GFX11-FAKE16-NEXT:    s_movk_i32 s4, 0x7e00
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s3, s3, 0x7c00
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s1, s4, 0x7c00
+; GFX11-FAKE16-NEXT:    s_cmpk_eq_i32 s0, 0x40f
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s0, s1, s3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
   %mag.trunc = fptrunc double %mag to half
   %result = call half @llvm.copysign.f16(half %mag.trunc, half %sign)
   %cast = bitcast half %result to i16
@@ -3029,28 +3101,27 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
 ; SI-NEXT:    v_and_b32_e32 v6, 0xffe, v6
 ; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; SI-NEXT:    v_bfe_u32 v7, v3, 20, 11
-; SI-NEXT:    s_movk_i32 s4, 0x3f1
+; SI-NEXT:    s_movk_i32 s6, 0x3f1
 ; SI-NEXT:    v_or_b32_e32 v2, v6, v2
-; SI-NEXT:    v_sub_i32_e32 v8, vcc, s4, v7
+; SI-NEXT:    v_sub_i32_e32 v8, vcc, s6, v7
 ; SI-NEXT:    v_or_b32_e32 v6, 0x1000, v2
 ; SI-NEXT:    v_med3_i32 v8, v8, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, v8, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, v8, v9
+; SI-NEXT:    v_or_b32_e32 v10, 1, v9
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v8, v6
-; SI-NEXT:    s_movk_i32 s5, 0xfc10
-; SI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; SI-NEXT:    v_add_i32_e32 v7, vcc, s5, v7
+; SI-NEXT:    s_movk_i32 s7, 0xfc10
+; SI-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; SI-NEXT:    v_add_i32_e32 v7, vcc, s7, v7
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 12, v7
-; SI-NEXT:    v_or_b32_e32 v6, v9, v6
 ; SI-NEXT:    v_or_b32_e32 v8, v2, v8
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v7
 ; SI-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
 ; SI-NEXT:    v_and_b32_e32 v8, 7, v6
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v8
-; SI-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v8
-; SI-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; SI-NEXT:    v_or_b32_e32 v8, v8, v9
+; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v8
+; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; SI-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 2, v6
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; SI-NEXT:    v_mov_b32_e32 v8, 0x7c00
@@ -3058,9 +3129,9 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
 ; SI-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x7e00
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT:    s_movk_i32 s6, 0x40f
+; SI-NEXT:    s_movk_i32 s8, 0x40f
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v7
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v7
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_and_b32_e32 v6, 0x1ff, v1
@@ -3073,25 +3144,24 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; SI-NEXT:    v_bfe_u32 v6, v1, 20, 11
 ; SI-NEXT:    v_or_b32_e32 v0, v3, v0
-; SI-NEXT:    v_sub_i32_e32 v7, vcc, s4, v6
+; SI-NEXT:    v_sub_i32_e32 v7, vcc, s6, v6
 ; SI-NEXT:    v_or_b32_e32 v3, 0x1000, v0
 ; SI-NEXT:    v_med3_i32 v7, v7, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v10, v7, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, v7, v10
+; SI-NEXT:    v_or_b32_e32 v11, 1, v10
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v7, v3
-; SI-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; SI-NEXT:    v_add_i32_e32 v6, vcc, s5, v6
+; SI-NEXT:    v_cndmask_b32_e32 v3, v10, v11, vcc
+; SI-NEXT:    v_add_i32_e32 v6, vcc, s7, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 12, v6
-; SI-NEXT:    v_or_b32_e32 v3, v10, v3
 ; SI-NEXT:    v_or_b32_e32 v7, v0, v7
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v6
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; SI-NEXT:    v_and_b32_e32 v7, 7, v3
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v7
-; SI-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v7
-; SI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; SI-NEXT:    v_or_b32_e32 v7, v7, v10
+; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v7
+; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; SI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v6
@@ -3100,7 +3170,7 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v8, v9, vcc
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v6
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v6
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; SI-NEXT:    v_and_b32_e32 v1, 0x8000, v1
@@ -3124,28 +3194,27 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
 ; VI-NEXT:    v_and_b32_e32 v5, 0xffe, v5
 ; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; VI-NEXT:    v_bfe_u32 v3, v3, 20, 11
-; VI-NEXT:    s_movk_i32 s4, 0x3f1
+; VI-NEXT:    s_movk_i32 s6, 0x3f1
 ; VI-NEXT:    v_or_b32_e32 v2, v5, v2
-; VI-NEXT:    v_sub_u32_e32 v6, vcc, s4, v3
+; VI-NEXT:    v_sub_u32_e32 v6, vcc, s6, v3
 ; VI-NEXT:    v_or_b32_e32 v5, 0x1000, v2
 ; VI-NEXT:    v_med3_i32 v6, v6, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, v6, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, v6, v7
+; VI-NEXT:    v_or_b32_e32 v8, 1, v7
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v6, v5
-; VI-NEXT:    s_movk_i32 s5, 0xfc10
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; VI-NEXT:    v_add_u32_e32 v3, vcc, s5, v3
+; VI-NEXT:    s_movk_i32 s7, 0xfc10
+; VI-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
+; VI-NEXT:    v_add_u32_e32 v3, vcc, s7, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, 12, v3
-; VI-NEXT:    v_or_b32_e32 v5, v7, v5
 ; VI-NEXT:    v_or_b32_e32 v6, v2, v6
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
 ; VI-NEXT:    v_and_b32_e32 v6, 7, v5
-; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v6
-; VI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v6
-; VI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; VI-NEXT:    v_or_b32_e32 v6, v6, v7
+; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v6
+; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; VI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v6
 ; VI-NEXT:    v_mov_b32_e32 v6, 0x7c00
@@ -3153,9 +3222,9 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
 ; VI-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
 ; VI-NEXT:    v_mov_b32_e32 v7, 0x7e00
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT:    s_movk_i32 s6, 0x40f
+; VI-NEXT:    s_movk_i32 s8, 0x40f
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v3
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; VI-NEXT:    v_and_b32_e32 v5, 0x1ff, v1
 ; VI-NEXT:    v_or_b32_e32 v0, v5, v0
@@ -3165,32 +3234,31 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; VI-NEXT:    v_bfe_u32 v1, v1, 20, 11
 ; VI-NEXT:    v_or_b32_e32 v0, v3, v0
-; VI-NEXT:    v_sub_u32_e32 v5, vcc, s4, v1
+; VI-NEXT:    v_sub_u32_e32 v5, vcc, s6, v1
 ; VI-NEXT:    v_or_b32_e32 v3, 0x1000, v0
 ; VI-NEXT:    v_med3_i32 v5, v5, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v8, v5, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v5, v5, v8
+; VI-NEXT:    v_or_b32_e32 v9, 1, v8
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v3
-; VI-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; VI-NEXT:    v_add_u32_e32 v1, vcc, s5, v1
+; VI-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
+; VI-NEXT:    v_add_u32_e32 v1, vcc, s7, v1
 ; VI-NEXT:    v_lshlrev_b32_e32 v5, 12, v1
-; VI-NEXT:    v_or_b32_e32 v3, v8, v3
 ; VI-NEXT:    v_or_b32_e32 v5, v0, v5
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; VI-NEXT:    v_and_b32_e32 v5, 7, v3
-; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v5
-; VI-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v5
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; VI-NEXT:    v_or_b32_e32 v5, v5, v8
+; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v5
+; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
 ; VI-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v7, vcc
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v1
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
@@ -3202,32 +3270,31 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
 ; GFX9-LABEL: v_copysign_out_v2f16_mag_v2f64_sign_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s4, 0x1ff
-; GFX9-NEXT:    v_and_or_b32 v0, v1, s4, v0
+; GFX9-NEXT:    s_movk_i32 s6, 0x1ff
+; GFX9-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX9-NEXT:    s_movk_i32 s5, 0xffe
+; GFX9-NEXT:    s_movk_i32 s7, 0xffe
 ; GFX9-NEXT:    v_bfe_u32 v6, v1, 20, 11
-; GFX9-NEXT:    v_and_or_b32 v0, v5, s5, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v5, s7, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v7, 0x3f1, v6
 ; GFX9-NEXT:    v_or_b32_e32 v5, 0x1000, v0
 ; GFX9-NEXT:    v_med3_i32 v7, v7, 0, 13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v8, v7, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, v7, v8
+; GFX9-NEXT:    v_or_b32_e32 v9, 1, v8
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v7, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v6, 0xfffffc10, v6
-; GFX9-NEXT:    v_or_b32_e32 v5, v8, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
 ; GFX9-NEXT:    v_lshl_or_b32 v7, v6, 12, v0
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v7, 7, v5
-; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_or_b32_e32 v7, v7, v8
+; GFX9-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v7
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
 ; GFX9-NEXT:    v_add_u32_e32 v5, v5, v7
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7c00
@@ -3235,47 +3302,46 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7e00
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    s_movk_i32 s6, 0x40f
+; GFX9-NEXT:    s_movk_i32 s8, 0x40f
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v6
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    s_mov_b32 s7, 0x8000
-; GFX9-NEXT:    v_and_or_b32 v0, v1, s7, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v3, s4, v2
+; GFX9-NEXT:    s_mov_b32 s9, 0x8000
+; GFX9-NEXT:    v_and_or_b32 v0, v1, s9, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v3, s6, v2
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v3
 ; GFX9-NEXT:    v_bfe_u32 v5, v3, 20, 11
-; GFX9-NEXT:    v_and_or_b32 v1, v2, s5, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v2, s7, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v6, 0x3f1, v5
 ; GFX9-NEXT:    v_or_b32_e32 v2, 0x1000, v1
 ; GFX9-NEXT:    v_med3_i32 v6, v6, 0, 13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v9, v6, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, v6, v9
+; GFX9-NEXT:    v_or_b32_e32 v10, 1, v9
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v6, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v5, 0xfffffc10, v5
-; GFX9-NEXT:    v_or_b32_e32 v2, v9, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v10, vcc
 ; GFX9-NEXT:    v_lshl_or_b32 v6, v5, 12, v1
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v6, 7, v2
-; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_or_b32_e32 v6, v6, v9
+; GFX9-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v6
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v6
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v5
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_and_or_b32 v1, v2, s7, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v2, s9, v1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7fff7fff
@@ -3289,12 +3355,11 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
 ; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0x1ff, v1, v0
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 20, 11
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v1, 20, 11
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v1, 20, 11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v9, 0x3f1, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0xfffffc10, v6
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0xffe, v5, v2
@@ -3311,61 +3376,59 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v12, v5, v10
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, v8, v11
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v5, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 1, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 1, v12
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v8, v9
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v6, 12, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v8, v11, v13 :: v_dual_add_nc_u32 v7, 0xfffffc10, v7
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v11, 0x7e00
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v5, v12
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v7, 12, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v5, v10
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v11, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v5, v12, v15 :: v_dual_add_nc_u32 v6, 0xfffffc10, v6
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v6, 12, v2
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v6
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0xfffffc10, v7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v12, v5
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v7, 12, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v14, v8, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v7
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v5, v10, v5 :: v_dual_mov_b32 v10, 0x7e00
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 7, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 2, v8
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 7, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
-; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v11
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, 0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v8, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v9
+; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e64 s0, 5, v9
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v3.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, 0
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v11, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v6
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 7, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e64 s1, 3, v10
+; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e64 s2, 5, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s0
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s2, s1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v8, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, 0x7c00, v10 :: v_dual_add_nc_u32 v5, v5, v11
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, 0x7c00, v11 :: v_dual_add_nc_u32 v5, v5, v12
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, 0x7c00, v5, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v6
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v7
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0x8000, v12, v2
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0x8000, v9, v2
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0x8000, v12, v0
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0x8000, v9, v0
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, v0, v4
@@ -3378,17 +3441,15 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0x1ff, v3, v2
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v1, 20, 11
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v3, 20, 11
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v13, 0x7e00
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v3, 20, 11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v9, 0x3f1, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0xfffffc10, v6
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0xffe, v5, v0
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v5, 0x3f1, v7
@@ -3401,62 +3462,59 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, v8, v9
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, v5, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, v8, v11
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, v5, v12
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 1, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 1, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v8, v9
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v6, 12, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v11, v14 :: v_dual_add_nc_u32 v7, 0xfffffc10, v7
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v11, 0x7e00
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, v5, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v7, 12, v2
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v5, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v11, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v12, v15 :: v_dual_add_nc_u32 v6, 0xfffffc10, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v6, 12, v0
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0xfffffc10, v7
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v12, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v7, 12, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v13, v8, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v7
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 7, v8
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 2, v8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v10, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v9
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 7, v5
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v11
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v10
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v8, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e64 s0, 5, v9
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e64 s1, 3, v10
+; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e64 s2, 5, v10
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v12
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s2, s1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v11, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v8, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v11, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, 0x7c00, v8 :: v_dual_add_nc_u32 v5, v5, v10
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v7
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, 0x7c00, v5, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v7
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0x8000, v1, v0
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v1, 0x8000, v3, v2
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v1, 0x8000, v3, v2
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, v0, v4
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mag.trunc = fptrunc <2 x double> %mag to <2 x half>
@@ -3853,78 +3911,82 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; SI-NEXT:    s_or_b32 s4, s0, 0x1000
 ; SI-NEXT:    v_readfirstlane_b32 s6, v2
 ; SI-NEXT:    s_lshr_b32 s7, s4, s6
+; SI-NEXT:    s_or_b32 s8, s7, 1
 ; SI-NEXT:    s_lshl_b32 s6, s7, s6
 ; SI-NEXT:    s_cmp_lg_u32 s6, s4
-; SI-NEXT:    s_cselect_b32 s4, 1, 0
-; SI-NEXT:    s_addk_i32 s5, 0xfc10
-; SI-NEXT:    s_lshl_b32 s6, s5, 12
-; SI-NEXT:    s_or_b32 s4, s7, s4
-; SI-NEXT:    s_or_b32 s6, s0, s6
-; SI-NEXT:    s_cmp_lt_i32 s5, 1
-; SI-NEXT:    s_cselect_b32 s4, s4, s6
-; SI-NEXT:    s_and_b32 s6, s4, 7
-; SI-NEXT:    s_cmp_gt_i32 s6, 5
-; SI-NEXT:    s_cselect_b32 s7, 1, 0
+; SI-NEXT:    s_cselect_b32 s4, s8, s7
+; SI-NEXT:    s_add_i32 s8, s5, 0xfffffc10
+; SI-NEXT:    s_lshl_b32 s5, s8, 12
+; SI-NEXT:    s_or_b32 s5, s0, s5
+; SI-NEXT:    s_cmp_lt_i32 s8, 1
+; SI-NEXT:    s_cselect_b32 s9, s4, s5
+; SI-NEXT:    s_and_b32 s6, s9, 7
 ; SI-NEXT:    s_cmp_eq_u32 s6, 3
-; SI-NEXT:    s_cselect_b32 s6, 1, 0
-; SI-NEXT:    s_or_b32 s6, s6, s7
-; SI-NEXT:    s_lshr_b32 s4, s4, 2
-; SI-NEXT:    s_add_i32 s4, s4, s6
-; SI-NEXT:    s_cmp_lt_i32 s5, 31
-; SI-NEXT:    s_cselect_b32 s4, s4, 0x7c00
+; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT:    s_cmp_gt_i32 s6, 5
+; SI-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; SI-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; SI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_lshr_b32 s5, s9, 2
+; SI-NEXT:    s_add_i32 s5, s5, s4
+; SI-NEXT:    s_cmp_lt_i32 s8, 31
+; SI-NEXT:    s_cselect_b32 s4, s5, 0x7c00
 ; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    s_movk_i32 s6, 0x7e00
 ; SI-NEXT:    s_cselect_b32 s0, s6, 0x7c00
-; SI-NEXT:    s_cmpk_eq_i32 s5, 0x40f
+; SI-NEXT:    s_cmpk_eq_i32 s8, 0x40f
 ; SI-NEXT:    s_cselect_b32 s0, s0, s4
 ; SI-NEXT:    s_lshr_b32 s1, s1, 16
 ; SI-NEXT:    s_and_b32 s1, s1, 0x8000
-; SI-NEXT:    s_or_b32 s4, s1, s0
+; SI-NEXT:    s_or_b32 s7, s1, s0
 ; SI-NEXT:    s_lshr_b32 s0, s3, 8
-; SI-NEXT:    s_and_b32 s5, s0, 0xffe
+; SI-NEXT:    s_and_b32 s4, s0, 0xffe
 ; SI-NEXT:    s_and_b32 s0, s3, 0x1ff
 ; SI-NEXT:    s_or_b32 s0, s0, s2
 ; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; SI-NEXT:    v_readfirstlane_b32 s0, v2
-; SI-NEXT:    s_bfe_u32 s2, s3, 0xb0014
-; SI-NEXT:    s_or_b32 s0, s5, s0
-; SI-NEXT:    s_sub_i32 s5, 0x3f1, s2
-; SI-NEXT:    v_med3_i32 v2, s5, 0, 13
-; SI-NEXT:    s_or_b32 s1, s0, 0x1000
-; SI-NEXT:    v_readfirstlane_b32 s5, v2
-; SI-NEXT:    s_lshr_b32 s7, s1, s5
-; SI-NEXT:    s_lshl_b32 s5, s7, s5
-; SI-NEXT:    s_cmp_lg_u32 s5, s1
-; SI-NEXT:    s_cselect_b32 s1, 1, 0
-; SI-NEXT:    s_addk_i32 s2, 0xfc10
-; SI-NEXT:    s_lshl_b32 s5, s2, 12
-; SI-NEXT:    s_or_b32 s1, s7, s1
-; SI-NEXT:    s_or_b32 s5, s0, s5
-; SI-NEXT:    s_cmp_lt_i32 s2, 1
-; SI-NEXT:    s_cselect_b32 s1, s1, s5
-; SI-NEXT:    s_and_b32 s5, s1, 7
-; SI-NEXT:    s_cmp_gt_i32 s5, 5
-; SI-NEXT:    s_cselect_b32 s7, 1, 0
-; SI-NEXT:    s_cmp_eq_u32 s5, 3
-; SI-NEXT:    s_cselect_b32 s5, 1, 0
-; SI-NEXT:    s_or_b32 s5, s5, s7
-; SI-NEXT:    s_lshr_b32 s1, s1, 2
-; SI-NEXT:    s_add_i32 s1, s1, s5
-; SI-NEXT:    s_cmp_lt_i32 s2, 31
-; SI-NEXT:    s_cselect_b32 s1, s1, 0x7c00
-; SI-NEXT:    s_cmp_lg_u32 s0, 0
-; SI-NEXT:    s_cselect_b32 s0, s6, 0x7c00
-; SI-NEXT:    s_cmpk_eq_i32 s2, 0x40f
-; SI-NEXT:    s_cselect_b32 s0, s0, s1
+; SI-NEXT:    s_bfe_u32 s1, s3, 0xb0014
+; SI-NEXT:    s_or_b32 s2, s4, s0
+; SI-NEXT:    s_sub_i32 s4, 0x3f1, s1
+; SI-NEXT:    v_med3_i32 v2, s4, 0, 13
+; SI-NEXT:    s_or_b32 s0, s2, 0x1000
+; SI-NEXT:    v_readfirstlane_b32 s4, v2
+; SI-NEXT:    s_lshr_b32 s5, s0, s4
+; SI-NEXT:    s_or_b32 s8, s5, 1
+; SI-NEXT:    s_lshl_b32 s4, s5, s4
+; SI-NEXT:    s_cmp_lg_u32 s4, s0
+; SI-NEXT:    s_cselect_b32 s0, s8, s5
+; SI-NEXT:    s_add_i32 s8, s1, 0xfffffc10
+; SI-NEXT:    s_lshl_b32 s1, s8, 12
+; SI-NEXT:    s_or_b32 s1, s2, s1
+; SI-NEXT:    s_cmp_lt_i32 s8, 1
+; SI-NEXT:    s_cselect_b32 s9, s0, s1
+; SI-NEXT:    s_and_b32 s4, s9, 7
+; SI-NEXT:    s_cmp_eq_u32 s4, 3
+; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; SI-NEXT:    s_cmp_gt_i32 s4, 5
+; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cselect_b32 s0, 1, 0
+; SI-NEXT:    s_lshr_b32 s1, s9, 2
+; SI-NEXT:    s_add_i32 s1, s1, s0
+; SI-NEXT:    s_cmp_lt_i32 s8, 31
+; SI-NEXT:    s_cselect_b32 s0, s1, 0x7c00
+; SI-NEXT:    s_cmp_lg_u32 s2, 0
+; SI-NEXT:    s_cselect_b32 s1, s6, 0x7c00
+; SI-NEXT:    s_cmpk_eq_i32 s8, 0x40f
+; SI-NEXT:    s_cselect_b32 s0, s1, s0
 ; SI-NEXT:    s_lshr_b32 s1, s3, 16
 ; SI-NEXT:    s_and_b32 s1, s1, 0x8000
 ; SI-NEXT:    s_or_b32 s0, s1, s0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, s0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s7
 ; SI-NEXT:    s_brev_b32 s0, -2
 ; SI-NEXT:    v_bfi_b32 v0, s0, v2, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -3947,36 +4009,38 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; VI-NEXT:    s_bfe_u32 s3, s3, 0xb0014
 ; VI-NEXT:    v_readfirstlane_b32 s2, v0
 ; VI-NEXT:    s_sub_i32 s6, 0x3f1, s3
-; VI-NEXT:    s_or_b32 s2, s5, s2
+; VI-NEXT:    s_or_b32 s5, s5, s2
 ; VI-NEXT:    v_med3_i32 v0, s6, 0, 13
-; VI-NEXT:    s_or_b32 s5, s2, 0x1000
+; VI-NEXT:    s_or_b32 s2, s5, 0x1000
 ; VI-NEXT:    v_readfirstlane_b32 s6, v0
-; VI-NEXT:    s_lshr_b32 s7, s5, s6
+; VI-NEXT:    s_lshr_b32 s7, s2, s6
+; VI-NEXT:    s_or_b32 s8, s7, 1
 ; VI-NEXT:    s_lshl_b32 s6, s7, s6
-; VI-NEXT:    s_cmp_lg_u32 s6, s5
-; VI-NEXT:    s_cselect_b32 s5, 1, 0
-; VI-NEXT:    s_addk_i32 s3, 0xfc10
-; VI-NEXT:    s_lshl_b32 s6, s3, 12
-; VI-NEXT:    s_or_b32 s5, s7, s5
-; VI-NEXT:    s_or_b32 s6, s2, s6
-; VI-NEXT:    s_cmp_lt_i32 s3, 1
-; VI-NEXT:    s_cselect_b32 s5, s5, s6
-; VI-NEXT:    s_and_b32 s6, s5, 7
-; VI-NEXT:    s_cmp_gt_i32 s6, 5
-; VI-NEXT:    s_cselect_b32 s7, 1, 0
+; VI-NEXT:    s_cmp_lg_u32 s6, s2
+; VI-NEXT:    s_cselect_b32 s2, s8, s7
+; VI-NEXT:    s_add_i32 s8, s3, 0xfffffc10
+; VI-NEXT:    s_lshl_b32 s3, s8, 12
+; VI-NEXT:    s_or_b32 s3, s5, s3
+; VI-NEXT:    s_cmp_lt_i32 s8, 1
+; VI-NEXT:    s_cselect_b32 s9, s2, s3
+; VI-NEXT:    s_and_b32 s6, s9, 7
 ; VI-NEXT:    s_cmp_eq_u32 s6, 3
-; VI-NEXT:    s_cselect_b32 s6, 1, 0
-; VI-NEXT:    s_or_b32 s6, s6, s7
-; VI-NEXT:    s_lshr_b32 s5, s5, 2
-; VI-NEXT:    s_add_i32 s5, s5, s6
-; VI-NEXT:    s_cmp_lt_i32 s3, 31
-; VI-NEXT:    s_cselect_b32 s5, s5, 0x7c00
-; VI-NEXT:    s_cmp_lg_u32 s2, 0
-; VI-NEXT:    s_movk_i32 s6, 0x7e00
-; VI-NEXT:    s_cselect_b32 s2, s6, 0x7c00
-; VI-NEXT:    s_cmpk_eq_i32 s3, 0x40f
-; VI-NEXT:    s_cselect_b32 s2, s2, s5
-; VI-NEXT:    s_lshl_b32 s5, s2, 16
+; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT:    s_cmp_gt_i32 s6, 5
+; VI-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; VI-NEXT:    s_or_b64 s[2:3], s[6:7], s[2:3]
+; VI-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; VI-NEXT:    s_cselect_b32 s2, 1, 0
+; VI-NEXT:    s_lshr_b32 s3, s9, 2
+; VI-NEXT:    s_add_i32 s3, s3, s2
+; VI-NEXT:    s_cmp_lt_i32 s8, 31
+; VI-NEXT:    s_cselect_b32 s2, s3, 0x7c00
+; VI-NEXT:    s_cmp_lg_u32 s5, 0
+; VI-NEXT:    s_movk_i32 s5, 0x7e00
+; VI-NEXT:    s_cselect_b32 s3, s5, 0x7c00
+; VI-NEXT:    s_cmpk_eq_i32 s8, 0x40f
+; VI-NEXT:    s_cselect_b32 s2, s3, s2
+; VI-NEXT:    s_lshl_b32 s6, s2, 16
 ; VI-NEXT:    s_lshr_b32 s2, s1, 8
 ; VI-NEXT:    s_and_b32 s7, s2, 0xffe
 ; VI-NEXT:    s_and_b32 s2, s1, 0x1ff
@@ -3986,37 +4050,39 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
 ; VI-NEXT:    s_bfe_u32 s1, s1, 0xb0014
 ; VI-NEXT:    v_readfirstlane_b32 s0, v0
-; VI-NEXT:    s_sub_i32 s3, 0x3f1, s1
-; VI-NEXT:    s_or_b32 s0, s7, s0
-; VI-NEXT:    v_med3_i32 v0, s3, 0, 13
-; VI-NEXT:    s_or_b32 s2, s0, 0x1000
-; VI-NEXT:    v_readfirstlane_b32 s3, v0
-; VI-NEXT:    s_lshr_b32 s7, s2, s3
-; VI-NEXT:    s_lshl_b32 s3, s7, s3
-; VI-NEXT:    s_cmp_lg_u32 s3, s2
-; VI-NEXT:    s_cselect_b32 s2, 1, 0
-; VI-NEXT:    s_addk_i32 s1, 0xfc10
-; VI-NEXT:    s_lshl_b32 s3, s1, 12
-; VI-NEXT:    s_or_b32 s2, s7, s2
-; VI-NEXT:    s_or_b32 s3, s0, s3
-; VI-NEXT:    s_cmp_lt_i32 s1, 1
-; VI-NEXT:    s_cselect_b32 s2, s2, s3
-; VI-NEXT:    s_and_b32 s3, s2, 7
-; VI-NEXT:    s_cmp_gt_i32 s3, 5
-; VI-NEXT:    s_cselect_b32 s7, 1, 0
-; VI-NEXT:    s_cmp_eq_u32 s3, 3
-; VI-NEXT:    s_cselect_b32 s3, 1, 0
-; VI-NEXT:    s_or_b32 s3, s3, s7
-; VI-NEXT:    s_lshr_b32 s2, s2, 2
-; VI-NEXT:    s_add_i32 s2, s2, s3
-; VI-NEXT:    s_cmp_lt_i32 s1, 31
-; VI-NEXT:    s_cselect_b32 s2, s2, 0x7c00
-; VI-NEXT:    s_cmp_lg_u32 s0, 0
-; VI-NEXT:    s_cselect_b32 s0, s6, 0x7c00
-; VI-NEXT:    s_cmpk_eq_i32 s1, 0x40f
-; VI-NEXT:    s_cselect_b32 s0, s0, s2
+; VI-NEXT:    s_sub_i32 s2, 0x3f1, s1
+; VI-NEXT:    s_or_b32 s7, s7, s0
+; VI-NEXT:    v_med3_i32 v0, s2, 0, 13
+; VI-NEXT:    s_or_b32 s0, s7, 0x1000
+; VI-NEXT:    v_readfirstlane_b32 s2, v0
+; VI-NEXT:    s_lshr_b32 s3, s0, s2
+; VI-NEXT:    s_or_b32 s8, s3, 1
+; VI-NEXT:    s_lshl_b32 s2, s3, s2
+; VI-NEXT:    s_cmp_lg_u32 s2, s0
+; VI-NEXT:    s_cselect_b32 s0, s8, s3
+; VI-NEXT:    s_add_i32 s8, s1, 0xfffffc10
+; VI-NEXT:    s_lshl_b32 s1, s8, 12
+; VI-NEXT:    s_or_b32 s1, s7, s1
+; VI-NEXT:    s_cmp_lt_i32 s8, 1
+; VI-NEXT:    s_cselect_b32 s9, s0, s1
+; VI-NEXT:    s_and_b32 s2, s9, 7
+; VI-NEXT:    s_cmp_eq_u32 s2, 3
+; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    s_cmp_gt_i32 s2, 5
+; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; VI-NEXT:    s_cselect_b32 s0, 1, 0
+; VI-NEXT:    s_lshr_b32 s1, s9, 2
+; VI-NEXT:    s_add_i32 s1, s1, s0
+; VI-NEXT:    s_cmp_lt_i32 s8, 31
+; VI-NEXT:    s_cselect_b32 s0, s1, 0x7c00
+; VI-NEXT:    s_cmp_lg_u32 s7, 0
+; VI-NEXT:    s_cselect_b32 s1, s5, 0x7c00
+; VI-NEXT:    s_cmpk_eq_i32 s8, 0x40f
+; VI-NEXT:    s_cselect_b32 s0, s1, s0
 ; VI-NEXT:    s_and_b32 s0, s0, 0x7fff
-; VI-NEXT:    s_or_b32 s0, s0, s5
+; VI-NEXT:    s_or_b32 s0, s0, s6
 ; VI-NEXT:    s_mov_b32 s1, 0x7fff7fff
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
@@ -4041,29 +4107,31 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; GFX9-NEXT:    s_or_b32 s5, s2, 0x1000
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
 ; GFX9-NEXT:    s_lshr_b32 s8, s5, s7
+; GFX9-NEXT:    s_or_b32 s9, s8, 1
 ; GFX9-NEXT:    s_lshl_b32 s7, s8, s7
 ; GFX9-NEXT:    s_cmp_lg_u32 s7, s5
-; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX9-NEXT:    s_addk_i32 s6, 0xfc10
-; GFX9-NEXT:    s_lshl_b32 s7, s6, 12
-; GFX9-NEXT:    s_or_b32 s5, s8, s5
-; GFX9-NEXT:    s_or_b32 s7, s2, s7
-; GFX9-NEXT:    s_cmp_lt_i32 s6, 1
-; GFX9-NEXT:    s_cselect_b32 s5, s5, s7
-; GFX9-NEXT:    s_and_b32 s7, s5, 7
-; GFX9-NEXT:    s_cmp_gt_i32 s7, 5
-; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s7, 3
-; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX9-NEXT:    s_or_b32 s7, s7, s8
+; GFX9-NEXT:    s_cselect_b32 s5, s9, s8
+; GFX9-NEXT:    s_add_i32 s10, s6, 0xfffffc10
+; GFX9-NEXT:    s_lshl_b32 s6, s10, 12
+; GFX9-NEXT:    s_or_b32 s6, s2, s6
+; GFX9-NEXT:    s_cmp_lt_i32 s10, 1
+; GFX9-NEXT:    s_cselect_b32 s5, s5, s6
+; GFX9-NEXT:    s_and_b32 s8, s5, 7
+; GFX9-NEXT:    s_cmp_eq_u32 s8, 3
+; GFX9-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GFX9-NEXT:    s_cmp_gt_i32 s8, 5
+; GFX9-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX9-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
+; GFX9-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX9-NEXT:    s_lshr_b32 s5, s5, 2
-; GFX9-NEXT:    s_add_i32 s5, s5, s7
-; GFX9-NEXT:    s_cmp_lt_i32 s6, 31
+; GFX9-NEXT:    s_add_i32 s5, s5, s6
+; GFX9-NEXT:    s_cmp_lt_i32 s10, 31
 ; GFX9-NEXT:    s_cselect_b32 s5, s5, 0x7c00
 ; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX9-NEXT:    s_movk_i32 s7, 0x7e00
-; GFX9-NEXT:    s_cselect_b32 s2, s7, 0x7c00
-; GFX9-NEXT:    s_cmpk_eq_i32 s6, 0x40f
+; GFX9-NEXT:    s_movk_i32 s8, 0x7e00
+; GFX9-NEXT:    s_cselect_b32 s2, s8, 0x7c00
+; GFX9-NEXT:    s_cmpk_eq_i32 s10, 0x40f
 ; GFX9-NEXT:    s_cselect_b32 s2, s2, s5
 ; GFX9-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX9-NEXT:    s_and_b32 s3, s3, 0x8000
@@ -4082,29 +4150,31 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; GFX9-NEXT:    v_med3_i32 v0, s6, 0, 13
 ; GFX9-NEXT:    s_or_b32 s2, s0, 0x1000
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9-NEXT:    s_lshr_b32 s8, s2, s6
-; GFX9-NEXT:    s_lshl_b32 s6, s8, s6
+; GFX9-NEXT:    s_lshr_b32 s7, s2, s6
+; GFX9-NEXT:    s_or_b32 s9, s7, 1
+; GFX9-NEXT:    s_lshl_b32 s6, s7, s6
 ; GFX9-NEXT:    s_cmp_lg_u32 s6, s2
-; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX9-NEXT:    s_addk_i32 s3, 0xfc10
-; GFX9-NEXT:    s_lshl_b32 s6, s3, 12
-; GFX9-NEXT:    s_or_b32 s2, s8, s2
-; GFX9-NEXT:    s_or_b32 s6, s0, s6
-; GFX9-NEXT:    s_cmp_lt_i32 s3, 1
-; GFX9-NEXT:    s_cselect_b32 s2, s2, s6
-; GFX9-NEXT:    s_and_b32 s6, s2, 7
-; GFX9-NEXT:    s_cmp_gt_i32 s6, 5
-; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s2, s9, s7
+; GFX9-NEXT:    s_add_i32 s9, s3, 0xfffffc10
+; GFX9-NEXT:    s_lshl_b32 s3, s9, 12
+; GFX9-NEXT:    s_or_b32 s3, s0, s3
+; GFX9-NEXT:    s_cmp_lt_i32 s9, 1
+; GFX9-NEXT:    s_cselect_b32 s10, s2, s3
+; GFX9-NEXT:    s_and_b32 s6, s10, 7
 ; GFX9-NEXT:    s_cmp_eq_u32 s6, 3
-; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX9-NEXT:    s_or_b32 s6, s6, s8
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 2
-; GFX9-NEXT:    s_add_i32 s2, s2, s6
-; GFX9-NEXT:    s_cmp_lt_i32 s3, 31
-; GFX9-NEXT:    s_cselect_b32 s2, s2, 0x7c00
+; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX9-NEXT:    s_cmp_gt_i32 s6, 5
+; GFX9-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GFX9-NEXT:    s_or_b64 s[2:3], s[6:7], s[2:3]
+; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX9-NEXT:    s_lshr_b32 s3, s10, 2
+; GFX9-NEXT:    s_add_i32 s3, s3, s2
+; GFX9-NEXT:    s_cmp_lt_i32 s9, 31
+; GFX9-NEXT:    s_cselect_b32 s2, s3, 0x7c00
 ; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX9-NEXT:    s_cselect_b32 s0, s7, 0x7c00
-; GFX9-NEXT:    s_cmpk_eq_i32 s3, 0x40f
+; GFX9-NEXT:    s_cselect_b32 s0, s8, 0x7c00
+; GFX9-NEXT:    s_cmpk_eq_i32 s9, 0x40f
 ; GFX9-NEXT:    s_cselect_b32 s0, s0, s2
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0x8000
@@ -4139,23 +4209,26 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshr_b32 s8, s6, s7
 ; GFX11-NEXT:    s_lshl_b32 s7, s8, s7
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s9, s8, 1
 ; GFX11-NEXT:    s_cmp_lg_u32 s7, s6
-; GFX11-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s6, s9, s8
 ; GFX11-NEXT:    s_addk_i32 s2, 0xfc10
-; GFX11-NEXT:    s_or_b32 s6, s8, s6
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshl_b32 s7, s2, 12
 ; GFX11-NEXT:    s_or_b32 s7, s5, s7
 ; GFX11-NEXT:    s_cmp_lt_i32 s2, 1
 ; GFX11-NEXT:    s_cselect_b32 s6, s6, s7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s7, s6, 7
-; GFX11-NEXT:    s_cmp_gt_i32 s7, 5
-; GFX11-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s7, 3
+; GFX11-NEXT:    s_cselect_b32 s8, -1, 0
+; GFX11-NEXT:    s_cmp_gt_i32 s7, 5
+; GFX11-NEXT:    s_cselect_b32 s7, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
+; GFX11-NEXT:    s_and_b32 s7, s7, exec_lo
 ; GFX11-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX11-NEXT:    s_lshr_b32 s6, s6, 2
-; GFX11-NEXT:    s_or_b32 s7, s7, s8
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_add_i32 s6, s6, s7
 ; GFX11-NEXT:    s_cmp_lt_i32 s2, 31
@@ -4189,23 +4262,26 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshr_b32 s8, s5, s6
 ; GFX11-NEXT:    s_lshl_b32 s6, s8, s6
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s9, s8, 1
 ; GFX11-NEXT:    s_cmp_lg_u32 s6, s5
-; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s5, s9, s8
 ; GFX11-NEXT:    s_addk_i32 s0, 0xfc10
-; GFX11-NEXT:    s_or_b32 s5, s8, s5
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshl_b32 s6, s0, 12
 ; GFX11-NEXT:    s_or_b32 s6, s3, s6
 ; GFX11-NEXT:    s_cmp_lt_i32 s0, 1
 ; GFX11-NEXT:    s_cselect_b32 s5, s5, s6
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s6, s5, 7
-; GFX11-NEXT:    s_cmp_gt_i32 s6, 5
-; GFX11-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX11-NEXT:    s_cselect_b32 s8, -1, 0
+; GFX11-NEXT:    s_cmp_gt_i32 s6, 5
+; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s6, s6, s8
+; GFX11-NEXT:    s_and_b32 s6, s6, exec_lo
 ; GFX11-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX11-NEXT:    s_lshr_b32 s5, s5, 2
-; GFX11-NEXT:    s_or_b32 s6, s6, s8
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_add_i32 s5, s5, s6
 ; GFX11-NEXT:    s_cmp_lt_i32 s0, 31
@@ -4342,15 +4418,27 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f16_sign_v2f64(<2 x half> inreg
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s2, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v0, s1
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff7fff, s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11-TRUE16-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_mov_b32 s1, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, s4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, s0, v0
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s2, s4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, s0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
   %sign.trunc = fptrunc <2 x double> %sign to <2 x half>
   %out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %mag, <2 x half> %sign.trunc)
   %cast = bitcast <2 x half> %out to i32
@@ -4665,28 +4753,27 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffe, v9
 ; SI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; SI-NEXT:    v_bfe_u32 v10, v5, 20, 11
-; SI-NEXT:    s_movk_i32 s4, 0x3f1
+; SI-NEXT:    s_movk_i32 s6, 0x3f1
 ; SI-NEXT:    v_or_b32_e32 v4, v9, v4
-; SI-NEXT:    v_sub_i32_e32 v11, vcc, s4, v10
+; SI-NEXT:    v_sub_i32_e32 v11, vcc, s6, v10
 ; SI-NEXT:    v_or_b32_e32 v9, 0x1000, v4
 ; SI-NEXT:    v_med3_i32 v11, v11, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v12, v11, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, v11, v12
+; SI-NEXT:    v_or_b32_e32 v13, 1, v12
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v11, v9
-; SI-NEXT:    s_movk_i32 s5, 0xfc10
-; SI-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; SI-NEXT:    v_add_i32_e32 v10, vcc, s5, v10
+; SI-NEXT:    s_movk_i32 s7, 0xfc10
+; SI-NEXT:    v_cndmask_b32_e32 v9, v12, v13, vcc
+; SI-NEXT:    v_add_i32_e32 v10, vcc, s7, v10
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 12, v10
-; SI-NEXT:    v_or_b32_e32 v9, v12, v9
 ; SI-NEXT:    v_or_b32_e32 v11, v4, v11
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v10
 ; SI-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
 ; SI-NEXT:    v_and_b32_e32 v11, 7, v9
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v11
-; SI-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v11
-; SI-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; SI-NEXT:    v_or_b32_e32 v11, v11, v12
+; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v11
+; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; SI-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 2, v9
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x7c00
@@ -4694,9 +4781,9 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; SI-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
 ; SI-NEXT:    v_mov_b32_e32 v12, 0x7e00
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; SI-NEXT:    s_movk_i32 s6, 0x40f
+; SI-NEXT:    s_movk_i32 s8, 0x40f
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v11, v12, vcc
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v10
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v10
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_and_b32_e32 v9, 0x1ff, v3
@@ -4709,32 +4796,31 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; SI-NEXT:    v_bfe_u32 v9, v3, 20, 11
 ; SI-NEXT:    v_or_b32_e32 v2, v5, v2
-; SI-NEXT:    v_sub_i32_e32 v10, vcc, s4, v9
+; SI-NEXT:    v_sub_i32_e32 v10, vcc, s6, v9
 ; SI-NEXT:    v_or_b32_e32 v5, 0x1000, v2
 ; SI-NEXT:    v_med3_i32 v10, v10, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v13, v10, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, v10, v13
+; SI-NEXT:    v_or_b32_e32 v14, 1, v13
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v10, v5
-; SI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; SI-NEXT:    v_add_i32_e32 v9, vcc, s5, v9
+; SI-NEXT:    v_cndmask_b32_e32 v5, v13, v14, vcc
+; SI-NEXT:    v_add_i32_e32 v9, vcc, s7, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 12, v9
-; SI-NEXT:    v_or_b32_e32 v5, v13, v5
 ; SI-NEXT:    v_or_b32_e32 v10, v2, v10
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v9
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v10, v5, vcc
 ; SI-NEXT:    v_and_b32_e32 v10, 7, v5
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v10
-; SI-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v10
-; SI-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; SI-NEXT:    v_or_b32_e32 v10, v10, v13
+; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v10
+; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; SI-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v9
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v11, v5, vcc
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v11, v12, vcc
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v9
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v9
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_and_b32_e32 v5, 0x1ff, v1
@@ -4747,25 +4833,24 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; SI-NEXT:    v_bfe_u32 v5, v1, 20, 11
 ; SI-NEXT:    v_or_b32_e32 v0, v3, v0
-; SI-NEXT:    v_sub_i32_e32 v9, vcc, s4, v5
+; SI-NEXT:    v_sub_i32_e32 v9, vcc, s6, v5
 ; SI-NEXT:    v_or_b32_e32 v3, 0x1000, v0
 ; SI-NEXT:    v_med3_i32 v9, v9, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v10, v9, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, v9, v10
+; SI-NEXT:    v_or_b32_e32 v13, 1, v10
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v9, v3
-; SI-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; SI-NEXT:    v_add_i32_e32 v5, vcc, s5, v5
+; SI-NEXT:    v_cndmask_b32_e32 v3, v10, v13, vcc
+; SI-NEXT:    v_add_i32_e32 v5, vcc, s7, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 12, v5
-; SI-NEXT:    v_or_b32_e32 v3, v10, v3
 ; SI-NEXT:    v_or_b32_e32 v9, v0, v9
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v5
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
 ; SI-NEXT:    v_and_b32_e32 v9, 7, v3
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v9
-; SI-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v9
-; SI-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; SI-NEXT:    v_or_b32_e32 v9, v9, v10
+; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v9
+; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; SI-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v5
@@ -4773,7 +4858,7 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v11, v12, vcc
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v5
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v5
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; SI-NEXT:    v_and_b32_e32 v1, 0x8000, v1
@@ -4798,28 +4883,27 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; VI-NEXT:    v_and_b32_e32 v8, 0xffe, v8
 ; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; VI-NEXT:    v_bfe_u32 v5, v5, 20, 11
-; VI-NEXT:    s_movk_i32 s4, 0x3f1
+; VI-NEXT:    s_movk_i32 s6, 0x3f1
 ; VI-NEXT:    v_or_b32_e32 v4, v8, v4
-; VI-NEXT:    v_sub_u32_e32 v9, vcc, s4, v5
+; VI-NEXT:    v_sub_u32_e32 v9, vcc, s6, v5
 ; VI-NEXT:    v_or_b32_e32 v8, 0x1000, v4
 ; VI-NEXT:    v_med3_i32 v9, v9, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v10, v9, v8
 ; VI-NEXT:    v_lshlrev_b32_e32 v9, v9, v10
+; VI-NEXT:    v_or_b32_e32 v11, 1, v10
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v9, v8
-; VI-NEXT:    s_movk_i32 s5, 0xfc10
-; VI-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; VI-NEXT:    v_add_u32_e32 v5, vcc, s5, v5
+; VI-NEXT:    s_movk_i32 s7, 0xfc10
+; VI-NEXT:    v_cndmask_b32_e32 v8, v10, v11, vcc
+; VI-NEXT:    v_add_u32_e32 v5, vcc, s7, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v9, 12, v5
-; VI-NEXT:    v_or_b32_e32 v8, v10, v8
 ; VI-NEXT:    v_or_b32_e32 v9, v4, v9
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v5
 ; VI-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
 ; VI-NEXT:    v_and_b32_e32 v9, 7, v8
-; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v9
-; VI-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v9
-; VI-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; VI-NEXT:    v_or_b32_e32 v9, v9, v10
+; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v9
+; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; VI-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
 ; VI-NEXT:    v_lshrrev_b32_e32 v8, 2, v8
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v9
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x7c00
@@ -4827,9 +4911,9 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; VI-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
 ; VI-NEXT:    v_mov_b32_e32 v10, 0x7e00
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; VI-NEXT:    s_movk_i32 s6, 0x40f
+; VI-NEXT:    s_movk_i32 s8, 0x40f
 ; VI-NEXT:    v_cndmask_b32_e32 v4, v9, v10, vcc
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v5
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v5
 ; VI-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; VI-NEXT:    v_and_b32_e32 v8, 0x1ff, v1
 ; VI-NEXT:    v_or_b32_e32 v0, v8, v0
@@ -4839,32 +4923,31 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; VI-NEXT:    v_bfe_u32 v1, v1, 20, 11
 ; VI-NEXT:    v_or_b32_e32 v0, v5, v0
-; VI-NEXT:    v_sub_u32_e32 v8, vcc, s4, v1
+; VI-NEXT:    v_sub_u32_e32 v8, vcc, s6, v1
 ; VI-NEXT:    v_or_b32_e32 v5, 0x1000, v0
 ; VI-NEXT:    v_med3_i32 v8, v8, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v11, v8, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, v8, v11
+; VI-NEXT:    v_or_b32_e32 v12, 1, v11
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v8, v5
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; VI-NEXT:    v_add_u32_e32 v1, vcc, s5, v1
+; VI-NEXT:    v_cndmask_b32_e32 v5, v11, v12, vcc
+; VI-NEXT:    v_add_u32_e32 v1, vcc, s7, v1
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 12, v1
-; VI-NEXT:    v_or_b32_e32 v5, v11, v5
 ; VI-NEXT:    v_or_b32_e32 v8, v0, v8
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
 ; VI-NEXT:    v_and_b32_e32 v8, 7, v5
-; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v8
-; VI-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v8
-; VI-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; VI-NEXT:    v_or_b32_e32 v8, v8, v11
+; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v8
+; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; VI-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v8
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v9, v10, vcc
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v1
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
 ; VI-NEXT:    v_and_b32_e32 v5, 0x1ff, v3
 ; VI-NEXT:    v_or_b32_e32 v2, v5, v2
@@ -4874,32 +4957,31 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; VI-NEXT:    v_bfe_u32 v3, v3, 20, 11
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v2
-; VI-NEXT:    v_sub_u32_e32 v5, vcc, s4, v3
+; VI-NEXT:    v_sub_u32_e32 v5, vcc, s6, v3
 ; VI-NEXT:    v_or_b32_e32 v2, 0x1000, v1
 ; VI-NEXT:    v_med3_i32 v5, v5, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v8, v5, v2
 ; VI-NEXT:    v_lshlrev_b32_e32 v5, v5, v8
+; VI-NEXT:    v_or_b32_e32 v11, 1, v8
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v2
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT:    v_add_u32_e32 v3, vcc, s5, v3
+; VI-NEXT:    v_cndmask_b32_e32 v2, v8, v11, vcc
+; VI-NEXT:    v_add_u32_e32 v3, vcc, s7, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v5, 12, v3
-; VI-NEXT:    v_or_b32_e32 v2, v8, v2
 ; VI-NEXT:    v_or_b32_e32 v5, v1, v5
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; VI-NEXT:    v_and_b32_e32 v5, 7, v2
-; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v5
-; VI-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v5
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; VI-NEXT:    v_or_b32_e32 v5, v5, v8
+; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v5
+; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v3
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
@@ -4912,32 +4994,31 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; GFX9-LABEL: v_copysign_out_v3f16_mag_v3f64_sign_v3f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s4, 0x1ff
-; GFX9-NEXT:    v_and_or_b32 v4, v5, s4, v4
+; GFX9-NEXT:    s_movk_i32 s6, 0x1ff
+; GFX9-NEXT:    v_and_or_b32 v4, v5, s6, v4
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v5
-; GFX9-NEXT:    s_movk_i32 s5, 0xffe
+; GFX9-NEXT:    s_movk_i32 s7, 0xffe
 ; GFX9-NEXT:    v_bfe_u32 v5, v5, 20, 11
-; GFX9-NEXT:    v_and_or_b32 v4, v8, s5, v4
+; GFX9-NEXT:    v_and_or_b32 v4, v8, s7, v4
 ; GFX9-NEXT:    v_sub_u32_e32 v9, 0x3f1, v5
 ; GFX9-NEXT:    v_or_b32_e32 v8, 0x1000, v4
 ; GFX9-NEXT:    v_med3_i32 v9, v9, 0, 13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v10, v9, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, v9, v10
+; GFX9-NEXT:    v_or_b32_e32 v11, 1, v10
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v9, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v5, 0xfffffc10, v5
-; GFX9-NEXT:    v_or_b32_e32 v8, v10, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v10, v11, vcc
 ; GFX9-NEXT:    v_lshl_or_b32 v9, v5, 12, v4
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v9, 7, v8
-; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v9
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v9
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX9-NEXT:    v_or_b32_e32 v9, v9, v10
+; GFX9-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v9
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 2, v8
 ; GFX9-NEXT:    v_add_u32_e32 v8, v8, v9
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0x7c00
@@ -4945,79 +5026,77 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7e00
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX9-NEXT:    s_movk_i32 s6, 0x40f
+; GFX9-NEXT:    s_movk_i32 s8, 0x40f
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v9, v10, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v1, s4, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
 ; GFX9-NEXT:    v_bfe_u32 v8, v1, 20, 11
-; GFX9-NEXT:    v_and_or_b32 v0, v5, s5, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v5, s7, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v11, 0x3f1, v8
 ; GFX9-NEXT:    v_or_b32_e32 v5, 0x1000, v0
 ; GFX9-NEXT:    v_med3_i32 v11, v11, 0, 13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v12, v11, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, v11, v12
+; GFX9-NEXT:    v_or_b32_e32 v13, 1, v12
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v11, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v8, 0xfffffc10, v8
-; GFX9-NEXT:    v_or_b32_e32 v5, v12, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v12, v13, vcc
 ; GFX9-NEXT:    v_lshl_or_b32 v11, v8, 12, v0
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v11, v5, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v11, 7, v5
-; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v11
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v11
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GFX9-NEXT:    v_or_b32_e32 v11, v11, v12
+; GFX9-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v11
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
 ; GFX9-NEXT:    v_add_u32_e32 v5, v5, v11
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v10, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    s_mov_b32 s7, 0x8000
-; GFX9-NEXT:    v_and_or_b32 v0, v1, s7, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v3, s4, v2
+; GFX9-NEXT:    s_mov_b32 s9, 0x8000
+; GFX9-NEXT:    v_and_or_b32 v0, v1, s9, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v3, s6, v2
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v3
 ; GFX9-NEXT:    v_bfe_u32 v5, v3, 20, 11
-; GFX9-NEXT:    v_and_or_b32 v1, v2, s5, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v2, s7, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v8, 0x3f1, v5
 ; GFX9-NEXT:    v_or_b32_e32 v2, 0x1000, v1
 ; GFX9-NEXT:    v_med3_i32 v8, v8, 0, 13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v11, v8, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, v8, v11
+; GFX9-NEXT:    v_or_b32_e32 v12, 1, v11
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v8, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v5, 0xfffffc10, v5
-; GFX9-NEXT:    v_or_b32_e32 v2, v11, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v11, v12, vcc
 ; GFX9-NEXT:    v_lshl_or_b32 v8, v5, 12, v1
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v8, 7, v2
-; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT:    v_or_b32_e32 v8, v8, v11
+; GFX9-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v8
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v8
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v5
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_and_or_b32 v1, v2, s7, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v2, s9, v1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7fff7fff
@@ -5031,108 +5110,106 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, 0x1ff, v5, v4
 ; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0x1ff, v3, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 8, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v3, 20, 11
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v3, 20, 11
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0x1ff, v1, v0
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v5, 20, 11
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0x1ff, v1, v0
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v1, 20, 11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 8, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v1, 20, 11
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v11, 0x3f1, v5
+; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v10, 0x3f1, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, 0xffe, v8, v4
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v8, 0x3f1, v10
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT:    v_med3_i32 v11, v11, 0, 13
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0xfffffc10, v10
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0xffe, v9, v2
-; GFX11-TRUE16-NEXT:    v_med3_i32 v8, v8, 0, 13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x1000, v4
+; GFX11-TRUE16-NEXT:    v_med3_i32 v10, v10, 0, 13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x1000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0xffe, v8, v2
+; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v8, 0x3f1, v9
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, v10, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x1000, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, v11, v9
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0xffe, v13, v0
-; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v13, 0x3f1, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, v8, v12
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0xfffffc10, v14
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, v11, v16
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x1000, v0
-; GFX11-TRUE16-NEXT:    v_med3_i32 v13, v13, 0, 13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, v8, v15
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_med3_i32 v8, v8, 0, 13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0xffe, v15, v0
+; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v15, 0x3f1, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, v10, v13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, v8, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_med3_i32 v15, v15, 0, 13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, v8, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 1, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v8, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v12, v13, v17
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v11, v9
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v10, 12, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, v13, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v15, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v14, v17, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0xfffffc10, v9
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x1000, v0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 1, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v9, 12, v2
+; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v10, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v12, v15, v14
+; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e64 s3, 31, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v10, v13, v17 :: v_dual_and_b32 v13, 7, v8
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0xfffffc10, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v16, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v11, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v13, v17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, v15, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 1, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 2, v8
+; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e64 s0, 5, v13
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v5, 12, v4
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v17, 0x7e00
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 7, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 2, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v15
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v12, v13
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v14, 12, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v14
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v15, v13
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v11, v12, v11 :: v_dual_and_b32 v12, 7, v9
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 2, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v15, v14
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v11, v12, v17 :: v_dual_add_nc_u32 v16, 0xfffffc10, v16
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v16, 12, v0
+; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e64 s1, 1, v16
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v11, v12, v11, s1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 7, v10
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 2, v10
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s0
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v12
+; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e64 s2, 5, v12
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v12, 0x7e00
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 7, v11
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v13
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 7, v11
-; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 2, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v15, v13
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 3, v14
+; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e64 s1, 5, v14
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v8, 0x7c00, v8, s3
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s0
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e64 s0, 0, v2
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, v11, v13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v12, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, 0x7c00, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v2, 0x7c00, v17 :: v_dual_add_nc_u32 v9, v9, v12
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0x7c00, v12, s0
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s2, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s0
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v14
+; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v16
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, 0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, v10, v14
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, 0x7c00, v11, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0x8000, v8, v2
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v1.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v17, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v14
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v16
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -5140,7 +5217,7 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, 0x7c00, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, 0x7c00, v17, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, 0x7c00, v12, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, v0, v6
@@ -5153,123 +5230,114 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, 0x1ff, v5, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 8, v5
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v5, 20, 11
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0x1ff, v1, v0
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v5, 20, 11
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0x1ff, v3, v2
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v1, 20, 11
-; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v9, 0x3f1, v8
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0x1ff, v3, v2
+; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v9, 0x3f1, v5
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 8, v3
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v3, 20, 11
+; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v15, 0x3f1, v10
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-FAKE16-NEXT:    v_med3_i32 v9, v9, 0, 13
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, 0xffe, v5, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v3, 20, 11
+; GFX11-FAKE16-NEXT:    v_med3_i32 v15, v15, 0, 13
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, 0xffe, v8, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x1000, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0xffe, v5, v0
-; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v5, 0x3f1, v10
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0xffe, v8, v0
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, v9, v11
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x1000, v0
-; GFX11-FAKE16-NEXT:    v_med3_i32 v5, v5, 0, 13
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, v9, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x1000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0xffe, v12, v2
-; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v12, 0x3f1, v13
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, v9, v15
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0xfffffc10, v13
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, v5, v14
+; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v12, 0x3f1, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, v9, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 1, v13
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0xfffffc10, v14
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x1000, v2
 ; GFX11-FAKE16-NEXT:    v_med3_i32 v12, v12, 0, 13
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v9, v11
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, v5, v16
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, v12, v17
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v5, v14
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, v12, v11
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v15, v9
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0xfffffc10, v8
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v8, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, v15, v16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v13, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0xfffffc10, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, v12, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v5, 12, v4
+; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v5
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, v12, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 1, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, v15, v11
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 1, v11
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v9, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v11, v15 :: v_dual_add_nc_u32 v10, 0xfffffc10, v10
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v12, v17
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v8, 12, v4
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v16, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0xfffffc10, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v10, 12, v0
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v12
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v14, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v10, 12, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v13, v18, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v10
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v13, 12, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v13
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 7, v9
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v14, 12, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v14
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 7, v8
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 2, v8
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v13, v12 :: v_dual_and_b32 v13, 7, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v11
+; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e64 s0, 5, v11
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 2, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v14, v12
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, 0x7e00 :: v_dual_add_nc_u32 v9, v9, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 7, v5
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 7, v11
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 2, v11
-; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v16
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v15, v17
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v16
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v5, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 7, v12
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e64 s1, 3, v13
+; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e64 s2, 5, v13
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 2, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v11
+; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e64 s0, 5, v11
+; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s1
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v8, v13
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v16, v18
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, 0x7c00, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, v12, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, v9, v11
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v11, 0x7e00
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, 0x7c00, v9, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, 0x7c00, v12 :: v_dual_add_nc_u32 v11, v11, v15
-; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, 0x7c00, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v14
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, 0x7c00, v12, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v11, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v14
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0x8000, v1, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v12, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v5
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v1, 0x8000, v3, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, 0x7c00, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, 0x7c00, v8, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v12, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v5
 ; GFX11-FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, v0, v6
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v2, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_bfi_b32 v1, 0x7fff7fff, v1, v7
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mag.trunc = fptrunc <3 x double> %mag to <3 x half>
@@ -5808,28 +5876,27 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; SI-NEXT:    v_and_b32_e32 v12, 0xffe, v12
 ; SI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; SI-NEXT:    v_bfe_u32 v13, v7, 20, 11
-; SI-NEXT:    s_movk_i32 s4, 0x3f1
+; SI-NEXT:    s_movk_i32 s6, 0x3f1
 ; SI-NEXT:    v_or_b32_e32 v6, v12, v6
-; SI-NEXT:    v_sub_i32_e32 v14, vcc, s4, v13
+; SI-NEXT:    v_sub_i32_e32 v14, vcc, s6, v13
 ; SI-NEXT:    v_or_b32_e32 v12, 0x1000, v6
 ; SI-NEXT:    v_med3_i32 v14, v14, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v15, v14, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, v14, v15
+; SI-NEXT:    v_or_b32_e32 v16, 1, v15
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v14, v12
-; SI-NEXT:    s_movk_i32 s5, 0xfc10
-; SI-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; SI-NEXT:    v_add_i32_e32 v13, vcc, s5, v13
+; SI-NEXT:    s_movk_i32 s7, 0xfc10
+; SI-NEXT:    v_cndmask_b32_e32 v12, v15, v16, vcc
+; SI-NEXT:    v_add_i32_e32 v13, vcc, s7, v13
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 12, v13
-; SI-NEXT:    v_or_b32_e32 v12, v15, v12
 ; SI-NEXT:    v_or_b32_e32 v14, v6, v14
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v13
 ; SI-NEXT:    v_cndmask_b32_e32 v12, v14, v12, vcc
 ; SI-NEXT:    v_and_b32_e32 v14, 7, v12
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v14
-; SI-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v14
-; SI-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; SI-NEXT:    v_or_b32_e32 v14, v14, v15
+; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v14
+; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; SI-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
 ; SI-NEXT:    v_lshrrev_b32_e32 v12, 2, v12
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; SI-NEXT:    v_mov_b32_e32 v14, 0x7c00
@@ -5837,9 +5904,9 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; SI-NEXT:    v_cndmask_b32_e32 v12, v14, v12, vcc
 ; SI-NEXT:    v_mov_b32_e32 v15, 0x7e00
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; SI-NEXT:    s_movk_i32 s6, 0x40f
+; SI-NEXT:    s_movk_i32 s8, 0x40f
 ; SI-NEXT:    v_cndmask_b32_e32 v6, v14, v15, vcc
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v13
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v13
 ; SI-NEXT:    v_cndmask_b32_e32 v6, v12, v6, vcc
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; SI-NEXT:    v_and_b32_e32 v12, 0x1ff, v5
@@ -5852,32 +5919,31 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; SI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; SI-NEXT:    v_bfe_u32 v12, v5, 20, 11
 ; SI-NEXT:    v_or_b32_e32 v4, v7, v4
-; SI-NEXT:    v_sub_i32_e32 v13, vcc, s4, v12
+; SI-NEXT:    v_sub_i32_e32 v13, vcc, s6, v12
 ; SI-NEXT:    v_or_b32_e32 v7, 0x1000, v4
 ; SI-NEXT:    v_med3_i32 v13, v13, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v16, v13, v7
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, v13, v16
+; SI-NEXT:    v_or_b32_e32 v17, 1, v16
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v13, v7
-; SI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; SI-NEXT:    v_add_i32_e32 v12, vcc, s5, v12
+; SI-NEXT:    v_cndmask_b32_e32 v7, v16, v17, vcc
+; SI-NEXT:    v_add_i32_e32 v12, vcc, s7, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 12, v12
-; SI-NEXT:    v_or_b32_e32 v7, v16, v7
 ; SI-NEXT:    v_or_b32_e32 v13, v4, v13
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v12
 ; SI-NEXT:    v_cndmask_b32_e32 v7, v13, v7, vcc
 ; SI-NEXT:    v_and_b32_e32 v13, 7, v7
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v13
-; SI-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v13
-; SI-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; SI-NEXT:    v_or_b32_e32 v13, v13, v16
+; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v13
+; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; SI-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 2, v7
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v12
 ; SI-NEXT:    v_cndmask_b32_e32 v7, v14, v7, vcc
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v14, v15, vcc
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v12
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v12
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_and_b32_e32 v7, 0x1ff, v3
@@ -5890,32 +5956,31 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; SI-NEXT:    v_bfe_u32 v7, v3, 20, 11
 ; SI-NEXT:    v_or_b32_e32 v2, v5, v2
-; SI-NEXT:    v_sub_i32_e32 v12, vcc, s4, v7
+; SI-NEXT:    v_sub_i32_e32 v12, vcc, s6, v7
 ; SI-NEXT:    v_or_b32_e32 v5, 0x1000, v2
 ; SI-NEXT:    v_med3_i32 v12, v12, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v13, v12, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, v12, v13
+; SI-NEXT:    v_or_b32_e32 v16, 1, v13
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v12, v5
-; SI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; SI-NEXT:    v_add_i32_e32 v7, vcc, s5, v7
+; SI-NEXT:    v_cndmask_b32_e32 v5, v13, v16, vcc
+; SI-NEXT:    v_add_i32_e32 v7, vcc, s7, v7
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 12, v7
-; SI-NEXT:    v_or_b32_e32 v5, v13, v5
 ; SI-NEXT:    v_or_b32_e32 v12, v2, v12
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v7
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v12, v5, vcc
 ; SI-NEXT:    v_and_b32_e32 v12, 7, v5
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v12
-; SI-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v12
-; SI-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; SI-NEXT:    v_or_b32_e32 v12, v12, v13
+; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v12
+; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; SI-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v7
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v14, v15, vcc
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v7
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v7
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_and_b32_e32 v5, 0x1ff, v1
@@ -5928,32 +5993,31 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; SI-NEXT:    v_bfe_u32 v5, v1, 20, 11
 ; SI-NEXT:    v_or_b32_e32 v0, v3, v0
-; SI-NEXT:    v_sub_i32_e32 v7, vcc, s4, v5
+; SI-NEXT:    v_sub_i32_e32 v7, vcc, s6, v5
 ; SI-NEXT:    v_or_b32_e32 v3, 0x1000, v0
 ; SI-NEXT:    v_med3_i32 v7, v7, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v12, v7, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, v7, v12
+; SI-NEXT:    v_or_b32_e32 v13, 1, v12
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v7, v3
-; SI-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; SI-NEXT:    v_add_i32_e32 v5, vcc, s5, v5
+; SI-NEXT:    v_cndmask_b32_e32 v3, v12, v13, vcc
+; SI-NEXT:    v_add_i32_e32 v5, vcc, s7, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 12, v5
-; SI-NEXT:    v_or_b32_e32 v3, v12, v3
 ; SI-NEXT:    v_or_b32_e32 v7, v0, v7
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v5
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; SI-NEXT:    v_and_b32_e32 v7, 7, v3
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v7
-; SI-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v7
-; SI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; SI-NEXT:    v_or_b32_e32 v7, v7, v12
+; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v7
+; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; SI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v5
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v14, v3, vcc
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v14, v15, vcc
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v5
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v5
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; SI-NEXT:    v_and_b32_e32 v1, 0x8000, v1
@@ -5979,28 +6043,27 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; VI-NEXT:    v_and_b32_e32 v10, 0xffe, v10
 ; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; VI-NEXT:    v_bfe_u32 v5, v5, 20, 11
-; VI-NEXT:    s_movk_i32 s4, 0x3f1
+; VI-NEXT:    s_movk_i32 s6, 0x3f1
 ; VI-NEXT:    v_or_b32_e32 v4, v10, v4
-; VI-NEXT:    v_sub_u32_e32 v11, vcc, s4, v5
+; VI-NEXT:    v_sub_u32_e32 v11, vcc, s6, v5
 ; VI-NEXT:    v_or_b32_e32 v10, 0x1000, v4
 ; VI-NEXT:    v_med3_i32 v11, v11, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v12, v11, v10
 ; VI-NEXT:    v_lshlrev_b32_e32 v11, v11, v12
+; VI-NEXT:    v_or_b32_e32 v13, 1, v12
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v11, v10
-; VI-NEXT:    s_movk_i32 s5, 0xfc10
-; VI-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; VI-NEXT:    v_add_u32_e32 v5, vcc, s5, v5
+; VI-NEXT:    s_movk_i32 s7, 0xfc10
+; VI-NEXT:    v_cndmask_b32_e32 v10, v12, v13, vcc
+; VI-NEXT:    v_add_u32_e32 v5, vcc, s7, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v11, 12, v5
-; VI-NEXT:    v_or_b32_e32 v10, v12, v10
 ; VI-NEXT:    v_or_b32_e32 v11, v4, v11
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v5
 ; VI-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
 ; VI-NEXT:    v_and_b32_e32 v11, 7, v10
-; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v11
-; VI-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v11
-; VI-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; VI-NEXT:    v_or_b32_e32 v11, v11, v12
+; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v11
+; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; VI-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; VI-NEXT:    v_lshrrev_b32_e32 v10, 2, v10
 ; VI-NEXT:    v_add_u32_e32 v10, vcc, v10, v11
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x7c00
@@ -6008,9 +6071,9 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; VI-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
 ; VI-NEXT:    v_mov_b32_e32 v12, 0x7e00
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; VI-NEXT:    s_movk_i32 s6, 0x40f
+; VI-NEXT:    s_movk_i32 s8, 0x40f
 ; VI-NEXT:    v_cndmask_b32_e32 v4, v11, v12, vcc
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v5
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v5
 ; VI-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
 ; VI-NEXT:    v_and_b32_e32 v10, 0x1ff, v7
 ; VI-NEXT:    v_or_b32_e32 v6, v10, v6
@@ -6020,32 +6083,31 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; VI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; VI-NEXT:    v_bfe_u32 v7, v7, 20, 11
 ; VI-NEXT:    v_or_b32_e32 v5, v5, v6
-; VI-NEXT:    v_sub_u32_e32 v10, vcc, s4, v7
+; VI-NEXT:    v_sub_u32_e32 v10, vcc, s6, v7
 ; VI-NEXT:    v_or_b32_e32 v6, 0x1000, v5
 ; VI-NEXT:    v_med3_i32 v10, v10, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v13, v10, v6
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, v10, v13
+; VI-NEXT:    v_or_b32_e32 v14, 1, v13
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v10, v6
-; VI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; VI-NEXT:    v_add_u32_e32 v7, vcc, s5, v7
+; VI-NEXT:    v_cndmask_b32_e32 v6, v13, v14, vcc
+; VI-NEXT:    v_add_u32_e32 v7, vcc, s7, v7
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, 12, v7
-; VI-NEXT:    v_or_b32_e32 v6, v13, v6
 ; VI-NEXT:    v_or_b32_e32 v10, v5, v10
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v7
 ; VI-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
 ; VI-NEXT:    v_and_b32_e32 v10, 7, v6
-; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v10
-; VI-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v10
-; VI-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; VI-NEXT:    v_or_b32_e32 v10, v10, v13
+; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v10
+; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; VI-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 2, v6
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v10
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v7
 ; VI-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; VI-NEXT:    v_cndmask_b32_e32 v5, v11, v12, vcc
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v7
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v7
 ; VI-NEXT:    v_and_b32_e32 v7, 0x1ff, v1
 ; VI-NEXT:    v_or_b32_e32 v0, v7, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
@@ -6055,32 +6117,31 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; VI-NEXT:    v_bfe_u32 v1, v1, 20, 11
 ; VI-NEXT:    v_or_b32_e32 v0, v6, v0
-; VI-NEXT:    v_sub_u32_e32 v7, vcc, s4, v1
+; VI-NEXT:    v_sub_u32_e32 v7, vcc, s6, v1
 ; VI-NEXT:    v_or_b32_e32 v6, 0x1000, v0
 ; VI-NEXT:    v_med3_i32 v7, v7, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v10, v7, v6
 ; VI-NEXT:    v_lshlrev_b32_e32 v7, v7, v10
+; VI-NEXT:    v_or_b32_e32 v13, 1, v10
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v7, v6
-; VI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; VI-NEXT:    v_add_u32_e32 v1, vcc, s5, v1
+; VI-NEXT:    v_cndmask_b32_e32 v6, v10, v13, vcc
+; VI-NEXT:    v_add_u32_e32 v1, vcc, s7, v1
 ; VI-NEXT:    v_lshlrev_b32_e32 v7, 12, v1
-; VI-NEXT:    v_or_b32_e32 v6, v10, v6
 ; VI-NEXT:    v_or_b32_e32 v7, v0, v7
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
 ; VI-NEXT:    v_and_b32_e32 v7, 7, v6
-; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v7
-; VI-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v7
-; VI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; VI-NEXT:    v_or_b32_e32 v7, v7, v10
+; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v7
+; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; VI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 2, v6
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v7
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v11, v12, vcc
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v1
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; VI-NEXT:    v_and_b32_e32 v6, 0x1ff, v3
 ; VI-NEXT:    v_or_b32_e32 v2, v6, v2
@@ -6090,32 +6151,31 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; VI-NEXT:    v_bfe_u32 v3, v3, 20, 11
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v2
-; VI-NEXT:    v_sub_u32_e32 v6, vcc, s4, v3
+; VI-NEXT:    v_sub_u32_e32 v6, vcc, s6, v3
 ; VI-NEXT:    v_or_b32_e32 v2, 0x1000, v1
 ; VI-NEXT:    v_med3_i32 v6, v6, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, v6, v2
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, v6, v7
+; VI-NEXT:    v_or_b32_e32 v10, 1, v7
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v6, v2
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT:    v_add_u32_e32 v3, vcc, s5, v3
+; VI-NEXT:    v_cndmask_b32_e32 v2, v7, v10, vcc
+; VI-NEXT:    v_add_u32_e32 v3, vcc, s7, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, 12, v3
-; VI-NEXT:    v_or_b32_e32 v2, v7, v2
 ; VI-NEXT:    v_or_b32_e32 v6, v1, v6
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
 ; VI-NEXT:    v_and_b32_e32 v6, 7, v2
-; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v6
-; VI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v6
-; VI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; VI-NEXT:    v_or_b32_e32 v6, v6, v7
+; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v6
+; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; VI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v11, v12, vcc
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v3
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
@@ -6131,32 +6191,31 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; GFX9-LABEL: v_copysign_out_v4f16_mag_v4f64_sign_v4f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s4, 0x1ff
-; GFX9-NEXT:    v_and_or_b32 v4, v5, s4, v4
+; GFX9-NEXT:    s_movk_i32 s6, 0x1ff
+; GFX9-NEXT:    v_and_or_b32 v4, v5, s6, v4
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v5
-; GFX9-NEXT:    s_movk_i32 s5, 0xffe
+; GFX9-NEXT:    s_movk_i32 s7, 0xffe
 ; GFX9-NEXT:    v_bfe_u32 v11, v5, 20, 11
-; GFX9-NEXT:    v_and_or_b32 v4, v10, s5, v4
+; GFX9-NEXT:    v_and_or_b32 v4, v10, s7, v4
 ; GFX9-NEXT:    v_sub_u32_e32 v12, 0x3f1, v11
 ; GFX9-NEXT:    v_or_b32_e32 v10, 0x1000, v4
 ; GFX9-NEXT:    v_med3_i32 v12, v12, 0, 13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v13, v12, v10
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v12, v12, v13
+; GFX9-NEXT:    v_or_b32_e32 v14, 1, v13
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v12, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v11, 0xfffffc10, v11
-; GFX9-NEXT:    v_or_b32_e32 v10, v13, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v13, v14, vcc
 ; GFX9-NEXT:    v_lshl_or_b32 v12, v11, 12, v4
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v12, 7, v10
-; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v12
-; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v12
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GFX9-NEXT:    v_or_b32_e32 v12, v12, v13
+; GFX9-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v12
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 2, v10
 ; GFX9-NEXT:    v_add_u32_e32 v10, v10, v12
 ; GFX9-NEXT:    v_mov_b32_e32 v12, 0x7c00
@@ -6164,115 +6223,112 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v13, 0x7e00
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX9-NEXT:    s_movk_i32 s6, 0x40f
+; GFX9-NEXT:    s_movk_i32 s8, 0x40f
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v12, v13, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v11
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    s_mov_b32 s7, 0x8000
-; GFX9-NEXT:    v_and_or_b32 v4, v5, s7, v4
-; GFX9-NEXT:    v_and_or_b32 v5, v7, s4, v6
+; GFX9-NEXT:    s_mov_b32 s9, 0x8000
+; GFX9-NEXT:    v_and_or_b32 v4, v5, s9, v4
+; GFX9-NEXT:    v_and_or_b32 v5, v7, s6, v6
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v7
 ; GFX9-NEXT:    v_bfe_u32 v10, v7, 20, 11
-; GFX9-NEXT:    v_and_or_b32 v5, v6, s5, v5
+; GFX9-NEXT:    v_and_or_b32 v5, v6, s7, v5
 ; GFX9-NEXT:    v_sub_u32_e32 v11, 0x3f1, v10
 ; GFX9-NEXT:    v_or_b32_e32 v6, 0x1000, v5
 ; GFX9-NEXT:    v_med3_i32 v11, v11, 0, 13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v14, v11, v6
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, v11, v14
+; GFX9-NEXT:    v_or_b32_e32 v15, 1, v14
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v11, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v10, 0xfffffc10, v10
-; GFX9-NEXT:    v_or_b32_e32 v6, v14, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v14, v15, vcc
 ; GFX9-NEXT:    v_lshl_or_b32 v11, v10, 12, v5
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v11, 7, v6
-; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v11
-; GFX9-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v11
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GFX9-NEXT:    v_or_b32_e32 v11, v11, v14
+; GFX9-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v11
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 2, v6
 ; GFX9-NEXT:    v_add_u32_e32 v6, v6, v11
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v12, v6, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v12, v13, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v10
-; GFX9-NEXT:    v_and_or_b32 v0, v1, s4, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v10
+; GFX9-NEXT:    v_and_or_b32 v0, v1, s6, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_and_or_b32 v5, v6, s7, v5
+; GFX9-NEXT:    v_and_or_b32 v5, v6, s9, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX9-NEXT:    v_bfe_u32 v7, v1, 20, 11
-; GFX9-NEXT:    v_and_or_b32 v0, v6, s5, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v6, s7, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v10, 0x3f1, v7
 ; GFX9-NEXT:    v_or_b32_e32 v6, 0x1000, v0
 ; GFX9-NEXT:    v_med3_i32 v10, v10, 0, 13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v11, v10, v6
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, v10, v11
+; GFX9-NEXT:    v_or_b32_e32 v14, 1, v11
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v10, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v7, 0xfffffc10, v7
-; GFX9-NEXT:    v_or_b32_e32 v6, v11, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v11, v14, vcc
 ; GFX9-NEXT:    v_lshl_or_b32 v10, v7, 12, v0
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v10, 7, v6
-; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GFX9-NEXT:    v_or_b32_e32 v10, v10, v11
+; GFX9-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v10
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 2, v6
 ; GFX9-NEXT:    v_add_u32_e32 v6, v6, v10
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v12, v6, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v13, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v7
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_and_or_b32 v0, v1, s7, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v3, s4, v2
+; GFX9-NEXT:    v_and_or_b32 v0, v1, s9, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v3, s6, v2
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v3
 ; GFX9-NEXT:    v_bfe_u32 v6, v3, 20, 11
-; GFX9-NEXT:    v_and_or_b32 v1, v2, s5, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v2, s7, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v7, 0x3f1, v6
 ; GFX9-NEXT:    v_or_b32_e32 v2, 0x1000, v1
 ; GFX9-NEXT:    v_med3_i32 v7, v7, 0, 13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v10, v7, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, v7, v10
+; GFX9-NEXT:    v_or_b32_e32 v11, 1, v10
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v7, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v6, 0xfffffc10, v6
-; GFX9-NEXT:    v_or_b32_e32 v2, v10, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc
 ; GFX9-NEXT:    v_lshl_or_b32 v7, v6, 12, v1
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v7, 7, v2
-; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_or_b32_e32 v7, v7, v10
+; GFX9-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v7
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v7
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v12, v2, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v13, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v6
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_and_or_b32 v1, v2, s7, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v2, s9, v1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    s_mov_b32 s5, 0x7fff7fff
@@ -6288,158 +6344,149 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v7, 20, 11
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 8, v7
 ; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, 0x1ff, v5, v4
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0x1ff, v3, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 8, v5
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
 ; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v12, 0x3f1, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v5
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v5, 20, 11
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0x1ff, v3, v2
 ; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0x1ff, v1, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v5, 20, 11
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v3
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v3, 20, 11
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v1, 20, 11
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v7.h
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_and_or_b32 v6, 0xffe, v11, v6
 ; GFX11-TRUE16-NEXT:    v_med3_i32 v11, v12, 0, 13
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x1000, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, 0xffe, v15, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, v11, v12
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x1000, v4
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, v11, v14
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 1, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v11, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v12, 8, v1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v14, v11
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0xfffffc10, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v14, 0x3f1, v17
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v10, 12, v6
+; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v12, 0x3f1, v16
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v11, v14, v17 :: v_dual_add_nc_u32 v16, 0xfffffc10, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v10, 12, v6
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, 0xffe, v16, v4
-; GFX11-TRUE16-NEXT:    v_med3_i32 v14, v14, 0, 13
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v21, v11, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT:    v_med3_i32 v12, v12, 0, 13
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 8, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v3, 20, 11
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v18, v11, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, 0x7e00
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x1000, v4
-; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v21, 0x3f1, v19
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 7, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, v12, v15
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 7, v11
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 2, v11
-; GFX11-TRUE16-NEXT:    v_med3_i32 v21, v21, 0, 13
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0xffe, v18, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, v14, v22
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0xffe, v12, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, v12, v19
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 1, v19
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v18
+; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e64 s0, 5, v18
+; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v18, 0x3f1, v17
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0xffe, v14, v2
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v16, 12, v4
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0xfffffc10, v17
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v12, v15
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x1000, v2
+; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e64 s1, 1, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v19, v20, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v23
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v6, 0x7c00, v13 :: v_dual_add_nc_u32 v11, v11, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, v14, v18
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x1000, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v12, v22
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, v21, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v7.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, 0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, 0x7c00, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v16
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v12, v14, v12 :: v_dual_add_nc_u32 v11, v11, v21
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v18, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, 0x7c00, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_med3_i32 v14, v18, 0, 13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v11, 0x7c00, v11 :: v_dual_and_b32 v18, 7, v12
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v10
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0xfffffc10, v17
-; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v10, 0x3f1, v20
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 8, v1
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, v14, v15
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v12, 2, v12
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v17, 12, v4
-; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v17
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, v21, v22
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v1, 20, 11
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, v14, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 1, v19
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v18
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e64 s0, v14, v15
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v17, 12, v2
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0xffe, v10, v0
+; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v10, 0x3f1, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v14, v19, v21, s0
+; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e64 s0, 5, v18
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0xfffffc10, v11
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x1000, v0
 ; GFX11-TRUE16-NEXT:    v_med3_i32 v10, v10, 0, 13
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v6, 0x8000, v15, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v5.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v11, v14
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0xfffffc10, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x1000, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v14, 12, v2
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, v10, v16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v22, v11
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 7, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v14, v15, v14, s1
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e64 s1, 1, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, v10, v22
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 7, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 2, v14
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v6, 0x8000, v20, v6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v5.h
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, v10, v7
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v12, 2, v12
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v19, v11, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v18
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 7, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v10, v16
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 2, v11
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v18, v21
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v7, v10
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0xfffffc10, v20
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, 1, v7
+; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e64 s0, 5, v19
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v10, v22
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v11, 12, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v7, v18, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v19
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v16, 12, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v16
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v7, v10, v7 :: v_dual_add_nc_u32 v10, v12, v18
-; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v17
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v20
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 7, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v7, v10, v7, s1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, v12, v15
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 7, v7
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e64 s1, 0x40f, v16
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 2, v7
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, 0x7c00, v10, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, 0x7c00, v13 :: v_dual_add_nc_u32 v11, v11, v12
-; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v18
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v17
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v14
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v18, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, 0x7c00, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e64 s0, 5, v15
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, 0x7c00, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v4, v10, v4, s1
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v7, v10
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, 0x8000, v15, v4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v3.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s0
+; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e64 s1, 31, v17
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, 0x8000, v20, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v3.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v2, 0x7c00, v13 :: v_dual_add_nc_u32 v7, v7, v10
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, v14, v12
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v17
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v13, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v14
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v5, 0x7c00, v12, s1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v16
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0x8000, v15, v2
+; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v11
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0x8000, v20, v2
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, 0x7c00, v7, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0x7fff7fff, v4, v9
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v13, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v16
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v11
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0x8000, v15, v0
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0x8000, v20, v0
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, v0, v8
@@ -6455,150 +6502,144 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v7
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
 ; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v12, 0x3f1, v10
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v15, v7, 20, 11
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v7, 20, 11
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0x1ff, v1, v0
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0x1ff, v3, v2
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v17, 0x3f1, v15
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v17, 0x3f1, v14
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v20, v3, 20, 11
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, 0xffe, v11, v4
 ; GFX11-FAKE16-NEXT:    v_med3_i32 v11, v12, 0, 13
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x1000, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v6, 0xffe, v13, v6
-; GFX11-FAKE16-NEXT:    v_med3_i32 v13, v17, 0, 13
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, v11, v12
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x1000, v6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, v11, v14
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0xfffffc10, v14
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, v11, v12
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e64 s3, 0, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, v11, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 1, v15
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v11, v12
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v1, 20, 11
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v14, v11
 ; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0xfffffc10, v10
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0xfffffc10, v15
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, v13, v17
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v10, 12, v4
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 8, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v15, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v10, 12, v4
+; GFX11-FAKE16-NEXT:    v_med3_i32 v15, v17, 0, 13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x1000, v6
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, v13, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v18, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v18, 0x3f1, v12
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, v15, v16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v1, 20, 11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, v15, v17
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 7, v11
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 1, v17
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 2, v11
-; GFX11-FAKE16-NEXT:    v_med3_i32 v18, v18, 0, 13
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0xffe, v16, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v13, v17
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v14, 12, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v19
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v15, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e64 s1, v15, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v19
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e64 s0, 5, v19
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v14, 12, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v15, v17, v21, s1
+; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v17, 0x3f1, v20
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v14
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v19, v17
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc_lo
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x1000, v0
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v3
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v3, 20, 11
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, v11, v15
-; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v10
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, v18, v16
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0xffe, v17, v2
-; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v17, 0x3f1, v19
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v19, 0xfffffc10, v19
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, 0x7c00, v11 :: v_dual_lshlrev_b32 v18, v18, v20
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 7, v13
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x1000, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 0xfffffc10, v20
 ; GFX11-FAKE16-NEXT:    v_med3_i32 v17, v17, 0, 13
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 2, v13
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v15
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, v17, v21
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v18, v16
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, 0x7e00 :: v_dual_lshlrev_b32 v17, v17, v23
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v22
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v17, v21
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, v13, v15
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v20, v16
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v23, v17
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0xfffffc10, v12
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v12, 12, v0
-; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v12
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v20, v16, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v19, 12, v2
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, 0x7c00, v18 :: v_dual_and_b32 v15, 7, v16
-; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v19, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v19, 0x7e00
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, v11, v16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0xffe, v12, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v12, 0x3f1, v13
+; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v10
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0xffe, v18, v2
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x1000, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_med3_i32 v12, v12, 0, 13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, 0x7c00, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0xfffffc10, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, v12, v18
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, v12, v21
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 7, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, 1, v21
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 2, v15
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v16
+; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e64 s0, 5, v16
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x1000, v2
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, v17, v16
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v12, v18
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v13, 12, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s0
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, v17, v23
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 1, v23
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v21, v24, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v20, v17, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v17, v16
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v20, 12, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v23, v25, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v18, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, 0x7c00, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v20
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, v15, v22
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 7, v12
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 2, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 7, v16
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v11, v4, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v14
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 7, v17
+; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e64 s0, 5, v17
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v6, 0x7c00, v19, s3
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e64 s1, 3, v10
+; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e64 s2, 5, v10
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, 0x7c00, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v17
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 2, v16
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, 0x8000, v5, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, 0x7c00, v13, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v15
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v11
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v15, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v11
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 2, v16
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 2, v17
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, v15, v13
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v20
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, 0x7c00, v18, vcc_lo
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v14
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v12
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, 0x7c00, v13 :: v_dual_add_nc_u32 v10, v16, v11
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v18, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v19
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s0
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s2, s1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s0
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v12, v10
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, 0x7c00, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, v16, v15
+; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v20
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, 0x7c00, v11, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v18, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v12
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v19, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v20
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0x8000, v1, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v1, 0x8000, v3, v2
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0x8000, v7, v6
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index 462d7748b86cd..af0c38c5624ba 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -3944,9 +3944,10 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 {
 ; SI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 20, v1
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffe, v2
-; SI-NEXT:    v_bfe_u32 v3, v1, 20, 11
+; SI-NEXT:    v_and_b32_e32 v3, 0x7ff, v3
 ; SI-NEXT:    s_movk_i32 s4, 0x3f1
 ; SI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; SI-NEXT:    v_sub_i32_e32 v4, vcc, s4, v3
@@ -3954,21 +3955,20 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 {
 ; SI-NEXT:    v_med3_i32 v4, v4, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, v4, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, v4, v5
+; SI-NEXT:    v_or_b32_e32 v6, 1, v5
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v4, v2
 ; SI-NEXT:    s_movk_i32 s4, 0xfc10
-; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, s4, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 12, v3
-; SI-NEXT:    v_or_b32_e32 v2, v5, v2
 ; SI-NEXT:    v_or_b32_e32 v4, v0, v4
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v3
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-NEXT:    v_and_b32_e32 v4, 7, v2
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v4
-; SI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v4
-; SI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
+; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v4
+; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; SI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; SI-NEXT:    v_mov_b32_e32 v4, 0x7c00
@@ -3994,9 +3994,10 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 {
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v3, 20, v1
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; VI-NEXT:    v_and_b32_e32 v2, 0xffe, v2
-; VI-NEXT:    v_bfe_u32 v3, v1, 20, 11
+; VI-NEXT:    v_and_b32_e32 v3, 0x7ff, v3
 ; VI-NEXT:    s_movk_i32 s4, 0x3f1
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    v_sub_u32_e32 v4, vcc, s4, v3
@@ -4004,21 +4005,20 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 {
 ; VI-NEXT:    v_med3_i32 v4, v4, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, v4, v2
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, v4, v5
+; VI-NEXT:    v_or_b32_e32 v6, 1, v5
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v4, v2
 ; VI-NEXT:    s_movk_i32 s4, 0xfc10
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 12, v3
-; VI-NEXT:    v_or_b32_e32 v2, v5, v2
 ; VI-NEXT:    v_or_b32_e32 v4, v0, v4
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; VI-NEXT:    v_and_b32_e32 v4, 7, v2
-; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v4
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v4
-; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; VI-NEXT:    v_or_b32_e32 v4, v4, v5
+; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v4
+; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x7c00
@@ -4041,47 +4041,47 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 {
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_and_or_b32 v0, 0x1ff, v1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
-; GFX11-NEXT:    v_bfe_u32 v3, v1, 20, 11
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 20, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v4, 0x3f1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0x7ff, v2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffe, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_med3_i32 v2, v4, 0, 13
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffe, v3, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 0x3f1, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0xfffffc10, v2
 ; GFX11-NEXT:    v_or_b32_e32 v4, 0x1000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v2, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v2, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v4
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0xfffffc10, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshl_or_b32 v4, v3, 12, v0
-; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v4, 7, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
-; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v4
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_med3_i32 v3, v3, 0, 13
+; GFX11-NEXT:    v_lshl_or_b32 v7, v2, 12, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v3, v5
+; GFX11-NEXT:    v_or_b32_e32 v6, 1, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v2
+; GFX11-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX11-NEXT:    v_and_b32_e32 v4, 7, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v4
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_i32_e64 s0, 5, v4
+; GFX11-NEXT:    s_or_b32 s0, s0, vcc_lo
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX11-NEXT:    v_dual_mov_b32 v5, 0x7e00 :: v_dual_add_nc_u32 v2, v2, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v5, vcc_lo
-; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, 0x7c00, v5 :: v_dual_add_nc_u32 v3, v3, v4
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_or_b32 v0, 0x8000, v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fpround = fptrunc double %a to half
@@ -4106,21 +4106,20 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; SI-NEXT:    v_med3_i32 v4, v4, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, v4, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, v4, v5
+; SI-NEXT:    v_or_b32_e32 v6, 1, v5
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v4, v2
 ; SI-NEXT:    s_movk_i32 s4, 0xfc10
-; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, s4, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 12, v3
-; SI-NEXT:    v_or_b32_e32 v2, v5, v2
 ; SI-NEXT:    v_or_b32_e32 v4, v0, v4
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v3
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-NEXT:    v_and_b32_e32 v4, 7, v2
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v4
-; SI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v4
-; SI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
+; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v4
+; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; SI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; SI-NEXT:    v_mov_b32_e32 v4, 0x7c00
@@ -4154,21 +4153,20 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; VI-NEXT:    v_med3_i32 v4, v4, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, v4, v2
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, v4, v5
+; VI-NEXT:    v_or_b32_e32 v6, 1, v5
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v4, v2
 ; VI-NEXT:    s_movk_i32 s4, 0xfc10
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 12, v3
-; VI-NEXT:    v_or_b32_e32 v2, v5, v2
 ; VI-NEXT:    v_or_b32_e32 v4, v0, v4
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; VI-NEXT:    v_and_b32_e32 v4, 7, v2
-; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v4
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v4
-; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; VI-NEXT:    v_or_b32_e32 v4, v4, v5
+; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v4
+; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x7c00
@@ -4202,35 +4200,32 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v4, 0x1000, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v2, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v2, v5
+; GFX11-NEXT:    v_or_b32_e32 v6, 1, v5
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v4
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0xfffffc10, v3
-; GFX11-NEXT:    v_lshl_or_b32 v4, v3, 12, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v5, v6 :: v_dual_add_nc_u32 v3, 0xfffffc10, v3
+; GFX11-NEXT:    v_lshl_or_b32 v7, v3, 12, v0
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
+; GFX11-NEXT:    v_dual_mov_b32 v5, 0x7e00 :: v_dual_cndmask_b32 v2, v7, v2
 ; GFX11-NEXT:    v_and_b32_e32 v4, 7, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v4
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v4
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_i32_e64 s0, 5, v4
+; GFX11-NEXT:    s_or_b32 s0, s0, vcc_lo
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v5, 0x7e00 :: v_dual_add_nc_u32 v2, v2, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v4
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_or_b32 v0, 0x8000, v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg double %a
@@ -4258,21 +4253,20 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0
 ; SI-NEXT:    v_med3_i32 v5, v5, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, v5, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, v5, v6
+; SI-NEXT:    v_or_b32_e32 v7, 1, v6
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v2
 ; SI-NEXT:    s_movk_i32 s4, 0xfc10
-; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 12, v4
-; SI-NEXT:    v_or_b32_e32 v2, v6, v2
 ; SI-NEXT:    v_or_b32_e32 v5, v0, v5
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v4
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; SI-NEXT:    v_and_b32_e32 v5, 7, v2
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v5
-; SI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v5
-; SI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; SI-NEXT:    v_or_b32_e32 v5, v5, v6
+; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v5
+; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; SI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; SI-NEXT:    v_mov_b32_e32 v5, 0x7c00
@@ -4310,21 +4304,20 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0
 ; VI-NEXT:    v_med3_i32 v6, v6, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, v6, v4
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, v6, v7
+; VI-NEXT:    v_or_b32_e32 v8, 1, v7
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v6, v4
 ; VI-NEXT:    s_movk_i32 s4, 0xfc10
-; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, 12, v5
-; VI-NEXT:    v_or_b32_e32 v4, v7, v4
 ; VI-NEXT:    v_or_b32_e32 v6, v0, v6
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v5
 ; VI-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; VI-NEXT:    v_and_b32_e32 v6, 7, v4
-; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v6
-; VI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v6
-; VI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; VI-NEXT:    v_or_b32_e32 v6, v6, v7
+; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v6
+; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; VI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 2, v4
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
 ; VI-NEXT:    v_mov_b32_e32 v6, 0x7c00
@@ -4359,32 +4352,28 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v5, 0x1000, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v6, v3, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v3, v6
+; GFX11-NEXT:    v_or_b32_e32 v7, 1, v6
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v3, v5
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v3, v6, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0xfffffc10, v4
-; GFX11-NEXT:    v_lshl_or_b32 v5, v4, 12, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v3, v6, v7 :: v_dual_add_nc_u32 v4, 0xfffffc10, v4
+; GFX11-NEXT:    v_lshl_or_b32 v8, v4, 12, v2
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
+; GFX11-NEXT:    v_dual_mov_b32 v6, 0x7e00 :: v_dual_cndmask_b32 v3, v8, v3
 ; GFX11-NEXT:    v_and_b32_e32 v5, 7, v3
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v5
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v5
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_i32_e64 s0, 5, v5
+; GFX11-NEXT:    s_or_b32 s0, s0, vcc_lo
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-NEXT:    v_or_b32_e32 v5, v5, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v6, 0x7e00 :: v_dual_add_nc_u32 v3, v3, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v6, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, 0x7c00, v6 :: v_dual_add_nc_u32 v3, v3, v5
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v4
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v4
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
@@ -4521,21 +4510,20 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do
 ; SI-NEXT:    v_med3_i32 v7, v7, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, v7, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, v7, v8
+; SI-NEXT:    v_or_b32_e32 v9, 1, v8
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v7, v5
 ; SI-NEXT:    s_movk_i32 s4, 0xfc10
-; SI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, s4, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 12, v6
-; SI-NEXT:    v_or_b32_e32 v5, v8, v5
 ; SI-NEXT:    v_or_b32_e32 v7, v4, v7
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v6
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
 ; SI-NEXT:    v_and_b32_e32 v7, 7, v5
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v7
-; SI-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v7
-; SI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; SI-NEXT:    v_or_b32_e32 v7, v7, v8
+; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v7
+; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; SI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; SI-NEXT:    v_mov_b32_e32 v7, 0x7c00
@@ -4572,22 +4560,21 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do
 ; VI-NEXT:    v_med3_i32 v7, v7, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v8, v7, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v7, v7, v8
+; VI-NEXT:    v_or_b32_e32 v9, 1, v8
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v7, v5
 ; VI-NEXT:    s_movk_i32 s4, 0xfc10
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
 ; VI-NEXT:    v_lshlrev_b32_e32 v7, 12, v6
-; VI-NEXT:    v_or_b32_e32 v5, v8, v5
 ; VI-NEXT:    v_or_b32_e32 v7, v4, v7
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v6
 ; VI-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
 ; VI-NEXT:    v_and_b32_e32 v7, 7, v5
-; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v7
-; VI-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v7
-; VI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v7
+; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; VI-NEXT:    v_mul_f64 v[2:3], -v[0:1], v[2:3]
-; VI-NEXT:    v_or_b32_e32 v7, v7, v8
+; VI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v7
 ; VI-NEXT:    v_mov_b32_e32 v7, 0x7c00
@@ -4625,28 +4612,27 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v8, v5, v7
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v5, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v9, 1, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v5, v7
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
-; GFX11-NEXT:    v_or_b32_e32 v5, v8, v5
 ; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0xfffffc10, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshl_or_b32 v7, v6, 12, v4
+; GFX11-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshl_or_b32 v10, v6, 12, v4
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v10, v5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 7, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
-; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_i32_e64 s0, 5, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 2, v5
+; GFX11-NEXT:    s_or_b32 s0, s0, vcc_lo
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v0, v0, v7
-; GFX11-NEXT:    v_dual_mov_b32 v7, 0x7e00 :: v_dual_add_nc_u32 v0, v5, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7c00, v7, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v5
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v6
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v0, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v6
@@ -4681,21 +4667,20 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; SI-NEXT:    v_med3_i32 v4, v4, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, v4, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, v4, v5
+; SI-NEXT:    v_or_b32_e32 v6, 1, v5
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v4, v2
 ; SI-NEXT:    s_movk_i32 s4, 0xfc10
-; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, s4, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 12, v3
-; SI-NEXT:    v_or_b32_e32 v2, v5, v2
 ; SI-NEXT:    v_or_b32_e32 v4, v0, v4
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v3
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-NEXT:    v_and_b32_e32 v4, 7, v2
-; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v4
-; SI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v4
-; SI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; SI-NEXT:    v_or_b32_e32 v4, v4, v5
+; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v4
+; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; SI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; SI-NEXT:    v_mov_b32_e32 v4, 0x7c00
@@ -4731,21 +4716,20 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; VI-NEXT:    v_med3_i32 v4, v4, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, v4, v2
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, v4, v5
+; VI-NEXT:    v_or_b32_e32 v6, 1, v5
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v4, v2
 ; VI-NEXT:    s_movk_i32 s4, 0xfc10
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 12, v3
-; VI-NEXT:    v_or_b32_e32 v2, v5, v2
 ; VI-NEXT:    v_or_b32_e32 v4, v0, v4
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; VI-NEXT:    v_and_b32_e32 v4, 7, v2
-; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v4
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v4
-; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; VI-NEXT:    v_or_b32_e32 v4, v4, v5
+; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v4
+; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x7c00
@@ -4780,36 +4764,34 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v4, 0x1000, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v2, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v2, v5
+; GFX11-NEXT:    v_or_b32_e32 v6, 1, v5
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v4
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v2, v5, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0xfffffc10, v3
-; GFX11-NEXT:    v_lshl_or_b32 v4, v3, 12, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v2, v5, v6 :: v_dual_add_nc_u32 v3, 0xfffffc10, v3
+; GFX11-NEXT:    v_lshl_or_b32 v7, v3, 12, v0
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
+; GFX11-NEXT:    v_dual_mov_b32 v5, 0x7e00 :: v_dual_cndmask_b32 v2, v7, v2
 ; GFX11-NEXT:    v_and_b32_e32 v4, 7, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v4
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v4
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_i32_e64 s0, 5, v4
+; GFX11-NEXT:    s_or_b32 s0, s0, vcc_lo
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v5, 0x7e00 :: v_dual_add_nc_u32 v2, v2, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v4
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_or_b32 v1, 0x8000, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-SAFE-TRUE16-LABEL: v_fneg_multi_use_fp_round_fneg_f64_to_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index e687745469014..40b33f48f4813 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -4318,9 +4318,9 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %ar
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_and_b32 s1, 1, s1
-; SI-NEXT:    s_cselect_b32 s0, 0, s0
 ; SI-NEXT:    s_xor_b32 s0, s0, 0x80008000
 ; SI-NEXT:    s_cmp_eq_u32 s1, 1
+; SI-NEXT:    s_cselect_b32 s0, 0x80008000, s0
 ; SI-NEXT:    s_cselect_b32 s0, 0, s0
 ; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
@@ -4358,9 +4358,10 @@ define <2 x half> @v_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %a
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    v_and_b32_e32 v1, 1, v2
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
 ; SI-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
+; SI-NEXT:    v_mov_b32_e32 v2, 0x80008000
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, vcc
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index 1b092b283290a..c20b99444ab35 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -349,29 +349,24 @@ define i32 @select_fneg_xor_select_i32(i1 %cond0, i1 %cond1, i32 %arg0, i32 %arg
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GCN-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, -v2, v3, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, -v0, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_xor_select_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, -v2, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, -v0, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = xor i32 %arg0, -2147483648
   %select0 = select i1 %cond0, i32 %arg1, i32 %fneg0
@@ -550,31 +545,25 @@ define i64 @select_fneg_xor_select_i64(i1 %cond0, i1 %cond1, i64 %arg0, i64 %arg
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
-; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 1, v1
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
-; GCN-NEXT:    v_xor_b32_e32 v3, 0x80000000, v2
+; GCN-NEXT:    v_cndmask_b32_e64 v2, -v3, v5, vcc
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v2, -v2, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: select_fneg_xor_select_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v4 :: v_dual_and_b32 v1, 1, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, -v3, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80000000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v2, -v2, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = xor i64 %arg0, 9223372036854775808
   %select0 = select i1 %cond0, i64 %arg1, i64 %fneg0
@@ -936,10 +925,8 @@ define double @cospiD_pattern0(i32 %arg, double %arg1, double %arg2) {
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
-; GCN-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, 1, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; GCN-NEXT:    v_xor_b32_e32 v1, v1, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, -v1, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v0, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -947,14 +934,14 @@ define double @cospiD_pattern0(i32 %arg, double %arg1, double %arg2) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v5, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v5
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
 ; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 0x80000000, vcc_lo
-; GFX11-NEXT:    v_xor_b32_e32 v1, v1, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, -v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %i = and i32 %arg, 1
   %i3 = icmp eq i32 %i, 0
@@ -1015,12 +1002,11 @@ define float @cospiD_pattern0_half(i16 %arg, float %arg1, float %arg2) {
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX7-NEXT:    v_mov_b32_e32 v2, 0xffff8000
-; GFX7-NEXT:    v_cmp_lt_i32_e32 vcc, 1, v3
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; GFX7-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX7-NEXT:    v_xor_b32_e32 v2, 0x8000, v0
+; GFX7-NEXT:    v_cmp_lt_i32_e32 vcc, 1, v3
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 49c563eef5d82..d99cf35c482a4 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -111,34 +111,36 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; SI-NEXT:    s_bfe_u32 s0, s7, 0xb0014
 ; SI-NEXT:    v_readfirstlane_b32 s1, v0
 ; SI-NEXT:    s_sub_i32 s6, 0x3f1, s0
-; SI-NEXT:    s_or_b32 s1, s8, s1
+; SI-NEXT:    s_or_b32 s10, s8, s1
 ; SI-NEXT:    v_med3_i32 v0, s6, 0, 13
-; SI-NEXT:    s_or_b32 s6, s1, 0x1000
-; SI-NEXT:    v_readfirstlane_b32 s8, v0
-; SI-NEXT:    s_lshr_b32 s9, s6, s8
-; SI-NEXT:    s_lshl_b32 s8, s9, s8
-; SI-NEXT:    s_cmp_lg_u32 s8, s6
-; SI-NEXT:    s_cselect_b32 s6, 1, 0
-; SI-NEXT:    s_addk_i32 s0, 0xfc10
-; SI-NEXT:    s_or_b32 s6, s9, s6
-; SI-NEXT:    s_lshl_b32 s8, s0, 12
-; SI-NEXT:    s_or_b32 s8, s1, s8
-; SI-NEXT:    s_cmp_lt_i32 s0, 1
-; SI-NEXT:    s_cselect_b32 s6, s6, s8
-; SI-NEXT:    s_and_b32 s8, s6, 7
-; SI-NEXT:    s_cmp_gt_i32 s8, 5
-; SI-NEXT:    s_cselect_b32 s9, 1, 0
+; SI-NEXT:    s_or_b32 s1, s10, 0x1000
+; SI-NEXT:    v_readfirstlane_b32 s6, v0
+; SI-NEXT:    s_lshr_b32 s8, s1, s6
+; SI-NEXT:    s_or_b32 s9, s8, 1
+; SI-NEXT:    s_lshl_b32 s6, s8, s6
+; SI-NEXT:    s_cmp_lg_u32 s6, s1
+; SI-NEXT:    s_cselect_b32 s1, s9, s8
+; SI-NEXT:    s_add_i32 s6, s0, 0xfffffc10
+; SI-NEXT:    s_lshl_b32 s0, s6, 12
+; SI-NEXT:    s_or_b32 s0, s10, s0
+; SI-NEXT:    s_cmp_lt_i32 s6, 1
+; SI-NEXT:    s_cselect_b32 s11, s1, s0
+; SI-NEXT:    s_and_b32 s8, s11, 7
 ; SI-NEXT:    s_cmp_eq_u32 s8, 3
-; SI-NEXT:    s_cselect_b32 s8, 1, 0
-; SI-NEXT:    s_lshr_b32 s6, s6, 2
-; SI-NEXT:    s_or_b32 s8, s8, s9
-; SI-NEXT:    s_add_i32 s6, s6, s8
-; SI-NEXT:    s_cmp_lt_i32 s0, 31
-; SI-NEXT:    s_cselect_b32 s6, s6, 0x7c00
-; SI-NEXT:    s_cmp_lg_u32 s1, 0
+; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; SI-NEXT:    s_cmp_gt_i32 s8, 5
+; SI-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; SI-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_cselect_b32 s0, 1, 0
+; SI-NEXT:    s_lshr_b32 s1, s11, 2
+; SI-NEXT:    s_add_i32 s1, s1, s0
+; SI-NEXT:    s_cmp_lt_i32 s6, 31
+; SI-NEXT:    s_cselect_b32 s0, s1, 0x7c00
+; SI-NEXT:    s_cmp_lg_u32 s10, 0
 ; SI-NEXT:    s_cselect_b32 s1, s2, 0x7c00
-; SI-NEXT:    s_cmpk_eq_i32 s0, 0x40f
-; SI-NEXT:    s_cselect_b32 s0, s1, s6
+; SI-NEXT:    s_cmpk_eq_i32 s6, 0x40f
+; SI-NEXT:    s_cselect_b32 s0, s1, s0
 ; SI-NEXT:    s_lshr_b32 s1, s7, 16
 ; SI-NEXT:    s_and_b32 s1, s1, 0x8000
 ; SI-NEXT:    s_or_b32 s6, s1, s0
@@ -165,37 +167,39 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; VI-SAFE-SDAG-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; VI-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; VI-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; VI-SAFE-SDAG-NEXT:    s_bfe_u32 s6, s7, 0xb0014
-; VI-SAFE-SDAG-NEXT:    s_or_b32 s4, s8, s4
-; VI-SAFE-SDAG-NEXT:    s_sub_i32 s8, 0x3f1, s6
+; VI-SAFE-SDAG-NEXT:    s_bfe_u32 s5, s7, 0xb0014
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s6, s8, s4
+; VI-SAFE-SDAG-NEXT:    s_sub_i32 s8, 0x3f1, s5
 ; VI-SAFE-SDAG-NEXT:    v_med3_i32 v0, s8, 0, 13
-; VI-SAFE-SDAG-NEXT:    s_or_b32 s5, s4, 0x1000
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s4, s6, 0x1000
 ; VI-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s8, v0
-; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s9, s5, s8
+; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s9, s4, s8
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s10, s9, 1
 ; VI-SAFE-SDAG-NEXT:    s_lshl_b32 s8, s9, s8
-; VI-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s8, s5
-; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s5, 1, 0
-; VI-SAFE-SDAG-NEXT:    s_addk_i32 s6, 0xfc10
-; VI-SAFE-SDAG-NEXT:    s_lshl_b32 s8, s6, 12
-; VI-SAFE-SDAG-NEXT:    s_or_b32 s5, s9, s5
-; VI-SAFE-SDAG-NEXT:    s_or_b32 s8, s4, s8
-; VI-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s6, 1
-; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, s8
-; VI-SAFE-SDAG-NEXT:    s_and_b32 s8, s5, 7
-; VI-SAFE-SDAG-NEXT:    s_cmp_gt_i32 s8, 5
-; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s9, 1, 0
+; VI-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s8, s4
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s10, s9
+; VI-SAFE-SDAG-NEXT:    s_add_i32 s10, s5, 0xfffffc10
+; VI-SAFE-SDAG-NEXT:    s_lshl_b32 s5, s10, 12
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s5, s6, s5
+; VI-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s10, 1
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s11, s4, s5
+; VI-SAFE-SDAG-NEXT:    s_and_b32 s8, s11, 7
 ; VI-SAFE-SDAG-NEXT:    s_cmp_eq_u32 s8, 3
-; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s8, 1, 0
-; VI-SAFE-SDAG-NEXT:    s_or_b32 s8, s8, s9
-; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s5, s5, 2
-; VI-SAFE-SDAG-NEXT:    s_add_i32 s5, s5, s8
-; VI-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s6, 31
+; VI-SAFE-SDAG-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; VI-SAFE-SDAG-NEXT:    s_cmp_gt_i32 s8, 5
+; VI-SAFE-SDAG-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; VI-SAFE-SDAG-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
+; VI-SAFE-SDAG-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s4, 1, 0
+; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s5, s11, 2
+; VI-SAFE-SDAG-NEXT:    s_add_i32 s5, s5, s4
+; VI-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s10, 31
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s5, 0x7c00
+; VI-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s6, 0
+; VI-SAFE-SDAG-NEXT:    s_movk_i32 s5, 0x7e00
 ; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, 0x7c00
-; VI-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
-; VI-SAFE-SDAG-NEXT:    s_movk_i32 s4, 0x7e00
-; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s4, 0x7c00
-; VI-SAFE-SDAG-NEXT:    s_cmpk_eq_i32 s6, 0x40f
-; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s4, s5
+; VI-SAFE-SDAG-NEXT:    s_cmpk_eq_i32 s10, 0x40f
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s5, s4
 ; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s5, s7, 16
 ; VI-SAFE-SDAG-NEXT:    s_and_b32 s5, s5, 0x8000
 ; VI-SAFE-SDAG-NEXT:    s_or_b32 s4, s5, s4
@@ -296,21 +300,23 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s5, s4, 0x1000
 ; GFX10-SAFE-SDAG-NEXT:    s_lshr_b32 s7, s5, s6
 ; GFX10-SAFE-SDAG-NEXT:    s_lshl_b32 s6, s7, s6
+; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s8, s7, 1
 ; GFX10-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s6, s5
-; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s8, s7
 ; GFX10-SAFE-SDAG-NEXT:    s_addk_i32 s2, 0xfc10
-; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s5, s7, s5
 ; GFX10-SAFE-SDAG-NEXT:    s_lshl_b32 s6, s2, 12
 ; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s6, s4, s6
 ; GFX10-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s2, 1
 ; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, s6
 ; GFX10-SAFE-SDAG-NEXT:    s_and_b32 s6, s5, 7
-; GFX10-SAFE-SDAG-NEXT:    s_cmp_gt_i32 s6, 5
-; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX10-SAFE-SDAG-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s7, -1, 0
+; GFX10-SAFE-SDAG-NEXT:    s_cmp_gt_i32 s6, 5
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s6, s6, s7
+; GFX10-SAFE-SDAG-NEXT:    s_and_b32 s6, s6, exec_lo
 ; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX10-SAFE-SDAG-NEXT:    s_lshr_b32 s5, s5, 2
-; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s6, s6, s7
 ; GFX10-SAFE-SDAG-NEXT:    s_add_i32 s5, s5, s6
 ; GFX10-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s2, 31
 ; GFX10-SAFE-SDAG-NEXT:    s_movk_i32 s6, 0x7e00
@@ -425,23 +431,26 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-SAFE-SDAG-NEXT:    s_lshr_b32 s7, s5, s6
 ; GFX11-SAFE-SDAG-NEXT:    s_lshl_b32 s6, s7, s6
-; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s8, s7, 1
 ; GFX11-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s6, s5
-; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s8, s7
 ; GFX11-SAFE-SDAG-NEXT:    s_addk_i32 s2, 0xfc10
-; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s5, s7, s5
+; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-SAFE-SDAG-NEXT:    s_lshl_b32 s6, s2, 12
 ; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s6, s4, s6
 ; GFX11-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s2, 1
 ; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, s6
 ; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-SAFE-SDAG-NEXT:    s_and_b32 s6, s5, 7
-; GFX11-SAFE-SDAG-NEXT:    s_cmp_gt_i32 s6, 5
-; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX11-SAFE-SDAG-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s7, -1, 0
+; GFX11-SAFE-SDAG-NEXT:    s_cmp_gt_i32 s6, 5
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s6, s6, s7
+; GFX11-SAFE-SDAG-NEXT:    s_and_b32 s6, s6, exec_lo
 ; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX11-SAFE-SDAG-NEXT:    s_lshr_b32 s5, s5, 2
-; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s6, s6, s7
 ; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SAFE-SDAG-NEXT:    s_add_i32 s5, s5, s6
 ; GFX11-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s2, 31
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.math.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.math.ll
index d8f21d285ddff..27e5b521ae8c3 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.math.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.math.ll
@@ -284,91 +284,85 @@ define <2 x half> @v_test_cvt_v2f64_v2f16(<2 x double> %src) {
 ; GFX950-SDAG-LABEL: v_test_cvt_v2f64_v2f16:
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0x1ff
-; GFX950-SDAG-NEXT:    v_and_or_b32 v0, v1, s0, v0
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0x1ff
+; GFX950-SDAG-NEXT:    v_and_or_b32 v0, v1, s2, v0
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX950-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX950-SDAG-NEXT:    s_movk_i32 s1, 0xffe
+; GFX950-SDAG-NEXT:    s_movk_i32 s3, 0xffe
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX950-SDAG-NEXT:    v_bfe_u32 v5, v1, 20, 11
-; GFX950-SDAG-NEXT:    v_and_or_b32 v0, v4, s1, v0
+; GFX950-SDAG-NEXT:    v_and_or_b32 v0, v4, s3, v0
 ; GFX950-SDAG-NEXT:    v_sub_u32_e32 v6, 0x3f1, v5
 ; GFX950-SDAG-NEXT:    v_or_b32_e32 v4, 0x1000, v0
 ; GFX950-SDAG-NEXT:    v_med3_i32 v6, v6, 0, 13
 ; GFX950-SDAG-NEXT:    v_lshrrev_b32_e32 v7, v6, v4
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v6, v6, v7
+; GFX950-SDAG-NEXT:    v_or_b32_e32 v8, 1, v7
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, v6, v4
 ; GFX950-SDAG-NEXT:    v_add_u32_e32 v5, 0xfffffc10, v5
 ; GFX950-SDAG-NEXT:    v_lshl_or_b32 v6, v5, 12, v0
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX950-SDAG-NEXT:    v_or_b32_e32 v4, v7, v4
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
 ; GFX950-SDAG-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v5
-; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0x40f
-; GFX950-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX950-SDAG-NEXT:    s_movk_i32 s4, 0x40f
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX950-SDAG-NEXT:    v_and_b32_e32 v6, 7, v4
-; GFX950-SDAG-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v6
-; GFX950-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 2, v4
-; GFX950-SDAG-NEXT:    s_mov_b32 s3, 0x8000
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX950-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v6
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX950-SDAG-NEXT:    v_or_b32_e32 v6, v6, v7
+; GFX950-SDAG-NEXT:    v_cmp_lt_i32_e64 s[0:1], 5, v6
+; GFX950-SDAG-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GFX950-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 2, v4
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
 ; GFX950-SDAG-NEXT:    v_add_u32_e32 v4, v4, v6
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v6, 0x7c00
 ; GFX950-SDAG-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v5
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v7, 0x7e00
-; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX950-SDAG-NEXT:    s_mov_b32 s5, 0x8000
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v7, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v5
+; GFX950-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v5
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX950-SDAG-NEXT:    v_and_or_b32 v0, v1, s3, v0
-; GFX950-SDAG-NEXT:    v_and_or_b32 v1, v3, s0, v2
+; GFX950-SDAG-NEXT:    v_and_or_b32 v0, v1, s5, v0
+; GFX950-SDAG-NEXT:    v_and_or_b32 v1, v3, s2, v2
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX950-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 8, v3
 ; GFX950-SDAG-NEXT:    v_bfe_u32 v4, v3, 20, 11
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX950-SDAG-NEXT:    v_and_or_b32 v1, v2, s1, v1
+; GFX950-SDAG-NEXT:    v_and_or_b32 v1, v2, s3, v1
 ; GFX950-SDAG-NEXT:    v_sub_u32_e32 v5, 0x3f1, v4
 ; GFX950-SDAG-NEXT:    v_or_b32_e32 v2, 0x1000, v1
 ; GFX950-SDAG-NEXT:    v_med3_i32 v5, v5, 0, 13
 ; GFX950-SDAG-NEXT:    v_lshrrev_b32_e32 v8, v5, v2
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v5, v5, v8
+; GFX950-SDAG-NEXT:    v_or_b32_e32 v9, 1, v8
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v2
 ; GFX950-SDAG-NEXT:    v_add_u32_e32 v4, 0xfffffc10, v4
 ; GFX950-SDAG-NEXT:    v_lshl_or_b32 v5, v4, 12, v1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX950-SDAG-NEXT:    v_or_b32_e32 v2, v8, v2
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
 ; GFX950-SDAG-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v4
-; GFX950-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; GFX950-SDAG-NEXT:    v_and_b32_e32 v5, 7, v2
-; GFX950-SDAG-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v5
-; GFX950-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
-; GFX950-SDAG-NEXT:    s_nop 0
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX950-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v5
-; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX950-SDAG-NEXT:    v_or_b32_e32 v5, v5, v8
+; GFX950-SDAG-NEXT:    v_cmp_lt_i32_e64 s[0:1], 5, v5
+; GFX950-SDAG-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GFX950-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
 ; GFX950-SDAG-NEXT:    v_add_u32_e32 v2, v2, v5
 ; GFX950-SDAG-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v4
-; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-SDAG-NEXT:    s_nop 0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v4
+; GFX950-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v4
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX950-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX950-SDAG-NEXT:    v_and_or_b32 v1, v2, s3, v1
+; GFX950-SDAG-NEXT:    v_and_or_b32 v1, v2, s5, v1
 ; GFX950-SDAG-NEXT:    v_perm_b32 v0, v1, v0, s0
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index 9389f1614721f..a841f7ffa02b9 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -51,160 +51,314 @@ bb:
 
 ; FIXME: This generates "instid1(/* invalid instid value */)".
 define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, ptr %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i1 %arg11) {
-; GFX11-LABEL: f2:
-; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    s_mov_b64 s[16:17], s[4:5]
-; GFX11-NEXT:    v_mov_b32_e32 v31, v0
-; GFX11-NEXT:    s_load_b32 s19, s[16:17], 0x24
-; GFX11-NEXT:    s_mov_b32 s12, s13
-; GFX11-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
-; GFX11-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX11-NEXT:    s_mov_b32 s20, 0
-; GFX11-NEXT:    s_mov_b32 s0, -1
-; GFX11-NEXT:    s_mov_b32 s3, exec_lo
-; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_mul_lo_u32 v0, s19, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX11-NEXT:    s_cbranch_execz .LBB2_13
-; GFX11-NEXT:  ; %bb.1: ; %bb14
-; GFX11-NEXT:    s_load_b128 s[20:23], s[16:17], 0x2c
-; GFX11-NEXT:    s_mov_b32 s18, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_bitcmp1_b32 s21, 0
-; GFX11-NEXT:    s_cselect_b32 s24, -1, 0
-; GFX11-NEXT:    s_bitcmp0_b32 s21, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB2_3
-; GFX11-NEXT:  ; %bb.2: ; %bb15
-; GFX11-NEXT:    s_add_u32 s8, s16, 0x58
-; GFX11-NEXT:    s_addc_u32 s9, s17, 0
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, f0 at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, f0 at gotpcrel32@hi+12
-; GFX11-NEXT:    s_mov_b32 s13, s14
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s21, s14
-; GFX11-NEXT:    s_mov_b32 s14, s15
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_mov_b32 s14, s21
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_cbranch_execz .LBB2_4
-; GFX11-NEXT:    s_branch .LBB2_12
-; GFX11-NEXT:  .LBB2_3:
-; GFX11-NEXT:    s_mov_b32 s2, 0
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccnz .LBB2_12
-; GFX11-NEXT:  .LBB2_4: ; %bb16
-; GFX11-NEXT:    s_load_b32 s0, s[16:17], 0x54
-; GFX11-NEXT:    s_bitcmp1_b32 s23, 0
-; GFX11-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX11-NEXT:    s_and_b32 s1, s23, 1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_bitcmp1_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s0, -1
-; GFX11-NEXT:    s_cselect_b32 s8, -1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s1, 0
-; GFX11-NEXT:    s_cbranch_scc0 .LBB2_8
-; GFX11-NEXT:  ; %bb.5: ; %bb18.preheader
-; GFX11-NEXT:    s_load_b128 s[28:31], s[16:17], 0x44
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mul_hi_u32 s0, s29, s28
-; GFX11-NEXT:    s_mul_i32 s1, s29, s28
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_alignbit_b32 v0, s0, s1, 1
-; GFX11-NEXT:    s_mov_b32 s1, 0
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_or_b32 s0, s0, 1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_lshr_b32 s0, s0, s30
-; GFX11-NEXT:    s_mul_i32 s0, s0, s22
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_mul_i32 s0, s0, s20
-; GFX11-NEXT:    s_or_b32 s0, s19, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_lshl_b64 s[20:21], s[0:1], 1
-; GFX11-NEXT:    s_mov_b32 s0, s1
-; GFX11-NEXT:    global_load_u16 v1, v0, s[20:21]
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s24
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
-; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB2_6: ; %bb18
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT:    v_readfirstlane_b32 s13, v0
-; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
-; GFX11-NEXT:    s_and_b32 s1, s8, s1
-; GFX11-NEXT:    s_and_b32 s1, s1, exec_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readfirstlane_b32 s19, v2
-; GFX11-NEXT:    s_cselect_b32 s1, s19, s13
-; GFX11-NEXT:    s_and_b32 s13, 0xffff, s0
-; GFX11-NEXT:    s_and_b32 s1, s1, 1
-; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX11-NEXT:    s_cselect_b32 s13, -1, 0
-; GFX11-NEXT:    s_and_b32 s20, s9, exec_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s13
-; GFX11-NEXT:    v_readfirstlane_b32 s13, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_readfirstlane_b32 s19, v2
-; GFX11-NEXT:    s_cselect_b32 s13, s19, s13
-; GFX11-NEXT:    s_bitcmp1_b32 s13, 0
-; GFX11-NEXT:    s_cselect_b32 s13, 0x100, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b32 s0, s13, s0
-; GFX11-NEXT:    s_cbranch_vccz .LBB2_6
-; GFX11-NEXT:  ; %bb.7: ; %Flow
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:  .LBB2_8: ; %Flow12
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccz .LBB2_12
-; GFX11-NEXT:  ; %bb.9:
-; GFX11-NEXT:    s_xor_b32 s0, s8, -1
-; GFX11-NEXT:  .LBB2_10: ; %bb17
-; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
-; GFX11-NEXT:    s_cbranch_vccz .LBB2_10
-; GFX11-NEXT:  ; %bb.11: ; %Flow6
-; GFX11-NEXT:    s_mov_b32 s18, -1
-; GFX11-NEXT:  .LBB2_12: ; %Flow11
-; GFX11-NEXT:    s_and_b32 s20, s2, exec_lo
-; GFX11-NEXT:    s_or_not1_b32 s0, s18, exec_lo
-; GFX11-NEXT:  .LBB2_13: ; %Flow9
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; GFX11-NEXT:    s_and_saveexec_b32 s3, s0
-; GFX11-NEXT:    s_cbranch_execz .LBB2_15
-; GFX11-NEXT:  ; %bb.14: ; %bb43
-; GFX11-NEXT:    s_add_u32 s8, s16, 0x58
-; GFX11-NEXT:    s_addc_u32 s9, s17, 0
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, f0 at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, f0 at gotpcrel32@hi+12
-; GFX11-NEXT:    s_mov_b32 s13, s14
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s14, s15
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_or_b32 s20, s20, exec_lo
-; GFX11-NEXT:  .LBB2_15: ; %Flow14
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; GFX11-NEXT:    s_and_saveexec_b32 s0, s20
-; GFX11-NEXT:  ; %bb.16: ; %UnifiedUnreachableBlock
-; GFX11-NEXT:    ; divergent unreachable
-; GFX11-NEXT:  ; %bb.17: ; %UnifiedReturnBlock
-; GFX11-NEXT:    s_endpgm
+
+; GFX11-TRUE16-LABEL: f2:
+; GFX11-TRUE16:       ; %bb.0: ; %bb
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[16:17], s[4:5]
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v31, v0
+; GFX11-TRUE16-NEXT:    s_load_b32 s19, s[16:17], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s13
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s20, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mul_lo_u32 v0, s19, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB2_13
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %bb14
+; GFX11-TRUE16-NEXT:    s_load_b128 s[20:23], s[16:17], 0x2c
+; GFX11-TRUE16-NEXT:    s_mov_b32 s18, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_bitcmp1_b32 s21, 0
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s24, -1, 0
+; GFX11-TRUE16-NEXT:    s_bitcmp0_b32 s21, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB2_3
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %bb15
+; GFX11-TRUE16-NEXT:    s_add_u32 s8, s16, 0x58
+; GFX11-TRUE16-NEXT:    s_addc_u32 s9, s17, 0
+; GFX11-TRUE16-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-TRUE16-NEXT:    s_add_u32 s0, s0, f0 at gotpcrel32@lo+4
+; GFX11-TRUE16-NEXT:    s_addc_u32 s1, s1, f0 at gotpcrel32@hi+12
+; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s14
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s21, s14
+; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s15
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s21
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, -1
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB2_4
+; GFX11-TRUE16-NEXT:    s_branch .LBB2_12
+; GFX11-TRUE16-NEXT:  .LBB2_3:
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:  .LBB2_4: ; %bb16
+; GFX11-TRUE16-NEXT:    s_load_b32 s1, s[16:17], 0x54
+; GFX11-TRUE16-NEXT:    s_bitcmp1_b32 s23, 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, -1
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s9, s23, 1
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_bitcmp1_b32 s1, 0
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX11-TRUE16-NEXT:    s_cmp_eq_u32 s9, 0
+; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB2_8
+; GFX11-TRUE16-NEXT:  ; %bb.5: ; %bb18.preheader
+; GFX11-TRUE16-NEXT:    s_load_b128 s[28:31], s[16:17], 0x44
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mul_hi_u32 s8, s29, s28
+; GFX11-TRUE16-NEXT:    s_mul_i32 s9, s29, s28
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_alignbit_b32 v0, s8, s9, 1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, 0
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-TRUE16-NEXT:    s_or_b32 s8, s8, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s8, s30
+; GFX11-TRUE16-NEXT:    s_mul_i32 s8, s8, s22
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_mul_i32 s8, s8, s20
+; GFX11-TRUE16-NEXT:    s_or_b32 s8, s19, s8
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_lshl_b64 s[20:21], s[8:9], 1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s9
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v0, s[20:21]
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s24
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX11-TRUE16-NEXT:    .p2align 6
+; GFX11-TRUE16-NEXT:  .LBB2_6: ; %bb18
+; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, 0xffff, s8
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s13, v0
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s8, -1, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s8
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s1, s8
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s8, exec_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s19, v2
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s8, s19, s13
+; GFX11-TRUE16-NEXT:    s_and_b32 s13, 0xffff, s9
+; GFX11-TRUE16-NEXT:    s_and_b32 s8, s8, 1
+; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s13, -1, 0
+; GFX11-TRUE16-NEXT:    s_and_b32 s20, s2, exec_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s13
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s13, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s19, v2
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s13, s19, s13
+; GFX11-TRUE16-NEXT:    s_or_b32 s19, s9, 0x100
+; GFX11-TRUE16-NEXT:    s_and_b32 s13, 1, s13
+; GFX11-TRUE16-NEXT:    s_cmp_eq_u32 s13, 1
+; GFX11-TRUE16-NEXT:    s_cselect_b32 s9, s19, s9
+; GFX11-TRUE16-NEXT:    s_cbranch_vccz .LBB2_6
+; GFX11-TRUE16-NEXT:  ; %bb.7: ; %Flow
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, 0
+; GFX11-TRUE16-NEXT:  .LBB2_8: ; %Flow12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_b32 vcc_lo, exec_lo, s8
+; GFX11-TRUE16-NEXT:    s_cbranch_vccz .LBB2_12
+; GFX11-TRUE16-NEXT:  ; %bb.9:
+; GFX11-TRUE16-NEXT:    s_xor_b32 s1, s1, -1
+; GFX11-TRUE16-NEXT:  .LBB2_10: ; %bb17
+; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_b32 vcc_lo, exec_lo, s1
+; GFX11-TRUE16-NEXT:    s_cbranch_vccz .LBB2_10
+; GFX11-TRUE16-NEXT:  ; %bb.11: ; %Flow6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s18, -1
+; GFX11-TRUE16-NEXT:  .LBB2_12: ; %Flow11
+; GFX11-TRUE16-NEXT:    s_and_b32 s20, s0, exec_lo
+; GFX11-TRUE16-NEXT:    s_or_not1_b32 s0, s18, exec_lo
+; GFX11-TRUE16-NEXT:  .LBB2_13: ; %Flow9
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s3, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB2_15
+; GFX11-TRUE16-NEXT:  ; %bb.14: ; %bb43
+; GFX11-TRUE16-NEXT:    s_add_u32 s8, s16, 0x58
+; GFX11-TRUE16-NEXT:    s_addc_u32 s9, s17, 0
+; GFX11-TRUE16-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-TRUE16-NEXT:    s_add_u32 s0, s0, f0 at gotpcrel32@lo+4
+; GFX11-TRUE16-NEXT:    s_addc_u32 s1, s1, f0 at gotpcrel32@hi+12
+; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s14
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s15
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-TRUE16-NEXT:    s_or_b32 s20, s20, exec_lo
+; GFX11-TRUE16-NEXT:  .LBB2_15: ; %Flow14
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, s20
+; GFX11-TRUE16-NEXT:  ; %bb.16: ; %UnifiedUnreachableBlock
+; GFX11-TRUE16-NEXT:    ; divergent unreachable
+; GFX11-TRUE16-NEXT:  ; %bb.17: ; %UnifiedReturnBlock
+; GFX11-TRUE16-NEXT:    s_endpgm
+; GFX11-FAKE16-LABEL: f2:
+; GFX11-FAKE16:       ; %bb.0: ; %bb
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[16:17], s[4:5]
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v31, v0
+; GFX11-FAKE16-NEXT:    s_load_b32 s19, s[16:17], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, s13
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s20, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mul_lo_u32 v0, s19, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB2_13
+; GFX11-FAKE16-NEXT:  ; %bb.1: ; %bb14
+; GFX11-FAKE16-NEXT:    s_load_b128 s[20:23], s[16:17], 0x2c
+; GFX11-FAKE16-NEXT:    s_mov_b32 s18, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_bitcmp1_b32 s21, 0
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s24, -1, 0
+; GFX11-FAKE16-NEXT:    s_bitcmp0_b32 s21, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB2_3
+; GFX11-FAKE16-NEXT:  ; %bb.2: ; %bb15
+; GFX11-FAKE16-NEXT:    s_add_u32 s8, s16, 0x58
+; GFX11-FAKE16-NEXT:    s_addc_u32 s9, s17, 0
+; GFX11-FAKE16-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-FAKE16-NEXT:    s_add_u32 s0, s0, f0 at gotpcrel32@lo+4
+; GFX11-FAKE16-NEXT:    s_addc_u32 s1, s1, f0 at gotpcrel32@hi+12
+; GFX11-FAKE16-NEXT:    s_mov_b32 s13, s14
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s21, s14
+; GFX11-FAKE16-NEXT:    s_mov_b32 s14, s15
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s14, s21
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB2_4
+; GFX11-FAKE16-NEXT:    s_branch .LBB2_12
+; GFX11-FAKE16-NEXT:  .LBB2_3:
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, 0
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB2_12
+; GFX11-FAKE16-NEXT:  .LBB2_4: ; %bb16
+; GFX11-FAKE16-NEXT:    s_load_b32 s0, s[16:17], 0x54
+; GFX11-FAKE16-NEXT:    s_bitcmp1_b32 s23, 0
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s9, -1, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s23, 1
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, -1
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s8, -1, 0
+; GFX11-FAKE16-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB2_8
+; GFX11-FAKE16-NEXT:  ; %bb.5: ; %bb18.preheader
+; GFX11-FAKE16-NEXT:    s_load_b128 s[28:31], s[16:17], 0x44
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mul_hi_u32 s0, s29, s28
+; GFX11-FAKE16-NEXT:    s_mul_i32 s1, s29, s28
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, s0, s1, 1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_lshr_b32 s0, s0, s30
+; GFX11-FAKE16-NEXT:    s_mul_i32 s0, s0, s22
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_mul_i32 s0, s0, s20
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, s19, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_lshl_b64 s[20:21], s[0:1], 1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s1
+; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[20:21]
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s24
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX11-FAKE16-NEXT:    .p2align 6
+; GFX11-FAKE16-NEXT:  .LBB2_6: ; %bb18
+; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s13, v0
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s8, s1
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, exec_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s19, v2
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s1, s19, s13
+; GFX11-FAKE16-NEXT:    s_and_b32 s13, 0xffff, s0
+; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 1
+; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s13, -1, 0
+; GFX11-FAKE16-NEXT:    s_and_b32 s20, s9, exec_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s13
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s13, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s19, v2
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s13, s19, s13
+; GFX11-FAKE16-NEXT:    s_or_b32 s19, s0, 0x100
+; GFX11-FAKE16-NEXT:    s_and_b32 s13, 1, s13
+; GFX11-FAKE16-NEXT:    s_cmp_eq_u32 s13, 1
+; GFX11-FAKE16-NEXT:    s_cselect_b32 s0, s19, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccz .LBB2_6
+; GFX11-FAKE16-NEXT:  ; %bb.7: ; %Flow
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:  .LBB2_8: ; %Flow12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccz .LBB2_12
+; GFX11-FAKE16-NEXT:  ; %bb.9:
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, s8, -1
+; GFX11-FAKE16-NEXT:  .LBB2_10: ; %bb17
+; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_vccz .LBB2_10
+; GFX11-FAKE16-NEXT:  ; %bb.11: ; %Flow6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s18, -1
+; GFX11-FAKE16-NEXT:  .LBB2_12: ; %Flow11
+; GFX11-FAKE16-NEXT:    s_and_b32 s20, s2, exec_lo
+; GFX11-FAKE16-NEXT:    s_or_not1_b32 s0, s18, exec_lo
+; GFX11-FAKE16-NEXT:  .LBB2_13: ; %Flow9
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s3, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB2_15
+; GFX11-FAKE16-NEXT:  ; %bb.14: ; %bb43
+; GFX11-FAKE16-NEXT:    s_add_u32 s8, s16, 0x58
+; GFX11-FAKE16-NEXT:    s_addc_u32 s9, s17, 0
+; GFX11-FAKE16-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-FAKE16-NEXT:    s_add_u32 s0, s0, f0 at gotpcrel32@lo+4
+; GFX11-FAKE16-NEXT:    s_addc_u32 s1, s1, f0 at gotpcrel32@hi+12
+; GFX11-FAKE16-NEXT:    s_mov_b32 s13, s14
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s14, s15
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-FAKE16-NEXT:    s_or_b32 s20, s20, exec_lo
+; GFX11-FAKE16-NEXT:  .LBB2_15: ; %Flow14
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, s20
+; GFX11-FAKE16-NEXT:  ; %bb.16: ; %UnifiedUnreachableBlock
+; GFX11-FAKE16-NEXT:    ; divergent unreachable
+; GFX11-FAKE16-NEXT:  ; %bb.17: ; %UnifiedReturnBlock
+; GFX11-FAKE16-NEXT:    s_endpgm
+
 bb:
   %i = tail call i32 @llvm.amdgcn.workitem.id.x()
   %i12 = mul i32 %arg, %i
diff --git a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll
index c3ce0d1aa739e..c13f1cdd23d36 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll
@@ -340,15 +340,16 @@ define i64 @fneg_select_i64_1(i64 %cond, i64 %a, i64 %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, -v3, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_select_i64_1:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v5, v3 :: v_dual_cndmask_b32 v1, v4, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, -v3, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = xor i64 %a, u0x8000000000000000
   %cmp = icmp eq i64 %cond, zeroinitializer
@@ -361,15 +362,16 @@ define i64 @fneg_select_i64_2(i64 %cond, i64 %a, i64 %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, -v3, v5, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_select_i64_2:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v3, v5 :: v_dual_cndmask_b32 v1, v2, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, -v3, v5, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = xor i64 %a, u0x8000000000000000
   %cmp = icmp eq i64 %cond, zeroinitializer
@@ -382,16 +384,16 @@ define i64 @fneg_1_fabs_2_select_i64(i64 %cond, i64 %a, i64 %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, |v5|, v3, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, |v5|, -v3, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_1_fabs_2_select_i64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, |v5|, v3, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, |v5|, -v3, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = xor i64 %a, u0x8000000000000000
   %abs.b = and i64 %b, u0x7fffffffffffffff
@@ -405,16 +407,16 @@ define i64 @fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v5, |v3|, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, |v3|, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fabs_select_i64_1:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v5, |v3|, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, |v3|, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = and i64 %a, u0x7fffffffffffffff
   %cmp = icmp eq i64 %cond, zeroinitializer
@@ -427,16 +429,16 @@ define i64 @fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, |v3|, v5, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, |v3|, v5, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fabs_select_i64_2:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, |v3|, v5, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, |v3|, v5, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = and i64 %a, u0x7fffffffffffffff
   %cmp = icmp eq i64 %cond, zeroinitializer
@@ -449,16 +451,16 @@ define i64 @fneg_fabs_select_i64_1(i64 %cond, i64 %a, i64 %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, v5, -|v3|, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, -|v3|, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_fabs_select_i64_1:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v5, -|v3|, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, -|v3|, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = or i64 %a, u0x8000000000000000
   %cmp = icmp eq i64 %cond, zeroinitializer
@@ -471,16 +473,16 @@ define i64 @fneg_fabs_select_i64_2(i64 %cond, i64 %a, i64 %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
-; GCN-NEXT:    v_cndmask_b32_e64 v0, -|v3|, v5, vcc
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GCN-NEXT:    v_cndmask_b32_e64 v1, -|v3|, v5, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fneg_fabs_select_i64_2:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, -|v3|, v5, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, -|v3|, v5, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = or i64 %a, u0x8000000000000000
   %cmp = icmp eq i64 %cond, zeroinitializer
diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll
index 4e27cf20d3c98..c52f7a4ac720a 100644
--- a/llvm/test/CodeGen/AMDGPU/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll
@@ -124,9 +124,8 @@ define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX6-NEXT:    v_add_i32_e64 v1, s[4:5], v0, v1
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
-; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v1, -v0, s[4:5]
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_i32:
@@ -136,9 +135,8 @@ define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v0, v1
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v1, -v0, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_i32:
@@ -383,16 +381,14 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-NEXT:    v_add_i32_e64 v2, s[4:5], v0, v2
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
-; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, -v0, s[4:5]
 ; GFX6-NEXT:    v_add_i32_e64 v2, s[4:5], v1, v3
 ; GFX6-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
-; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, -v1, s[4:5]
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_v2i32:
@@ -402,16 +398,14 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v0, v2
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, -v0, s[4:5]
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v1, v3
 ; GFX8-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v2, -v1, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_v2i32:
@@ -442,8 +436,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v5, -v1, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_i64:
@@ -456,8 +449,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v5, -v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_saddsat_i64:
@@ -470,8 +462,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, -v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_saddsat_i64:
@@ -480,12 +471,11 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, 0, v[2:3]
-; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX10-NEXT:    v_xor_b32_e32 v1, 0x80000000, v6
+; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, -v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_saddsat_i64:
@@ -494,11 +484,11 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, 0, v[2:3]
-; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v6
+; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, -v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %result
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll
index 2c7819a395c86..2549e76821e1c 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll
@@ -8,11 +8,10 @@ define i32 @test_select_on_sext_sdwa(i8 %x, i32 %y, i1 %cond)  {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_and_b32_e32 v2, 1, v2
+; CHECK-NEXT:    v_or_b32_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; CHECK-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; CHECK-NEXT:    v_or_b32_e32 v0, v0, v1
+; CHECK-NEXT:    s_nop 1
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %sext = sext i8 %x to i32
   %select = select i1 %cond, i32 %sext, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
index 40d80f5e83e36..09c0e775f783d 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
@@ -124,9 +124,8 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e64 v1, s[4:5], v0, v1
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
-; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v1, -v0, s[4:5]
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_i32:
@@ -136,9 +135,8 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v0, v1
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v1, -v0, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_i32:
@@ -383,16 +381,14 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e64 v2, s[4:5], v0, v2
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
-; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, -v0, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v2, s[4:5], v1, v3
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v3
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
-; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, -v1, s[4:5]
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v2i32:
@@ -402,16 +398,14 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v0, v2
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, -v0, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v1, v3
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v2, -v1, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v2i32:
@@ -439,23 +433,20 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e64 v3, s[4:5], v0, v3
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
-; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v3, -v0, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v3, s[4:5], v1, v4
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v4
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v3
-; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, -v1, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v3, s[4:5], v2, v5
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v3
-; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, v3, -v2, s[4:5]
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v3i32:
@@ -465,23 +456,20 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v0, v3
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, -v0, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v1, v4
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v4
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v3
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, -v1, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v2, v5
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v2
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v3
-; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v3, -v2, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v3i32:
@@ -511,30 +499,26 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v0, v4
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v4
-; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v4, -v0, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v1, v5
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v4
-; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v4, -v1, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v2, v6
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v6
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v4
-; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, v4, -v2, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v3, v7
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v7
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v4
-; GFX6-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v3, v4, -v3, s[4:5]
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v4i32:
@@ -544,30 +528,26 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v0, v4
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v4
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, -v0, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v1, v5
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v4
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v4, -v1, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v2, v6
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v6
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v2
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v4
-; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v4, -v2, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v3, v7
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v7
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v3
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v4
-; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v4, -v3, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v4i32:
@@ -599,58 +579,50 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v0, v8
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v8, -v0, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v1, v9
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v9
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v8, -v1, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v2, v10
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v10
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, v8, -v2, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v3, v11
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v11
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v3, v8, -v3, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v4, v12
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v12
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v4
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, v8, -v4, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v5, v13
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v13
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v5
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, v8, -v5, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v6, v14
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v14
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v6
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v6, 0x80000000, v6
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v6, v8, -v6, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v7, v15
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v15
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v7
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 31, v8
-; GFX6-NEXT:    v_xor_b32_e32 v7, 0x80000000, v7
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v7, v8, -v7, s[4:5]
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v8i32:
@@ -660,58 +632,50 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v0, v8
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v8, -v0, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v1, v9
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v9
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v8, -v1, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v2, v10
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v10
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v2
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v8, -v2, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v3, v11
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v11
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v3
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v8, -v3, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v4, v12
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v12
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v4
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v8, -v4, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v5, v13
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v13
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v5
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v8, -v5, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v6, v14
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v14
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v6
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v6, 0x80000000, v6
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v8, -v6, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v7, v15
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v15
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v7
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v8
-; GFX8-NEXT:    v_xor_b32_e32 v7, 0x80000000, v7
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v8, -v7, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v8i32:
@@ -751,116 +715,100 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v0, v16
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v0
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v0, v16, -v0, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v1, v17
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v17
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v16, -v1, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v2, v18
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v18
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, v16, -v2, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v3, v19
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v19
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
-; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v4, v20
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v20
-; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v4
-; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v3, v16, -v3, s[4:5]
 ; GFX6-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v4, v20
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v20
+; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v4
+; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 31, v17
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, v17, -v4, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v5, v21
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v21
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v5
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v5, v17, -v5, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v6, v22
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v22
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v6
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v6, 0x80000000, v6
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v6, v17, v6, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v6, v17, -v6, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v7, v23
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v23
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v7
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v7, 0x80000000, v7
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, v17, v7, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v7, v17, -v7, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v8, v24
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v24
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v8
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v8, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v8, 0x80000000, v8
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v8, v17, -v8, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v9, v25
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v25
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v9
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v9, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v9, 0x80000000, v9
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v9, v17, v9, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v9, v17, -v9, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v10, v26
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v26
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v10
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v10, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v10, 0x80000000, v10
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v10, v17, v10, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v10, v17, -v10, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v11, v27
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v27
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v11
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v11, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v11, 0x80000000, v11
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v11, v17, v11, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v11, v17, -v11, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v12, v28
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v28
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v12
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v12, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v12, 0x80000000, v12
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v12, v17, v12, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v12, v17, -v12, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v13, v29
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v29
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v13
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v13, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v13, 0x80000000, v13
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v13, v17, v13, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v13, v17, -v13, s[4:5]
 ; GFX6-NEXT:    v_sub_i32_e64 v17, s[4:5], v14, v30
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v30
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v14
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v14, 31, v17
-; GFX6-NEXT:    v_xor_b32_e32 v14, 0x80000000, v14
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v14, v17, v14, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v14, v17, -v14, s[4:5]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v16
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v15, v16
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v15
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v15, 31, v16
-; GFX6-NEXT:    v_xor_b32_e32 v15, 0x80000000, v15
-; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX6-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e64 v15, v16, -v15, s[4:5]
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v16i32:
@@ -870,116 +818,100 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v0, v16
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v0
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v16, -v0, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v1, v17
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v17
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v1
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v16, -v1, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v2, v18
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v18
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v2
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v16, -v2, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v3, v19
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v19
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v3
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v4, v20
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v20
-; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v4
-; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v16, -v3, s[4:5]
 ; GFX8-NEXT:    buffer_load_dword v16, off, s[0:3], s32
+; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v4, v20
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v20
+; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v4
+; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v17
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v17, -v4, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v5, v21
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v21
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v5
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v17, -v5, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v6, v22
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v22
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v6
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v6, 0x80000000, v6
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v17, v6, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v17, -v6, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v7, v23
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v23
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v7
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v7, 0x80000000, v7
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v17, v7, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v17, -v7, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v8, v24
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v24
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v8
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v8, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v8, 0x80000000, v8
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v17, -v8, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v9, v25
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v25
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v9
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v9, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v9, 0x80000000, v9
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v17, v9, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v17, -v9, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v10, v26
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v26
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v10
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v10, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v10, 0x80000000, v10
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v17, v10, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v17, -v10, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v11, v27
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v27
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v11
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v11, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v11, 0x80000000, v11
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v17, v11, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v17, -v11, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v12, v28
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v28
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v12
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v12, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v12, 0x80000000, v12
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v12, v17, v12, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v12, v17, -v12, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v13, v29
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v29
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v13
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v13, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v13, 0x80000000, v13
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v17, v13, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v17, -v13, s[4:5]
 ; GFX8-NEXT:    v_sub_u32_e64 v17, s[4:5], v14, v30
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v30
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v17, v14
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v14, 31, v17
-; GFX8-NEXT:    v_xor_b32_e32 v14, 0x80000000, v14
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v14, v17, v14, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v14, v17, -v14, s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v16
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v15, v16
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v15
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v15, 31, v16
-; GFX8-NEXT:    v_xor_b32_e32 v15, 0x80000000, v15
-; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
+; GFX8-NEXT:    s_xor_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v15, v16, -v15, s[4:5]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_v16i32:
@@ -1066,8 +998,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e64 v1, v5, -v1, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_i64:
@@ -1080,8 +1011,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v5, -v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_ssubsat_i64:
@@ -1094,8 +1024,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v5, -v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_ssubsat_i64:
@@ -1104,12 +1033,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
 ; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, 0, v[2:3]
-; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX10-NEXT:    v_xor_b32_e32 v1, 0x80000000, v6
+; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, -v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_ssubsat_i64:
@@ -1118,11 +1046,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX11-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
 ; GFX11-NEXT:    v_sub_co_ci_u32_e64 v5, null, v1, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, 0, v[2:3]
-; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v6
+; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX11-NEXT:    s_xor_b32 vcc_lo, s0, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, -v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
   ret i64 %result

>From 4c79cafe4a4831dbe501f3757bf05c687459ea66 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Sat, 12 Jul 2025 19:08:18 -0500
Subject: [PATCH 19/28] Remove dead code that was moved to the TI DAGCombiner

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 66 -------------------
 1 file changed, 66 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 7436de2d6a6a8..b635f27c56979 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4842,54 +4842,6 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
   return SDValue();
 }
 
-static EVT getFloatVT(EVT VT) {
-  EVT FT = MVT::getFloatingPointVT(VT.getScalarSizeInBits());
-  return VT.isVector() ? VT.changeVectorElementType(FT) : FT;
-}
-
-static SDValue getBitwiseToSrcModifierOp(SDValue N,
-                                         TargetLowering::DAGCombinerInfo &DCI) {
-
-  unsigned Opc = N.getNode()->getOpcode();
-  if (Opc != ISD::AND && Opc != ISD::XOR && Opc != ISD::OR)
-    return SDValue();
-
-  SelectionDAG &DAG = DCI.DAG;
-  SDValue LHS = N->getOperand(0);
-  SDValue RHS = N->getOperand(1);
-  ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
-
-  if (!CRHS)
-    return SDValue();
-
-  EVT VT = RHS.getValueType();
-  EVT FVT = getFloatVT(VT);
-  SDLoc SL = SDLoc(N);
-
-  switch (Opc) {
-  case ISD::XOR:
-    if (CRHS->getAPIntValue().isSignMask())
-      return DAG.getNode(ISD::FNEG, SL, FVT,
-                         DAG.getNode(ISD::BITCAST, SL, FVT, LHS));
-    break;
-  case ISD::OR:
-    if (CRHS->getAPIntValue().isSignMask()) {
-      SDValue Abs = DAG.getNode(ISD::FABS, SL, FVT,
-                                DAG.getNode(ISD::BITCAST, SL, FVT, LHS));
-      return DAG.getNode(ISD::FNEG, SL, FVT, Abs);
-    }
-    break;
-  case ISD::AND:
-    if (CRHS->getAPIntValue().isMaxSignedValue())
-      return DAG.getNode(ISD::FABS, SL, FVT,
-                         DAG.getNode(ISD::BITCAST, SL, FVT, LHS));
-    break;
-  default:
-    return SDValue();
-  }
-  return SDValue();
-}
-
 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
                                                    DAGCombinerInfo &DCI) const {
   if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
@@ -4930,24 +4882,6 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
       // DCI.AddToWorklist(MinMax.getNode());
       return MinMax;
     }
-
-    auto FoldSrcMods = [&](SDValue LHS, SDValue RHS, EVT VT) -> SDValue {
-      SDValue SrcModTrue = getBitwiseToSrcModifierOp(LHS, DCI);
-      SDValue SrcModFalse = getBitwiseToSrcModifierOp(RHS, DCI);
-      if (SrcModTrue || SrcModFalse) {
-        SDLoc SL(N);
-        EVT FVT =
-            SrcModTrue ? SrcModTrue.getValueType() : SrcModFalse.getValueType();
-        SDValue FLHS =
-            SrcModTrue ? SrcModTrue : DAG.getNode(ISD::BITCAST, SL, FVT, LHS);
-        SDValue FRHS =
-            SrcModFalse ? SrcModFalse : DAG.getNode(ISD::BITCAST, SL, FVT, RHS);
-        SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, Cond, FLHS, FRHS);
-        return DAG.getNode(ISD::BITCAST, SL, VT, FSelect);
-      }
-      return SDValue();
-    };
-
   }
 
   // There's no reason to not do this if the condition has other uses.

>From c265ed4a57c2cf8b27cb11c651c56dde1ea6475a Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Sat, 12 Jul 2025 19:22:59 -0500
Subject: [PATCH 20/28] Canonicalise TI select operand variable names and
 update tests

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  21 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll     | 225 ++++++------------
 2 files changed, 83 insertions(+), 163 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4f58ffa47fd20..5243cebbdd05a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12440,18 +12440,17 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
   if (SDValue R = combineSelectAsExtAnd(N0, N1, N2, DL, DAG))
     return R;
 
-  auto FoldSrcMods = [&](SDValue LHS, SDValue RHS, EVT VT) -> SDValue {
-    SDValue SrcModTrue = getBitwiseToSrcModifierOp(LHS);
-    SDValue SrcModFalse = getBitwiseToSrcModifierOp(RHS);
-    if (SrcModTrue || SrcModFalse) {
+  auto FoldSrcMods = [&](SDValue N1, SDValue N2, EVT VT) -> SDValue {
+    SDValue SrcModN1 = getBitwiseToSrcModifierOp(N1);
+    SDValue SrcModN2 = getBitwiseToSrcModifierOp(N2);
+    if (SrcModN1 || SrcModN2) {
       SDLoc SL(N);
-      EVT FVT =
-          SrcModTrue ? SrcModTrue.getValueType() : SrcModFalse.getValueType();
-      SDValue FLHS =
-          SrcModTrue ? SrcModTrue : DAG.getNode(ISD::BITCAST, SL, FVT, LHS);
-      SDValue FRHS =
-          SrcModFalse ? SrcModFalse : DAG.getNode(ISD::BITCAST, SL, FVT, RHS);
-      SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, N0, FLHS, FRHS);
+      EVT FVT = SrcModN1 ? SrcModN1.getValueType() : SrcModN2.getValueType();
+      SDValue FN1 =
+          SrcModN1 ? SrcModN1 : DAG.getNode(ISD::BITCAST, SL, FVT, N1);
+      SDValue FN2 =
+          SrcModN2 ? SrcModN2 : DAG.getNode(ISD::BITCAST, SL, FVT, N2);
+      SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, N0, FN1, FN2);
       return DAG.getNode(ISD::BITCAST, SL, VT, FSelect);
     }
     return SDValue();
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index d52fe845d62ec..606f6d1e3939b 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -275,23 +275,14 @@ define i16 @s_test_copysign_f16_10_mag(half inreg %sign) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-TRUE16-LABEL: s_test_copysign_f16_10_mag:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_and_b32 s0, s0, 0x8000
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, 0x4900
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s0
-; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-FAKE16-LABEL: s_test_copysign_f16_10_mag:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_and_b32 s0, s0, 0x8000
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, 0x4900
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: s_test_copysign_f16_10_mag:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_and_b32 s0, s0, 0x8000
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s0, s0, 0x4900
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call half @llvm.copysign.f16(half 10.0, half %sign)
   %cast = bitcast half %result to i16
   ret i16 %cast
@@ -1199,120 +1190,62 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_and_b32 s3, s1, 0x1ff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s3, s0
-; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s0, -1, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-TRUE16-NEXT:    s_bfe_u32 s0, s1, 0xb0014
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s1, s1, 8
-; GFX11-TRUE16-NEXT:    s_sub_i32 s3, 0x3f1, s0
-; GFX11-TRUE16-NEXT:    s_and_b32 s1, s1, 0xffe
-; GFX11-TRUE16-NEXT:    v_med3_i32 v1, s3, 0, 13
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX11-TRUE16-NEXT:    s_or_b32 s1, s1, s3
-; GFX11-TRUE16-NEXT:    s_or_b32 s3, s1, 0x1000
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s5, s3, s4
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s5, s4
-; GFX11-TRUE16-NEXT:    s_or_b32 s6, s5, 1
-; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s4, s3
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s3, s6, s5
-; GFX11-TRUE16-NEXT:    s_addk_i32 s0, 0xfc10
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s0, 12
-; GFX11-TRUE16-NEXT:    s_or_b32 s4, s1, s4
-; GFX11-TRUE16-NEXT:    s_cmp_lt_i32 s0, 1
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s3, s3, s4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s3, 7
-; GFX11-TRUE16-NEXT:    s_cmp_eq_u32 s4, 3
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s5, -1, 0
-; GFX11-TRUE16-NEXT:    s_cmp_gt_i32 s4, 5
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_or_b32 s4, s4, s5
-; GFX11-TRUE16-NEXT:    s_and_b32 s4, s4, exec_lo
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s3, s3, 2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_add_i32 s3, s3, s4
-; GFX11-TRUE16-NEXT:    s_cmp_lt_i32 s0, 31
-; GFX11-TRUE16-NEXT:    s_movk_i32 s4, 0x7e00
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s3, s3, 0x7c00
-; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s1, s4, 0x7c00
-; GFX11-TRUE16-NEXT:    s_cmpk_eq_i32 s0, 0x40f
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s0, s1, s3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v0, s2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-TRUE16-NEXT:    ; return to shader part epilog
-;
-; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_and_b32 s3, s1, 0x1ff
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s3, s0
-; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s0, -1, 0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-FAKE16-NEXT:    s_bfe_u32 s0, s1, 0xb0014
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s1, s1, 8
-; GFX11-FAKE16-NEXT:    s_sub_i32 s3, 0x3f1, s0
-; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 0xffe
-; GFX11-FAKE16-NEXT:    v_med3_i32 v1, s3, 0, 13
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s1, s3
-; GFX11-FAKE16-NEXT:    s_or_b32 s3, s1, 0x1000
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s5, s3, s4
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s5, s4
-; GFX11-FAKE16-NEXT:    s_or_b32 s6, s5, 1
-; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s4, s3
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s3, s6, s5
-; GFX11-FAKE16-NEXT:    s_addk_i32 s0, 0xfc10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s0, 12
-; GFX11-FAKE16-NEXT:    s_or_b32 s4, s1, s4
-; GFX11-FAKE16-NEXT:    s_cmp_lt_i32 s0, 1
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s3, s3, s4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s3, 7
-; GFX11-FAKE16-NEXT:    s_cmp_eq_u32 s4, 3
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s5, -1, 0
-; GFX11-FAKE16-NEXT:    s_cmp_gt_i32 s4, 5
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    s_or_b32 s4, s4, s5
-; GFX11-FAKE16-NEXT:    s_and_b32 s4, s4, exec_lo
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s3, s3, 2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    s_add_i32 s3, s3, s4
-; GFX11-FAKE16-NEXT:    s_cmp_lt_i32 s0, 31
-; GFX11-FAKE16-NEXT:    s_movk_i32 s4, 0x7e00
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s3, s3, 0x7c00
-; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s1, s4, 0x7c00
-; GFX11-FAKE16-NEXT:    s_cmpk_eq_i32 s0, 0x40f
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s0, s1, s3
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+; GFX11-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_and_b32 s3, s1, 0x1ff
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s0, s3, s0
+; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_bfe_u32 s0, s1, 0xb0014
+; GFX11-NEXT:    s_lshr_b32 s1, s1, 8
+; GFX11-NEXT:    s_sub_i32 s3, 0x3f1, s0
+; GFX11-NEXT:    s_and_b32 s1, s1, 0xffe
+; GFX11-NEXT:    v_med3_i32 v1, s3, 0, 13
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX11-NEXT:    s_or_b32 s1, s1, s3
+; GFX11-NEXT:    s_or_b32 s3, s1, 0x1000
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshr_b32 s5, s3, s4
+; GFX11-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX11-NEXT:    s_or_b32 s6, s5, 1
+; GFX11-NEXT:    s_cmp_lg_u32 s4, s3
+; GFX11-NEXT:    s_cselect_b32 s3, s6, s5
+; GFX11-NEXT:    s_addk_i32 s0, 0xfc10
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshl_b32 s4, s0, 12
+; GFX11-NEXT:    s_or_b32 s4, s1, s4
+; GFX11-NEXT:    s_cmp_lt_i32 s0, 1
+; GFX11-NEXT:    s_cselect_b32 s3, s3, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s4, s3, 7
+; GFX11-NEXT:    s_cmp_eq_u32 s4, 3
+; GFX11-NEXT:    s_cselect_b32 s5, -1, 0
+; GFX11-NEXT:    s_cmp_gt_i32 s4, 5
+; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s4, s4, s5
+; GFX11-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX11-NEXT:    s_lshr_b32 s3, s3, 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s3, s3, s4
+; GFX11-NEXT:    s_cmp_lt_i32 s0, 31
+; GFX11-NEXT:    s_movk_i32 s4, 0x7e00
+; GFX11-NEXT:    s_cselect_b32 s3, s3, 0x7c00
+; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-NEXT:    s_cselect_b32 s1, s4, 0x7c00
+; GFX11-NEXT:    s_cmpk_eq_i32 s0, 0x40f
+; GFX11-NEXT:    s_cselect_b32 s0, s1, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, s0, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    ; return to shader part epilog
   %mag.trunc = fptrunc double %mag to half
   %result = call half @llvm.copysign.f16(half %mag.trunc, half %sign)
   %cast = bitcast half %result to i16
@@ -4418,27 +4351,15 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f16_sign_v2f64(<2 x half> inreg
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX11-TRUE16-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f64:
-; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    s_mov_b32 s1, s2
-; GFX11-TRUE16-NEXT:    s_mov_b32 s2, s4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, s0, v0
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-TRUE16-NEXT:    ; return to shader part epilog
-;
-; GFX11-FAKE16-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f64:
-; GFX11-FAKE16:       ; %bb.0:
-; GFX11-FAKE16-NEXT:    s_pack_ll_b32_b16 s1, s2, s4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s1
-; GFX11-FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, s0, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+; GFX11-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_pack_ll_b32_b16 s1, s2, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s1
+; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff7fff, s0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    ; return to shader part epilog
   %sign.trunc = fptrunc <2 x double> %sign to <2 x half>
   %out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %mag, <2 x half> %sign.trunc)
   %cast = bitcast <2 x half> %out to i32

>From d658adbc1349f5c7df030ae205dbbed4b8c9593b Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Sat, 12 Jul 2025 19:28:04 -0500
Subject: [PATCH 21/28] Fix missed clang-format

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 5243cebbdd05a..3e201deaaf3ff 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12189,7 +12189,8 @@ SDValue DAGCombiner::getBitwiseToSrcModifierOp(SDValue N) {
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
 
-  if(!TLI.shouldFoldSelectWithIdentityConstant(N.getOpcode(), N->getValueType(0), ISD::SELECT, LHS, RHS))
+  if (!TLI.shouldFoldSelectWithIdentityConstant(
+          N.getOpcode(), N->getValueType(0), ISD::SELECT, LHS, RHS))
     return SDValue();
 
   ConstantSDNode *CRHS = isConstOrConstSplat(RHS);

>From 9fb7344c197ef7bfbc81282de9d18ad87fe482cf Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Sat, 12 Jul 2025 19:32:45 -0500
Subject: [PATCH 22/28] Suppress overzealous clang-format

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index b635f27c56979..0ede2a9783461 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4876,10 +4876,10 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
     }
 
     if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
-      SDValue MinMax =
-          combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
+      SDValue MinMax
+      = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
       // Revisit this node so we can catch min3/max3/med3 patterns.
-      // DCI.AddToWorklist(MinMax.getNode());
+      //DCI.AddToWorklist(MinMax.getNode());
       return MinMax;
     }
   }

>From 29d9b3d2272c1ec37bb2cad00b62b1e21aae445f Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Sat, 12 Jul 2025 19:33:36 -0500
Subject: [PATCH 23/28] Suppress overzealous clang-format

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 0ede2a9783461..e64d2162441ab 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4877,7 +4877,7 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
 
     if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
       SDValue MinMax
-      = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
+        = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
       // Revisit this node so we can catch min3/max3/med3 patterns.
       //DCI.AddToWorklist(MinMax.getNode());
       return MinMax;

>From cd5c7329bc074e71ff0fdad8f1ae25a99158620a Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Sun, 13 Jul 2025 04:03:42 -0500
Subject: [PATCH 24/28] Remove unnecessary lambda and refactor
 foldSelectOfSourceMods() to fit TI DAGCombiner style.

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 53 +++++++++----------
 1 file changed, 25 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 3e201deaaf3ff..ba4767bdec1b4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -602,6 +602,7 @@ namespace {
     SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1,
                                    SDValue N2, SDValue N3, ISD::CondCode CC);
     SDValue foldSelectOfBinops(SDNode *N);
+    SDValue foldSelectOfSourceMods(SDNode *N);
     SDValue foldSextSetcc(SDNode *N);
     SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
                               const SDLoc &DL);
@@ -684,7 +685,6 @@ namespace {
                                   SDValue VecIn2, unsigned LeftIdx,
                                   bool DidSplitVec);
     SDValue matchVSelectOpSizesWithSetCC(SDNode *Cast);
-    SDValue getBitwiseToSrcModifierOp(SDValue N);
     /// Walk up chain skipping non-aliasing memory nodes,
     /// looking for aliasing nodes and adding them to the Aliases vector.
     void GatherAllAliases(SDNode *N, SDValue OriginalChain,
@@ -12175,12 +12175,7 @@ SDValue DAGCombiner::foldSelectToABD(SDValue LHS, SDValue RHS, SDValue True,
   return SDValue();
 }
 
-static EVT getFloatVT(EVT VT) {
-  EVT FT = MVT::getFloatingPointVT(VT.getScalarSizeInBits());
-  return VT.isVector() ? VT.changeVectorElementType(FT) : FT;
-}
-
-SDValue DAGCombiner::getBitwiseToSrcModifierOp(SDValue N) {
+static SDValue getBitwiseToSrcModifierOp(SDValue N, SelectionDAG &DAG) {
 
   unsigned Opc = N.getNode()->getOpcode();
   if (Opc != ISD::AND && Opc != ISD::XOR && Opc != ISD::OR)
@@ -12189,17 +12184,18 @@ SDValue DAGCombiner::getBitwiseToSrcModifierOp(SDValue N) {
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
 
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!TLI.shouldFoldSelectWithIdentityConstant(
           N.getOpcode(), N->getValueType(0), ISD::SELECT, LHS, RHS))
     return SDValue();
 
   ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
-
   if (!CRHS)
     return SDValue();
 
   EVT VT = RHS.getValueType();
-  EVT FVT = getFloatVT(VT);
+  EVT FT = MVT::getFloatingPointVT(VT.getScalarSizeInBits());
+  EVT FVT = VT.isVector() ? VT.changeVectorElementType(FT) : FT;
   SDLoc SL = SDLoc(N);
 
   switch (Opc) {
@@ -12226,6 +12222,24 @@ SDValue DAGCombiner::getBitwiseToSrcModifierOp(SDValue N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::foldSelectOfSourceMods(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  EVT VT = N->getValueType(0);
+  SDValue SrcModN1 = getBitwiseToSrcModifierOp(N1, DAG);
+  SDValue SrcModN2 = getBitwiseToSrcModifierOp(N2, DAG);
+  if (SrcModN1 || SrcModN2) {
+    SDLoc SL(N);
+    EVT FVT = SrcModN1 ? SrcModN1.getValueType() : SrcModN2.getValueType();
+    SDValue FN1 = SrcModN1 ? SrcModN1 : DAG.getNode(ISD::BITCAST, SL, FVT, N1);
+    SDValue FN2 = SrcModN2 ? SrcModN2 : DAG.getNode(ISD::BITCAST, SL, FVT, N2);
+    SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, N0, FN1, FN2);
+    return DAG.getNode(ISD::BITCAST, SL, VT, FSelect);
+  }
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitSELECT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -12441,27 +12455,10 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
   if (SDValue R = combineSelectAsExtAnd(N0, N1, N2, DL, DAG))
     return R;
 
-  auto FoldSrcMods = [&](SDValue N1, SDValue N2, EVT VT) -> SDValue {
-    SDValue SrcModN1 = getBitwiseToSrcModifierOp(N1);
-    SDValue SrcModN2 = getBitwiseToSrcModifierOp(N2);
-    if (SrcModN1 || SrcModN2) {
-      SDLoc SL(N);
-      EVT FVT = SrcModN1 ? SrcModN1.getValueType() : SrcModN2.getValueType();
-      SDValue FN1 =
-          SrcModN1 ? SrcModN1 : DAG.getNode(ISD::BITCAST, SL, FVT, N1);
-      SDValue FN2 =
-          SrcModN2 ? SrcModN2 : DAG.getNode(ISD::BITCAST, SL, FVT, N2);
-      SDValue FSelect = DAG.getNode(ISD::SELECT, SL, FVT, N0, FN1, FN2);
-      return DAG.getNode(ISD::BITCAST, SL, VT, FSelect);
-    }
-    return SDValue();
-  };
-
   // Identify bitmask operations that are source mods and create
   // the relevant fneg, fabs or fneg+fabs.
-  if (VT == MVT::i32 || VT == MVT::v2i32)
-    if (SDValue F = FoldSrcMods(N1, N2, VT))
-      return F;
+  if (SDValue F = foldSelectOfSourceMods(N))
+    return F;
 
   return SDValue();
 }

>From ec42e07d01cc84f4d4a435c2e190a25695eacdd4 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Sun, 13 Jul 2025 06:35:38 -0500
Subject: [PATCH 25/28] [NFC] Minor corrections to whitespace and test name

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp               | 1 +
 llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ba4767bdec1b4..85585472881a6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -685,6 +685,7 @@ namespace {
                                   SDValue VecIn2, unsigned LeftIdx,
                                   bool DidSplitVec);
     SDValue matchVSelectOpSizesWithSetCC(SDNode *Cast);
+
     /// Walk up chain skipping non-aliasing memory nodes,
     /// looking for aliasing nodes and adding them to the Aliases vector.
     void GatherAllAliases(SDNode *N, SDValue OriginalChain,
diff --git a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll
index c13f1cdd23d36..beab27ca97126 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-select-source-modifiers.ll
@@ -222,8 +222,8 @@ define <2 x i32> @fabs_select_v2i32_2(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %
   ret <2 x i32> %select
 }
 
-define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
-; GCN-LABEL: fneg_select_v2i32:
+define <2 x i32> @fneg_1_fabs_2_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b) {
+; GCN-LABEL: fneg_1_fabs_2_select_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
@@ -232,7 +232,7 @@ define <2 x i32> @fneg_select_v2i32(<2 x i32> %cond, <2 x i32> %a, <2 x i32> %b)
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, -v3, |v3|, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: fneg_select_v2i32:
+; GFX11-LABEL: fneg_1_fabs_2_select_v2i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0

>From 4bd51d01444f925de88274e185de3be53e0c76b8 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Mon, 14 Jul 2025 08:48:16 -0500
Subject: [PATCH 26/28] Add tighter constraints to apply combine and update
 tests.

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp      |  8 ++++++++
 .../atomic_optimizations_global_pointer.ll     | 18 ++++++++----------
 .../branch-folding-implicit-def-subreg.ll      | 18 +++++++++---------
 llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll  |  7 +++----
 .../AMDGPU/sdwa-peephole-cndmask-sext.ll       |  7 ++++---
 5 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 77632c1423f4e..5180c3805550d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15496,6 +15496,14 @@ SDValue SITargetLowering::performFMACombine(SDNode *N,
 bool SITargetLowering::shouldFoldSelectWithIdentityConstant(
     unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
     SDValue Y) const {
+
+  if (BinOpcode != ISD::AND && BinOpcode != ISD::OR && BinOpcode != ISD::XOR)
+    return false;
+
+  ConstantSDNode *CY = isConstOrConstSplat(Y);
+  if (!CY)
+    return false;
+
   return (BinOpcode == ISD::AND || BinOpcode == ISD::OR ||
           BinOpcode == ISD::XOR) &&
          (VT.getScalarType() == MVT::i32);
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 7584d3eb12928..3ca7db155b385 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -7145,13 +7145,12 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
 ; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX7LESS-NEXT:    s_or_b32 s5, s4, s6
-; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; GFX7LESS-NEXT:    v_or_b32_e32 v0, s4, v0
 ; GFX7LESS-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
@@ -8839,13 +8838,12 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
 ; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX7LESS-NEXT:    s_or_b32 s5, s4, s6
-; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s5
-; GFX7LESS-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX7LESS-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
+; GFX7LESS-NEXT:    v_or_b32_e32 v0, s4, v0
 ; GFX7LESS-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 50efed6da381b..92c63fead15ac 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -913,15 +913,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr25, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr3 = V_OR_B32_e32 killed $vgpr11, killed $vgpr19, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr3, killed $vgpr2, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr3 = V_OR_B32_e32 killed $vgpr52, killed $vgpr13, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 $vgpr3, killed $vgpr2, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = V_CMP_NE_U32_sdwa 0, killed $vgpr53, 0, $vgpr10, 0, 0, 6, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, killed $vgpr2, 0, killed $vgpr3, killed $vcc, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 $vgpr15, killed $vgpr2, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = V_CMP_NE_U32_sdwa 0, killed $vgpr17, 0, $vgpr10, 0, 0, 6, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, killed $vgpr2, 0, killed $vgpr15, killed $vcc, implicit $exec
-  ; GFX90A-NEXT:   DS_WRITE2_B32_gfx9 killed renamable $vgpr10, killed renamable $vgpr2, renamable $vgpr10, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr53, 0, $vgpr3, 0, 0, 6, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr10 = V_OR_B32_e32 killed $vgpr52, killed $vgpr13, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr10, killed $vgpr2, implicit $exec
+  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr17, 0, $vgpr3, 0, 0, 6, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr15, implicit $exec
+  ; GFX90A-NEXT:   DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3)
   ; GFX90A-NEXT:   S_BRANCH %bb.65
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.68.bb174:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 40b33f48f4813..e687745469014 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -4318,9 +4318,9 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %ar
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_and_b32 s1, 1, s1
+; SI-NEXT:    s_cselect_b32 s0, 0, s0
 ; SI-NEXT:    s_xor_b32 s0, s0, 0x80008000
 ; SI-NEXT:    s_cmp_eq_u32 s1, 1
-; SI-NEXT:    s_cselect_b32 s0, 0x80008000, s0
 ; SI-NEXT:    s_cselect_b32 s0, 0, s0
 ; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
@@ -4358,10 +4358,9 @@ define <2 x half> @v_fneg_select_infloop_regression_v2f16(<2 x half> %arg, i1 %a
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; SI-NEXT:    v_and_b32_e32 v1, 1, v2
-; SI-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; SI-NEXT:    v_mov_b32_e32 v2, 0x80008000
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; SI-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; SI-NEXT:    v_cndmask_b32_e64 v1, v0, 0, vcc
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v1
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll
index 2549e76821e1c..2c7819a395c86 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-sext.ll
@@ -8,10 +8,11 @@ define i32 @test_select_on_sext_sdwa(i8 %x, i32 %y, i1 %cond)  {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_and_b32_e32 v2, 1, v2
-; CHECK-NEXT:    v_or_b32_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; CHECK-NEXT:    s_nop 1
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; CHECK-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; CHECK-NEXT:    v_or_b32_e32 v0, v0, v1
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %sext = sext i8 %x to i32
   %select = select i1 %cond, i32 %sext, i32 0

>From b0140bd97b08bfb326873a2a7bcee61951bb0bd5 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Mon, 14 Jul 2025 09:36:53 -0500
Subject: [PATCH 27/28] Further constrain
 shouldFoldSelectWithIdentityConstant(), preventing regressions

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   11 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll     | 2012 +++++++++--------
 llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll |  302 +--
 .../CodeGen/AMDGPU/fneg-modifier-casting.ll   |    7 +-
 llvm/test/CodeGen/AMDGPU/fptrunc.ll           |  133 +-
 .../AMDGPU/fptrunc.v2f16.no.fast.math.ll      |   64 +-
 6 files changed, 1294 insertions(+), 1235 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5180c3805550d..44f66a1de37fc 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15504,9 +15504,14 @@ bool SITargetLowering::shouldFoldSelectWithIdentityConstant(
   if (!CY)
     return false;
 
-  return (BinOpcode == ISD::AND || BinOpcode == ISD::OR ||
-          BinOpcode == ISD::XOR) &&
-         (VT.getScalarType() == MVT::i32);
+  if (!CY->getAPIntValue().isSignMask() &&
+      !CY->getAPIntValue().isMaxSignedValue())
+    return false;
+
+  if (VT.getScalarType() != MVT::i32)
+    return false;
+
+  return true;
 }
 
 SDValue SITargetLowering::performSetCCCombine(SDNode *N,
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 606f6d1e3939b..ba4fe3685458d 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -864,20 +864,21 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) {
 ; SI-NEXT:    v_med3_i32 v5, v5, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, v5, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, v5, v6
-; SI-NEXT:    v_or_b32_e32 v7, 1, v6
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v3
 ; SI-NEXT:    s_movk_i32 s4, 0xfc10
-; SI-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 12, v4
+; SI-NEXT:    v_or_b32_e32 v3, v6, v3
 ; SI-NEXT:    v_or_b32_e32 v5, v0, v5
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v4
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-NEXT:    v_and_b32_e32 v5, 7, v3
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v5
+; SI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v5
-; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v5
-; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; SI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; SI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; SI-NEXT:    v_or_b32_e32 v5, v5, v6
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; SI-NEXT:    v_mov_b32_e32 v5, 0x7c00
@@ -913,20 +914,21 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) {
 ; VI-NEXT:    v_med3_i32 v4, v4, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, v4, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, v4, v5
-; VI-NEXT:    v_or_b32_e32 v6, 1, v5
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v4, v3
 ; VI-NEXT:    s_movk_i32 s4, 0xfc10
-; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, s4, v1
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 12, v1
+; VI-NEXT:    v_or_b32_e32 v3, v5, v3
 ; VI-NEXT:    v_or_b32_e32 v4, v0, v4
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; VI-NEXT:    v_and_b32_e32 v4, 7, v3
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v4
+; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v4
-; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v4
-; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; VI-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x7c00
@@ -958,18 +960,19 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) {
 ; GFX9-NEXT:    v_med3_i32 v4, v4, 0, 13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, v4, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, v4, v5
-; GFX9-NEXT:    v_or_b32_e32 v6, 1, v5
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v4, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v1, 0xfffffc10, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX9-NEXT:    v_or_b32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_lshl_or_b32 v4, v1, 12, v0
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v4, 7, v3
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v4
-; GFX9-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v4
-; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7c00
@@ -999,35 +1002,36 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_and_or_b32 v0, 0xffe, v3, v0
 ; GFX11-NEXT:    v_med3_i32 v3, v4, 0, 13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v4, 0x1000, v0
-; GFX11-NEXT:    v_lshl_or_b32 v7, v1, 12, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v3, v5
-; GFX11-NEXT:    v_or_b32_e32 v6, 1, v5
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-NEXT:    v_lshl_or_b32 v4, v1, 12, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v4, 7, v3
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
+; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v4
-; GFX11-NEXT:    v_cmp_lt_i32_e64 s0, 5, v4
-; GFX11-NEXT:    s_or_b32 s0, s0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX11-NEXT:    v_dual_mov_b32 v4, 0x7e00 :: v_dual_add_nc_u32 v3, v3, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v4, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %mag.trunc = fptrunc double %mag to half
@@ -1053,31 +1057,29 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; SI-NEXT:    s_or_b32 s3, s0, 0x1000
 ; SI-NEXT:    v_readfirstlane_b32 s5, v0
 ; SI-NEXT:    s_lshr_b32 s6, s3, s5
-; SI-NEXT:    s_or_b32 s7, s6, 1
 ; SI-NEXT:    s_lshl_b32 s5, s6, s5
 ; SI-NEXT:    s_cmp_lg_u32 s5, s3
-; SI-NEXT:    s_cselect_b32 s3, s7, s6
-; SI-NEXT:    s_add_i32 s8, s4, 0xfffffc10
-; SI-NEXT:    s_lshl_b32 s4, s8, 12
-; SI-NEXT:    s_or_b32 s4, s0, s4
-; SI-NEXT:    s_cmp_lt_i32 s8, 1
-; SI-NEXT:    s_cselect_b32 s3, s3, s4
-; SI-NEXT:    s_and_b32 s6, s3, 7
-; SI-NEXT:    s_cmp_eq_u32 s6, 3
-; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; SI-NEXT:    s_cmp_gt_i32 s6, 5
-; SI-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; SI-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
-; SI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; SI-NEXT:    s_cselect_b32 s4, 1, 0
+; SI-NEXT:    s_cselect_b32 s3, 1, 0
+; SI-NEXT:    s_addk_i32 s4, 0xfc10
+; SI-NEXT:    s_lshl_b32 s5, s4, 12
+; SI-NEXT:    s_or_b32 s3, s6, s3
+; SI-NEXT:    s_or_b32 s5, s0, s5
+; SI-NEXT:    s_cmp_lt_i32 s4, 1
+; SI-NEXT:    s_cselect_b32 s3, s3, s5
+; SI-NEXT:    s_and_b32 s5, s3, 7
+; SI-NEXT:    s_cmp_gt_i32 s5, 5
+; SI-NEXT:    s_cselect_b32 s6, 1, 0
+; SI-NEXT:    s_cmp_eq_u32 s5, 3
+; SI-NEXT:    s_cselect_b32 s5, 1, 0
+; SI-NEXT:    s_or_b32 s5, s5, s6
 ; SI-NEXT:    s_lshr_b32 s3, s3, 2
-; SI-NEXT:    s_add_i32 s3, s3, s4
-; SI-NEXT:    s_cmp_lt_i32 s8, 31
+; SI-NEXT:    s_add_i32 s3, s3, s5
+; SI-NEXT:    s_cmp_lt_i32 s4, 31
 ; SI-NEXT:    s_cselect_b32 s3, s3, 0x7c00
 ; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    s_movk_i32 s0, 0x7e00
 ; SI-NEXT:    s_cselect_b32 s0, s0, 0x7c00
-; SI-NEXT:    s_cmpk_eq_i32 s8, 0x40f
+; SI-NEXT:    s_cmpk_eq_i32 s4, 0x40f
 ; SI-NEXT:    s_cselect_b32 s0, s0, s3
 ; SI-NEXT:    s_lshr_b32 s1, s1, 16
 ; SI-NEXT:    s_and_b32 s1, s1, 0x8000
@@ -1102,37 +1104,35 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; VI-NEXT:    s_and_b32 s0, s0, 0xffe
 ; VI-NEXT:    v_readfirstlane_b32 s3, v0
 ; VI-NEXT:    s_sub_i32 s4, 0x3f1, s1
-; VI-NEXT:    s_or_b32 s3, s0, s3
+; VI-NEXT:    s_or_b32 s0, s0, s3
 ; VI-NEXT:    v_med3_i32 v0, s4, 0, 13
-; VI-NEXT:    s_or_b32 s0, s3, 0x1000
+; VI-NEXT:    s_or_b32 s3, s0, 0x1000
 ; VI-NEXT:    v_readfirstlane_b32 s4, v0
-; VI-NEXT:    s_lshr_b32 s5, s0, s4
-; VI-NEXT:    s_or_b32 s6, s5, 1
+; VI-NEXT:    s_lshr_b32 s5, s3, s4
 ; VI-NEXT:    s_lshl_b32 s4, s5, s4
-; VI-NEXT:    s_cmp_lg_u32 s4, s0
-; VI-NEXT:    s_cselect_b32 s0, s6, s5
-; VI-NEXT:    s_add_i32 s6, s1, 0xfffffc10
-; VI-NEXT:    s_lshl_b32 s1, s6, 12
-; VI-NEXT:    s_or_b32 s1, s3, s1
-; VI-NEXT:    s_cmp_lt_i32 s6, 1
-; VI-NEXT:    s_cselect_b32 s7, s0, s1
-; VI-NEXT:    s_and_b32 s4, s7, 7
-; VI-NEXT:    s_cmp_eq_u32 s4, 3
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    s_cmp_lg_u32 s4, s3
+; VI-NEXT:    s_cselect_b32 s3, 1, 0
+; VI-NEXT:    s_addk_i32 s1, 0xfc10
+; VI-NEXT:    s_lshl_b32 s4, s1, 12
+; VI-NEXT:    s_or_b32 s3, s5, s3
+; VI-NEXT:    s_or_b32 s4, s0, s4
+; VI-NEXT:    s_cmp_lt_i32 s1, 1
+; VI-NEXT:    s_cselect_b32 s3, s3, s4
+; VI-NEXT:    s_and_b32 s4, s3, 7
 ; VI-NEXT:    s_cmp_gt_i32 s4, 5
-; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; VI-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
-; VI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; VI-NEXT:    s_cselect_b32 s0, 1, 0
-; VI-NEXT:    s_lshr_b32 s1, s7, 2
-; VI-NEXT:    s_add_i32 s1, s1, s0
-; VI-NEXT:    s_cmp_lt_i32 s6, 31
-; VI-NEXT:    s_cselect_b32 s0, s1, 0x7c00
-; VI-NEXT:    s_cmp_lg_u32 s3, 0
-; VI-NEXT:    s_movk_i32 s1, 0x7e00
-; VI-NEXT:    s_cselect_b32 s1, s1, 0x7c00
-; VI-NEXT:    s_cmpk_eq_i32 s6, 0x40f
-; VI-NEXT:    s_cselect_b32 s0, s1, s0
+; VI-NEXT:    s_cselect_b32 s5, 1, 0
+; VI-NEXT:    s_cmp_eq_u32 s4, 3
+; VI-NEXT:    s_cselect_b32 s4, 1, 0
+; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    s_lshr_b32 s3, s3, 2
+; VI-NEXT:    s_add_i32 s3, s3, s4
+; VI-NEXT:    s_cmp_lt_i32 s1, 31
+; VI-NEXT:    s_cselect_b32 s3, s3, 0x7c00
+; VI-NEXT:    s_cmp_lg_u32 s0, 0
+; VI-NEXT:    s_movk_i32 s0, 0x7e00
+; VI-NEXT:    s_cselect_b32 s0, s0, 0x7c00
+; VI-NEXT:    s_cmpk_eq_i32 s1, 0x40f
+; VI-NEXT:    s_cselect_b32 s0, s0, s3
 ; VI-NEXT:    s_movk_i32 s1, 0x7fff
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
@@ -1152,37 +1152,35 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xffe
 ; GFX9-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX9-NEXT:    s_sub_i32 s4, 0x3f1, s1
-; GFX9-NEXT:    s_or_b32 s3, s0, s3
+; GFX9-NEXT:    s_or_b32 s0, s0, s3
 ; GFX9-NEXT:    v_med3_i32 v0, s4, 0, 13
-; GFX9-NEXT:    s_or_b32 s0, s3, 0x1000
+; GFX9-NEXT:    s_or_b32 s3, s0, 0x1000
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX9-NEXT:    s_lshr_b32 s5, s0, s4
-; GFX9-NEXT:    s_or_b32 s6, s5, 1
+; GFX9-NEXT:    s_lshr_b32 s5, s3, s4
 ; GFX9-NEXT:    s_lshl_b32 s4, s5, s4
-; GFX9-NEXT:    s_cmp_lg_u32 s4, s0
-; GFX9-NEXT:    s_cselect_b32 s0, s6, s5
-; GFX9-NEXT:    s_add_i32 s6, s1, 0xfffffc10
-; GFX9-NEXT:    s_lshl_b32 s1, s6, 12
-; GFX9-NEXT:    s_or_b32 s1, s3, s1
-; GFX9-NEXT:    s_cmp_lt_i32 s6, 1
-; GFX9-NEXT:    s_cselect_b32 s7, s0, s1
-; GFX9-NEXT:    s_and_b32 s4, s7, 7
-; GFX9-NEXT:    s_cmp_eq_u32 s4, 3
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s4, s3
+; GFX9-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX9-NEXT:    s_addk_i32 s1, 0xfc10
+; GFX9-NEXT:    s_lshl_b32 s4, s1, 12
+; GFX9-NEXT:    s_or_b32 s3, s5, s3
+; GFX9-NEXT:    s_or_b32 s4, s0, s4
+; GFX9-NEXT:    s_cmp_lt_i32 s1, 1
+; GFX9-NEXT:    s_cselect_b32 s3, s3, s4
+; GFX9-NEXT:    s_and_b32 s4, s3, 7
 ; GFX9-NEXT:    s_cmp_gt_i32 s4, 5
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
-; GFX9-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX9-NEXT:    s_lshr_b32 s1, s7, 2
-; GFX9-NEXT:    s_add_i32 s1, s1, s0
-; GFX9-NEXT:    s_cmp_lt_i32 s6, 31
-; GFX9-NEXT:    s_cselect_b32 s0, s1, 0x7c00
-; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX9-NEXT:    s_movk_i32 s1, 0x7e00
-; GFX9-NEXT:    s_cselect_b32 s1, s1, 0x7c00
-; GFX9-NEXT:    s_cmpk_eq_i32 s6, 0x40f
-; GFX9-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s4, 3
+; GFX9-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX9-NEXT:    s_or_b32 s4, s4, s5
+; GFX9-NEXT:    s_lshr_b32 s3, s3, 2
+; GFX9-NEXT:    s_add_i32 s3, s3, s4
+; GFX9-NEXT:    s_cmp_lt_i32 s1, 31
+; GFX9-NEXT:    s_cselect_b32 s3, s3, 0x7c00
+; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX9-NEXT:    s_movk_i32 s0, 0x7e00
+; GFX9-NEXT:    s_cselect_b32 s0, s0, 0x7c00
+; GFX9-NEXT:    s_cmpk_eq_i32 s1, 0x40f
+; GFX9-NEXT:    s_cselect_b32 s0, s0, s3
 ; GFX9-NEXT:    s_movk_i32 s1, 0x7fff
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
@@ -1213,26 +1211,23 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshr_b32 s5, s3, s4
 ; GFX11-NEXT:    s_lshl_b32 s4, s5, s4
-; GFX11-NEXT:    s_or_b32 s6, s5, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_cmp_lg_u32 s4, s3
-; GFX11-NEXT:    s_cselect_b32 s3, s6, s5
+; GFX11-NEXT:    s_cselect_b32 s3, 1, 0
 ; GFX11-NEXT:    s_addk_i32 s0, 0xfc10
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s3, s5, s3
 ; GFX11-NEXT:    s_lshl_b32 s4, s0, 12
 ; GFX11-NEXT:    s_or_b32 s4, s1, s4
 ; GFX11-NEXT:    s_cmp_lt_i32 s0, 1
 ; GFX11-NEXT:    s_cselect_b32 s3, s3, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s4, s3, 7
-; GFX11-NEXT:    s_cmp_eq_u32 s4, 3
-; GFX11-NEXT:    s_cselect_b32 s5, -1, 0
 ; GFX11-NEXT:    s_cmp_gt_i32 s4, 5
-; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b32 s4, s4, s5
-; GFX11-NEXT:    s_and_b32 s4, s4, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s4, 3
 ; GFX11-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX11-NEXT:    s_lshr_b32 s3, s3, 2
+; GFX11-NEXT:    s_or_b32 s4, s4, s5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_add_i32 s3, s3, s4
 ; GFX11-NEXT:    s_cmp_lt_i32 s0, 31
@@ -3034,27 +3029,28 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
 ; SI-NEXT:    v_and_b32_e32 v6, 0xffe, v6
 ; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; SI-NEXT:    v_bfe_u32 v7, v3, 20, 11
-; SI-NEXT:    s_movk_i32 s6, 0x3f1
+; SI-NEXT:    s_movk_i32 s4, 0x3f1
 ; SI-NEXT:    v_or_b32_e32 v2, v6, v2
-; SI-NEXT:    v_sub_i32_e32 v8, vcc, s6, v7
+; SI-NEXT:    v_sub_i32_e32 v8, vcc, s4, v7
 ; SI-NEXT:    v_or_b32_e32 v6, 0x1000, v2
 ; SI-NEXT:    v_med3_i32 v8, v8, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, v8, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, v8, v9
-; SI-NEXT:    v_or_b32_e32 v10, 1, v9
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v8, v6
-; SI-NEXT:    s_movk_i32 s7, 0xfc10
-; SI-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
-; SI-NEXT:    v_add_i32_e32 v7, vcc, s7, v7
+; SI-NEXT:    s_movk_i32 s5, 0xfc10
+; SI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; SI-NEXT:    v_add_i32_e32 v7, vcc, s5, v7
 ; SI-NEXT:    v_lshlrev_b32_e32 v8, 12, v7
+; SI-NEXT:    v_or_b32_e32 v6, v9, v6
 ; SI-NEXT:    v_or_b32_e32 v8, v2, v8
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v7
 ; SI-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
 ; SI-NEXT:    v_and_b32_e32 v8, 7, v6
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v8
+; SI-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v8
-; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v8
-; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; SI-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; SI-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; SI-NEXT:    v_or_b32_e32 v8, v8, v9
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, 2, v6
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; SI-NEXT:    v_mov_b32_e32 v8, 0x7c00
@@ -3062,9 +3058,9 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
 ; SI-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
 ; SI-NEXT:    v_mov_b32_e32 v9, 0x7e00
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; SI-NEXT:    s_movk_i32 s8, 0x40f
+; SI-NEXT:    s_movk_i32 s6, 0x40f
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v7
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v7
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_and_b32_e32 v6, 0x1ff, v1
@@ -3077,24 +3073,25 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; SI-NEXT:    v_bfe_u32 v6, v1, 20, 11
 ; SI-NEXT:    v_or_b32_e32 v0, v3, v0
-; SI-NEXT:    v_sub_i32_e32 v7, vcc, s6, v6
+; SI-NEXT:    v_sub_i32_e32 v7, vcc, s4, v6
 ; SI-NEXT:    v_or_b32_e32 v3, 0x1000, v0
 ; SI-NEXT:    v_med3_i32 v7, v7, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v10, v7, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, v7, v10
-; SI-NEXT:    v_or_b32_e32 v11, 1, v10
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v7, v3
-; SI-NEXT:    v_cndmask_b32_e32 v3, v10, v11, vcc
-; SI-NEXT:    v_add_i32_e32 v6, vcc, s7, v6
+; SI-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; SI-NEXT:    v_add_i32_e32 v6, vcc, s5, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 12, v6
+; SI-NEXT:    v_or_b32_e32 v3, v10, v3
 ; SI-NEXT:    v_or_b32_e32 v7, v0, v7
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v6
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; SI-NEXT:    v_and_b32_e32 v7, 7, v3
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v7
+; SI-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v7
-; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v7
-; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; SI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; SI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; SI-NEXT:    v_or_b32_e32 v7, v7, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v6
@@ -3103,7 +3100,7 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v8, v9, vcc
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v6
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v6
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; SI-NEXT:    v_and_b32_e32 v1, 0x8000, v1
@@ -3127,27 +3124,28 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
 ; VI-NEXT:    v_and_b32_e32 v5, 0xffe, v5
 ; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; VI-NEXT:    v_bfe_u32 v3, v3, 20, 11
-; VI-NEXT:    s_movk_i32 s6, 0x3f1
+; VI-NEXT:    s_movk_i32 s4, 0x3f1
 ; VI-NEXT:    v_or_b32_e32 v2, v5, v2
-; VI-NEXT:    v_sub_u32_e32 v6, vcc, s6, v3
+; VI-NEXT:    v_sub_u32_e32 v6, vcc, s4, v3
 ; VI-NEXT:    v_or_b32_e32 v5, 0x1000, v2
 ; VI-NEXT:    v_med3_i32 v6, v6, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, v6, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, v6, v7
-; VI-NEXT:    v_or_b32_e32 v8, 1, v7
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v6, v5
-; VI-NEXT:    s_movk_i32 s7, 0xfc10
-; VI-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
-; VI-NEXT:    v_add_u32_e32 v3, vcc, s7, v3
+; VI-NEXT:    s_movk_i32 s5, 0xfc10
+; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; VI-NEXT:    v_add_u32_e32 v3, vcc, s5, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, 12, v3
+; VI-NEXT:    v_or_b32_e32 v5, v7, v5
 ; VI-NEXT:    v_or_b32_e32 v6, v2, v6
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
 ; VI-NEXT:    v_and_b32_e32 v6, 7, v5
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v6
+; VI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v6
-; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v6
-; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; VI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; VI-NEXT:    v_or_b32_e32 v6, v6, v7
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v6
 ; VI-NEXT:    v_mov_b32_e32 v6, 0x7c00
@@ -3155,9 +3153,9 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
 ; VI-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
 ; VI-NEXT:    v_mov_b32_e32 v7, 0x7e00
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; VI-NEXT:    s_movk_i32 s8, 0x40f
+; VI-NEXT:    s_movk_i32 s6, 0x40f
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v3
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; VI-NEXT:    v_and_b32_e32 v5, 0x1ff, v1
 ; VI-NEXT:    v_or_b32_e32 v0, v5, v0
@@ -3167,31 +3165,32 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; VI-NEXT:    v_bfe_u32 v1, v1, 20, 11
 ; VI-NEXT:    v_or_b32_e32 v0, v3, v0
-; VI-NEXT:    v_sub_u32_e32 v5, vcc, s6, v1
+; VI-NEXT:    v_sub_u32_e32 v5, vcc, s4, v1
 ; VI-NEXT:    v_or_b32_e32 v3, 0x1000, v0
 ; VI-NEXT:    v_med3_i32 v5, v5, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v8, v5, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v5, v5, v8
-; VI-NEXT:    v_or_b32_e32 v9, 1, v8
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v3
-; VI-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc
-; VI-NEXT:    v_add_u32_e32 v1, vcc, s7, v1
+; VI-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; VI-NEXT:    v_add_u32_e32 v1, vcc, s5, v1
 ; VI-NEXT:    v_lshlrev_b32_e32 v5, 12, v1
+; VI-NEXT:    v_or_b32_e32 v3, v8, v3
 ; VI-NEXT:    v_or_b32_e32 v5, v0, v5
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; VI-NEXT:    v_and_b32_e32 v5, 7, v3
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v5
+; VI-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v5
-; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v5
-; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; VI-NEXT:    v_or_b32_e32 v5, v5, v8
 ; VI-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v7, vcc
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v1
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
@@ -3203,31 +3202,32 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
 ; GFX9-LABEL: v_copysign_out_v2f16_mag_v2f64_sign_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s6, 0x1ff
-; GFX9-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX9-NEXT:    s_movk_i32 s4, 0x1ff
+; GFX9-NEXT:    v_and_or_b32 v0, v1, s4, v0
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX9-NEXT:    s_movk_i32 s7, 0xffe
+; GFX9-NEXT:    s_movk_i32 s5, 0xffe
 ; GFX9-NEXT:    v_bfe_u32 v6, v1, 20, 11
-; GFX9-NEXT:    v_and_or_b32 v0, v5, s7, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v5, s5, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v7, 0x3f1, v6
 ; GFX9-NEXT:    v_or_b32_e32 v5, 0x1000, v0
 ; GFX9-NEXT:    v_med3_i32 v7, v7, 0, 13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v8, v7, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, v7, v8
-; GFX9-NEXT:    v_or_b32_e32 v9, 1, v8
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v7, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v6, 0xfffffc10, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX9-NEXT:    v_or_b32_e32 v5, v8, v5
 ; GFX9-NEXT:    v_lshl_or_b32 v7, v6, 12, v0
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v7, 7, v5
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v7
-; GFX9-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v7
-; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT:    v_or_b32_e32 v7, v7, v8
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
 ; GFX9-NEXT:    v_add_u32_e32 v5, v5, v7
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7c00
@@ -3235,46 +3235,47 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0x7e00
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    s_movk_i32 s8, 0x40f
+; GFX9-NEXT:    s_movk_i32 s6, 0x40f
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v6
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    s_mov_b32 s9, 0x8000
-; GFX9-NEXT:    v_and_or_b32 v0, v1, s9, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v3, s6, v2
+; GFX9-NEXT:    s_mov_b32 s7, 0x8000
+; GFX9-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v3, s4, v2
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v3
 ; GFX9-NEXT:    v_bfe_u32 v5, v3, 20, 11
-; GFX9-NEXT:    v_and_or_b32 v1, v2, s7, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v2, s5, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v6, 0x3f1, v5
 ; GFX9-NEXT:    v_or_b32_e32 v2, 0x1000, v1
 ; GFX9-NEXT:    v_med3_i32 v6, v6, 0, 13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v9, v6, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, v6, v9
-; GFX9-NEXT:    v_or_b32_e32 v10, 1, v9
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v6, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v5, 0xfffffc10, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v10, vcc
+; GFX9-NEXT:    v_or_b32_e32 v2, v9, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v6, v5, 12, v1
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v6, 7, v2
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v6
-; GFX9-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v6
-; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT:    v_or_b32_e32 v6, v6, v9
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v6
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v5
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_and_or_b32 v1, v2, s9, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v2, s7, v1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7fff7fff
@@ -3288,11 +3289,12 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
 ; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0x1ff, v1, v0
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v6, v3, 20, 11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v7, v1, 20, 11
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v9, 0x3f1, v6
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v6, 0xfffffc10, v6
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0xffe, v5, v2
@@ -3309,59 +3311,61 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v12, v5, v10
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, v8, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, 1, v11
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 1, v12
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v8, v9
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v8, v11, v13 :: v_dual_add_nc_u32 v7, 0xfffffc10, v7
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v11, 0x7e00
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v5, v5, v12
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v7, 12, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v8, v9
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v9, v6, 12, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v5, v10
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v5, v12, v15 :: v_dual_add_nc_u32 v6, 0xfffffc10, v6
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v6, 12, v2
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v11, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v14, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, 0xfffffc10, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v5, v12, v5
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v7, 12, v0
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v5, v10, v5 :: v_dual_mov_b32 v10, 0x7e00
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v9, 7, v8
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 2, v8
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v11, 7, v5
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
+; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v9
-; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e64 s0, 5, v9
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v3.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.h, 0
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v9, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v3.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.h, 0
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v8, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v11, v13
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v10, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v6
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v10, 7, v5
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e64 s1, 3, v10
-; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e64 s2, 5, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s0
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s2, s1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v3, v8, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, 0x7c00, v11 :: v_dual_add_nc_u32 v5, v5, v12
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v0, 0x7c00, v10 :: v_dual_add_nc_u32 v5, v5, v11
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v7
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, 0x7c00, v5, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v6
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v7
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0x8000, v9, v2
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0x8000, v12, v2
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v12.l, v1.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0x8000, v9, v0
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0x8000, v12, v0
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, v0, v4
@@ -3374,15 +3378,17 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0x1ff, v3, v2
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v6, v1, 20, 11
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-FAKE16-NEXT:    v_bfe_u32 v7, v3, 20, 11
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v13, 0x7e00
 ; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v9, 0x3f1, v6
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v6, 0xfffffc10, v6
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0xffe, v5, v0
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v5, 0x3f1, v7
@@ -3395,59 +3401,62 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, v8, v9
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, v5, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, v8, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 1, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 1, v12
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v8, v9
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v11, v14 :: v_dual_add_nc_u32 v7, 0xfffffc10, v7
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v11, 0x7e00
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, v5, v12
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v7, 12, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v8, v9
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v6, 12, v0
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v5, v10
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v5, v12, v15 :: v_dual_add_nc_u32 v6, 0xfffffc10, v6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v6, 12, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v8, v11, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v13, v8, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v7
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v7, 0xfffffc10, v7
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v12, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v7, 12, v2
+; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v7
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v9, 7, v8
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 2, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v10, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v9
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 7, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v9
-; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e64 s0, 5, v9
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e64 s1, 3, v10
-; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e64 s2, 5, v10
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v9, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v10
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v8, v9
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s2, s1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v10, v10, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v13, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v8, v9
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v13, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, 0x7c00, v8 :: v_dual_add_nc_u32 v5, v5, v10
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, 0x7c00, v5, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v7
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0x8000, v1, v0
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v1, 0x8000, v3, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, v0, v4
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mag.trunc = fptrunc <2 x double> %mag to <2 x half>
@@ -3844,82 +3853,78 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; SI-NEXT:    s_or_b32 s4, s0, 0x1000
 ; SI-NEXT:    v_readfirstlane_b32 s6, v2
 ; SI-NEXT:    s_lshr_b32 s7, s4, s6
-; SI-NEXT:    s_or_b32 s8, s7, 1
 ; SI-NEXT:    s_lshl_b32 s6, s7, s6
 ; SI-NEXT:    s_cmp_lg_u32 s6, s4
-; SI-NEXT:    s_cselect_b32 s4, s8, s7
-; SI-NEXT:    s_add_i32 s8, s5, 0xfffffc10
-; SI-NEXT:    s_lshl_b32 s5, s8, 12
-; SI-NEXT:    s_or_b32 s5, s0, s5
-; SI-NEXT:    s_cmp_lt_i32 s8, 1
-; SI-NEXT:    s_cselect_b32 s9, s4, s5
-; SI-NEXT:    s_and_b32 s6, s9, 7
-; SI-NEXT:    s_cmp_eq_u32 s6, 3
-; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; SI-NEXT:    s_cmp_gt_i32 s6, 5
-; SI-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; SI-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
-; SI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
 ; SI-NEXT:    s_cselect_b32 s4, 1, 0
-; SI-NEXT:    s_lshr_b32 s5, s9, 2
-; SI-NEXT:    s_add_i32 s5, s5, s4
-; SI-NEXT:    s_cmp_lt_i32 s8, 31
-; SI-NEXT:    s_cselect_b32 s4, s5, 0x7c00
+; SI-NEXT:    s_addk_i32 s5, 0xfc10
+; SI-NEXT:    s_lshl_b32 s6, s5, 12
+; SI-NEXT:    s_or_b32 s4, s7, s4
+; SI-NEXT:    s_or_b32 s6, s0, s6
+; SI-NEXT:    s_cmp_lt_i32 s5, 1
+; SI-NEXT:    s_cselect_b32 s4, s4, s6
+; SI-NEXT:    s_and_b32 s6, s4, 7
+; SI-NEXT:    s_cmp_gt_i32 s6, 5
+; SI-NEXT:    s_cselect_b32 s7, 1, 0
+; SI-NEXT:    s_cmp_eq_u32 s6, 3
+; SI-NEXT:    s_cselect_b32 s6, 1, 0
+; SI-NEXT:    s_or_b32 s6, s6, s7
+; SI-NEXT:    s_lshr_b32 s4, s4, 2
+; SI-NEXT:    s_add_i32 s4, s4, s6
+; SI-NEXT:    s_cmp_lt_i32 s5, 31
+; SI-NEXT:    s_cselect_b32 s4, s4, 0x7c00
 ; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    s_movk_i32 s6, 0x7e00
 ; SI-NEXT:    s_cselect_b32 s0, s6, 0x7c00
-; SI-NEXT:    s_cmpk_eq_i32 s8, 0x40f
+; SI-NEXT:    s_cmpk_eq_i32 s5, 0x40f
 ; SI-NEXT:    s_cselect_b32 s0, s0, s4
 ; SI-NEXT:    s_lshr_b32 s1, s1, 16
 ; SI-NEXT:    s_and_b32 s1, s1, 0x8000
-; SI-NEXT:    s_or_b32 s7, s1, s0
+; SI-NEXT:    s_or_b32 s4, s1, s0
 ; SI-NEXT:    s_lshr_b32 s0, s3, 8
-; SI-NEXT:    s_and_b32 s4, s0, 0xffe
+; SI-NEXT:    s_and_b32 s5, s0, 0xffe
 ; SI-NEXT:    s_and_b32 s0, s3, 0x1ff
 ; SI-NEXT:    s_or_b32 s0, s0, s2
 ; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; SI-NEXT:    v_readfirstlane_b32 s0, v2
-; SI-NEXT:    s_bfe_u32 s1, s3, 0xb0014
-; SI-NEXT:    s_or_b32 s2, s4, s0
-; SI-NEXT:    s_sub_i32 s4, 0x3f1, s1
-; SI-NEXT:    v_med3_i32 v2, s4, 0, 13
-; SI-NEXT:    s_or_b32 s0, s2, 0x1000
-; SI-NEXT:    v_readfirstlane_b32 s4, v2
-; SI-NEXT:    s_lshr_b32 s5, s0, s4
-; SI-NEXT:    s_or_b32 s8, s5, 1
-; SI-NEXT:    s_lshl_b32 s4, s5, s4
-; SI-NEXT:    s_cmp_lg_u32 s4, s0
-; SI-NEXT:    s_cselect_b32 s0, s8, s5
-; SI-NEXT:    s_add_i32 s8, s1, 0xfffffc10
-; SI-NEXT:    s_lshl_b32 s1, s8, 12
-; SI-NEXT:    s_or_b32 s1, s2, s1
-; SI-NEXT:    s_cmp_lt_i32 s8, 1
-; SI-NEXT:    s_cselect_b32 s9, s0, s1
-; SI-NEXT:    s_and_b32 s4, s9, 7
-; SI-NEXT:    s_cmp_eq_u32 s4, 3
-; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT:    s_cmp_gt_i32 s4, 5
-; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; SI-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
-; SI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; SI-NEXT:    s_cselect_b32 s0, 1, 0
-; SI-NEXT:    s_lshr_b32 s1, s9, 2
-; SI-NEXT:    s_add_i32 s1, s1, s0
-; SI-NEXT:    s_cmp_lt_i32 s8, 31
-; SI-NEXT:    s_cselect_b32 s0, s1, 0x7c00
-; SI-NEXT:    s_cmp_lg_u32 s2, 0
-; SI-NEXT:    s_cselect_b32 s1, s6, 0x7c00
-; SI-NEXT:    s_cmpk_eq_i32 s8, 0x40f
-; SI-NEXT:    s_cselect_b32 s0, s1, s0
+; SI-NEXT:    s_bfe_u32 s2, s3, 0xb0014
+; SI-NEXT:    s_or_b32 s0, s5, s0
+; SI-NEXT:    s_sub_i32 s5, 0x3f1, s2
+; SI-NEXT:    v_med3_i32 v2, s5, 0, 13
+; SI-NEXT:    s_or_b32 s1, s0, 0x1000
+; SI-NEXT:    v_readfirstlane_b32 s5, v2
+; SI-NEXT:    s_lshr_b32 s7, s1, s5
+; SI-NEXT:    s_lshl_b32 s5, s7, s5
+; SI-NEXT:    s_cmp_lg_u32 s5, s1
+; SI-NEXT:    s_cselect_b32 s1, 1, 0
+; SI-NEXT:    s_addk_i32 s2, 0xfc10
+; SI-NEXT:    s_lshl_b32 s5, s2, 12
+; SI-NEXT:    s_or_b32 s1, s7, s1
+; SI-NEXT:    s_or_b32 s5, s0, s5
+; SI-NEXT:    s_cmp_lt_i32 s2, 1
+; SI-NEXT:    s_cselect_b32 s1, s1, s5
+; SI-NEXT:    s_and_b32 s5, s1, 7
+; SI-NEXT:    s_cmp_gt_i32 s5, 5
+; SI-NEXT:    s_cselect_b32 s7, 1, 0
+; SI-NEXT:    s_cmp_eq_u32 s5, 3
+; SI-NEXT:    s_cselect_b32 s5, 1, 0
+; SI-NEXT:    s_or_b32 s5, s5, s7
+; SI-NEXT:    s_lshr_b32 s1, s1, 2
+; SI-NEXT:    s_add_i32 s1, s1, s5
+; SI-NEXT:    s_cmp_lt_i32 s2, 31
+; SI-NEXT:    s_cselect_b32 s1, s1, 0x7c00
+; SI-NEXT:    s_cmp_lg_u32 s0, 0
+; SI-NEXT:    s_cselect_b32 s0, s6, 0x7c00
+; SI-NEXT:    s_cmpk_eq_i32 s2, 0x40f
+; SI-NEXT:    s_cselect_b32 s0, s0, s1
 ; SI-NEXT:    s_lshr_b32 s1, s3, 16
 ; SI-NEXT:    s_and_b32 s1, s1, 0x8000
 ; SI-NEXT:    s_or_b32 s0, s1, s0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, s0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, s7
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
 ; SI-NEXT:    s_brev_b32 s0, -2
 ; SI-NEXT:    v_bfi_b32 v0, s0, v2, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -3942,38 +3947,36 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; VI-NEXT:    s_bfe_u32 s3, s3, 0xb0014
 ; VI-NEXT:    v_readfirstlane_b32 s2, v0
 ; VI-NEXT:    s_sub_i32 s6, 0x3f1, s3
-; VI-NEXT:    s_or_b32 s5, s5, s2
+; VI-NEXT:    s_or_b32 s2, s5, s2
 ; VI-NEXT:    v_med3_i32 v0, s6, 0, 13
-; VI-NEXT:    s_or_b32 s2, s5, 0x1000
+; VI-NEXT:    s_or_b32 s5, s2, 0x1000
 ; VI-NEXT:    v_readfirstlane_b32 s6, v0
-; VI-NEXT:    s_lshr_b32 s7, s2, s6
-; VI-NEXT:    s_or_b32 s8, s7, 1
+; VI-NEXT:    s_lshr_b32 s7, s5, s6
 ; VI-NEXT:    s_lshl_b32 s6, s7, s6
-; VI-NEXT:    s_cmp_lg_u32 s6, s2
-; VI-NEXT:    s_cselect_b32 s2, s8, s7
-; VI-NEXT:    s_add_i32 s8, s3, 0xfffffc10
-; VI-NEXT:    s_lshl_b32 s3, s8, 12
-; VI-NEXT:    s_or_b32 s3, s5, s3
-; VI-NEXT:    s_cmp_lt_i32 s8, 1
-; VI-NEXT:    s_cselect_b32 s9, s2, s3
-; VI-NEXT:    s_and_b32 s6, s9, 7
-; VI-NEXT:    s_cmp_eq_u32 s6, 3
-; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT:    s_cmp_lg_u32 s6, s5
+; VI-NEXT:    s_cselect_b32 s5, 1, 0
+; VI-NEXT:    s_addk_i32 s3, 0xfc10
+; VI-NEXT:    s_lshl_b32 s6, s3, 12
+; VI-NEXT:    s_or_b32 s5, s7, s5
+; VI-NEXT:    s_or_b32 s6, s2, s6
+; VI-NEXT:    s_cmp_lt_i32 s3, 1
+; VI-NEXT:    s_cselect_b32 s5, s5, s6
+; VI-NEXT:    s_and_b32 s6, s5, 7
 ; VI-NEXT:    s_cmp_gt_i32 s6, 5
-; VI-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; VI-NEXT:    s_or_b64 s[2:3], s[6:7], s[2:3]
-; VI-NEXT:    s_and_b64 s[2:3], s[2:3], exec
-; VI-NEXT:    s_cselect_b32 s2, 1, 0
-; VI-NEXT:    s_lshr_b32 s3, s9, 2
-; VI-NEXT:    s_add_i32 s3, s3, s2
-; VI-NEXT:    s_cmp_lt_i32 s8, 31
-; VI-NEXT:    s_cselect_b32 s2, s3, 0x7c00
-; VI-NEXT:    s_cmp_lg_u32 s5, 0
-; VI-NEXT:    s_movk_i32 s5, 0x7e00
-; VI-NEXT:    s_cselect_b32 s3, s5, 0x7c00
-; VI-NEXT:    s_cmpk_eq_i32 s8, 0x40f
-; VI-NEXT:    s_cselect_b32 s2, s3, s2
-; VI-NEXT:    s_lshl_b32 s6, s2, 16
+; VI-NEXT:    s_cselect_b32 s7, 1, 0
+; VI-NEXT:    s_cmp_eq_u32 s6, 3
+; VI-NEXT:    s_cselect_b32 s6, 1, 0
+; VI-NEXT:    s_or_b32 s6, s6, s7
+; VI-NEXT:    s_lshr_b32 s5, s5, 2
+; VI-NEXT:    s_add_i32 s5, s5, s6
+; VI-NEXT:    s_cmp_lt_i32 s3, 31
+; VI-NEXT:    s_cselect_b32 s5, s5, 0x7c00
+; VI-NEXT:    s_cmp_lg_u32 s2, 0
+; VI-NEXT:    s_movk_i32 s6, 0x7e00
+; VI-NEXT:    s_cselect_b32 s2, s6, 0x7c00
+; VI-NEXT:    s_cmpk_eq_i32 s3, 0x40f
+; VI-NEXT:    s_cselect_b32 s2, s2, s5
+; VI-NEXT:    s_lshl_b32 s5, s2, 16
 ; VI-NEXT:    s_lshr_b32 s2, s1, 8
 ; VI-NEXT:    s_and_b32 s7, s2, 0xffe
 ; VI-NEXT:    s_and_b32 s2, s1, 0x1ff
@@ -3983,39 +3986,37 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
 ; VI-NEXT:    s_bfe_u32 s1, s1, 0xb0014
 ; VI-NEXT:    v_readfirstlane_b32 s0, v0
-; VI-NEXT:    s_sub_i32 s2, 0x3f1, s1
-; VI-NEXT:    s_or_b32 s7, s7, s0
-; VI-NEXT:    v_med3_i32 v0, s2, 0, 13
-; VI-NEXT:    s_or_b32 s0, s7, 0x1000
-; VI-NEXT:    v_readfirstlane_b32 s2, v0
-; VI-NEXT:    s_lshr_b32 s3, s0, s2
-; VI-NEXT:    s_or_b32 s8, s3, 1
-; VI-NEXT:    s_lshl_b32 s2, s3, s2
-; VI-NEXT:    s_cmp_lg_u32 s2, s0
-; VI-NEXT:    s_cselect_b32 s0, s8, s3
-; VI-NEXT:    s_add_i32 s8, s1, 0xfffffc10
-; VI-NEXT:    s_lshl_b32 s1, s8, 12
-; VI-NEXT:    s_or_b32 s1, s7, s1
-; VI-NEXT:    s_cmp_lt_i32 s8, 1
-; VI-NEXT:    s_cselect_b32 s9, s0, s1
-; VI-NEXT:    s_and_b32 s2, s9, 7
-; VI-NEXT:    s_cmp_eq_u32 s2, 3
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_gt_i32 s2, 5
-; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; VI-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; VI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; VI-NEXT:    s_cselect_b32 s0, 1, 0
-; VI-NEXT:    s_lshr_b32 s1, s9, 2
-; VI-NEXT:    s_add_i32 s1, s1, s0
-; VI-NEXT:    s_cmp_lt_i32 s8, 31
-; VI-NEXT:    s_cselect_b32 s0, s1, 0x7c00
-; VI-NEXT:    s_cmp_lg_u32 s7, 0
-; VI-NEXT:    s_cselect_b32 s1, s5, 0x7c00
-; VI-NEXT:    s_cmpk_eq_i32 s8, 0x40f
-; VI-NEXT:    s_cselect_b32 s0, s1, s0
+; VI-NEXT:    s_sub_i32 s3, 0x3f1, s1
+; VI-NEXT:    s_or_b32 s0, s7, s0
+; VI-NEXT:    v_med3_i32 v0, s3, 0, 13
+; VI-NEXT:    s_or_b32 s2, s0, 0x1000
+; VI-NEXT:    v_readfirstlane_b32 s3, v0
+; VI-NEXT:    s_lshr_b32 s7, s2, s3
+; VI-NEXT:    s_lshl_b32 s3, s7, s3
+; VI-NEXT:    s_cmp_lg_u32 s3, s2
+; VI-NEXT:    s_cselect_b32 s2, 1, 0
+; VI-NEXT:    s_addk_i32 s1, 0xfc10
+; VI-NEXT:    s_lshl_b32 s3, s1, 12
+; VI-NEXT:    s_or_b32 s2, s7, s2
+; VI-NEXT:    s_or_b32 s3, s0, s3
+; VI-NEXT:    s_cmp_lt_i32 s1, 1
+; VI-NEXT:    s_cselect_b32 s2, s2, s3
+; VI-NEXT:    s_and_b32 s3, s2, 7
+; VI-NEXT:    s_cmp_gt_i32 s3, 5
+; VI-NEXT:    s_cselect_b32 s7, 1, 0
+; VI-NEXT:    s_cmp_eq_u32 s3, 3
+; VI-NEXT:    s_cselect_b32 s3, 1, 0
+; VI-NEXT:    s_or_b32 s3, s3, s7
+; VI-NEXT:    s_lshr_b32 s2, s2, 2
+; VI-NEXT:    s_add_i32 s2, s2, s3
+; VI-NEXT:    s_cmp_lt_i32 s1, 31
+; VI-NEXT:    s_cselect_b32 s2, s2, 0x7c00
+; VI-NEXT:    s_cmp_lg_u32 s0, 0
+; VI-NEXT:    s_cselect_b32 s0, s6, 0x7c00
+; VI-NEXT:    s_cmpk_eq_i32 s1, 0x40f
+; VI-NEXT:    s_cselect_b32 s0, s0, s2
 ; VI-NEXT:    s_and_b32 s0, s0, 0x7fff
-; VI-NEXT:    s_or_b32 s0, s0, s6
+; VI-NEXT:    s_or_b32 s0, s0, s5
 ; VI-NEXT:    s_mov_b32 s1, 0x7fff7fff
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
@@ -4040,31 +4041,29 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; GFX9-NEXT:    s_or_b32 s5, s2, 0x1000
 ; GFX9-NEXT:    v_readfirstlane_b32 s7, v0
 ; GFX9-NEXT:    s_lshr_b32 s8, s5, s7
-; GFX9-NEXT:    s_or_b32 s9, s8, 1
 ; GFX9-NEXT:    s_lshl_b32 s7, s8, s7
 ; GFX9-NEXT:    s_cmp_lg_u32 s7, s5
-; GFX9-NEXT:    s_cselect_b32 s5, s9, s8
-; GFX9-NEXT:    s_add_i32 s10, s6, 0xfffffc10
-; GFX9-NEXT:    s_lshl_b32 s6, s10, 12
-; GFX9-NEXT:    s_or_b32 s6, s2, s6
-; GFX9-NEXT:    s_cmp_lt_i32 s10, 1
-; GFX9-NEXT:    s_cselect_b32 s5, s5, s6
-; GFX9-NEXT:    s_and_b32 s8, s5, 7
-; GFX9-NEXT:    s_cmp_eq_u32 s8, 3
-; GFX9-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX9-NEXT:    s_cmp_gt_i32 s8, 5
-; GFX9-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
-; GFX9-NEXT:    s_and_b64 s[6:7], s[6:7], exec
-; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX9-NEXT:    s_addk_i32 s6, 0xfc10
+; GFX9-NEXT:    s_lshl_b32 s7, s6, 12
+; GFX9-NEXT:    s_or_b32 s5, s8, s5
+; GFX9-NEXT:    s_or_b32 s7, s2, s7
+; GFX9-NEXT:    s_cmp_lt_i32 s6, 1
+; GFX9-NEXT:    s_cselect_b32 s5, s5, s7
+; GFX9-NEXT:    s_and_b32 s7, s5, 7
+; GFX9-NEXT:    s_cmp_gt_i32 s7, 5
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s7, 3
+; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX9-NEXT:    s_or_b32 s7, s7, s8
 ; GFX9-NEXT:    s_lshr_b32 s5, s5, 2
-; GFX9-NEXT:    s_add_i32 s5, s5, s6
-; GFX9-NEXT:    s_cmp_lt_i32 s10, 31
+; GFX9-NEXT:    s_add_i32 s5, s5, s7
+; GFX9-NEXT:    s_cmp_lt_i32 s6, 31
 ; GFX9-NEXT:    s_cselect_b32 s5, s5, 0x7c00
 ; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX9-NEXT:    s_movk_i32 s8, 0x7e00
-; GFX9-NEXT:    s_cselect_b32 s2, s8, 0x7c00
-; GFX9-NEXT:    s_cmpk_eq_i32 s10, 0x40f
+; GFX9-NEXT:    s_movk_i32 s7, 0x7e00
+; GFX9-NEXT:    s_cselect_b32 s2, s7, 0x7c00
+; GFX9-NEXT:    s_cmpk_eq_i32 s6, 0x40f
 ; GFX9-NEXT:    s_cselect_b32 s2, s2, s5
 ; GFX9-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX9-NEXT:    s_and_b32 s3, s3, 0x8000
@@ -4083,31 +4082,29 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; GFX9-NEXT:    v_med3_i32 v0, s6, 0, 13
 ; GFX9-NEXT:    s_or_b32 s2, s0, 0x1000
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9-NEXT:    s_lshr_b32 s7, s2, s6
-; GFX9-NEXT:    s_or_b32 s9, s7, 1
-; GFX9-NEXT:    s_lshl_b32 s6, s7, s6
+; GFX9-NEXT:    s_lshr_b32 s8, s2, s6
+; GFX9-NEXT:    s_lshl_b32 s6, s8, s6
 ; GFX9-NEXT:    s_cmp_lg_u32 s6, s2
-; GFX9-NEXT:    s_cselect_b32 s2, s9, s7
-; GFX9-NEXT:    s_add_i32 s9, s3, 0xfffffc10
-; GFX9-NEXT:    s_lshl_b32 s3, s9, 12
-; GFX9-NEXT:    s_or_b32 s3, s0, s3
-; GFX9-NEXT:    s_cmp_lt_i32 s9, 1
-; GFX9-NEXT:    s_cselect_b32 s10, s2, s3
-; GFX9-NEXT:    s_and_b32 s6, s10, 7
-; GFX9-NEXT:    s_cmp_eq_u32 s6, 3
-; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; GFX9-NEXT:    s_cmp_gt_i32 s6, 5
-; GFX9-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX9-NEXT:    s_or_b64 s[2:3], s[6:7], s[2:3]
-; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
 ; GFX9-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX9-NEXT:    s_lshr_b32 s3, s10, 2
-; GFX9-NEXT:    s_add_i32 s3, s3, s2
-; GFX9-NEXT:    s_cmp_lt_i32 s9, 31
-; GFX9-NEXT:    s_cselect_b32 s2, s3, 0x7c00
+; GFX9-NEXT:    s_addk_i32 s3, 0xfc10
+; GFX9-NEXT:    s_lshl_b32 s6, s3, 12
+; GFX9-NEXT:    s_or_b32 s2, s8, s2
+; GFX9-NEXT:    s_or_b32 s6, s0, s6
+; GFX9-NEXT:    s_cmp_lt_i32 s3, 1
+; GFX9-NEXT:    s_cselect_b32 s2, s2, s6
+; GFX9-NEXT:    s_and_b32 s6, s2, 7
+; GFX9-NEXT:    s_cmp_gt_i32 s6, 5
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX9-NEXT:    s_or_b32 s6, s6, s8
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 2
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_cmp_lt_i32 s3, 31
+; GFX9-NEXT:    s_cselect_b32 s2, s2, 0x7c00
 ; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX9-NEXT:    s_cselect_b32 s0, s8, 0x7c00
-; GFX9-NEXT:    s_cmpk_eq_i32 s9, 0x40f
+; GFX9-NEXT:    s_cselect_b32 s0, s7, 0x7c00
+; GFX9-NEXT:    s_cmpk_eq_i32 s3, 0x40f
 ; GFX9-NEXT:    s_cselect_b32 s0, s0, s2
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0x8000
@@ -4142,26 +4139,23 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshr_b32 s8, s6, s7
 ; GFX11-NEXT:    s_lshl_b32 s7, s8, s7
-; GFX11-NEXT:    s_or_b32 s9, s8, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_cmp_lg_u32 s7, s6
-; GFX11-NEXT:    s_cselect_b32 s6, s9, s8
+; GFX11-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX11-NEXT:    s_addk_i32 s2, 0xfc10
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s6, s8, s6
 ; GFX11-NEXT:    s_lshl_b32 s7, s2, 12
 ; GFX11-NEXT:    s_or_b32 s7, s5, s7
 ; GFX11-NEXT:    s_cmp_lt_i32 s2, 1
 ; GFX11-NEXT:    s_cselect_b32 s6, s6, s7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s7, s6, 7
-; GFX11-NEXT:    s_cmp_eq_u32 s7, 3
-; GFX11-NEXT:    s_cselect_b32 s8, -1, 0
 ; GFX11-NEXT:    s_cmp_gt_i32 s7, 5
-; GFX11-NEXT:    s_cselect_b32 s7, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b32 s7, s7, s8
-; GFX11-NEXT:    s_and_b32 s7, s7, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s7, 3
 ; GFX11-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX11-NEXT:    s_lshr_b32 s6, s6, 2
+; GFX11-NEXT:    s_or_b32 s7, s7, s8
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_add_i32 s6, s6, s7
 ; GFX11-NEXT:    s_cmp_lt_i32 s2, 31
@@ -4195,26 +4189,23 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshr_b32 s8, s5, s6
 ; GFX11-NEXT:    s_lshl_b32 s6, s8, s6
-; GFX11-NEXT:    s_or_b32 s9, s8, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_cmp_lg_u32 s6, s5
-; GFX11-NEXT:    s_cselect_b32 s5, s9, s8
+; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-NEXT:    s_addk_i32 s0, 0xfc10
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s5, s8, s5
 ; GFX11-NEXT:    s_lshl_b32 s6, s0, 12
 ; GFX11-NEXT:    s_or_b32 s6, s3, s6
 ; GFX11-NEXT:    s_cmp_lt_i32 s0, 1
 ; GFX11-NEXT:    s_cselect_b32 s5, s5, s6
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s6, s5, 7
-; GFX11-NEXT:    s_cmp_eq_u32 s6, 3
-; GFX11-NEXT:    s_cselect_b32 s8, -1, 0
 ; GFX11-NEXT:    s_cmp_gt_i32 s6, 5
-; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b32 s6, s6, s8
-; GFX11-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s6, 3
 ; GFX11-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX11-NEXT:    s_lshr_b32 s5, s5, 2
+; GFX11-NEXT:    s_or_b32 s6, s6, s8
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_add_i32 s5, s5, s6
 ; GFX11-NEXT:    s_cmp_lt_i32 s0, 31
@@ -4674,27 +4665,28 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; SI-NEXT:    v_and_b32_e32 v9, 0xffe, v9
 ; SI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; SI-NEXT:    v_bfe_u32 v10, v5, 20, 11
-; SI-NEXT:    s_movk_i32 s6, 0x3f1
+; SI-NEXT:    s_movk_i32 s4, 0x3f1
 ; SI-NEXT:    v_or_b32_e32 v4, v9, v4
-; SI-NEXT:    v_sub_i32_e32 v11, vcc, s6, v10
+; SI-NEXT:    v_sub_i32_e32 v11, vcc, s4, v10
 ; SI-NEXT:    v_or_b32_e32 v9, 0x1000, v4
 ; SI-NEXT:    v_med3_i32 v11, v11, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v12, v11, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, v11, v12
-; SI-NEXT:    v_or_b32_e32 v13, 1, v12
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v11, v9
-; SI-NEXT:    s_movk_i32 s7, 0xfc10
-; SI-NEXT:    v_cndmask_b32_e32 v9, v12, v13, vcc
-; SI-NEXT:    v_add_i32_e32 v10, vcc, s7, v10
+; SI-NEXT:    s_movk_i32 s5, 0xfc10
+; SI-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; SI-NEXT:    v_add_i32_e32 v10, vcc, s5, v10
 ; SI-NEXT:    v_lshlrev_b32_e32 v11, 12, v10
+; SI-NEXT:    v_or_b32_e32 v9, v12, v9
 ; SI-NEXT:    v_or_b32_e32 v11, v4, v11
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v10
 ; SI-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
 ; SI-NEXT:    v_and_b32_e32 v11, 7, v9
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v11
+; SI-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v11
-; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v11
-; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; SI-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; SI-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; SI-NEXT:    v_or_b32_e32 v11, v11, v12
 ; SI-NEXT:    v_lshrrev_b32_e32 v9, 2, v9
 ; SI-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; SI-NEXT:    v_mov_b32_e32 v11, 0x7c00
@@ -4702,9 +4694,9 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; SI-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
 ; SI-NEXT:    v_mov_b32_e32 v12, 0x7e00
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; SI-NEXT:    s_movk_i32 s8, 0x40f
+; SI-NEXT:    s_movk_i32 s6, 0x40f
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v11, v12, vcc
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v10
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v10
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_and_b32_e32 v9, 0x1ff, v3
@@ -4717,31 +4709,32 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; SI-NEXT:    v_bfe_u32 v9, v3, 20, 11
 ; SI-NEXT:    v_or_b32_e32 v2, v5, v2
-; SI-NEXT:    v_sub_i32_e32 v10, vcc, s6, v9
+; SI-NEXT:    v_sub_i32_e32 v10, vcc, s4, v9
 ; SI-NEXT:    v_or_b32_e32 v5, 0x1000, v2
 ; SI-NEXT:    v_med3_i32 v10, v10, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v13, v10, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, v10, v13
-; SI-NEXT:    v_or_b32_e32 v14, 1, v13
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v10, v5
-; SI-NEXT:    v_cndmask_b32_e32 v5, v13, v14, vcc
-; SI-NEXT:    v_add_i32_e32 v9, vcc, s7, v9
+; SI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; SI-NEXT:    v_add_i32_e32 v9, vcc, s5, v9
 ; SI-NEXT:    v_lshlrev_b32_e32 v10, 12, v9
+; SI-NEXT:    v_or_b32_e32 v5, v13, v5
 ; SI-NEXT:    v_or_b32_e32 v10, v2, v10
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v9
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v10, v5, vcc
 ; SI-NEXT:    v_and_b32_e32 v10, 7, v5
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v10
+; SI-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v10
-; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v10
-; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; SI-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; SI-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; SI-NEXT:    v_or_b32_e32 v10, v10, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v9
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v11, v5, vcc
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v11, v12, vcc
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v9
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v9
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_and_b32_e32 v5, 0x1ff, v1
@@ -4754,24 +4747,25 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; SI-NEXT:    v_bfe_u32 v5, v1, 20, 11
 ; SI-NEXT:    v_or_b32_e32 v0, v3, v0
-; SI-NEXT:    v_sub_i32_e32 v9, vcc, s6, v5
+; SI-NEXT:    v_sub_i32_e32 v9, vcc, s4, v5
 ; SI-NEXT:    v_or_b32_e32 v3, 0x1000, v0
 ; SI-NEXT:    v_med3_i32 v9, v9, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v10, v9, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, v9, v10
-; SI-NEXT:    v_or_b32_e32 v13, 1, v10
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v9, v3
-; SI-NEXT:    v_cndmask_b32_e32 v3, v10, v13, vcc
-; SI-NEXT:    v_add_i32_e32 v5, vcc, s7, v5
+; SI-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; SI-NEXT:    v_add_i32_e32 v5, vcc, s5, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v9, 12, v5
+; SI-NEXT:    v_or_b32_e32 v3, v10, v3
 ; SI-NEXT:    v_or_b32_e32 v9, v0, v9
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v5
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
 ; SI-NEXT:    v_and_b32_e32 v9, 7, v3
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v9
+; SI-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v9
-; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v9
-; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; SI-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; SI-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; SI-NEXT:    v_or_b32_e32 v9, v9, v10
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v5
@@ -4779,7 +4773,7 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v11, v12, vcc
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v5
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v5
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; SI-NEXT:    v_and_b32_e32 v1, 0x8000, v1
@@ -4804,27 +4798,28 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; VI-NEXT:    v_and_b32_e32 v8, 0xffe, v8
 ; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; VI-NEXT:    v_bfe_u32 v5, v5, 20, 11
-; VI-NEXT:    s_movk_i32 s6, 0x3f1
+; VI-NEXT:    s_movk_i32 s4, 0x3f1
 ; VI-NEXT:    v_or_b32_e32 v4, v8, v4
-; VI-NEXT:    v_sub_u32_e32 v9, vcc, s6, v5
+; VI-NEXT:    v_sub_u32_e32 v9, vcc, s4, v5
 ; VI-NEXT:    v_or_b32_e32 v8, 0x1000, v4
 ; VI-NEXT:    v_med3_i32 v9, v9, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v10, v9, v8
 ; VI-NEXT:    v_lshlrev_b32_e32 v9, v9, v10
-; VI-NEXT:    v_or_b32_e32 v11, 1, v10
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v9, v8
-; VI-NEXT:    s_movk_i32 s7, 0xfc10
-; VI-NEXT:    v_cndmask_b32_e32 v8, v10, v11, vcc
-; VI-NEXT:    v_add_u32_e32 v5, vcc, s7, v5
+; VI-NEXT:    s_movk_i32 s5, 0xfc10
+; VI-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-NEXT:    v_add_u32_e32 v5, vcc, s5, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v9, 12, v5
+; VI-NEXT:    v_or_b32_e32 v8, v10, v8
 ; VI-NEXT:    v_or_b32_e32 v9, v4, v9
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v5
 ; VI-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
 ; VI-NEXT:    v_and_b32_e32 v9, 7, v8
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v9
+; VI-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v9
-; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v9
-; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; VI-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; VI-NEXT:    v_or_b32_e32 v9, v9, v10
 ; VI-NEXT:    v_lshrrev_b32_e32 v8, 2, v8
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, v8, v9
 ; VI-NEXT:    v_mov_b32_e32 v9, 0x7c00
@@ -4832,9 +4827,9 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; VI-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
 ; VI-NEXT:    v_mov_b32_e32 v10, 0x7e00
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; VI-NEXT:    s_movk_i32 s8, 0x40f
+; VI-NEXT:    s_movk_i32 s6, 0x40f
 ; VI-NEXT:    v_cndmask_b32_e32 v4, v9, v10, vcc
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v5
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v5
 ; VI-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; VI-NEXT:    v_and_b32_e32 v8, 0x1ff, v1
 ; VI-NEXT:    v_or_b32_e32 v0, v8, v0
@@ -4844,31 +4839,32 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; VI-NEXT:    v_bfe_u32 v1, v1, 20, 11
 ; VI-NEXT:    v_or_b32_e32 v0, v5, v0
-; VI-NEXT:    v_sub_u32_e32 v8, vcc, s6, v1
+; VI-NEXT:    v_sub_u32_e32 v8, vcc, s4, v1
 ; VI-NEXT:    v_or_b32_e32 v5, 0x1000, v0
 ; VI-NEXT:    v_med3_i32 v8, v8, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v11, v8, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, v8, v11
-; VI-NEXT:    v_or_b32_e32 v12, 1, v11
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v8, v5
-; VI-NEXT:    v_cndmask_b32_e32 v5, v11, v12, vcc
-; VI-NEXT:    v_add_u32_e32 v1, vcc, s7, v1
+; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; VI-NEXT:    v_add_u32_e32 v1, vcc, s5, v1
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 12, v1
+; VI-NEXT:    v_or_b32_e32 v5, v11, v5
 ; VI-NEXT:    v_or_b32_e32 v8, v0, v8
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
 ; VI-NEXT:    v_and_b32_e32 v8, 7, v5
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v8
+; VI-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v8
-; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v8
-; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; VI-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; VI-NEXT:    v_or_b32_e32 v8, v8, v11
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v8
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v9, v10, vcc
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v1
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
 ; VI-NEXT:    v_and_b32_e32 v5, 0x1ff, v3
 ; VI-NEXT:    v_or_b32_e32 v2, v5, v2
@@ -4878,31 +4874,32 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; VI-NEXT:    v_bfe_u32 v3, v3, 20, 11
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v2
-; VI-NEXT:    v_sub_u32_e32 v5, vcc, s6, v3
+; VI-NEXT:    v_sub_u32_e32 v5, vcc, s4, v3
 ; VI-NEXT:    v_or_b32_e32 v2, 0x1000, v1
 ; VI-NEXT:    v_med3_i32 v5, v5, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v8, v5, v2
 ; VI-NEXT:    v_lshlrev_b32_e32 v5, v5, v8
-; VI-NEXT:    v_or_b32_e32 v11, 1, v8
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v8, v11, vcc
-; VI-NEXT:    v_add_u32_e32 v3, vcc, s7, v3
+; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT:    v_add_u32_e32 v3, vcc, s5, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v5, 12, v3
+; VI-NEXT:    v_or_b32_e32 v2, v8, v2
 ; VI-NEXT:    v_or_b32_e32 v5, v1, v5
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; VI-NEXT:    v_and_b32_e32 v5, 7, v2
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v5
+; VI-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v5
-; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v5
-; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; VI-NEXT:    v_or_b32_e32 v5, v5, v8
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v3
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
@@ -4915,31 +4912,32 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; GFX9-LABEL: v_copysign_out_v3f16_mag_v3f64_sign_v3f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s6, 0x1ff
-; GFX9-NEXT:    v_and_or_b32 v4, v5, s6, v4
+; GFX9-NEXT:    s_movk_i32 s4, 0x1ff
+; GFX9-NEXT:    v_and_or_b32 v4, v5, s4, v4
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v5
-; GFX9-NEXT:    s_movk_i32 s7, 0xffe
+; GFX9-NEXT:    s_movk_i32 s5, 0xffe
 ; GFX9-NEXT:    v_bfe_u32 v5, v5, 20, 11
-; GFX9-NEXT:    v_and_or_b32 v4, v8, s7, v4
+; GFX9-NEXT:    v_and_or_b32 v4, v8, s5, v4
 ; GFX9-NEXT:    v_sub_u32_e32 v9, 0x3f1, v5
 ; GFX9-NEXT:    v_or_b32_e32 v8, 0x1000, v4
 ; GFX9-NEXT:    v_med3_i32 v9, v9, 0, 13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v10, v9, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, v9, v10
-; GFX9-NEXT:    v_or_b32_e32 v11, 1, v10
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v9, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v5, 0xfffffc10, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v10, v11, vcc
+; GFX9-NEXT:    v_or_b32_e32 v8, v10, v8
 ; GFX9-NEXT:    v_lshl_or_b32 v9, v5, 12, v4
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v9, 7, v8
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v9
-; GFX9-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v9
-; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX9-NEXT:    v_or_b32_e32 v9, v9, v10
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 2, v8
 ; GFX9-NEXT:    v_add_u32_e32 v8, v8, v9
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0x7c00
@@ -4947,77 +4945,79 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7e00
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX9-NEXT:    s_movk_i32 s8, 0x40f
+; GFX9-NEXT:    s_movk_i32 s6, 0x40f
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v9, v10, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v5
-; GFX9-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v1, s4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
 ; GFX9-NEXT:    v_bfe_u32 v8, v1, 20, 11
-; GFX9-NEXT:    v_and_or_b32 v0, v5, s7, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v5, s5, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v11, 0x3f1, v8
 ; GFX9-NEXT:    v_or_b32_e32 v5, 0x1000, v0
 ; GFX9-NEXT:    v_med3_i32 v11, v11, 0, 13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v12, v11, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, v11, v12
-; GFX9-NEXT:    v_or_b32_e32 v13, 1, v12
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v11, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v8, 0xfffffc10, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v12, v13, vcc
+; GFX9-NEXT:    v_or_b32_e32 v5, v12, v5
 ; GFX9-NEXT:    v_lshl_or_b32 v11, v8, 12, v0
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v11, v5, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v11, 7, v5
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v11
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v11
-; GFX9-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v11
-; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX9-NEXT:    v_or_b32_e32 v11, v11, v12
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
 ; GFX9-NEXT:    v_add_u32_e32 v5, v5, v11
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v10, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    s_mov_b32 s9, 0x8000
-; GFX9-NEXT:    v_and_or_b32 v0, v1, s9, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v3, s6, v2
+; GFX9-NEXT:    s_mov_b32 s7, 0x8000
+; GFX9-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v3, s4, v2
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v3
 ; GFX9-NEXT:    v_bfe_u32 v5, v3, 20, 11
-; GFX9-NEXT:    v_and_or_b32 v1, v2, s7, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v2, s5, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v8, 0x3f1, v5
 ; GFX9-NEXT:    v_or_b32_e32 v2, 0x1000, v1
 ; GFX9-NEXT:    v_med3_i32 v8, v8, 0, 13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v11, v8, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, v8, v11
-; GFX9-NEXT:    v_or_b32_e32 v12, 1, v11
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v8, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v5, 0xfffffc10, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v11, v12, vcc
+; GFX9-NEXT:    v_or_b32_e32 v2, v11, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v8, v5, 12, v1
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v8, 7, v2
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v8
-; GFX9-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v8
-; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-NEXT:    v_or_b32_e32 v8, v8, v11
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v8
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v5
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_and_or_b32 v1, v2, s9, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v2, s7, v1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7fff7fff
@@ -5031,106 +5031,108 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, 0x1ff, v5, v4
 ; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0x1ff, v3, v2
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 8, v5
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v9, v3, 20, 11
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0x1ff, v1, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v3, 20, 11
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v5, v5, 20, 11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 8, v1
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v1, 20, 11
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0x1ff, v1, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v1
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v14, v1, 20, 11
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v10, 0x3f1, v5
+; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v11, 0x3f1, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, 0xffe, v8, v4
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
+; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v8, 0x3f1, v10
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT:    v_med3_i32 v10, v10, 0, 13
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, 0x1000, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0xffe, v8, v2
-; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v8, 0x3f1, v9
+; GFX11-TRUE16-NEXT:    v_med3_i32 v11, v11, 0, 13
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0xfffffc10, v10
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0xffe, v9, v2
+; GFX11-TRUE16-NEXT:    v_med3_i32 v8, v8, 0, 13
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, 0x1000, v4
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v13, v10, v11
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x1000, v2
-; GFX11-TRUE16-NEXT:    v_med3_i32 v8, v8, 0, 13
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0xffe, v15, v0
-; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v15, 0x3f1, v16
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, v10, v13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, v8, v12
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_med3_i32 v15, v15, 0, 13
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, v8, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 1, v14
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, v11, v9
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0xffe, v13, v0
+; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v13, 0x3f1, v14
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, v8, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0xfffffc10, v14
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, v11, v16
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 0x1000, v0
+; GFX11-TRUE16-NEXT:    v_med3_i32 v13, v13, 0, 13
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v8, v8, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v8, v12
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v14, v17, vcc_lo
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, 0xfffffc10, v9
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x1000, v0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 1, v13
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v9, 12, v2
-; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v12, v8, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v10, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v12, v15, v14
-; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e64 s3, 31, v9
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v10, v13, v17 :: v_dual_and_b32 v13, 7, v8
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v12, v13, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v11, v9
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v10, 12, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v13, v13, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v8, v15, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v10
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, 0xfffffc10, v5
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v15, v15, v12
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 1, v12
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 2, v8
-; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e64 s0, 5, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v9, v16, v9
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, v11, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v13, v17
 ; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v11, v5, 12, v4
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v17, 0x7e00
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 7, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v15, v14
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v11, v12, v17 :: v_dual_add_nc_u32 v16, 0xfffffc10, v16
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v16, 12, v0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e64 s1, 1, v16
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v13
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v11, v12, v11, s1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v12, 7, v10
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, vcc_lo
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 2, v10
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v12
-; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e64 s2, 5, v12
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v12, 0x7e00
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v14, 7, v11
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 2, v8
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v15
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v12, v13
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v12, v14, 12, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v15
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v15, v13
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v11, v12, v11 :: v_dual_and_b32 v12, 7, v9
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 2, v9
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v8, v8, v13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 7, v11
+; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v12
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 2, v11
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 3, v14
-; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e64 s1, 5, v14
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v8, 0x7c00, v8, s3
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s1, s0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s0
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e64 s0, 0, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v15
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v15
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v12
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v13, v15, v13
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, v11, v13
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0x7c00, v12, s0
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s2, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v9
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s0
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v12, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v8, 0x7c00, v8, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v2, 0x7c00, v17 :: v_dual_add_nc_u32 v9, v9, v12
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v10
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v16
+; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v14
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.h, 0
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, v10, v14
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, 0x7c00, v11, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0x8000, v8, v2
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v1.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v12, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v17, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v14
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -5138,7 +5140,7 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v1, 0x7c00, v9, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, 0x7c00, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, 0x7c00, v17, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, v0, v6
@@ -5151,114 +5153,123 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, 0x1ff, v5, v4
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 8, v5
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v5, v5, 20, 11
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0x1ff, v1, v0
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v1, 20, 11
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v8, v5, 20, 11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0x1ff, v3, v2
-; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v9, 0x3f1, v5
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v10, v1, 20, 11
+; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v9, 0x3f1, v8
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 8, v3
-; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v15, 0x3f1, v10
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v3, 20, 11
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-FAKE16-NEXT:    v_med3_i32 v9, v9, 0, 13
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v3, 20, 11
-; GFX11-FAKE16-NEXT:    v_med3_i32 v15, v15, 0, 13
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, 0xffe, v8, v4
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, 0xffe, v5, v4
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, 0x1000, v4
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0xffe, v8, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0xffe, v5, v0
+; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v5, 0x3f1, v10
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, v9, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x1000, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, v9, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, 0x1000, v0
+; GFX11-FAKE16-NEXT:    v_med3_i32 v5, v5, 0, 13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0xffe, v12, v2
-; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v12, 0x3f1, v14
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v8, v9, v13
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, 1, v13
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0xfffffc10, v14
+; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v12, 0x3f1, v13
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v9, v9, v15
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0xfffffc10, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, v5, v14
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x1000, v2
 ; GFX11-FAKE16-NEXT:    v_med3_i32 v12, v12, 0, 13
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v8, v11
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, v15, v16
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v13, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, 0xfffffc10, v5
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, v12, v17
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v9, v5, 12, v4
-; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v5
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, v12, v13
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 1, v13
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, v15, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, 1, v11
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v9, v16
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v9, v11, v15 :: v_dual_add_nc_u32 v10, 0xfffffc10, v10
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v9, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v5, v5, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, v12, v17
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v5, v14
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, v12, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v9, v15, v9
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, 0xfffffc10, v8
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v12, v17
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v10, 12, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v13, v18, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v14, v8, 12, v4
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v5, v16, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0xfffffc10, v10
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v8
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v10, 12, v0
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v14, v9, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v10
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v14, 12, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v14
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 7, v8
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 2, v8
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v12, v13, v12 :: v_dual_and_b32 v13, 7, v9
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v11
-; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e64 s0, 5, v11
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v13, 12, v2
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v13
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v14, 7, v9
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 2, v9
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 7, v12
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e64 s1, 3, v13
-; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e64 s2, 5, v13
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 2, v12
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s0
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v11
-; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e64 s0, 5, v11
-; GFX11-FAKE16-NEXT:    s_or_b32 s1, s2, s1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s1
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v8, v8, v13
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v12, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v14
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v14
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v14, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, 0x7e00 :: v_dual_add_nc_u32 v9, v9, v12
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 7, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 7, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 2, v11
+; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v14, v15, v17
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v16
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, v5, v14
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, v12, v15
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, v9, v11
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v11, 0x7e00
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v9, 0x7c00, v9, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v16, v18
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, 0x7c00, v5, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, 0x7c00, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v0, 0x7c00, v12 :: v_dual_add_nc_u32 v11, v11, v15
+; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, 0x7c00, v11, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v12, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v14
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v13
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0x8000, v1, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v12, v2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v8
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v1, 0x8000, v3, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, 0x7c00, v8, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, 0x7c00, v9, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v5
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v8
 ; GFX11-FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, v0, v6
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v8, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_bfi_b32 v1, 0x7fff7fff, v1, v7
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mag.trunc = fptrunc <3 x double> %mag to <3 x half>
@@ -5797,27 +5808,28 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; SI-NEXT:    v_and_b32_e32 v12, 0xffe, v12
 ; SI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; SI-NEXT:    v_bfe_u32 v13, v7, 20, 11
-; SI-NEXT:    s_movk_i32 s6, 0x3f1
+; SI-NEXT:    s_movk_i32 s4, 0x3f1
 ; SI-NEXT:    v_or_b32_e32 v6, v12, v6
-; SI-NEXT:    v_sub_i32_e32 v14, vcc, s6, v13
+; SI-NEXT:    v_sub_i32_e32 v14, vcc, s4, v13
 ; SI-NEXT:    v_or_b32_e32 v12, 0x1000, v6
 ; SI-NEXT:    v_med3_i32 v14, v14, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v15, v14, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, v14, v15
-; SI-NEXT:    v_or_b32_e32 v16, 1, v15
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v14, v12
-; SI-NEXT:    s_movk_i32 s7, 0xfc10
-; SI-NEXT:    v_cndmask_b32_e32 v12, v15, v16, vcc
-; SI-NEXT:    v_add_i32_e32 v13, vcc, s7, v13
+; SI-NEXT:    s_movk_i32 s5, 0xfc10
+; SI-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; SI-NEXT:    v_add_i32_e32 v13, vcc, s5, v13
 ; SI-NEXT:    v_lshlrev_b32_e32 v14, 12, v13
+; SI-NEXT:    v_or_b32_e32 v12, v15, v12
 ; SI-NEXT:    v_or_b32_e32 v14, v6, v14
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v13
 ; SI-NEXT:    v_cndmask_b32_e32 v12, v14, v12, vcc
 ; SI-NEXT:    v_and_b32_e32 v14, 7, v12
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v14
+; SI-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v14
-; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v14
-; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; SI-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; SI-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; SI-NEXT:    v_or_b32_e32 v14, v14, v15
 ; SI-NEXT:    v_lshrrev_b32_e32 v12, 2, v12
 ; SI-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; SI-NEXT:    v_mov_b32_e32 v14, 0x7c00
@@ -5825,9 +5837,9 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; SI-NEXT:    v_cndmask_b32_e32 v12, v14, v12, vcc
 ; SI-NEXT:    v_mov_b32_e32 v15, 0x7e00
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; SI-NEXT:    s_movk_i32 s8, 0x40f
+; SI-NEXT:    s_movk_i32 s6, 0x40f
 ; SI-NEXT:    v_cndmask_b32_e32 v6, v14, v15, vcc
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v13
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v13
 ; SI-NEXT:    v_cndmask_b32_e32 v6, v12, v6, vcc
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; SI-NEXT:    v_and_b32_e32 v12, 0x1ff, v5
@@ -5840,31 +5852,32 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; SI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; SI-NEXT:    v_bfe_u32 v12, v5, 20, 11
 ; SI-NEXT:    v_or_b32_e32 v4, v7, v4
-; SI-NEXT:    v_sub_i32_e32 v13, vcc, s6, v12
+; SI-NEXT:    v_sub_i32_e32 v13, vcc, s4, v12
 ; SI-NEXT:    v_or_b32_e32 v7, 0x1000, v4
 ; SI-NEXT:    v_med3_i32 v13, v13, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v16, v13, v7
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, v13, v16
-; SI-NEXT:    v_or_b32_e32 v17, 1, v16
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v13, v7
-; SI-NEXT:    v_cndmask_b32_e32 v7, v16, v17, vcc
-; SI-NEXT:    v_add_i32_e32 v12, vcc, s7, v12
+; SI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; SI-NEXT:    v_add_i32_e32 v12, vcc, s5, v12
 ; SI-NEXT:    v_lshlrev_b32_e32 v13, 12, v12
+; SI-NEXT:    v_or_b32_e32 v7, v16, v7
 ; SI-NEXT:    v_or_b32_e32 v13, v4, v13
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v12
 ; SI-NEXT:    v_cndmask_b32_e32 v7, v13, v7, vcc
 ; SI-NEXT:    v_and_b32_e32 v13, 7, v7
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v13
+; SI-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v13
-; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v13
-; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; SI-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; SI-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; SI-NEXT:    v_or_b32_e32 v13, v13, v16
 ; SI-NEXT:    v_lshrrev_b32_e32 v7, 2, v7
 ; SI-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v12
 ; SI-NEXT:    v_cndmask_b32_e32 v7, v14, v7, vcc
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v14, v15, vcc
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v12
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v12
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; SI-NEXT:    v_and_b32_e32 v7, 0x1ff, v3
@@ -5877,31 +5890,32 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; SI-NEXT:    v_bfe_u32 v7, v3, 20, 11
 ; SI-NEXT:    v_or_b32_e32 v2, v5, v2
-; SI-NEXT:    v_sub_i32_e32 v12, vcc, s6, v7
+; SI-NEXT:    v_sub_i32_e32 v12, vcc, s4, v7
 ; SI-NEXT:    v_or_b32_e32 v5, 0x1000, v2
 ; SI-NEXT:    v_med3_i32 v12, v12, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v13, v12, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, v12, v13
-; SI-NEXT:    v_or_b32_e32 v16, 1, v13
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v12, v5
-; SI-NEXT:    v_cndmask_b32_e32 v5, v13, v16, vcc
-; SI-NEXT:    v_add_i32_e32 v7, vcc, s7, v7
+; SI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; SI-NEXT:    v_add_i32_e32 v7, vcc, s5, v7
 ; SI-NEXT:    v_lshlrev_b32_e32 v12, 12, v7
+; SI-NEXT:    v_or_b32_e32 v5, v13, v5
 ; SI-NEXT:    v_or_b32_e32 v12, v2, v12
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v7
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v12, v5, vcc
 ; SI-NEXT:    v_and_b32_e32 v12, 7, v5
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v12
+; SI-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v12
-; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v12
-; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; SI-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; SI-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; SI-NEXT:    v_or_b32_e32 v12, v12, v13
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v7
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v14, v15, vcc
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v7
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v7
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_and_b32_e32 v5, 0x1ff, v1
@@ -5914,31 +5928,32 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; SI-NEXT:    v_bfe_u32 v5, v1, 20, 11
 ; SI-NEXT:    v_or_b32_e32 v0, v3, v0
-; SI-NEXT:    v_sub_i32_e32 v7, vcc, s6, v5
+; SI-NEXT:    v_sub_i32_e32 v7, vcc, s4, v5
 ; SI-NEXT:    v_or_b32_e32 v3, 0x1000, v0
 ; SI-NEXT:    v_med3_i32 v7, v7, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v12, v7, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, v7, v12
-; SI-NEXT:    v_or_b32_e32 v13, 1, v12
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v7, v3
-; SI-NEXT:    v_cndmask_b32_e32 v3, v12, v13, vcc
-; SI-NEXT:    v_add_i32_e32 v5, vcc, s7, v5
+; SI-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; SI-NEXT:    v_add_i32_e32 v5, vcc, s5, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 12, v5
+; SI-NEXT:    v_or_b32_e32 v3, v12, v3
 ; SI-NEXT:    v_or_b32_e32 v7, v0, v7
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v5
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; SI-NEXT:    v_and_b32_e32 v7, 7, v3
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v7
+; SI-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v7
-; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v7
-; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; SI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; SI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; SI-NEXT:    v_or_b32_e32 v7, v7, v12
 ; SI-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v5
 ; SI-NEXT:    v_cndmask_b32_e32 v3, v14, v3, vcc
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v14, v15, vcc
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v5
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v5
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; SI-NEXT:    v_and_b32_e32 v1, 0x8000, v1
@@ -5964,27 +5979,28 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; VI-NEXT:    v_and_b32_e32 v10, 0xffe, v10
 ; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; VI-NEXT:    v_bfe_u32 v5, v5, 20, 11
-; VI-NEXT:    s_movk_i32 s6, 0x3f1
+; VI-NEXT:    s_movk_i32 s4, 0x3f1
 ; VI-NEXT:    v_or_b32_e32 v4, v10, v4
-; VI-NEXT:    v_sub_u32_e32 v11, vcc, s6, v5
+; VI-NEXT:    v_sub_u32_e32 v11, vcc, s4, v5
 ; VI-NEXT:    v_or_b32_e32 v10, 0x1000, v4
 ; VI-NEXT:    v_med3_i32 v11, v11, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v12, v11, v10
 ; VI-NEXT:    v_lshlrev_b32_e32 v11, v11, v12
-; VI-NEXT:    v_or_b32_e32 v13, 1, v12
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v11, v10
-; VI-NEXT:    s_movk_i32 s7, 0xfc10
-; VI-NEXT:    v_cndmask_b32_e32 v10, v12, v13, vcc
-; VI-NEXT:    v_add_u32_e32 v5, vcc, s7, v5
+; VI-NEXT:    s_movk_i32 s5, 0xfc10
+; VI-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; VI-NEXT:    v_add_u32_e32 v5, vcc, s5, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v11, 12, v5
+; VI-NEXT:    v_or_b32_e32 v10, v12, v10
 ; VI-NEXT:    v_or_b32_e32 v11, v4, v11
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v5
 ; VI-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
 ; VI-NEXT:    v_and_b32_e32 v11, 7, v10
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v11
+; VI-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v11
-; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v11
-; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; VI-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; VI-NEXT:    v_or_b32_e32 v11, v11, v12
 ; VI-NEXT:    v_lshrrev_b32_e32 v10, 2, v10
 ; VI-NEXT:    v_add_u32_e32 v10, vcc, v10, v11
 ; VI-NEXT:    v_mov_b32_e32 v11, 0x7c00
@@ -5992,9 +6008,9 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; VI-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
 ; VI-NEXT:    v_mov_b32_e32 v12, 0x7e00
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; VI-NEXT:    s_movk_i32 s8, 0x40f
+; VI-NEXT:    s_movk_i32 s6, 0x40f
 ; VI-NEXT:    v_cndmask_b32_e32 v4, v11, v12, vcc
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v5
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v5
 ; VI-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
 ; VI-NEXT:    v_and_b32_e32 v10, 0x1ff, v7
 ; VI-NEXT:    v_or_b32_e32 v6, v10, v6
@@ -6004,31 +6020,32 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; VI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; VI-NEXT:    v_bfe_u32 v7, v7, 20, 11
 ; VI-NEXT:    v_or_b32_e32 v5, v5, v6
-; VI-NEXT:    v_sub_u32_e32 v10, vcc, s6, v7
+; VI-NEXT:    v_sub_u32_e32 v10, vcc, s4, v7
 ; VI-NEXT:    v_or_b32_e32 v6, 0x1000, v5
 ; VI-NEXT:    v_med3_i32 v10, v10, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v13, v10, v6
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, v10, v13
-; VI-NEXT:    v_or_b32_e32 v14, 1, v13
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v10, v6
-; VI-NEXT:    v_cndmask_b32_e32 v6, v13, v14, vcc
-; VI-NEXT:    v_add_u32_e32 v7, vcc, s7, v7
+; VI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; VI-NEXT:    v_add_u32_e32 v7, vcc, s5, v7
 ; VI-NEXT:    v_lshlrev_b32_e32 v10, 12, v7
+; VI-NEXT:    v_or_b32_e32 v6, v13, v6
 ; VI-NEXT:    v_or_b32_e32 v10, v5, v10
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v7
 ; VI-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
 ; VI-NEXT:    v_and_b32_e32 v10, 7, v6
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v10
+; VI-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v10
-; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v10
-; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; VI-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; VI-NEXT:    v_or_b32_e32 v10, v10, v13
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 2, v6
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v10
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v7
 ; VI-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; VI-NEXT:    v_cndmask_b32_e32 v5, v11, v12, vcc
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v7
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v7
 ; VI-NEXT:    v_and_b32_e32 v7, 0x1ff, v1
 ; VI-NEXT:    v_or_b32_e32 v0, v7, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
@@ -6038,31 +6055,32 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; VI-NEXT:    v_bfe_u32 v1, v1, 20, 11
 ; VI-NEXT:    v_or_b32_e32 v0, v6, v0
-; VI-NEXT:    v_sub_u32_e32 v7, vcc, s6, v1
+; VI-NEXT:    v_sub_u32_e32 v7, vcc, s4, v1
 ; VI-NEXT:    v_or_b32_e32 v6, 0x1000, v0
 ; VI-NEXT:    v_med3_i32 v7, v7, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v10, v7, v6
 ; VI-NEXT:    v_lshlrev_b32_e32 v7, v7, v10
-; VI-NEXT:    v_or_b32_e32 v13, 1, v10
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v7, v6
-; VI-NEXT:    v_cndmask_b32_e32 v6, v10, v13, vcc
-; VI-NEXT:    v_add_u32_e32 v1, vcc, s7, v1
+; VI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; VI-NEXT:    v_add_u32_e32 v1, vcc, s5, v1
 ; VI-NEXT:    v_lshlrev_b32_e32 v7, 12, v1
+; VI-NEXT:    v_or_b32_e32 v6, v10, v6
 ; VI-NEXT:    v_or_b32_e32 v7, v0, v7
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
 ; VI-NEXT:    v_and_b32_e32 v7, 7, v6
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v7
+; VI-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v7
-; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v7
-; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; VI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; VI-NEXT:    v_or_b32_e32 v7, v7, v10
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 2, v6
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v7
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v11, v12, vcc
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v1
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; VI-NEXT:    v_and_b32_e32 v6, 0x1ff, v3
 ; VI-NEXT:    v_or_b32_e32 v2, v6, v2
@@ -6072,31 +6090,32 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; VI-NEXT:    v_bfe_u32 v3, v3, 20, 11
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v2
-; VI-NEXT:    v_sub_u32_e32 v6, vcc, s6, v3
+; VI-NEXT:    v_sub_u32_e32 v6, vcc, s4, v3
 ; VI-NEXT:    v_or_b32_e32 v2, 0x1000, v1
 ; VI-NEXT:    v_med3_i32 v6, v6, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, v6, v2
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, v6, v7
-; VI-NEXT:    v_or_b32_e32 v10, 1, v7
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v6, v2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v7, v10, vcc
-; VI-NEXT:    v_add_u32_e32 v3, vcc, s7, v3
+; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT:    v_add_u32_e32 v3, vcc, s5, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, 12, v3
+; VI-NEXT:    v_or_b32_e32 v2, v7, v2
 ; VI-NEXT:    v_or_b32_e32 v6, v1, v6
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
 ; VI-NEXT:    v_and_b32_e32 v6, 7, v2
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v6
+; VI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v6
-; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v6
-; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; VI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; VI-NEXT:    v_or_b32_e32 v6, v6, v7
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v11, v12, vcc
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v3
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
@@ -6112,31 +6131,32 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; GFX9-LABEL: v_copysign_out_v4f16_mag_v4f64_sign_v4f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s6, 0x1ff
-; GFX9-NEXT:    v_and_or_b32 v4, v5, s6, v4
+; GFX9-NEXT:    s_movk_i32 s4, 0x1ff
+; GFX9-NEXT:    v_and_or_b32 v4, v5, s4, v4
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v5
-; GFX9-NEXT:    s_movk_i32 s7, 0xffe
+; GFX9-NEXT:    s_movk_i32 s5, 0xffe
 ; GFX9-NEXT:    v_bfe_u32 v11, v5, 20, 11
-; GFX9-NEXT:    v_and_or_b32 v4, v10, s7, v4
+; GFX9-NEXT:    v_and_or_b32 v4, v10, s5, v4
 ; GFX9-NEXT:    v_sub_u32_e32 v12, 0x3f1, v11
 ; GFX9-NEXT:    v_or_b32_e32 v10, 0x1000, v4
 ; GFX9-NEXT:    v_med3_i32 v12, v12, 0, 13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v13, v12, v10
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v12, v12, v13
-; GFX9-NEXT:    v_or_b32_e32 v14, 1, v13
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v12, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v11, 0xfffffc10, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v13, v14, vcc
+; GFX9-NEXT:    v_or_b32_e32 v10, v13, v10
 ; GFX9-NEXT:    v_lshl_or_b32 v12, v11, 12, v4
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v12, 7, v10
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v12
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v12
-; GFX9-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v12
-; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GFX9-NEXT:    v_or_b32_e32 v12, v12, v13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 2, v10
 ; GFX9-NEXT:    v_add_u32_e32 v10, v10, v12
 ; GFX9-NEXT:    v_mov_b32_e32 v12, 0x7c00
@@ -6144,112 +6164,115 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v12, v10, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v13, 0x7e00
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GFX9-NEXT:    s_movk_i32 s8, 0x40f
+; GFX9-NEXT:    s_movk_i32 s6, 0x40f
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v12, v13, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v11
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    s_mov_b32 s9, 0x8000
-; GFX9-NEXT:    v_and_or_b32 v4, v5, s9, v4
-; GFX9-NEXT:    v_and_or_b32 v5, v7, s6, v6
+; GFX9-NEXT:    s_mov_b32 s7, 0x8000
+; GFX9-NEXT:    v_and_or_b32 v4, v5, s7, v4
+; GFX9-NEXT:    v_and_or_b32 v5, v7, s4, v6
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v7
 ; GFX9-NEXT:    v_bfe_u32 v10, v7, 20, 11
-; GFX9-NEXT:    v_and_or_b32 v5, v6, s7, v5
+; GFX9-NEXT:    v_and_or_b32 v5, v6, s5, v5
 ; GFX9-NEXT:    v_sub_u32_e32 v11, 0x3f1, v10
 ; GFX9-NEXT:    v_or_b32_e32 v6, 0x1000, v5
 ; GFX9-NEXT:    v_med3_i32 v11, v11, 0, 13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v14, v11, v6
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, v11, v14
-; GFX9-NEXT:    v_or_b32_e32 v15, 1, v14
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v11, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v10, 0xfffffc10, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v14, v15, vcc
+; GFX9-NEXT:    v_or_b32_e32 v6, v14, v6
 ; GFX9-NEXT:    v_lshl_or_b32 v11, v10, 12, v5
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v11, 7, v6
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v11
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v11
-; GFX9-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v11
-; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX9-NEXT:    v_or_b32_e32 v11, v11, v14
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 2, v6
 ; GFX9-NEXT:    v_add_u32_e32 v6, v6, v11
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v12, v6, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v12, v13, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v10
-; GFX9-NEXT:    v_and_or_b32 v0, v1, s6, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v10
+; GFX9-NEXT:    v_and_or_b32 v0, v1, s4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_and_or_b32 v5, v6, s9, v5
+; GFX9-NEXT:    v_and_or_b32 v5, v6, s7, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX9-NEXT:    v_bfe_u32 v7, v1, 20, 11
-; GFX9-NEXT:    v_and_or_b32 v0, v6, s7, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v6, s5, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v10, 0x3f1, v7
 ; GFX9-NEXT:    v_or_b32_e32 v6, 0x1000, v0
 ; GFX9-NEXT:    v_med3_i32 v10, v10, 0, 13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v11, v10, v6
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, v10, v11
-; GFX9-NEXT:    v_or_b32_e32 v14, 1, v11
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v10, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v7, 0xfffffc10, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v11, v14, vcc
+; GFX9-NEXT:    v_or_b32_e32 v6, v11, v6
 ; GFX9-NEXT:    v_lshl_or_b32 v10, v7, 12, v0
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v10, 7, v6
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v10
-; GFX9-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v10
-; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX9-NEXT:    v_or_b32_e32 v10, v10, v11
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 2, v6
 ; GFX9-NEXT:    v_add_u32_e32 v6, v6, v10
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v12, v6, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v12, v13, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v7
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_and_or_b32 v0, v1, s9, v0
-; GFX9-NEXT:    v_and_or_b32 v1, v3, s6, v2
+; GFX9-NEXT:    v_and_or_b32 v0, v1, s7, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v3, s4, v2
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 8, v3
 ; GFX9-NEXT:    v_bfe_u32 v6, v3, 20, 11
-; GFX9-NEXT:    v_and_or_b32 v1, v2, s7, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v2, s5, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v7, 0x3f1, v6
 ; GFX9-NEXT:    v_or_b32_e32 v2, 0x1000, v1
 ; GFX9-NEXT:    v_med3_i32 v7, v7, 0, 13
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v10, v7, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, v7, v10
-; GFX9-NEXT:    v_or_b32_e32 v11, 1, v10
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, v7, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v6, 0xfffffc10, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc
+; GFX9-NEXT:    v_or_b32_e32 v2, v10, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v7, v6, 12, v1
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v7, 7, v2
+; GFX9-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v7
-; GFX9-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v7
-; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT:    v_or_b32_e32 v7, v7, v10
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v7
 ; GFX9-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v12, v2, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v12, v13, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v6
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s6, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX9-NEXT:    v_and_or_b32 v1, v2, s9, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v2, s7, v1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    s_mov_b32 s5, 0x7fff7fff
@@ -6265,149 +6288,158 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; GFX11-TRUE16-NEXT:    v_bfe_u32 v10, v7, 20, 11
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 8, v7
 ; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, 0x1ff, v5, v4
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 8, v5
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0x1ff, v3, v2
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
 ; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v12, 0x3f1, v10
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v16, v5, 20, 11
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0x1ff, v3, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v5
 ; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0x1ff, v1, v0
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v5, 20, 11
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v3
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v19, v3, 20, 11
+; GFX11-TRUE16-NEXT:    v_bfe_u32 v20, v1, 20, 11
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v7.h
 ; GFX11-TRUE16-NEXT:    v_and_or_b32 v6, 0xffe, v11, v6
 ; GFX11-TRUE16-NEXT:    v_med3_i32 v11, v12, 0, 13
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.h, 0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, 0x1000, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, 0xffe, v15, v4
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, v11, v12
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x1000, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, v11, v14
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v17, 1, v14
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v11, v12
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v12, 8, v1
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v14, v11
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, 0xfffffc10, v10
-; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v12, 0x3f1, v16
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v11, v14, v17 :: v_dual_add_nc_u32 v16, 0xfffffc10, v16
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v18, v10, 12, v6
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v14, 0x3f1, v17
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v21, v10, 12, v6
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v10
-; GFX11-TRUE16-NEXT:    v_med3_i32 v12, v12, 0, 13
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 8, v3
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v17, v3, 20, 11
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v18, v11, vcc_lo
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, 0x7e00
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, v12, v15
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, 0xffe, v16, v4
+; GFX11-TRUE16-NEXT:    v_med3_i32 v14, v14, 0, 13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v21, v11, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 7, v11
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 2, v11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, v12, v19
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v13, 0x7e00
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x1000, v4
+; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v21, 0x3f1, v19
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 7, v11
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v20, 1, v19
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v18
-; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e64 s0, 5, v18
-; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v18, 0x3f1, v17
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0xffe, v14, v2
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v14, v16, 12, v4
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0xfffffc10, v17
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v12, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v21, 0, 1, s0
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v15, 0x1000, v2
-; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e64 s1, 1, v17
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v19, v20, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 2, v11
+; GFX11-TRUE16-NEXT:    v_med3_i32 v21, v21, 0, 13
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0xffe, v18, v2
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v18, v14, v22
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0xffe, v12, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v23, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v16
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v7.h
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.h, 0
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, 0x7c00, v13, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v16
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v12, v14, v12 :: v_dual_add_nc_u32 v11, v11, v21
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, v16, v23
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v6, 0x7c00, v13 :: v_dual_add_nc_u32 v11, v11, v16
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v12, v14, v18
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v14, 0x1000, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v12, v22
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v22, v21, v14
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v10
-; GFX11-TRUE16-NEXT:    v_med3_i32 v14, v18, 0, 13
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v11, 0x7c00, v11 :: v_dual_and_b32 v18, 7, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v18, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, 0x7c00, v11, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v10
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 8, v1
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v19, v14, v15
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v12, 2, v12
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v17, 0xfffffc10, v17
+; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v10, 0x3f1, v20
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT:    v_bfe_u32 v11, v1, 20, 11
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v14, v14, v19
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v21, 1, v19
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v18
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e64 s0, v14, v15
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v15, v17, 12, v2
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0xffe, v10, v0
-; GFX11-TRUE16-NEXT:    v_sub_nc_u32_e32 v10, 0x3f1, v11
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v14, v19, v21, s0
-; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e64 s0, 5, v18
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v11, 0xfffffc10, v11
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v22, 0x1000, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v16, v17, 12, v4
+; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v17
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v11, v21, v22
 ; GFX11-TRUE16-NEXT:    v_med3_i32 v10, v10, 0, 13
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v14, v15, v14, s1
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e64 s1, 1, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s0
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, v10, v22
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 7, v14
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 2, v14
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v6, 0x8000, v20, v6
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v5.h
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v6, 0x8000, v15, v6
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v5.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v11, v14
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v14, 0xfffffc10, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v16, 0x1000, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v19, v14, 12, v2
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, v10, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v11, v22, v11
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 7, v12
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v10, v10, v7
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, 1, v7
-; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e64 s0, 5, v19
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v10, v22
-; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v11, 12, v0
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v7, v7, v18, vcc_lo
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v12, 2, v12
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v14
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v11, v19, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v18
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v19, 7, v11
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v10, v16
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 2, v11
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v18, v18, v21
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v19
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v16, 0xfffffc10, v20
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v19
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v7, v10, v7, s1
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v10, v12, v15
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v16
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s0
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v15, 7, v7
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e64 s1, 0x40f, v16
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v10, v16, 12, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v16
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v7, v10, v7 :: v_dual_add_nc_u32 v10, v12, v18
+; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v17
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v12, v19, v20
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v18, 7, v7
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 2, v7
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v10, 0x7c00, v10, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e64 s0, 5, v15
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, 0x7c00, v13, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v15
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v4, v10, v4, s1
-; GFX11-TRUE16-NEXT:    s_or_b32 s0, s0, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s0
-; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e64 s1, 31, v17
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, 0x8000, v20, v4
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v3.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v2, 0x7c00, v13 :: v_dual_add_nc_u32 v7, v7, v10
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v12, v14, v12
+; GFX11-TRUE16-NEXT:    v_dual_cndmask_b32 v4, 0x7c00, v13 :: v_dual_add_nc_u32 v11, v11, v12
+; GFX11-TRUE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v18
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v17
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v10, v18, v12
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v5, 0x7c00, v11, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v7, v7, v10
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v4, 0x8000, v15, v4
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v3.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v6.l
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v5, 0x7c00, v12, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v13, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v14
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v11
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0x8000, v20, v2
+; GFX11-TRUE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v16
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v2, 0x8000, v15, v2
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v3, 0x7c00, v7, vcc_lo
 ; GFX11-TRUE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v20.l, v1.h
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v15.l, v1.h
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0x7fff7fff, v4, v9
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v13, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v11
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v16
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0x8000, v20, v0
+; GFX11-TRUE16-NEXT:    v_and_or_b32 v0, 0x8000, v15, v0
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, v0, v8
@@ -6423,144 +6455,150 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 8, v7
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
 ; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v12, 0x3f1, v10
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v14, v7, 20, 11
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v15, v7, 20, 11
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0x1ff, v1, v0
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0x1ff, v3, v2
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v17, 0x3f1, v14
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 8, v3
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v20, v3, 20, 11
+; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v17, 0x3f1, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 8, v1
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, 0xffe, v11, v4
 ; GFX11-FAKE16-NEXT:    v_med3_i32 v11, v12, 0, 13
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x1000, v4
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v6, 0xffe, v13, v6
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0xfffffc10, v14
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, v11, v12
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e64 s3, 0, v6
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, v11, v15
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 1, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, 0x1000, v4
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v6, 0xffe, v13, v6
+; GFX11-FAKE16-NEXT:    v_med3_i32 v13, v17, 0, 13
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v14, v11, v12
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, 0x1000, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v11, v11, v14
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v11, v12
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v12, v1, 20, 11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v14, v11
 ; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, 0xfffffc10, v10
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 8, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v15, v16, vcc_lo
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v10, 12, v4
-; GFX11-FAKE16-NEXT:    v_med3_i32 v15, v17, 0, 13
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x1000, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v14, 0xfffffc10, v15
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, v13, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v10, 12, v4
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v10
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, v15, v16
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v13, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    v_bfe_u32 v13, v1, 20, 11
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v15, v15, v17
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 7, v11
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 1, v17
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 2, v11
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e64 s1, v15, v16
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v19
-; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e64 s0, 5, v19
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v14, 12, v6
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v15, v17, v21, s1
-; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v17, 0x3f1, v20
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v20, 0xfffffc10, v20
-; GFX11-FAKE16-NEXT:    v_med3_i32 v17, v17, 0, 13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v15, v19, v15, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v13, v13, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v18, v11, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v19, 0x7e00
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, v11, v16
+; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v18, 0x3f1, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 7, v11
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0xffe, v12, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 2, v11
+; GFX11-FAKE16-NEXT:    v_med3_i32 v18, v18, 0, 13
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0xffe, v16, v0
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v12, 0x3f1, v13
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v13, v17
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v16, v14, 12, v6
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v19
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v15, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v19
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v14
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v19, v17
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x1000, v0
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 8, v3
+; GFX11-FAKE16-NEXT:    v_bfe_u32 v19, v3, 20, 11
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, v11, v15
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v10
-; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0xffe, v18, v2
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v18, 0x1000, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_med3_i32 v12, v12, 0, 13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, 0x7c00, v11, vcc_lo
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, 0xfffffc10, v13
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v21, v12, v18
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v12, v12, v21
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 7, v15
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v24, 1, v21
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 2, v15
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v16
-; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e64 s0, 5, v16
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, 0x1000, v2
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, v17, v16
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v12, v18
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v13, 12, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s0
-; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v17, v17, v23
-; GFX11-FAKE16-NEXT:    v_or_b32_e32 v25, 1, v23
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v21, v24, vcc_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v17, v16
-; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v17, v20, 12, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v23, v25, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v12, v18, v12, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v20, v18, v16
+; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0xffe, v17, v2
+; GFX11-FAKE16-NEXT:    v_sub_nc_u32_e32 v17, 0x3f1, v19
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v19, 0xfffffc10, v19
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, 0x7c00, v11 :: v_dual_lshlrev_b32 v18, v18, v20
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 7, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v21, 0x1000, v2
+; GFX11-FAKE16-NEXT:    v_med3_i32 v17, v17, 0, 13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 2, v13
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v15
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v23, v17, v21
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v18, v16
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v18, 0x7e00 :: v_dual_lshlrev_b32 v17, v17, v23
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v15, v15, v22
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v17, v21
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, v13, v15
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v16, v20, v16
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v17, v23, v17
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v12, 0xfffffc10, v12
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v12, 12, v0
+; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v12
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v20, v16, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, 0x7c00, v19, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v20
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc_lo
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v19, 12, v2
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v4, 0x7c00, v18 :: v_dual_and_b32 v15, 7, v16
+; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v19
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v17, v20, v17, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v10
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v15, v15, v22
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v17, 7, v12
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v12, 2, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 7, v16
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v11, v4, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v14
-; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e64 s0, 5, v17
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v6, 0x7c00, v19, s3
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e64 s1, 3, v10
-; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e64 s2, 5, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, 0x7c00, v15, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v17
-; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 2, v16
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 7, v17
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v4, 0x8000, v5, v4
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, 0x7c00, v13, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v15
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v11
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v13, v15, v13
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v11
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 2, v16
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 2, v17
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v13, v15, v13
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v11, v11, v20
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, 0x7c00, v18, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v14
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s0
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s2, s1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s0
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v12, v10
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v11, v6, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v13
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, 0x7c00, v10, vcc_lo
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v12
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, 0x7c00, v13 :: v_dual_add_nc_u32 v10, v16, v11
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v19, vcc_lo
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v11, v16, v15
-; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v20
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, 0x7c00, v11, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v18, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v19
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, 0x7c00, v10, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v19, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v13
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v20
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v18, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v12
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v19
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v0, 0x8000, v1, v0
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v1, 0x8000, v3, v2
 ; GFX11-FAKE16-NEXT:    v_and_or_b32 v2, 0x8000, v7, v6
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index af0c38c5624ba..462d7748b86cd 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -3944,10 +3944,9 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 {
 ; SI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 20, v1
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; SI-NEXT:    v_and_b32_e32 v2, 0xffe, v2
-; SI-NEXT:    v_and_b32_e32 v3, 0x7ff, v3
+; SI-NEXT:    v_bfe_u32 v3, v1, 20, 11
 ; SI-NEXT:    s_movk_i32 s4, 0x3f1
 ; SI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; SI-NEXT:    v_sub_i32_e32 v4, vcc, s4, v3
@@ -3955,20 +3954,21 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 {
 ; SI-NEXT:    v_med3_i32 v4, v4, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, v4, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, v4, v5
-; SI-NEXT:    v_or_b32_e32 v6, 1, v5
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v4, v2
 ; SI-NEXT:    s_movk_i32 s4, 0xfc10
-; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, s4, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 12, v3
+; SI-NEXT:    v_or_b32_e32 v2, v5, v2
 ; SI-NEXT:    v_or_b32_e32 v4, v0, v4
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v3
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-NEXT:    v_and_b32_e32 v4, 7, v2
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v4
+; SI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v4
-; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v4
-; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; SI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; SI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; SI-NEXT:    v_mov_b32_e32 v4, 0x7c00
@@ -3994,10 +3994,9 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 {
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 20, v1
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; VI-NEXT:    v_and_b32_e32 v2, 0xffe, v2
-; VI-NEXT:    v_and_b32_e32 v3, 0x7ff, v3
+; VI-NEXT:    v_bfe_u32 v3, v1, 20, 11
 ; VI-NEXT:    s_movk_i32 s4, 0x3f1
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    v_sub_u32_e32 v4, vcc, s4, v3
@@ -4005,20 +4004,21 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 {
 ; VI-NEXT:    v_med3_i32 v4, v4, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, v4, v2
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, v4, v5
-; VI-NEXT:    v_or_b32_e32 v6, 1, v5
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v4, v2
 ; VI-NEXT:    s_movk_i32 s4, 0xfc10
-; VI-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 12, v3
+; VI-NEXT:    v_or_b32_e32 v2, v5, v2
 ; VI-NEXT:    v_or_b32_e32 v4, v0, v4
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; VI-NEXT:    v_and_b32_e32 v4, 7, v2
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v4
+; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v4
-; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v4
-; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x7c00
@@ -4041,47 +4041,47 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 {
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_and_or_b32 v0, 0x1ff, v1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 20, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
+; GFX11-NEXT:    v_bfe_u32 v3, v1, 20, 11
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0x7ff, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v4, 0x3f1, v3
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    v_and_or_b32 v0, 0xffe, v3, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 0x3f1, v2
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 0xfffffc10, v2
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffe, v2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_med3_i32 v2, v4, 0, 13
 ; GFX11-NEXT:    v_or_b32_e32 v4, 0x1000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_med3_i32 v3, v3, 0, 13
-; GFX11-NEXT:    v_lshl_or_b32 v7, v2, 12, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v3, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v3, v5
-; GFX11-NEXT:    v_or_b32_e32 v6, 1, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v2
-; GFX11-NEXT:    v_mov_b32_e32 v5, 0x7e00
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v4, 7, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v2, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v2, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0xfffffc10, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshl_or_b32 v4, v3, 12, v0
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v4, 7, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
+; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v4
-; GFX11-NEXT:    v_cmp_lt_i32_e64 s0, 5, v4
-; GFX11-NEXT:    s_or_b32 s0, s0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, 0x7c00, v5 :: v_dual_add_nc_u32 v3, v3, v4
-; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-NEXT:    v_dual_mov_b32 v5, 0x7e00 :: v_dual_add_nc_u32 v2, v2, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
 ; GFX11-NEXT:    v_and_or_b32 v0, 0x8000, v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fpround = fptrunc double %a to half
@@ -4106,20 +4106,21 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; SI-NEXT:    v_med3_i32 v4, v4, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, v4, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, v4, v5
-; SI-NEXT:    v_or_b32_e32 v6, 1, v5
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v4, v2
 ; SI-NEXT:    s_movk_i32 s4, 0xfc10
-; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, s4, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 12, v3
+; SI-NEXT:    v_or_b32_e32 v2, v5, v2
 ; SI-NEXT:    v_or_b32_e32 v4, v0, v4
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v3
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-NEXT:    v_and_b32_e32 v4, 7, v2
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v4
+; SI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v4
-; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v4
-; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; SI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; SI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; SI-NEXT:    v_mov_b32_e32 v4, 0x7c00
@@ -4153,20 +4154,21 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; VI-NEXT:    v_med3_i32 v4, v4, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, v4, v2
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, v4, v5
-; VI-NEXT:    v_or_b32_e32 v6, 1, v5
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v4, v2
 ; VI-NEXT:    s_movk_i32 s4, 0xfc10
-; VI-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 12, v3
+; VI-NEXT:    v_or_b32_e32 v2, v5, v2
 ; VI-NEXT:    v_or_b32_e32 v4, v0, v4
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; VI-NEXT:    v_and_b32_e32 v4, 7, v2
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v4
+; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v4
-; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v4
-; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x7c00
@@ -4200,32 +4202,35 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v4, 0x1000, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v2, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v2, v5
-; GFX11-NEXT:    v_or_b32_e32 v6, 1, v5
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v5, v6 :: v_dual_add_nc_u32 v3, 0xfffffc10, v3
-; GFX11-NEXT:    v_lshl_or_b32 v7, v3, 12, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0xfffffc10, v3
+; GFX11-NEXT:    v_lshl_or_b32 v4, v3, 12, v0
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v5, 0x7e00 :: v_dual_cndmask_b32 v2, v7, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
 ; GFX11-NEXT:    v_and_b32_e32 v4, 7, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v4
-; GFX11-NEXT:    v_cmp_lt_i32_e64 s0, 5, v4
-; GFX11-NEXT:    s_or_b32 s0, s0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
+; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v5, 0x7e00 :: v_dual_add_nc_u32 v2, v2, v4
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v5, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v4
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_or_b32 v0, 0x8000, v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg double %a
@@ -4253,20 +4258,21 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0
 ; SI-NEXT:    v_med3_i32 v5, v5, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v6, v5, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, v5, v6
-; SI-NEXT:    v_or_b32_e32 v7, 1, v6
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v2
 ; SI-NEXT:    s_movk_i32 s4, 0xfc10
-; SI-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, s4, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 12, v4
+; SI-NEXT:    v_or_b32_e32 v2, v6, v2
 ; SI-NEXT:    v_or_b32_e32 v5, v0, v5
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v4
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; SI-NEXT:    v_and_b32_e32 v5, 7, v2
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v5
+; SI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v5
-; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v5
-; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; SI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; SI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; SI-NEXT:    v_or_b32_e32 v5, v5, v6
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; SI-NEXT:    v_mov_b32_e32 v5, 0x7c00
@@ -4304,20 +4310,21 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0
 ; VI-NEXT:    v_med3_i32 v6, v6, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v7, v6, v4
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, v6, v7
-; VI-NEXT:    v_or_b32_e32 v8, 1, v7
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v6, v4
 ; VI-NEXT:    s_movk_i32 s4, 0xfc10
-; VI-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, 12, v5
+; VI-NEXT:    v_or_b32_e32 v4, v7, v4
 ; VI-NEXT:    v_or_b32_e32 v6, v0, v6
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v5
 ; VI-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; VI-NEXT:    v_and_b32_e32 v6, 7, v4
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v6
+; VI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v6
-; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v6
-; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; VI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; VI-NEXT:    v_or_b32_e32 v6, v6, v7
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 2, v4
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
 ; VI-NEXT:    v_mov_b32_e32 v6, 0x7c00
@@ -4352,28 +4359,32 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v5, 0x1000, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v6, v3, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v3, v6
-; GFX11-NEXT:    v_or_b32_e32 v7, 1, v6
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v3, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v3, v6, v7 :: v_dual_add_nc_u32 v4, 0xfffffc10, v4
-; GFX11-NEXT:    v_lshl_or_b32 v8, v4, 12, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v3, v6, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 0xfffffc10, v4
+; GFX11-NEXT:    v_lshl_or_b32 v5, v4, 12, v2
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v6, 0x7e00 :: v_dual_cndmask_b32 v3, v8, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc_lo
 ; GFX11-NEXT:    v_and_b32_e32 v5, 7, v3
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v5
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v5
-; GFX11-NEXT:    v_cmp_lt_i32_e64 s0, 5, v5
-; GFX11-NEXT:    s_or_b32 s0, s0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, 0x7c00, v6 :: v_dual_add_nc_u32 v3, v3, v5
+; GFX11-NEXT:    v_or_b32_e32 v5, v5, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v6, 0x7e00 :: v_dual_add_nc_u32 v3, v3, v5
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v6, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v4
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v4
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
@@ -4510,20 +4521,21 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do
 ; SI-NEXT:    v_med3_i32 v7, v7, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v8, v7, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, v7, v8
-; SI-NEXT:    v_or_b32_e32 v9, 1, v8
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v7, v5
 ; SI-NEXT:    s_movk_i32 s4, 0xfc10
-; SI-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; SI-NEXT:    v_add_i32_e32 v6, vcc, s4, v6
 ; SI-NEXT:    v_lshlrev_b32_e32 v7, 12, v6
+; SI-NEXT:    v_or_b32_e32 v5, v8, v5
 ; SI-NEXT:    v_or_b32_e32 v7, v4, v7
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v6
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
 ; SI-NEXT:    v_and_b32_e32 v7, 7, v5
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v7
+; SI-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v7
-; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v7
-; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; SI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; SI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; SI-NEXT:    v_or_b32_e32 v7, v7, v8
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
 ; SI-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; SI-NEXT:    v_mov_b32_e32 v7, 0x7c00
@@ -4560,21 +4572,22 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do
 ; VI-NEXT:    v_med3_i32 v7, v7, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v8, v7, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v7, v7, v8
-; VI-NEXT:    v_or_b32_e32 v9, 1, v8
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v7, v5
 ; VI-NEXT:    s_movk_i32 s4, 0xfc10
-; VI-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
 ; VI-NEXT:    v_lshlrev_b32_e32 v7, 12, v6
+; VI-NEXT:    v_or_b32_e32 v5, v8, v5
 ; VI-NEXT:    v_or_b32_e32 v7, v4, v7
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v6
 ; VI-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
 ; VI-NEXT:    v_and_b32_e32 v7, 7, v5
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v7
+; VI-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v7
-; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v7
-; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
+; VI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; VI-NEXT:    v_mul_f64 v[2:3], -v[0:1], v[2:3]
-; VI-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; VI-NEXT:    v_or_b32_e32 v7, v7, v8
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
 ; VI-NEXT:    v_add_u32_e32 v5, vcc, v5, v7
 ; VI-NEXT:    v_mov_b32_e32 v7, 0x7c00
@@ -4612,27 +4625,28 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v8, v5, v7
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, v5, v8
-; GFX11-NEXT:    v_or_b32_e32 v9, 1, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v5, v7
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v5, v8, v5
 ; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0xfffffc10, v6
-; GFX11-NEXT:    v_mov_b32_e32 v7, 0x7e00
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshl_or_b32 v10, v6, 12, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshl_or_b32 v7, v6, 12, v4
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v10, v5, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 7, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
+; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX11-NEXT:    v_cmp_lt_i32_e64 s0, 5, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 2, v5
-; GFX11-NEXT:    s_or_b32 s0, s0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v7
+; GFX11-NEXT:    v_dual_mov_b32 v7, 0x7e00 :: v_dual_add_nc_u32 v0, v5, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7c00, v7, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v5
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v6
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v0, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v6
@@ -4667,20 +4681,21 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; SI-NEXT:    v_med3_i32 v4, v4, 0, 13
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, v4, v2
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, v4, v5
-; SI-NEXT:    v_or_b32_e32 v6, 1, v5
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, v4, v2
 ; SI-NEXT:    s_movk_i32 s4, 0xfc10
-; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, s4, v3
 ; SI-NEXT:    v_lshlrev_b32_e32 v4, 12, v3
+; SI-NEXT:    v_or_b32_e32 v2, v5, v2
 ; SI-NEXT:    v_or_b32_e32 v4, v0, v4
 ; SI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v3
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-NEXT:    v_and_b32_e32 v4, 7, v2
+; SI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v4
+; SI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v4
-; SI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v4
-; SI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; SI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; SI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; SI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; SI-NEXT:    v_mov_b32_e32 v4, 0x7c00
@@ -4716,20 +4731,21 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; VI-NEXT:    v_med3_i32 v4, v4, 0, 13
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, v4, v2
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, v4, v5
-; VI-NEXT:    v_or_b32_e32 v6, 1, v5
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, v4, v2
 ; VI-NEXT:    s_movk_i32 s4, 0xfc10
-; VI-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 12, v3
+; VI-NEXT:    v_or_b32_e32 v2, v5, v2
 ; VI-NEXT:    v_or_b32_e32 v4, v0, v4
 ; VI-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; VI-NEXT:    v_and_b32_e32 v4, 7, v2
+; VI-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v4
+; VI-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v4
-; VI-NEXT:    v_cmp_lt_i32_e64 s[4:5], 5, v4
-; VI-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; VI-NEXT:    v_or_b32_e32 v4, v4, v5
 ; VI-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x7c00
@@ -4764,34 +4780,36 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v4, 0x1000, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v2, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v2, v5
-; GFX11-NEXT:    v_or_b32_e32 v6, 1, v5
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v5, v6 :: v_dual_add_nc_u32 v3, 0xfffffc10, v3
-; GFX11-NEXT:    v_lshl_or_b32 v7, v3, 12, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v2, v5, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0xfffffc10, v3
+; GFX11-NEXT:    v_lshl_or_b32 v4, v3, 12, v0
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v5, 0x7e00 :: v_dual_cndmask_b32 v2, v7, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
 ; GFX11-NEXT:    v_and_b32_e32 v4, 7, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v4
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v4
-; GFX11-NEXT:    v_cmp_lt_i32_e64 s0, 5, v4
-; GFX11-NEXT:    s_or_b32 s0, s0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
+; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v5, 0x7e00 :: v_dual_add_nc_u32 v2, v2, v4
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v5, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, v2, v4
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_or_b32 v1, 0x8000, v1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-SAFE-TRUE16-LABEL: v_fneg_multi_use_fp_round_fneg_f64_to_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index c20b99444ab35..20809e6b5afcc 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -1002,11 +1002,12 @@ define float @cospiD_pattern0_half(i16 %arg, float %arg1, float %arg2) {
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX7-NEXT:    v_cmp_lt_i32_e32 vcc, 1, v3
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_xor_b32_e32 v2, 0x8000, v0
-; GFX7-NEXT:    v_cmp_lt_i32_e32 vcc, 1, v3
-; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX7-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index d99cf35c482a4..49c563eef5d82 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -111,36 +111,34 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; SI-NEXT:    s_bfe_u32 s0, s7, 0xb0014
 ; SI-NEXT:    v_readfirstlane_b32 s1, v0
 ; SI-NEXT:    s_sub_i32 s6, 0x3f1, s0
-; SI-NEXT:    s_or_b32 s10, s8, s1
+; SI-NEXT:    s_or_b32 s1, s8, s1
 ; SI-NEXT:    v_med3_i32 v0, s6, 0, 13
-; SI-NEXT:    s_or_b32 s1, s10, 0x1000
-; SI-NEXT:    v_readfirstlane_b32 s6, v0
-; SI-NEXT:    s_lshr_b32 s8, s1, s6
-; SI-NEXT:    s_or_b32 s9, s8, 1
-; SI-NEXT:    s_lshl_b32 s6, s8, s6
-; SI-NEXT:    s_cmp_lg_u32 s6, s1
-; SI-NEXT:    s_cselect_b32 s1, s9, s8
-; SI-NEXT:    s_add_i32 s6, s0, 0xfffffc10
-; SI-NEXT:    s_lshl_b32 s0, s6, 12
-; SI-NEXT:    s_or_b32 s0, s10, s0
-; SI-NEXT:    s_cmp_lt_i32 s6, 1
-; SI-NEXT:    s_cselect_b32 s11, s1, s0
-; SI-NEXT:    s_and_b32 s8, s11, 7
-; SI-NEXT:    s_cmp_eq_u32 s8, 3
-; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; SI-NEXT:    s_or_b32 s6, s1, 0x1000
+; SI-NEXT:    v_readfirstlane_b32 s8, v0
+; SI-NEXT:    s_lshr_b32 s9, s6, s8
+; SI-NEXT:    s_lshl_b32 s8, s9, s8
+; SI-NEXT:    s_cmp_lg_u32 s8, s6
+; SI-NEXT:    s_cselect_b32 s6, 1, 0
+; SI-NEXT:    s_addk_i32 s0, 0xfc10
+; SI-NEXT:    s_or_b32 s6, s9, s6
+; SI-NEXT:    s_lshl_b32 s8, s0, 12
+; SI-NEXT:    s_or_b32 s8, s1, s8
+; SI-NEXT:    s_cmp_lt_i32 s0, 1
+; SI-NEXT:    s_cselect_b32 s6, s6, s8
+; SI-NEXT:    s_and_b32 s8, s6, 7
 ; SI-NEXT:    s_cmp_gt_i32 s8, 5
-; SI-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; SI-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
-; SI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; SI-NEXT:    s_cselect_b32 s0, 1, 0
-; SI-NEXT:    s_lshr_b32 s1, s11, 2
-; SI-NEXT:    s_add_i32 s1, s1, s0
-; SI-NEXT:    s_cmp_lt_i32 s6, 31
-; SI-NEXT:    s_cselect_b32 s0, s1, 0x7c00
-; SI-NEXT:    s_cmp_lg_u32 s10, 0
+; SI-NEXT:    s_cselect_b32 s9, 1, 0
+; SI-NEXT:    s_cmp_eq_u32 s8, 3
+; SI-NEXT:    s_cselect_b32 s8, 1, 0
+; SI-NEXT:    s_lshr_b32 s6, s6, 2
+; SI-NEXT:    s_or_b32 s8, s8, s9
+; SI-NEXT:    s_add_i32 s6, s6, s8
+; SI-NEXT:    s_cmp_lt_i32 s0, 31
+; SI-NEXT:    s_cselect_b32 s6, s6, 0x7c00
+; SI-NEXT:    s_cmp_lg_u32 s1, 0
 ; SI-NEXT:    s_cselect_b32 s1, s2, 0x7c00
-; SI-NEXT:    s_cmpk_eq_i32 s6, 0x40f
-; SI-NEXT:    s_cselect_b32 s0, s1, s0
+; SI-NEXT:    s_cmpk_eq_i32 s0, 0x40f
+; SI-NEXT:    s_cselect_b32 s0, s1, s6
 ; SI-NEXT:    s_lshr_b32 s1, s7, 16
 ; SI-NEXT:    s_and_b32 s1, s1, 0x8000
 ; SI-NEXT:    s_or_b32 s6, s1, s0
@@ -167,39 +165,37 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; VI-SAFE-SDAG-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; VI-SAFE-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; VI-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; VI-SAFE-SDAG-NEXT:    s_bfe_u32 s5, s7, 0xb0014
-; VI-SAFE-SDAG-NEXT:    s_or_b32 s6, s8, s4
-; VI-SAFE-SDAG-NEXT:    s_sub_i32 s8, 0x3f1, s5
+; VI-SAFE-SDAG-NEXT:    s_bfe_u32 s6, s7, 0xb0014
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s4, s8, s4
+; VI-SAFE-SDAG-NEXT:    s_sub_i32 s8, 0x3f1, s6
 ; VI-SAFE-SDAG-NEXT:    v_med3_i32 v0, s8, 0, 13
-; VI-SAFE-SDAG-NEXT:    s_or_b32 s4, s6, 0x1000
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s5, s4, 0x1000
 ; VI-SAFE-SDAG-NEXT:    v_readfirstlane_b32 s8, v0
-; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s9, s4, s8
-; VI-SAFE-SDAG-NEXT:    s_or_b32 s10, s9, 1
+; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s9, s5, s8
 ; VI-SAFE-SDAG-NEXT:    s_lshl_b32 s8, s9, s8
-; VI-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s8, s4
-; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s10, s9
-; VI-SAFE-SDAG-NEXT:    s_add_i32 s10, s5, 0xfffffc10
-; VI-SAFE-SDAG-NEXT:    s_lshl_b32 s5, s10, 12
-; VI-SAFE-SDAG-NEXT:    s_or_b32 s5, s6, s5
-; VI-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s10, 1
-; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s11, s4, s5
-; VI-SAFE-SDAG-NEXT:    s_and_b32 s8, s11, 7
-; VI-SAFE-SDAG-NEXT:    s_cmp_eq_u32 s8, 3
-; VI-SAFE-SDAG-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; VI-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s8, s5
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s5, 1, 0
+; VI-SAFE-SDAG-NEXT:    s_addk_i32 s6, 0xfc10
+; VI-SAFE-SDAG-NEXT:    s_lshl_b32 s8, s6, 12
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s5, s9, s5
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s8, s4, s8
+; VI-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s6, 1
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, s8
+; VI-SAFE-SDAG-NEXT:    s_and_b32 s8, s5, 7
 ; VI-SAFE-SDAG-NEXT:    s_cmp_gt_i32 s8, 5
-; VI-SAFE-SDAG-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; VI-SAFE-SDAG-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
-; VI-SAFE-SDAG-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s4, 1, 0
-; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s5, s11, 2
-; VI-SAFE-SDAG-NEXT:    s_add_i32 s5, s5, s4
-; VI-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s10, 31
-; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s5, 0x7c00
-; VI-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s6, 0
-; VI-SAFE-SDAG-NEXT:    s_movk_i32 s5, 0x7e00
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s9, 1, 0
+; VI-SAFE-SDAG-NEXT:    s_cmp_eq_u32 s8, 3
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s8, 1, 0
+; VI-SAFE-SDAG-NEXT:    s_or_b32 s8, s8, s9
+; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s5, s5, 2
+; VI-SAFE-SDAG-NEXT:    s_add_i32 s5, s5, s8
+; VI-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s6, 31
 ; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, 0x7c00
-; VI-SAFE-SDAG-NEXT:    s_cmpk_eq_i32 s10, 0x40f
-; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s5, s4
+; VI-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
+; VI-SAFE-SDAG-NEXT:    s_movk_i32 s4, 0x7e00
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s4, 0x7c00
+; VI-SAFE-SDAG-NEXT:    s_cmpk_eq_i32 s6, 0x40f
+; VI-SAFE-SDAG-NEXT:    s_cselect_b32 s4, s4, s5
 ; VI-SAFE-SDAG-NEXT:    s_lshr_b32 s5, s7, 16
 ; VI-SAFE-SDAG-NEXT:    s_and_b32 s5, s5, 0x8000
 ; VI-SAFE-SDAG-NEXT:    s_or_b32 s4, s5, s4
@@ -300,23 +296,21 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s5, s4, 0x1000
 ; GFX10-SAFE-SDAG-NEXT:    s_lshr_b32 s7, s5, s6
 ; GFX10-SAFE-SDAG-NEXT:    s_lshl_b32 s6, s7, s6
-; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s8, s7, 1
 ; GFX10-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s6, s5
-; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s8, s7
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX10-SAFE-SDAG-NEXT:    s_addk_i32 s2, 0xfc10
+; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s5, s7, s5
 ; GFX10-SAFE-SDAG-NEXT:    s_lshl_b32 s6, s2, 12
 ; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s6, s4, s6
 ; GFX10-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s2, 1
 ; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, s6
 ; GFX10-SAFE-SDAG-NEXT:    s_and_b32 s6, s5, 7
-; GFX10-SAFE-SDAG-NEXT:    s_cmp_eq_u32 s6, 3
-; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s7, -1, 0
 ; GFX10-SAFE-SDAG-NEXT:    s_cmp_gt_i32 s6, 5
-; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s6, s6, s7
-; GFX10-SAFE-SDAG-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX10-SAFE-SDAG-NEXT:    s_cmp_eq_u32 s6, 3
 ; GFX10-SAFE-SDAG-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX10-SAFE-SDAG-NEXT:    s_lshr_b32 s5, s5, 2
+; GFX10-SAFE-SDAG-NEXT:    s_or_b32 s6, s6, s7
 ; GFX10-SAFE-SDAG-NEXT:    s_add_i32 s5, s5, s6
 ; GFX10-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s2, 31
 ; GFX10-SAFE-SDAG-NEXT:    s_movk_i32 s6, 0x7e00
@@ -431,26 +425,23 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-SAFE-SDAG-NEXT:    s_lshr_b32 s7, s5, s6
 ; GFX11-SAFE-SDAG-NEXT:    s_lshl_b32 s6, s7, s6
-; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s8, s7, 1
+; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; GFX11-SAFE-SDAG-NEXT:    s_cmp_lg_u32 s6, s5
-; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s8, s7
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-SAFE-SDAG-NEXT:    s_addk_i32 s2, 0xfc10
-; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s5, s7, s5
 ; GFX11-SAFE-SDAG-NEXT:    s_lshl_b32 s6, s2, 12
 ; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s6, s4, s6
 ; GFX11-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s2, 1
 ; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s5, s5, s6
 ; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-SAFE-SDAG-NEXT:    s_and_b32 s6, s5, 7
-; GFX11-SAFE-SDAG-NEXT:    s_cmp_eq_u32 s6, 3
-; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s7, -1, 0
 ; GFX11-SAFE-SDAG-NEXT:    s_cmp_gt_i32 s6, 5
-; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s6, s6, s7
-; GFX11-SAFE-SDAG-NEXT:    s_and_b32 s6, s6, exec_lo
+; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX11-SAFE-SDAG-NEXT:    s_cmp_eq_u32 s6, 3
 ; GFX11-SAFE-SDAG-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX11-SAFE-SDAG-NEXT:    s_lshr_b32 s5, s5, 2
+; GFX11-SAFE-SDAG-NEXT:    s_or_b32 s6, s6, s7
 ; GFX11-SAFE-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SAFE-SDAG-NEXT:    s_add_i32 s5, s5, s6
 ; GFX11-SAFE-SDAG-NEXT:    s_cmp_lt_i32 s2, 31
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.math.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.math.ll
index 27e5b521ae8c3..d8f21d285ddff 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.math.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.math.ll
@@ -284,85 +284,91 @@ define <2 x half> @v_test_cvt_v2f64_v2f16(<2 x double> %src) {
 ; GFX950-SDAG-LABEL: v_test_cvt_v2f64_v2f16:
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0x1ff
-; GFX950-SDAG-NEXT:    v_and_or_b32 v0, v1, s2, v0
+; GFX950-SDAG-NEXT:    s_movk_i32 s0, 0x1ff
+; GFX950-SDAG-NEXT:    v_and_or_b32 v0, v1, s0, v0
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX950-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX950-SDAG-NEXT:    s_movk_i32 s3, 0xffe
+; GFX950-SDAG-NEXT:    s_movk_i32 s1, 0xffe
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GFX950-SDAG-NEXT:    v_bfe_u32 v5, v1, 20, 11
-; GFX950-SDAG-NEXT:    v_and_or_b32 v0, v4, s3, v0
+; GFX950-SDAG-NEXT:    v_and_or_b32 v0, v4, s1, v0
 ; GFX950-SDAG-NEXT:    v_sub_u32_e32 v6, 0x3f1, v5
 ; GFX950-SDAG-NEXT:    v_or_b32_e32 v4, 0x1000, v0
 ; GFX950-SDAG-NEXT:    v_med3_i32 v6, v6, 0, 13
 ; GFX950-SDAG-NEXT:    v_lshrrev_b32_e32 v7, v6, v4
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v6, v6, v7
-; GFX950-SDAG-NEXT:    v_or_b32_e32 v8, 1, v7
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, v6, v4
 ; GFX950-SDAG-NEXT:    v_add_u32_e32 v5, 0xfffffc10, v5
 ; GFX950-SDAG-NEXT:    v_lshl_or_b32 v6, v5, 12, v0
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX950-SDAG-NEXT:    v_or_b32_e32 v4, v7, v4
 ; GFX950-SDAG-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v5
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v7, 0x7e00
-; GFX950-SDAG-NEXT:    s_movk_i32 s4, 0x40f
+; GFX950-SDAG-NEXT:    s_movk_i32 s2, 0x40f
+; GFX950-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX950-SDAG-NEXT:    v_and_b32_e32 v6, 7, v4
-; GFX950-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v6
-; GFX950-SDAG-NEXT:    v_cmp_lt_i32_e64 s[0:1], 5, v6
-; GFX950-SDAG-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GFX950-SDAG-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v6
 ; GFX950-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 2, v4
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
+; GFX950-SDAG-NEXT:    s_mov_b32 s3, 0x8000
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX950-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v6
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX950-SDAG-NEXT:    v_or_b32_e32 v6, v6, v7
 ; GFX950-SDAG-NEXT:    v_add_u32_e32 v4, v4, v6
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v6, 0x7c00
 ; GFX950-SDAG-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v5
-; GFX950-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX950-SDAG-NEXT:    s_mov_b32 s5, 0x8000
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v7, 0x7e00
+; GFX950-SDAG-NEXT:    s_nop 0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v6, v7, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v5
+; GFX950-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v5
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX950-SDAG-NEXT:    v_and_or_b32 v0, v1, s5, v0
-; GFX950-SDAG-NEXT:    v_and_or_b32 v1, v3, s2, v2
+; GFX950-SDAG-NEXT:    v_and_or_b32 v0, v1, s3, v0
+; GFX950-SDAG-NEXT:    v_and_or_b32 v1, v3, s0, v2
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX950-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 8, v3
 ; GFX950-SDAG-NEXT:    v_bfe_u32 v4, v3, 20, 11
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX950-SDAG-NEXT:    v_and_or_b32 v1, v2, s3, v1
+; GFX950-SDAG-NEXT:    v_and_or_b32 v1, v2, s1, v1
 ; GFX950-SDAG-NEXT:    v_sub_u32_e32 v5, 0x3f1, v4
 ; GFX950-SDAG-NEXT:    v_or_b32_e32 v2, 0x1000, v1
 ; GFX950-SDAG-NEXT:    v_med3_i32 v5, v5, 0, 13
 ; GFX950-SDAG-NEXT:    v_lshrrev_b32_e32 v8, v5, v2
 ; GFX950-SDAG-NEXT:    v_lshlrev_b32_e32 v5, v5, v8
-; GFX950-SDAG-NEXT:    v_or_b32_e32 v9, 1, v8
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, v5, v2
 ; GFX950-SDAG-NEXT:    v_add_u32_e32 v4, 0xfffffc10, v4
 ; GFX950-SDAG-NEXT:    v_lshl_or_b32 v5, v4, 12, v1
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX950-SDAG-NEXT:    v_or_b32_e32 v2, v8, v2
 ; GFX950-SDAG-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v4
-; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX950-SDAG-NEXT:    s_nop 0
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; GFX950-SDAG-NEXT:    v_and_b32_e32 v5, 7, v2
-; GFX950-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v5
-; GFX950-SDAG-NEXT:    v_cmp_lt_i32_e64 s[0:1], 5, v5
-; GFX950-SDAG-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GFX950-SDAG-NEXT:    v_cmp_lt_i32_e32 vcc, 5, v5
 ; GFX950-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
-; GFX950-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
+; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX950-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v5
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX950-SDAG-NEXT:    v_or_b32_e32 v5, v5, v8
 ; GFX950-SDAG-NEXT:    v_add_u32_e32 v2, v2, v5
 ; GFX950-SDAG-NEXT:    v_cmp_gt_i32_e32 vcc, 31, v4
-; GFX950-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX950-SDAG-NEXT:    s_nop 0
+; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
 ; GFX950-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
-; GFX950-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v4
+; GFX950-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v4
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX950-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX950-SDAG-NEXT:    v_and_or_b32 v1, v2, s5, v1
+; GFX950-SDAG-NEXT:    v_and_or_b32 v1, v2, s3, v1
 ; GFX950-SDAG-NEXT:    v_perm_b32 v0, v1, v0, s0
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;

>From ce7cd980bb09e2141f526f6e2eff72ea0e36da31 Mon Sep 17 00:00:00 2001
From: Chris Jackson <chris.jackson at amd.com>
Date: Mon, 14 Jul 2025 09:47:48 -0500
Subject: [PATCH 28/28] Fix broken insert-delay-alu-bug.ll test

---
 .../CodeGen/AMDGPU/insert-delay-alu-bug.ll    | 462 ++++++------------
 1 file changed, 154 insertions(+), 308 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index a841f7ffa02b9..9389f1614721f 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -51,314 +51,160 @@ bb:
 
 ; FIXME: This generates "instid1(/* invalid instid value */)".
 define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, ptr %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i1 %arg11) {
-
-; GFX11-TRUE16-LABEL: f2:
-; GFX11-TRUE16:       ; %bb.0: ; %bb
-; GFX11-TRUE16-NEXT:    s_mov_b64 s[16:17], s[4:5]
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v31, v0
-; GFX11-TRUE16-NEXT:    s_load_b32 s19, s[16:17], 0x24
-; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s13
-; GFX11-TRUE16-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX11-TRUE16-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
-; GFX11-TRUE16-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX11-TRUE16-NEXT:    s_mov_b32 s20, 0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, -1
-; GFX11-TRUE16-NEXT:    s_mov_b32 s3, exec_lo
-; GFX11-TRUE16-NEXT:    s_mov_b32 s32, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mul_lo_u32 v0, s19, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB2_13
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %bb14
-; GFX11-TRUE16-NEXT:    s_load_b128 s[20:23], s[16:17], 0x2c
-; GFX11-TRUE16-NEXT:    s_mov_b32 s18, 0
-; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_bitcmp1_b32 s21, 0
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s24, -1, 0
-; GFX11-TRUE16-NEXT:    s_bitcmp0_b32 s21, 0
-; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB2_3
-; GFX11-TRUE16-NEXT:  ; %bb.2: ; %bb15
-; GFX11-TRUE16-NEXT:    s_add_u32 s8, s16, 0x58
-; GFX11-TRUE16-NEXT:    s_addc_u32 s9, s17, 0
-; GFX11-TRUE16-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-TRUE16-NEXT:    s_add_u32 s0, s0, f0 at gotpcrel32@lo+4
-; GFX11-TRUE16-NEXT:    s_addc_u32 s1, s1, f0 at gotpcrel32@hi+12
-; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s14
-; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s21, s14
-; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s15
-; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s21
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, -1
-; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB2_4
-; GFX11-TRUE16-NEXT:    s_branch .LBB2_12
-; GFX11-TRUE16-NEXT:  .LBB2_3:
-; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT:  .LBB2_4: ; %bb16
-; GFX11-TRUE16-NEXT:    s_load_b32 s1, s[16:17], 0x54
-; GFX11-TRUE16-NEXT:    s_bitcmp1_b32 s23, 0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s8, -1
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX11-TRUE16-NEXT:    s_and_b32 s9, s23, 1
-; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_bitcmp1_b32 s1, 0
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX11-TRUE16-NEXT:    s_cmp_eq_u32 s9, 0
-; GFX11-TRUE16-NEXT:    s_cbranch_scc0 .LBB2_8
-; GFX11-TRUE16-NEXT:  ; %bb.5: ; %bb18.preheader
-; GFX11-TRUE16-NEXT:    s_load_b128 s[28:31], s[16:17], 0x44
-; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_mul_hi_u32 s8, s29, s28
-; GFX11-TRUE16-NEXT:    s_mul_i32 s9, s29, s28
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_alignbit_b32 v0, s8, s9, 1
-; GFX11-TRUE16-NEXT:    s_mov_b32 s9, 0
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s8, 1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_lshr_b32 s8, s8, s30
-; GFX11-TRUE16-NEXT:    s_mul_i32 s8, s8, s22
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_mul_i32 s8, s8, s20
-; GFX11-TRUE16-NEXT:    s_or_b32 s8, s19, s8
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_lshl_b64 s[20:21], s[8:9], 1
-; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s9
-; GFX11-TRUE16-NEXT:    global_load_u16 v1, v0, s[20:21]
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s24
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-TRUE16-NEXT:    s_mov_b32 vcc_lo, 0
-; GFX11-TRUE16-NEXT:    .p2align 6
-; GFX11-TRUE16-NEXT:  .LBB2_6: ; %bb18
-; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-TRUE16-NEXT:    s_and_b32 s8, 0xffff, s8
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s13, v0
-; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s8, -1, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s8, s1, s8
-; GFX11-TRUE16-NEXT:    s_and_b32 s8, s8, exec_lo
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s19, v2
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s8, s19, s13
-; GFX11-TRUE16-NEXT:    s_and_b32 s13, 0xffff, s9
-; GFX11-TRUE16-NEXT:    s_and_b32 s8, s8, 1
-; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s13, -1, 0
-; GFX11-TRUE16-NEXT:    s_and_b32 s20, s2, exec_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s13
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s13, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s19, v2
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s13, s19, s13
-; GFX11-TRUE16-NEXT:    s_or_b32 s19, s9, 0x100
-; GFX11-TRUE16-NEXT:    s_and_b32 s13, 1, s13
-; GFX11-TRUE16-NEXT:    s_cmp_eq_u32 s13, 1
-; GFX11-TRUE16-NEXT:    s_cselect_b32 s9, s19, s9
-; GFX11-TRUE16-NEXT:    s_cbranch_vccz .LBB2_6
-; GFX11-TRUE16-NEXT:  ; %bb.7: ; %Flow
-; GFX11-TRUE16-NEXT:    s_mov_b32 s8, 0
-; GFX11-TRUE16-NEXT:  .LBB2_8: ; %Flow12
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_and_b32 vcc_lo, exec_lo, s8
-; GFX11-TRUE16-NEXT:    s_cbranch_vccz .LBB2_12
-; GFX11-TRUE16-NEXT:  ; %bb.9:
-; GFX11-TRUE16-NEXT:    s_xor_b32 s1, s1, -1
-; GFX11-TRUE16-NEXT:  .LBB2_10: ; %bb17
-; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    s_and_b32 vcc_lo, exec_lo, s1
-; GFX11-TRUE16-NEXT:    s_cbranch_vccz .LBB2_10
-; GFX11-TRUE16-NEXT:  ; %bb.11: ; %Flow6
-; GFX11-TRUE16-NEXT:    s_mov_b32 s18, -1
-; GFX11-TRUE16-NEXT:  .LBB2_12: ; %Flow11
-; GFX11-TRUE16-NEXT:    s_and_b32 s20, s0, exec_lo
-; GFX11-TRUE16-NEXT:    s_or_not1_b32 s0, s18, exec_lo
-; GFX11-TRUE16-NEXT:  .LBB2_13: ; %Flow9
-; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s3, s0
-; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB2_15
-; GFX11-TRUE16-NEXT:  ; %bb.14: ; %bb43
-; GFX11-TRUE16-NEXT:    s_add_u32 s8, s16, 0x58
-; GFX11-TRUE16-NEXT:    s_addc_u32 s9, s17, 0
-; GFX11-TRUE16-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-TRUE16-NEXT:    s_add_u32 s0, s0, f0 at gotpcrel32@lo+4
-; GFX11-TRUE16-NEXT:    s_addc_u32 s1, s1, f0 at gotpcrel32@hi+12
-; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s14
-; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s15
-; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-TRUE16-NEXT:    s_or_b32 s20, s20, exec_lo
-; GFX11-TRUE16-NEXT:  .LBB2_15: ; %Flow14
-; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; GFX11-TRUE16-NEXT:    s_and_saveexec_b32 s0, s20
-; GFX11-TRUE16-NEXT:  ; %bb.16: ; %UnifiedUnreachableBlock
-; GFX11-TRUE16-NEXT:    ; divergent unreachable
-; GFX11-TRUE16-NEXT:  ; %bb.17: ; %UnifiedReturnBlock
-; GFX11-TRUE16-NEXT:    s_endpgm
-; GFX11-FAKE16-LABEL: f2:
-; GFX11-FAKE16:       ; %bb.0: ; %bb
-; GFX11-FAKE16-NEXT:    s_mov_b64 s[16:17], s[4:5]
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v31, v0
-; GFX11-FAKE16-NEXT:    s_load_b32 s19, s[16:17], 0x24
-; GFX11-FAKE16-NEXT:    s_mov_b32 s12, s13
-; GFX11-FAKE16-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX11-FAKE16-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
-; GFX11-FAKE16-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GFX11-FAKE16-NEXT:    s_mov_b32 s20, 0
-; GFX11-FAKE16-NEXT:    s_mov_b32 s0, -1
-; GFX11-FAKE16-NEXT:    s_mov_b32 s3, exec_lo
-; GFX11-FAKE16-NEXT:    s_mov_b32 s32, 0
-; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_mul_lo_u32 v0, s19, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB2_13
-; GFX11-FAKE16-NEXT:  ; %bb.1: ; %bb14
-; GFX11-FAKE16-NEXT:    s_load_b128 s[20:23], s[16:17], 0x2c
-; GFX11-FAKE16-NEXT:    s_mov_b32 s18, 0
-; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_bitcmp1_b32 s21, 0
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s24, -1, 0
-; GFX11-FAKE16-NEXT:    s_bitcmp0_b32 s21, 0
-; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB2_3
-; GFX11-FAKE16-NEXT:  ; %bb.2: ; %bb15
-; GFX11-FAKE16-NEXT:    s_add_u32 s8, s16, 0x58
-; GFX11-FAKE16-NEXT:    s_addc_u32 s9, s17, 0
-; GFX11-FAKE16-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-FAKE16-NEXT:    s_add_u32 s0, s0, f0 at gotpcrel32@lo+4
-; GFX11-FAKE16-NEXT:    s_addc_u32 s1, s1, f0 at gotpcrel32@hi+12
-; GFX11-FAKE16-NEXT:    s_mov_b32 s13, s14
-; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-FAKE16-NEXT:    s_mov_b32 s21, s14
-; GFX11-FAKE16-NEXT:    s_mov_b32 s14, s15
-; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT:    s_mov_b32 s14, s21
-; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
-; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB2_4
-; GFX11-FAKE16-NEXT:    s_branch .LBB2_12
-; GFX11-FAKE16-NEXT:  .LBB2_3:
-; GFX11-FAKE16-NEXT:    s_mov_b32 s2, 0
-; GFX11-FAKE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT:    s_cbranch_vccnz .LBB2_12
-; GFX11-FAKE16-NEXT:  .LBB2_4: ; %bb16
-; GFX11-FAKE16-NEXT:    s_load_b32 s0, s[16:17], 0x54
-; GFX11-FAKE16-NEXT:    s_bitcmp1_b32 s23, 0
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX11-FAKE16-NEXT:    s_and_b32 s1, s23, 1
-; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_bitcmp1_b32 s0, 0
-; GFX11-FAKE16-NEXT:    s_mov_b32 s0, -1
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s8, -1, 0
-; GFX11-FAKE16-NEXT:    s_cmp_eq_u32 s1, 0
-; GFX11-FAKE16-NEXT:    s_cbranch_scc0 .LBB2_8
-; GFX11-FAKE16-NEXT:  ; %bb.5: ; %bb18.preheader
-; GFX11-FAKE16-NEXT:    s_load_b128 s[28:31], s[16:17], 0x44
-; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_mul_hi_u32 s0, s29, s28
-; GFX11-FAKE16-NEXT:    s_mul_i32 s1, s29, s28
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_alignbit_b32 v0, s0, s1, 1
-; GFX11-FAKE16-NEXT:    s_mov_b32 s1, 0
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s0, 1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    s_lshr_b32 s0, s0, s30
-; GFX11-FAKE16-NEXT:    s_mul_i32 s0, s0, s22
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    s_mul_i32 s0, s0, s20
-; GFX11-FAKE16-NEXT:    s_or_b32 s0, s19, s0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    s_lshl_b64 s[20:21], s[0:1], 1
-; GFX11-FAKE16-NEXT:    s_mov_b32 s0, s1
-; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[20:21]
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s24
-; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-FAKE16-NEXT:    s_mov_b32 vcc_lo, 0
-; GFX11-FAKE16-NEXT:    .p2align 6
-; GFX11-FAKE16-NEXT:  .LBB2_6: ; %bb18
-; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-FAKE16-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s13, v0
-; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
-; GFX11-FAKE16-NEXT:    s_and_b32 s1, s8, s1
-; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, exec_lo
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s19, v2
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s1, s19, s13
-; GFX11-FAKE16-NEXT:    s_and_b32 s13, 0xffff, s0
-; GFX11-FAKE16-NEXT:    s_and_b32 s1, s1, 1
-; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s13, -1, 0
-; GFX11-FAKE16-NEXT:    s_and_b32 s20, s9, exec_lo
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s13
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s13, v1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s19, v2
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s13, s19, s13
-; GFX11-FAKE16-NEXT:    s_or_b32 s19, s0, 0x100
-; GFX11-FAKE16-NEXT:    s_and_b32 s13, 1, s13
-; GFX11-FAKE16-NEXT:    s_cmp_eq_u32 s13, 1
-; GFX11-FAKE16-NEXT:    s_cselect_b32 s0, s19, s0
-; GFX11-FAKE16-NEXT:    s_cbranch_vccz .LBB2_6
-; GFX11-FAKE16-NEXT:  ; %bb.7: ; %Flow
-; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
-; GFX11-FAKE16-NEXT:  .LBB2_8: ; %Flow12
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT:    s_cbranch_vccz .LBB2_12
-; GFX11-FAKE16-NEXT:  ; %bb.9:
-; GFX11-FAKE16-NEXT:    s_xor_b32 s0, s8, -1
-; GFX11-FAKE16-NEXT:  .LBB2_10: ; %bb17
-; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
-; GFX11-FAKE16-NEXT:    s_cbranch_vccz .LBB2_10
-; GFX11-FAKE16-NEXT:  ; %bb.11: ; %Flow6
-; GFX11-FAKE16-NEXT:    s_mov_b32 s18, -1
-; GFX11-FAKE16-NEXT:  .LBB2_12: ; %Flow11
-; GFX11-FAKE16-NEXT:    s_and_b32 s20, s2, exec_lo
-; GFX11-FAKE16-NEXT:    s_or_not1_b32 s0, s18, exec_lo
-; GFX11-FAKE16-NEXT:  .LBB2_13: ; %Flow9
-; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s3, s0
-; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB2_15
-; GFX11-FAKE16-NEXT:  ; %bb.14: ; %bb43
-; GFX11-FAKE16-NEXT:    s_add_u32 s8, s16, 0x58
-; GFX11-FAKE16-NEXT:    s_addc_u32 s9, s17, 0
-; GFX11-FAKE16-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-FAKE16-NEXT:    s_add_u32 s0, s0, f0 at gotpcrel32@lo+4
-; GFX11-FAKE16-NEXT:    s_addc_u32 s1, s1, f0 at gotpcrel32@hi+12
-; GFX11-FAKE16-NEXT:    s_mov_b32 s13, s14
-; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-FAKE16-NEXT:    s_mov_b32 s14, s15
-; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-FAKE16-NEXT:    s_or_b32 s20, s20, exec_lo
-; GFX11-FAKE16-NEXT:  .LBB2_15: ; %Flow14
-; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s3
-; GFX11-FAKE16-NEXT:    s_and_saveexec_b32 s0, s20
-; GFX11-FAKE16-NEXT:  ; %bb.16: ; %UnifiedUnreachableBlock
-; GFX11-FAKE16-NEXT:    ; divergent unreachable
-; GFX11-FAKE16-NEXT:  ; %bb.17: ; %UnifiedReturnBlock
-; GFX11-FAKE16-NEXT:    s_endpgm
-
+; GFX11-LABEL: f2:
+; GFX11:       ; %bb.0: ; %bb
+; GFX11-NEXT:    s_mov_b64 s[16:17], s[4:5]
+; GFX11-NEXT:    v_mov_b32_e32 v31, v0
+; GFX11-NEXT:    s_load_b32 s19, s[16:17], 0x24
+; GFX11-NEXT:    s_mov_b32 s12, s13
+; GFX11-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s20, 0
+; GFX11-NEXT:    s_mov_b32 s0, -1
+; GFX11-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mul_lo_u32 v0, s19, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX11-NEXT:    s_cbranch_execz .LBB2_13
+; GFX11-NEXT:  ; %bb.1: ; %bb14
+; GFX11-NEXT:    s_load_b128 s[20:23], s[16:17], 0x2c
+; GFX11-NEXT:    s_mov_b32 s18, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_bitcmp1_b32 s21, 0
+; GFX11-NEXT:    s_cselect_b32 s24, -1, 0
+; GFX11-NEXT:    s_bitcmp0_b32 s21, 0
+; GFX11-NEXT:    s_cbranch_scc0 .LBB2_3
+; GFX11-NEXT:  ; %bb.2: ; %bb15
+; GFX11-NEXT:    s_add_u32 s8, s16, 0x58
+; GFX11-NEXT:    s_addc_u32 s9, s17, 0
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, f0 at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, f0 at gotpcrel32@hi+12
+; GFX11-NEXT:    s_mov_b32 s13, s14
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s21, s14
+; GFX11-NEXT:    s_mov_b32 s14, s15
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s14, s21
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_cbranch_execz .LBB2_4
+; GFX11-NEXT:    s_branch .LBB2_12
+; GFX11-NEXT:  .LBB2_3:
+; GFX11-NEXT:    s_mov_b32 s2, 0
+; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_vccnz .LBB2_12
+; GFX11-NEXT:  .LBB2_4: ; %bb16
+; GFX11-NEXT:    s_load_b32 s0, s[16:17], 0x54
+; GFX11-NEXT:    s_bitcmp1_b32 s23, 0
+; GFX11-NEXT:    s_cselect_b32 s9, -1, 0
+; GFX11-NEXT:    s_and_b32 s1, s23, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX11-NEXT:    s_mov_b32 s0, -1
+; GFX11-NEXT:    s_cselect_b32 s8, -1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s1, 0
+; GFX11-NEXT:    s_cbranch_scc0 .LBB2_8
+; GFX11-NEXT:  ; %bb.5: ; %bb18.preheader
+; GFX11-NEXT:    s_load_b128 s[28:31], s[16:17], 0x44
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_mul_hi_u32 s0, s29, s28
+; GFX11-NEXT:    s_mul_i32 s1, s29, s28
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_alignbit_b32 v0, s0, s1, 1
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_or_b32 s0, s0, 1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshr_b32 s0, s0, s30
+; GFX11-NEXT:    s_mul_i32 s0, s0, s22
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_mul_i32 s0, s0, s20
+; GFX11-NEXT:    s_or_b32 s0, s19, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshl_b64 s[20:21], s[0:1], 1
+; GFX11-NEXT:    s_mov_b32 s0, s1
+; GFX11-NEXT:    global_load_u16 v1, v0, s[20:21]
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s24
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB2_6: ; %bb18
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT:    v_readfirstlane_b32 s13, v0
+; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
+; GFX11-NEXT:    s_and_b32 s1, s8, s1
+; GFX11-NEXT:    s_and_b32 s1, s1, exec_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s19, v2
+; GFX11-NEXT:    s_cselect_b32 s1, s19, s13
+; GFX11-NEXT:    s_and_b32 s13, 0xffff, s0
+; GFX11-NEXT:    s_and_b32 s1, s1, 1
+; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX11-NEXT:    s_cselect_b32 s13, -1, 0
+; GFX11-NEXT:    s_and_b32 s20, s9, exec_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s13
+; GFX11-NEXT:    v_readfirstlane_b32 s13, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s19, v2
+; GFX11-NEXT:    s_cselect_b32 s13, s19, s13
+; GFX11-NEXT:    s_bitcmp1_b32 s13, 0
+; GFX11-NEXT:    s_cselect_b32 s13, 0x100, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s0, s13, s0
+; GFX11-NEXT:    s_cbranch_vccz .LBB2_6
+; GFX11-NEXT:  ; %bb.7: ; %Flow
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB2_8: ; %Flow12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_vccz .LBB2_12
+; GFX11-NEXT:  ; %bb.9:
+; GFX11-NEXT:    s_xor_b32 s0, s8, -1
+; GFX11-NEXT:  .LBB2_10: ; %bb17
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_vccz .LBB2_10
+; GFX11-NEXT:  ; %bb.11: ; %Flow6
+; GFX11-NEXT:    s_mov_b32 s18, -1
+; GFX11-NEXT:  .LBB2_12: ; %Flow11
+; GFX11-NEXT:    s_and_b32 s20, s2, exec_lo
+; GFX11-NEXT:    s_or_not1_b32 s0, s18, exec_lo
+; GFX11-NEXT:  .LBB2_13: ; %Flow9
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX11-NEXT:    s_and_saveexec_b32 s3, s0
+; GFX11-NEXT:    s_cbranch_execz .LBB2_15
+; GFX11-NEXT:  ; %bb.14: ; %bb43
+; GFX11-NEXT:    s_add_u32 s8, s16, 0x58
+; GFX11-NEXT:    s_addc_u32 s9, s17, 0
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, f0 at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, f0 at gotpcrel32@hi+12
+; GFX11-NEXT:    s_mov_b32 s13, s14
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s14, s15
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_or_b32 s20, s20, exec_lo
+; GFX11-NEXT:  .LBB2_15: ; %Flow14
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s20
+; GFX11-NEXT:  ; %bb.16: ; %UnifiedUnreachableBlock
+; GFX11-NEXT:    ; divergent unreachable
+; GFX11-NEXT:  ; %bb.17: ; %UnifiedReturnBlock
+; GFX11-NEXT:    s_endpgm
 bb:
   %i = tail call i32 @llvm.amdgcn.workitem.id.x()
   %i12 = mul i32 %arg, %i



More information about the llvm-commits mailing list