[llvm] [AMDGPU][True16] true16 for v_cndmask_b16 (PR #119736)

Brox Chen via llvm-commits llvm-commits at lists.llvm.org
Fri Dec 13 10:21:48 PST 2024


https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/119736

>From 0b4967f777d7ab95a1262c02e86ec73c3dd7121a Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Thu, 12 Dec 2024 12:05:32 -0500
Subject: [PATCH 1/2] True16 for v_cndmask_b16 in MC

---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp |   8 +-
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  14 +-
 llvm/lib/Target/AMDGPU/VOP2Instructions.td    |  26 ++-
 llvm/test/MC/AMDGPU/gfx11_asm_vop3.s          | 173 ++++++++-------
 llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s    | 187 +++++++++-------
 llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s     |  85 +++++---
 llvm/test/MC/AMDGPU/gfx12_asm_vop3.s          | 173 ++++++++-------
 llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s    | 203 ++++++++++--------
 llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s     |  95 ++++----
 .../Disassembler/AMDGPU/gfx11_dasm_vop3.txt   |  95 +++++---
 .../AMDGPU/gfx11_dasm_vop3_dpp16.txt          | 111 +++++++---
 .../AMDGPU/gfx11_dasm_vop3_dpp8.txt           |  57 ++++-
 .../Disassembler/AMDGPU/gfx12_dasm_vop3.txt   |  95 +++++---
 .../AMDGPU/gfx12_dasm_vop3_dpp16.txt          | 101 ++++++---
 .../AMDGPU/gfx12_dasm_vop3_dpp8.txt           |  47 +++-
 15 files changed, 950 insertions(+), 520 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 5207201e14c091..6baef137df5e16 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -3007,8 +3007,8 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
     switch (I.getOpcode()) {
     case AMDGPU::V_ADDC_U32_e32:
     case AMDGPU::V_ADDC_U32_dpp:
-    case AMDGPU::V_CNDMASK_B16_e32:
-    case AMDGPU::V_CNDMASK_B16_dpp:
+    case AMDGPU::V_CNDMASK_B16_fake16_e32:
+    case AMDGPU::V_CNDMASK_B16_fake16_dpp:
     case AMDGPU::V_CNDMASK_B32_e32:
     case AMDGPU::V_CNDMASK_B32_dpp:
     case AMDGPU::V_DIV_FMAS_F32_e64:
@@ -3023,8 +3023,8 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
              HazardReg == AMDGPU::VCC_HI;
     case AMDGPU::V_ADDC_U32_e64:
     case AMDGPU::V_ADDC_U32_e64_dpp:
-    case AMDGPU::V_CNDMASK_B16_e64:
-    case AMDGPU::V_CNDMASK_B16_e64_dpp:
+    case AMDGPU::V_CNDMASK_B16_fake16_e64:
+    case AMDGPU::V_CNDMASK_B16_fake16_e64_dpp:
     case AMDGPU::V_CNDMASK_B32_e64:
     case AMDGPU::V_CNDMASK_B32_e64_dpp:
     case AMDGPU::V_SUBB_U32_e64:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index bc25d75131cc35..efdf0a46d19c8f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1245,11 +1245,21 @@ class VOPSelectPat <ValueType vt> : GCNPat <
   (vt (select i1:$src0, vt:$src1, vt:$src2)),
   (V_CNDMASK_B32_e64 0, VSrc_b32:$src2, 0, VSrc_b32:$src1, SSrc_i1:$src0)
 >;
+class VOPSelectPat_fake16 <ValueType vt> : GCNPat <
+  (vt (select i1:$src0, vt:$src1, vt:$src2)),
+  (V_CNDMASK_B16_fake16_e64 0, VSrc_b16:$src2, 0, VSrc_b16:$src1, SSrc_i1:$src0)
+>;
 
 def : VOPSelectModsPat <i32>;
 def : VOPSelectModsPat <f32>;
-def : VOPSelectPat <f16>;
-def : VOPSelectPat <i16>;
+let True16Predicate = NotHasTrue16BitInsts in {
+  def : VOPSelectPat <f16>;
+  def : VOPSelectPat <i16>;
+} // End True16Predicate = NotHasTrue16BitInsts
+let True16Predicate = UseFakeTrue16Insts in {
+  def : VOPSelectPat_fake16 <f16>;
+  def : VOPSelectPat_fake16 <i16>;
+} // End True16Predicate = UseFakeTrue16Insts
 
 let AddedComplexity = 1 in {
 def : GCNPat <
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 384fec0079a5d9..d9e4fcc53f0c4b 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -714,6 +714,26 @@ class VOP2e_SGPR<list<ValueType> ArgVT> : VOPProfile<ArgVT> {
 def VOP2e_I32_I32_I32_I1 : VOP2e_SGPR<[i32, i32, i32, i1]>;
 def VOP2e_I16_I16_I16_I1 : VOP2e_SGPR<[i16, i16, i16, i1]>;
 // V_CNDMASK_B16 is VOP3 only
+def VOP2e_I16_I16_I16_I1_true16 : VOP2e_SGPR<[i16, i16, i16, i1]> {
+  let IsTrue16 = 1;
+  let IsRealTrue16 = 1;
+  let HasOpSel = 1;
+  let DstRC64 = getVALUDstForVT<DstVT, 1, 1>.ret;
+  let Src0RC64 = getVOP3SrcForVT<Src0VT, 1/*IsTrue16*/>.ret;
+  let Src1RC64 = getVOP3SrcForVT<Src1VT, 1/*IsTrue16*/>.ret;
+  let Src2RC64 = getVOP3SrcForVT<Src2VT, 1/*IsTrue16*/>.ret;
+  let Src0Mod = getSrc0Mod<f16, DstVT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+  let Src1Mod = getSrcMod<f16, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
+  let HasSrc2Mods = 0;
+  let InsVOP3OpSel = getInsVOP3Base<Src0RC64, Src1RC64,
+                    Src2RC64, NumSrcArgs,
+                    HasClamp, 1/*HasModifiers*/, 0/*HasSrc2Mods*/, HasOMod,
+                    Src0Mod, Src1Mod, Src2Mod, 1/*HasOpSel*/>.ret;
+  let Src0VOP3DPP = VGPRSrc_16;
+  let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0/*IsFake16*/>.ret;
+  let Src0ModVOP3DPP = getSrc0ModVOP3DPP<f16, DstVT, 0/*IsFake16*/>.ret;
+  let Src1ModVOP3DPP = getSrcModVOP3DPP<f16, 0/*IsFake16*/>.ret;
+}
 def VOP2e_I16_I16_I16_I1_fake16 : VOP2e_SGPR<[i16, i16, i16, i1]> {
   let IsTrue16 = 1;
   let DstRC64 = getVALUDstForVT<DstVT>.ret;
@@ -765,8 +785,8 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
 // VOP2 Instructions
 //===----------------------------------------------------------------------===//
 
-let SubtargetPredicate = isGFX11Plus in
-defm V_CNDMASK_B16 : VOP2eInst <"v_cndmask_b16", VOP2e_I16_I16_I16_I1_fake16>;
+defm V_CNDMASK_B16_t16 : VOP2eInst <"v_cndmask_b16_t16", VOP2e_I16_I16_I16_I1_true16>;
+defm V_CNDMASK_B16_fake16 : VOP2eInst <"v_cndmask_b16_fake16", VOP2e_I16_I16_I16_I1_fake16>;
 defm V_CNDMASK_B32 : VOP2eInst_VOPD <"v_cndmask_b32", VOP2e_I32_I32_I32_I1, 0x9, "v_cndmask_b32">;
 let SubtargetPredicate = HasMadMacF32Insts, isReMaterializable = 1 in
 def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>;
@@ -1835,7 +1855,7 @@ defm V_FMAMK_F16           : VOP2Only_Real_MADK_t16_and_fake16_gfx11_gfx12<0x037
 defm V_FMAAK_F16           : VOP2Only_Real_MADK_t16_and_fake16_gfx11_gfx12<0x038, "v_fmaak_f16">;
 
 // VOP3 only.
-defm V_CNDMASK_B16         : VOP3Only_Realtriple_gfx11_gfx12<0x25d>;
+defm V_CNDMASK_B16         : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x25d, "v_cndmask_b16">;
 defm V_LDEXP_F32           : VOP3Only_Realtriple_gfx11_gfx12<0x31c>;
 defm V_BFM_B32             : VOP3Only_Realtriple_gfx11_gfx12<0x31d>;
 defm V_BCNT_U32_B32        : VOP3Only_Realtriple_gfx11_gfx12<0x31e>;
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
index b649bab532f262..955915f69ff612 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
@@ -890,104 +890,131 @@ v_bfm_b32 v5, src_scc, vcc_lo
 v_bfm_b32 v255, 0xaf123456, vcc_hi
 // GFX11: v_bfm_b32 v255, 0xaf123456, vcc_hi      ; encoding: [0xff,0x00,0x1d,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cndmask_b16 v5, v1, src_scc, s3
-// W32: v_cndmask_b16 v5, v1, src_scc, s3       ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x0d,0x00]
-// W64-ERR: :[[@LINE-2]]:32: error: invalid operand for instruction
+v_cndmask_b16 v5.l, v1.l, src_scc, s3
+// W32: v_cndmask_b16 v5.l, v1.l, src_scc, s3   ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x0d,0x00]
+// W64-ERR: :[[@LINE-2]]:36: error: invalid operand for instruction
 
-v_cndmask_b16 v5, v255, 0.5, s3
-// W32: v_cndmask_b16 v5, v255, 0.5, s3         ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x0d,0x00]
-// W64-ERR: :[[@LINE-2]]:30: error: invalid operand for instruction
+v_cndmask_b16 v5.l, v255.l, 0.5, s3
+// W32: v_cndmask_b16 v5.l, v255.l, 0.5, s3     ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x0d,0x00]
+// W64-ERR: :[[@LINE-2]]:34: error: invalid operand for instruction
 
-v_cndmask_b16 v5, s105, s105, s3
-// W32: v_cndmask_b16 v5, s105, s105, s3        ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x0c,0x00]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
-
-v_cndmask_b16 v5, vcc_hi, v2, s3
-// W32: v_cndmask_b16 v5, vcc_hi, v2, s3        ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x0e,0x00]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, s105, s105, s3
+// W32: v_cndmask_b16 v5.l, s105, s105, s3      ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x0c,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, ttmp15, ttmp15, s3
-// W32: v_cndmask_b16 v5, ttmp15, ttmp15, s3    ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x0c,0x00]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, vcc_hi, v2.l, s3
+// W32: v_cndmask_b16 v5.l, vcc_hi, v2.l, s3    ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x0e,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, m0, v255, s3
-// W32: v_cndmask_b16 v5, m0, v255, s3          ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x0f,0x00]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, ttmp15, ttmp15, s3
+// W32: v_cndmask_b16 v5.l, ttmp15, ttmp15, s3  ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x0c,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, exec_lo, exec_lo, s3
-// W32: v_cndmask_b16 v5, exec_lo, exec_lo, s3  ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x0c,0x00]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, m0, v255.l, s3
+// W32: v_cndmask_b16 v5.l, m0, v255.l, s3      ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x0f,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, exec_hi, exec_hi, s3
-// W32: v_cndmask_b16 v5, exec_hi, exec_hi, s3  ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x0c,0x00]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, exec_lo, exec_lo, s3
+// W32: v_cndmask_b16 v5.l, exec_lo, exec_lo, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x0c,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, null, m0, s105
-// W32: v_cndmask_b16 v5, null, m0, s105        ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0xa4,0x01]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, exec_hi, exec_hi, s3
+// W32: v_cndmask_b16 v5.l, exec_hi, exec_hi, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x0c,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, -1, -|vcc_lo|, vcc_lo
-// W32: v_cndmask_b16 v5, -1, -|vcc_lo|, vcc_lo ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa8,0x41]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, null, m0, s105
+// W32: v_cndmask_b16 v5.l, null, m0, s105      ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0xa4,0x01]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, 0.5, -1, vcc_hi
-// W32: v_cndmask_b16 v5, 0.5, -1, vcc_hi       ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xad,0x01]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, -1, -|vcc_lo|, vcc_lo
+// W32: v_cndmask_b16 v5.l, -1, -|vcc_lo|, vcc_lo ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa8,0x41]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, -|src_scc|, null, ttmp15
-// W32: v_cndmask_b16 v5, -|src_scc|, null, ttmp15 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xec,0x21]
+v_cndmask_b16 v5.l, 0.5, -1, vcc_hi
+// W32: v_cndmask_b16 v5.l, 0.5, -1, vcc_hi     ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xad,0x01]
 // W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, v1, src_scc, s[6:7]
-// W64: v_cndmask_b16 v5, v1, src_scc, s[6:7]   ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
-// W32-ERR: :[[@LINE-2]]:32: error: invalid operand for instruction
+v_cndmask_b16 v5.l, -|src_scc|, null, ttmp15
+// W32: v_cndmask_b16 v5.l, -|src_scc|, null, ttmp15 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xec,0x21]
+// W64-ERR: :[[@LINE-2]]:23: error: invalid operand for instruction
 
-v_cndmask_b16 v5, v255, 0.5, s[6:7]
-// W64: v_cndmask_b16 v5, v255, 0.5, s[6:7]     ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x19,0x00]
-// W32-ERR: :[[@LINE-2]]:30: error: invalid operand for instruction
+v_cndmask_b16 v5.l, v1.l, src_scc, s[6:7]
+// W64: v_cndmask_b16 v5.l, v1.l, src_scc, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
+// W32-ERR: :[[@LINE-2]]:36: error: invalid operand for instruction
 
-v_cndmask_b16 v5, s105, s105, s[6:7]
-// W64: v_cndmask_b16 v5, s105, s105, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, v255.l, 0.5, s[6:7]
+// W64: v_cndmask_b16 v5.l, v255.l, 0.5, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x19,0x00]
+// W32-ERR: :[[@LINE-2]]:34: error: invalid operand for instruction
 
-v_cndmask_b16 v5, vcc_hi, v2, s[6:7]
-// W64: v_cndmask_b16 v5, vcc_hi, v2, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, s105, s105, s[6:7]
+// W64: v_cndmask_b16 v5.l, s105, s105, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7]
-// W64: v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, vcc_hi, v2.l, s[6:7]
+// W64: v_cndmask_b16 v5.l, vcc_hi, v2.l, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, m0, v255, s[6:7]
-// W64: v_cndmask_b16 v5, m0, v255, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, ttmp15, ttmp15, s[6:7]
+// W64: v_cndmask_b16 v5.l, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7]
-// W64: v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, m0, v255.l, s[6:7]
+// W64: v_cndmask_b16 v5.l, m0, v255.l, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7]
-// W64: v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, exec_lo, exec_lo, s[6:7]
+// W64: v_cndmask_b16 v5.l, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, null, m0, s[6:7]
-// W64: v_cndmask_b16 v5, null, m0, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, exec_hi, exec_hi, s[6:7]
+// W64: v_cndmask_b16 v5.l, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105]
-// W64: v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, null, m0, s[6:7]
+// W64: v_cndmask_b16 v5.l, null, m0, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
+
+v_cndmask_b16 v5.l, -1, -|vcc_lo|, s[104:105]
+// W64: v_cndmask_b16 v5.l, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
+
+v_cndmask_b16 v5.l, 0.5, -1, vcc
+// W64: v_cndmask_b16 v5.l, 0.5, -1, vcc        ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xa9,0x01]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
+
+v_cndmask_b16 v5.l, -|src_scc|, null, ttmp[14:15]
+// W64: v_cndmask_b16 v5.l, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
+// W32-ERR: :[[@LINE-2]]:23: error: invalid operand for instruction
+
+v_cndmask_b16 v255.l, -|0xfe0b|, -|vcc_hi|, null
+// GFX11: v_cndmask_b16 v255.l, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+
+v_cndmask_b16 v5.l, 0x3800, -1, vcc_lo
+// W32: v_cndmask_b16 v5.l, 0x3800, -1, vcc_lo  ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
+
+v_cndmask_b16 v5.l, 0x3800, -1, vcc
+// W64: v_cndmask_b16 v5.l, 0x3800, -1, vcc     ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
+
+v_cndmask_b16 v5.l, v255.h, 0.5, s3
+// W32: v_cndmask_b16 v5.l, v255.h, 0.5, s3     ; encoding: [0x05,0x08,0x5d,0xd6,0xff,0xe1,0x0d,0x00]
+// W64-ERR: :[[@LINE-2]]:34: error: invalid operand for instruction
+
+v_cndmask_b16 v5.l, m0, v255.h, s3
+// W32: v_cndmask_b16 v5.l, m0, v255.h, s3      ; encoding: [0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x0f,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, 0.5, -1, vcc
-// W64: v_cndmask_b16 v5, 0.5, -1, vcc          ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xa9,0x01]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, v255.h, 0.5, s[6:7]
+// W64: v_cndmask_b16 v5.l, v255.h, 0.5, s[6:7] ; encoding: [0x05,0x08,0x5d,0xd6,0xff,0xe1,0x19,0x00]
+// W32-ERR: :[[@LINE-2]]:34: error: invalid operand for instruction
 
-v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15]
-// W64: v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
+v_cndmask_b16 v5.l, m0, v255.h, s[6:7]
+// W64: v_cndmask_b16 v5.l, m0, v255.h, s[6:7]  ; encoding: [0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
 // W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null
-// GFX11: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+v_cndmask_b16 v255.h, -|0xfe0b|, -|vcc_hi|, null
+// GFX11: v_cndmask_b16 v255.h, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x43,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
 v_cubeid_f32 v5, v1, v2, s3
 // GFX11: v_cubeid_f32 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x0c,0xd6,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
index e6f868d2b40e7e..0a28432dd5b223 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
@@ -753,112 +753,139 @@ v_bfm_b32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_bfm_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_bfm_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x1d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[3,2,1,0]
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1b,0x00,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[0,1,2,3]
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0xe4,0x00,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[0,1,2,3]
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0xe4,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_mirror
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_half_mirror
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x41,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_half_mirror
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x41,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:1
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x01,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:1
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x01,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:15
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x0f,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:15
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x0f,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:1
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x11,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:1
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x11,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:15
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1f,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:15
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1f,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_ror:1
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x21,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_ror:1
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x21,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s105 row_ror:15
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s105 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x2f,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 row_ror:15
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x2f,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xae,0x01,0x01,0x50,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xae,0x01,0x01,0x50,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1
-// W32: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
-// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1
+// W32: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// W32: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xee,0x21,0x01,0x60,0x09,0x13]
-// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// W32: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xee,0x21,0x01,0x60,0x09,0x13]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0]
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[3,2,1,0]
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3]
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[0,1,2,3]
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_mirror
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_half_mirror
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:1
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:15
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:1
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:15
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:1
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:15
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1
-// W64: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
-// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1
+// W64: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// W64: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x09,0x13]
-// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// W64: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x09,0x13]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x05,0x30]
+v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x05,0x30]
+
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_share:0 row_mask:0xf bank_mask:0xf
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x50,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_hi row_share:15 row_mask:0x0 bank_mask:0x1
+// W32: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_hi row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xae,0x41,0x01,0x5f,0x01,0x01]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_hi row_share:15 row_mask:0x0 bank_mask:0x1
+// W32: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_hi row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x09,0x5d,0xd6,0xfa,0x04,0xae,0x41,0x01,0x5f,0x01,0x01]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// W32: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x12,0x5d,0xd6,0xfa,0x04,0xee,0x21,0x01,0x60,0x09,0x13]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1
+// W64: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x09,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// W64: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x12,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x09,0x13]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x05,0x30]
 
 v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX11: v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
index 160863b19012dc..daacdc2a040497 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
@@ -412,44 +412,71 @@ v_bfm_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_bfm_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_bfm_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x1d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0]
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// W32: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
-// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// W32: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xee,0x21,0x01,0x77,0x39,0x05]
-// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// W32: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xee,0x21,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0]
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] dpp8:[7,6,5,4,3,2,1,0]
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0]
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] dpp8:[7,6,5,4,3,2,1,0]
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc dpp8:[7,6,5,4,3,2,1,0]
-// W64: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
-// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0]
+// W64: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1
-// W64: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
-// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1
+// W64: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x5d,0xd6,0xe9,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
+v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x5d,0xd6,0xe9,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
+
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xae,0x41,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x09,0x5d,0xd6,0xe9,0x04,0xae,0x41,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// W32: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x12,0x5d,0xd6,0xea,0x04,0xee,0x21,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0]
+// W64: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x09,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1
+// W64: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x12,0x5d,0xd6,0xea,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x43,0x5d,0xd6,0xe9,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
 
 v_cubeid_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cubeid_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
index c7cd88e81583f8..a2342d686d69db 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
@@ -848,104 +848,131 @@ v_bfm_b32 v5, src_scc, vcc_lo
 v_bfm_b32 v255, 0xaf123456, vcc_hi
 // GFX12: v_bfm_b32 v255, 0xaf123456, vcc_hi      ; encoding: [0xff,0x00,0x1d,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cndmask_b16 v5, v1, src_scc, s3
-// W32: v_cndmask_b16 v5, v1, src_scc, s3       ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x0d,0x00]
-// W64-ERR: :[[@LINE-2]]:32: error: invalid operand for instruction
+v_cndmask_b16 v5.l, v1.l, src_scc, s3
+// W32: v_cndmask_b16 v5.l, v1.l, src_scc, s3   ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x0d,0x00]
+// W64-ERR: :[[@LINE-2]]:36: error: invalid operand for instruction
 
-v_cndmask_b16 v5, v255, 0.5, s3
-// W32: v_cndmask_b16 v5, v255, 0.5, s3         ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x0d,0x00]
-// W64-ERR: :[[@LINE-2]]:30: error: invalid operand for instruction
+v_cndmask_b16 v5.l, v255.l, 0.5, s3
+// W32: v_cndmask_b16 v5.l, v255.l, 0.5, s3     ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x0d,0x00]
+// W64-ERR: :[[@LINE-2]]:34: error: invalid operand for instruction
 
-v_cndmask_b16 v5, s105, s105, s3
-// W32: v_cndmask_b16 v5, s105, s105, s3        ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x0c,0x00]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
-
-v_cndmask_b16 v5, vcc_hi, v2, s3
-// W32: v_cndmask_b16 v5, vcc_hi, v2, s3        ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x0e,0x00]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, s105, s105, s3
+// W32: v_cndmask_b16 v5.l, s105, s105, s3      ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x0c,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, ttmp15, ttmp15, s3
-// W32: v_cndmask_b16 v5, ttmp15, ttmp15, s3    ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x0c,0x00]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, vcc_hi, v2.l, s3
+// W32: v_cndmask_b16 v5.l, vcc_hi, v2.l, s3    ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x0e,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, m0, v255, s3
-// W32: v_cndmask_b16 v5, m0, v255, s3          ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x0f,0x00]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, ttmp15, ttmp15, s3
+// W32: v_cndmask_b16 v5.l, ttmp15, ttmp15, s3  ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x0c,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, exec_lo, exec_lo, s3
-// W32: v_cndmask_b16 v5, exec_lo, exec_lo, s3  ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x0c,0x00]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, m0, v255.l, s3
+// W32: v_cndmask_b16 v5.l, m0, v255.l, s3      ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x0f,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, exec_hi, exec_hi, s3
-// W32: v_cndmask_b16 v5, exec_hi, exec_hi, s3  ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x0c,0x00]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, exec_lo, exec_lo, s3
+// W32: v_cndmask_b16 v5.l, exec_lo, exec_lo, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x0c,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, null, m0, s105
-// W32: v_cndmask_b16 v5, null, m0, s105        ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0xa4,0x01]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, exec_hi, exec_hi, s3
+// W32: v_cndmask_b16 v5.l, exec_hi, exec_hi, s3 ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x0c,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, -1, -|vcc_lo|, vcc_lo
-// W32: v_cndmask_b16 v5, -1, -|vcc_lo|, vcc_lo ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa8,0x41]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, null, m0, s105
+// W32: v_cndmask_b16 v5.l, null, m0, s105      ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0xa4,0x01]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, 0.5, -1, vcc_hi
-// W32: v_cndmask_b16 v5, 0.5, -1, vcc_hi       ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xad,0x01]
-// W64-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, -1, -|vcc_lo|, vcc_lo
+// W32: v_cndmask_b16 v5.l, -1, -|vcc_lo|, vcc_lo ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa8,0x41]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, -|src_scc|, null, ttmp15
-// W32: v_cndmask_b16 v5, -|src_scc|, null, ttmp15 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xec,0x21]
+v_cndmask_b16 v5.l, 0.5, -1, vcc_hi
+// W32: v_cndmask_b16 v5.l, 0.5, -1, vcc_hi     ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xad,0x01]
 // W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, v1, src_scc, s[6:7]
-// W64: v_cndmask_b16 v5, v1, src_scc, s[6:7]   ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
-// W32-ERR: :[[@LINE-2]]:32: error: invalid operand for instruction
+v_cndmask_b16 v5.l, -|src_scc|, null, ttmp15
+// W32: v_cndmask_b16 v5.l, -|src_scc|, null, ttmp15 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xec,0x21]
+// W64-ERR: :[[@LINE-2]]:23: error: invalid operand for instruction
 
-v_cndmask_b16 v5, v255, 0.5, s[6:7]
-// W64: v_cndmask_b16 v5, v255, 0.5, s[6:7]     ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x19,0x00]
-// W32-ERR: :[[@LINE-2]]:30: error: invalid operand for instruction
+v_cndmask_b16 v5.l, v1.l, src_scc, s[6:7]
+// W64: v_cndmask_b16 v5.l, v1.l, src_scc, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
+// W32-ERR: :[[@LINE-2]]:36: error: invalid operand for instruction
 
-v_cndmask_b16 v5, s105, s105, s[6:7]
-// W64: v_cndmask_b16 v5, s105, s105, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, v255.l, 0.5, s[6:7]
+// W64: v_cndmask_b16 v5.l, v255.l, 0.5, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xe1,0x19,0x00]
+// W32-ERR: :[[@LINE-2]]:34: error: invalid operand for instruction
 
-v_cndmask_b16 v5, vcc_hi, v2, s[6:7]
-// W64: v_cndmask_b16 v5, vcc_hi, v2, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, s105, s105, s[6:7]
+// W64: v_cndmask_b16 v5.l, s105, s105, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7]
-// W64: v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, vcc_hi, v2.l, s[6:7]
+// W64: v_cndmask_b16 v5.l, vcc_hi, v2.l, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, m0, v255, s[6:7]
-// W64: v_cndmask_b16 v5, m0, v255, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, ttmp15, ttmp15, s[6:7]
+// W64: v_cndmask_b16 v5.l, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7]
-// W64: v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, m0, v255.l, s[6:7]
+// W64: v_cndmask_b16 v5.l, m0, v255.l, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7]
-// W64: v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, exec_lo, exec_lo, s[6:7]
+// W64: v_cndmask_b16 v5.l, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, null, m0, s[6:7]
-// W64: v_cndmask_b16 v5, null, m0, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, exec_hi, exec_hi, s[6:7]
+// W64: v_cndmask_b16 v5.l, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105]
-// W64: v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, null, m0, s[6:7]
+// W64: v_cndmask_b16 v5.l, null, m0, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
+
+v_cndmask_b16 v5.l, -1, -|vcc_lo|, s[104:105]
+// W64: v_cndmask_b16 v5.l, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
+
+v_cndmask_b16 v5.l, 0.5, -1, vcc
+// W64: v_cndmask_b16 v5.l, 0.5, -1, vcc        ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xa9,0x01]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
+
+v_cndmask_b16 v5.l, -|src_scc|, null, ttmp[14:15]
+// W64: v_cndmask_b16 v5.l, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
+// W32-ERR: :[[@LINE-2]]:23: error: invalid operand for instruction
+
+v_cndmask_b16 v255.l, -|0xfe0b|, -|vcc_hi|, null
+// GFX12: v_cndmask_b16 v255.l, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
-v_cndmask_b16 v5, 0.5, -1, vcc
-// W64: v_cndmask_b16 v5, 0.5, -1, vcc          ; encoding: [0x05,0x00,0x5d,0xd6,0xf0,0x82,0xa9,0x01]
-// W32-ERR: :[[@LINE-2]]:19: error: invalid operand for instruction
+v_cndmask_b16 v5.l, v255.h, 0.5, s3
+// W32: v_cndmask_b16 v5.l, v255.h, 0.5, s3     ; encoding: [0x05,0x08,0x5d,0xd6,0xff,0xe1,0x0d,0x00]
+// W64-ERR: :[[@LINE-2]]:34: error: invalid operand for instruction
+
+v_cndmask_b16 v5.l, m0, v255.h, s3
+// W32: v_cndmask_b16 v5.l, m0, v255.h, s3      ; encoding: [0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x0f,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
+
+v_cndmask_b16 v5.l, v255.h, 0.5, s[6:7]
+// W64: v_cndmask_b16 v5.l, v255.h, 0.5, s[6:7] ; encoding: [0x05,0x08,0x5d,0xd6,0xff,0xe1,0x19,0x00]
+// W32-ERR: :[[@LINE-2]]:34: error: invalid operand for instruction
+
+v_cndmask_b16 v5.l, m0, v255.h, s[6:7]
+// W64: v_cndmask_b16 v5.l, m0, v255.h, s[6:7]  ; encoding: [0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+// W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
+
+v_cndmask_b16 v5.l, 0x3800, -1, vcc_lo
+// W32: v_cndmask_b16 v5.l, 0x3800, -1, vcc_lo  ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
+// W64-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15]
-// W64: v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
+v_cndmask_b16 v5.l, 0x3800, -1, vcc
+// W64: v_cndmask_b16 v5.l, 0x3800, -1, vcc     ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
 // W32-ERR: :[[@LINE-2]]:21: error: invalid operand for instruction
 
-v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null
-// GFX12: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+v_cndmask_b16 v255.h, -|0xfe0b|, -|vcc_hi|, null
+// GFX12: v_cndmask_b16 v255.h, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x43,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
 v_cubeid_f32 v5, v1, v2, s3
 // GFX12: v_cubeid_f32 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x0c,0xd6,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
index 894acc5e94e1d3..04a8821f0dbc7d 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
@@ -857,128 +857,147 @@ v_bfm_b32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_bfm_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_bfm_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x00,0x1d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[3,2,1,0]
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1b,0x00,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1b,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[0,1,2,3]
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0xe4,0x00,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[0,1,2,3]
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0xe4,0x00,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_mirror
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, s2, s3 row_mirror
-// W32: v_cndmask_b16_e64_dpp v5, v1, s2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0c,0x00,0x01,0x40,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, s2, s3 row_mirror
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, s2, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0c,0x00,0x01,0x40,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:39: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, 10, s3 row_mirror
-// W32: v_cndmask_b16_e64_dpp v5, v1, 10, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x14,0x0d,0x00,0x01,0x40,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, 10, s3 row_mirror
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, 10, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x14,0x0d,0x00,0x01,0x40,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:39: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_half_mirror
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x41,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_half_mirror
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x41,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:1
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x01,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:1
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x01,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:15
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x0f,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:15
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x0f,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:1
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x11,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:1
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x11,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:15
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1f,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:15
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x1f,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_ror:1
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x21,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_ror:1
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x21,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s105 row_ror:15
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s105 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x2f,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 row_ror:15
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x2f,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xae,0x01,0x01,0x50,0x01,0xff]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xae,0x01,0x01,0x50,0x01,0xff]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1
-// W32: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
-// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1
+// W32: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// W32: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xee,0x21,0x01,0x60,0x09,0x13]
-// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// W32: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xee,0x21,0x01,0x60,0x09,0x13]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0]
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[3,2,1,0]
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3]
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[0,1,2,3]
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_mirror
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_half_mirror
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, s2, s[6:7] row_half_mirror
-// W64: v_cndmask_b16_e64_dpp v5, v1, s2, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x18,0x00,0x01,0x41,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, s2, s[6:7] row_half_mirror
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, s2, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x18,0x00,0x01,0x41,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:39: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, 10, s[6:7] row_half_mirror
-// W64: v_cndmask_b16_e64_dpp v5, v1, 10, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x14,0x19,0x00,0x01,0x41,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, 10, s[6:7] row_half_mirror
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, 10, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x14,0x19,0x00,0x01,0x41,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:39: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:1
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:15
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:1
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:15
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:1
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:15
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1
-// W64: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
-// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1
+// W64: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// W64: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x09,0x13]
-// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// W64: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x09,0x13]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x05,0x30]
+v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x05,0x30]
+
+v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_hi row_share:15 row_mask:0x0 bank_mask:0x1
+// W32: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_hi row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x09,0x5d,0xd6,0xfa,0x04,0xae,0x41,0x01,0x5f,0x01,0x01]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// W32: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp15 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x12,0x5d,0xd6,0xfa,0x04,0xee,0x21,0x01,0x60,0x09,0x13]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1
+// W64: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x09,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// W64: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x12,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x09,0x13]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x05,0x30]
 
 v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
index a5bfec80d8039e..ed585252c64477 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
@@ -504,56 +504,75 @@ v_bfm_b32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_bfm_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_bfm_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x00,0x1d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0]
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, s2, s3 dpp8:[7,6,5,4,3,2,1,0]
-// W32: v_cndmask_b16_e64_dpp v5, v1, s2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x0c,0x00,0x01,0x77,0x39,0x05]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, s2, s3 dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, s2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x0c,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:39: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, 10, s3 dpp8:[7,6,5,4,3,2,1,0]
-// W32: v_cndmask_b16_e64_dpp v5, v1, 10, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x14,0x0d,0x00,0x01,0x77,0x39,0x05]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, 10, s3 dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, 10, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x14,0x0d,0x00,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:39: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// W32: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
-// W64-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// W32: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
-// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// W32: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xee,0x21,0x01,0x77,0x39,0x05]
-// W64-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// W32: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xee,0x21,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0]
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] dpp8:[7,6,5,4,3,2,1,0]
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0]
-// W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
-// W32-ERR: :[[@LINE-2]]:35: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] dpp8:[7,6,5,4,3,2,1,0]
+// W64: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:41: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc dpp8:[7,6,5,4,3,2,1,0]
-// W64: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
-// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0]
+// W64: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1
-// W64: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
-// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1
+// W64: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v5, -v1, |s2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1
-// W64: v_cndmask_b16_e64_dpp v5, -v1, |s2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xe8,0x21,0x01,0x77,0x39,0x05]
-// W32-ERR: :[[@LINE-2]]:38: error: invalid operand for instruction
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |s2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1
+// W64: v_cndmask_b16_e64_dpp v5.l, -v1.l, |s2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xea,0x04,0xe8,0x21,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:42: error: invalid operand for instruction
 
-v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x5d,0xd6,0xe9,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
+v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cndmask_b16_e64_dpp v255.l, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x5d,0xd6,0xe9,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
+
+v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// W32: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x09,0x5d,0xd6,0xe9,0x04,0xae,0x41,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// W32: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp15 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x12,0x5d,0xd6,0xea,0x04,0xee,0x21,0x01,0x77,0x39,0x05]
+// W64-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0]
+// W64: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x09,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1
+// W64: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x12,0x5d,0xd6,0xea,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
+// W32-ERR: :[[@LINE-2]]:44: error: invalid operand for instruction
+
+v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x43,0x5d,0xd6,0xe9,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
 
 v_cubeid_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cubeid_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
index 60d213f1ff9372..5287fb277b8523 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
@@ -1036,55 +1036,100 @@
 # GFX11: v_bfm_b32 v255, 0xaf123456, vcc_hi      ; encoding: [0xff,0x00,0x1d,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00
-# W32: v_cndmask_b16 v5, v1, src_scc, s6       ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
-# W64: v_cndmask_b16 v5, v1, src_scc, s[6:7]   ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, v1.l, src_scc, s6   ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, v1, src_scc, s6       ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, v1.l, src_scc, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, v1, src_scc, s[6:7]   ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
 
 0x05,0x00,0x5d,0xd6,0xff,0xe1,0x19,0x00
-# W32: v_cndmask_b16 v5, v255, 0x3800, s6      ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cndmask_b16 v5, v255, 0x3800, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, v255.l, 0x3800, s6  ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, v255, 0x3800, s6      ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, v255.l, 0x3800, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, v255, 0x3800, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
 
 0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00
-# W32: v_cndmask_b16 v5, s105, s105, s6        ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
-# W64: v_cndmask_b16 v5, s105, s105, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, s105, s105, s6      ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, s105, s105, s6        ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, s105, s105, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, s105, s105, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
 
 0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00
-# W32: v_cndmask_b16 v5, vcc_hi, v2, s6        ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
-# W64: v_cndmask_b16 v5, vcc_hi, v2, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, vcc_hi, v2.l, s6    ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, vcc_hi, v2, s6        ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, vcc_hi, v2.l, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, vcc_hi, v2, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
 
 0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00
-# W32: v_cndmask_b16 v5, ttmp15, ttmp15, s6    ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
-# W64: v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, ttmp15, ttmp15, s6  ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, ttmp15, ttmp15, s6    ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
 
 0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00
-# W32: v_cndmask_b16 v5, m0, v255, s6          ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
-# W64: v_cndmask_b16 v5, m0, v255, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, m0, v255.l, s6      ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, m0, v255, s6          ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, m0, v255.l, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, m0, v255, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
 
 0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00
-# W32: v_cndmask_b16 v5, exec_lo, exec_lo, s6  ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
-# W64: v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, exec_lo, exec_lo, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, exec_lo, exec_lo, s6  ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
 
 0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00
-# W32: v_cndmask_b16 v5, exec_hi, exec_hi, s6  ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
-# W64: v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, exec_hi, exec_hi, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, exec_hi, exec_hi, s6  ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
 
 0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00
-# W32: v_cndmask_b16 v5, null, m0, s6          ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
-# W64: v_cndmask_b16 v5, null, m0, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, null, m0, s6        ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, null, m0, s6          ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, null, m0, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, null, m0, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
 
 0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41
-# W32: v_cndmask_b16 v5, -1, -|vcc_lo|, s104   ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
-# W64: v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
+# W32-REAL16: v_cndmask_b16 v5.l, -1, -|vcc_lo|, s104 ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
+# W32-FAKE16: v_cndmask_b16 v5, -1, -|vcc_lo|, s104   ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
+# W64-REAL16: v_cndmask_b16 v5.l, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
+# W64-FAKE16: v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
 
 0x05,0x00,0x5d,0xd6,0xf0,0x82,0xa9,0x01
-# W32: v_cndmask_b16 v5, 0x3800, -1, vcc_lo    ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
-# W64: v_cndmask_b16 v5, 0x3800, -1, vcc       ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, 0x3800, -1, vcc_lo  ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, 0x3800, -1, vcc_lo    ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, 0x3800, -1, vcc     ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, 0x3800, -1, vcc       ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
 
 0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21
-# W32: v_cndmask_b16 v5, -|src_scc|, null, ttmp14 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
-# W64: v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
+# W32-REAL16: v_cndmask_b16 v5.l, -|src_scc|, null, ttmp14 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
+# W32-FAKE16: v_cndmask_b16 v5, -|src_scc|, null, ttmp14 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
+# W64-REAL16: v_cndmask_b16 v5.l, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
+# W64-FAKE16: v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
 
 0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00
-# GFX11: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_cndmask_b16 v255.l, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_cndmask_b16 v255.l, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+
+0x05,0x08,0x5d,0xd6,0xff,0xe1,0x19,0x00
+# W32-REAL16: v_cndmask_b16 v5.l, v255.h, 0x3800, s6  ; encoding: [0x05,0x08,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, v255, 0x3800, s6      ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, v255.h, 0x3800, s[6:7] ; encoding: [0x05,0x08,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, v255, 0x3800, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+
+0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x1b,0x00
+# W32-REAL16: v_cndmask_b16 v5.l, m0, v255.h, s6      ; encoding: [0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, m0, v255, s6          ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, m0, v255.h, s[6:7]  ; encoding: [0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, m0, v255, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+
+0xff,0x43,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_cndmask_b16 v255.h, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x43,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_cndmask_b16 v255.h, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x43,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x0c,0xd6,0x01,0x05,0x0e,0x00
 # GFX11: v_cubeid_f32 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x0c,0xd6,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
index 35ad673de75db4..d4a489aaed66b5 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
@@ -714,65 +714,118 @@
 # GFX11: v_bfm_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x1d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s104 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s104 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s104 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13]
 
 0x05,0x02,0x5d,0xd6,0xfa,0x04,0xf2,0x21,0x01,0x6f,0x09,0x30
-# GFX11: v_cndmask_b16_e64_dpp v5, -v1, |v2|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xf2,0x21,0x01,0x6f,0x09,0x30]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xf2,0x21,0x01,0x6f,0x09,0x30]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xf2,0x21,0x01,0x6f,0x09,0x30]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xf2,0x21,0x01,0x6f,0x09,0x30]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xf2,0x21,0x01,0x6f,0x09,0x30]
 
 0x05,0x01,0x5d,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x6f,0x09,0x30
-# GFX11: v_cndmask_b16_e64_dpp v5, |v1|, -v2, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x6f,0x09,0x30]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x6f,0x09,0x30]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x6f,0x09,0x30]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x6f,0x09,0x30]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x6f,0x09,0x30]
 
 0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30
-# GFX11: v_cndmask_b16_e64_dpp v255, v255, v255, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_cndmask_b16_e64_dpp v255.l, v255.l, v255.l, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v255, v255, v255, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_cndmask_b16_e64_dpp v255.l, v255.l, v255.l, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v255, v255, v255, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30]
+
+0x05,0x09,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x09,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x09,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+
+0x05,0x12,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x12,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x12,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13]
+
+0xff,0x43,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30
+# W32-REAL16: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # GFX11: v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
index 3a37a19d6d3af9..77721ecb86d115 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
@@ -372,29 +372,64 @@
 # GFX11: v_bfm_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x1d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s104 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s104 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s104 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x5d,0xd6,0xe9,0x04,0xf2,0x21,0x01,0x00,0x00,0x00
-# GFX11: v_cndmask_b16_e64_dpp v5, -v1, |v2|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x02,0x5d,0xd6,0xe9,0x04,0xf2,0x21,0x01,0x00,0x00,0x00]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x02,0x5d,0xd6,0xe9,0x04,0xf2,0x21,0x01,0x00,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x02,0x5d,0xd6,0xe9,0x04,0xf2,0x21,0x01,0x00,0x00,0x00]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.l|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x02,0x5d,0xd6,0xe9,0x04,0xf2,0x21,0x01,0x00,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x02,0x5d,0xd6,0xe9,0x04,0xf2,0x21,0x01,0x00,0x00,0x00]
 
 0x05,0x01,0x5d,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x00,0x00,0x00
-# GFX11: v_cndmask_b16_e64_dpp v5, |v1|, -v2, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x00,0x00,0x00]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x00,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x00,0x00,0x00]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x00,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, null dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x00,0x00,0x00]
 
 0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00
-# GFX11: v_cndmask_b16_e64_dpp v255, v255, v255, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_cndmask_b16_e64_dpp v255.l, v255.l, v255.l, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v255, v255, v255, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_cndmask_b16_e64_dpp v255.l, v255.l, v255.l, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v255, v255, v255, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00]
+
+0x05,0x09,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x09,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x09,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+
+0x05,0x12,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x12,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x12,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
+
+0xff,0x43,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00
+# W32-REAL16: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 # GFX11: v_cubeid_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
index c3bab320b0ba2d..fc1406c243757a 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
@@ -1000,55 +1000,100 @@
 # GFX12: v_bfm_b32 v255, 0xaf123456, vcc_hi      ; encoding: [0xff,0x00,0x1d,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00
-# W32: v_cndmask_b16 v5, v1, src_scc, s6       ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
-# W64: v_cndmask_b16 v5, v1, src_scc, s[6:7]   ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, v1.l, src_scc, s6   ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, v1, src_scc, s6       ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, v1.l, src_scc, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, v1, src_scc, s[6:7]   ; encoding: [0x05,0x00,0x5d,0xd6,0x01,0xfb,0x19,0x00]
 
 0x05,0x00,0x5d,0xd6,0xff,0xe1,0x19,0x00
-# W32: v_cndmask_b16 v5, v255, 0x3800, s6      ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
-# W64: v_cndmask_b16 v5, v255, 0x3800, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, v255.l, 0x3800, s6  ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, v255, 0x3800, s6      ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, v255.l, 0x3800, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, v255, 0x3800, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
 
 0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00
-# W32: v_cndmask_b16 v5, s105, s105, s6        ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
-# W64: v_cndmask_b16 v5, s105, s105, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, s105, s105, s6      ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, s105, s105, s6        ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, s105, s105, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, s105, s105, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x69,0xd2,0x18,0x00]
 
 0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00
-# W32: v_cndmask_b16 v5, vcc_hi, v2, s6        ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
-# W64: v_cndmask_b16 v5, vcc_hi, v2, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, vcc_hi, v2.l, s6    ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, vcc_hi, v2, s6        ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, vcc_hi, v2.l, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, vcc_hi, v2, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x6b,0x04,0x1a,0x00]
 
 0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00
-# W32: v_cndmask_b16 v5, ttmp15, ttmp15, s6    ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
-# W64: v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, ttmp15, ttmp15, s6  ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, ttmp15, ttmp15, s6    ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, ttmp15, ttmp15, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7b,0xf6,0x18,0x00]
 
 0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00
-# W32: v_cndmask_b16 v5, m0, v255, s6          ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
-# W64: v_cndmask_b16 v5, m0, v255, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, m0, v255.l, s6      ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, m0, v255, s6          ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, m0, v255.l, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, m0, v255, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
 
 0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00
-# W32: v_cndmask_b16 v5, exec_lo, exec_lo, s6  ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
-# W64: v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, exec_lo, exec_lo, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, exec_lo, exec_lo, s6  ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, exec_lo, exec_lo, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7e,0xfc,0x18,0x00]
 
 0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00
-# W32: v_cndmask_b16 v5, exec_hi, exec_hi, s6  ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
-# W64: v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, exec_hi, exec_hi, s6 ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, exec_hi, exec_hi, s6  ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, exec_hi, exec_hi, s[6:7] ; encoding: [0x05,0x00,0x5d,0xd6,0x7f,0xfe,0x18,0x00]
 
 0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00
-# W32: v_cndmask_b16 v5, null, m0, s6          ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
-# W64: v_cndmask_b16 v5, null, m0, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, null, m0, s6        ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, null, m0, s6          ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, null, m0, s[6:7]    ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, null, m0, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7c,0xfa,0x18,0x00]
 
 0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41
-# W32: v_cndmask_b16 v5, -1, -|vcc_lo|, s104   ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
-# W64: v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
+# W32-REAL16: v_cndmask_b16 v5.l, -1, -|vcc_lo|, s104 ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
+# W32-FAKE16: v_cndmask_b16 v5, -1, -|vcc_lo|, s104   ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
+# W64-REAL16: v_cndmask_b16 v5.l, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
+# W64-FAKE16: v_cndmask_b16 v5, -1, -|vcc_lo|, s[104:105] ; encoding: [0x05,0x02,0x5d,0xd6,0xc1,0xd4,0xa0,0x41]
 
 0x05,0x00,0x5d,0xd6,0xf0,0x82,0xa9,0x01
-# W32: v_cndmask_b16 v5, 0x3800, -1, vcc_lo    ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
-# W64: v_cndmask_b16 v5, 0x3800, -1, vcc       ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
+# W32-REAL16: v_cndmask_b16 v5.l, 0x3800, -1, vcc_lo  ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, 0x3800, -1, vcc_lo    ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, 0x3800, -1, vcc     ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, 0x3800, -1, vcc       ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0x82,0xa9,0x01,0x00,0x38,0x00,0x00]
 
 0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21
-# W32: v_cndmask_b16 v5, -|src_scc|, null, ttmp14 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
-# W64: v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
+# W32-REAL16: v_cndmask_b16 v5.l, -|src_scc|, null, ttmp14 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
+# W32-FAKE16: v_cndmask_b16 v5, -|src_scc|, null, ttmp14 ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
+# W64-REAL16: v_cndmask_b16 v5.l, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
+# W64-FAKE16: v_cndmask_b16 v5, -|src_scc|, null, ttmp[14:15] ; encoding: [0x05,0x01,0x5d,0xd6,0xfd,0xf8,0xe8,0x21]
 
 0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00
-# GFX12: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_cndmask_b16 v255.l, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_cndmask_b16 v255.l, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+
+0x05,0x08,0x5d,0xd6,0xff,0xe1,0x19,0x00
+# W32-REAL16: v_cndmask_b16 v5.l, v255.h, 0x3800, s6  ; encoding: [0x05,0x08,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, v255, 0x3800, s6      ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, v255.h, 0x3800, s[6:7] ; encoding: [0x05,0x08,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, v255, 0x3800, s[6:7]  ; encoding: [0x05,0x00,0x5d,0xd6,0xff,0xff,0x19,0x00,0x00,0x38,0x00,0x00]
+
+0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x1b,0x00
+# W32-REAL16: v_cndmask_b16 v5.l, m0, v255.h, s6      ; encoding: [0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W32-FAKE16: v_cndmask_b16 v5, m0, v255, s6          ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W64-REAL16: v_cndmask_b16 v5.l, m0, v255.h, s[6:7]  ; encoding: [0x05,0x10,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+# W64-FAKE16: v_cndmask_b16 v5, m0, v255, s[6:7]      ; encoding: [0x05,0x00,0x5d,0xd6,0x7d,0xfe,0x1b,0x00]
+
+0xff,0x43,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_cndmask_b16 v255.h, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x43,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_cndmask_b16 v255.h, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x43,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16 v255, -|0xfe0b|, -|vcc_hi|, null ; encoding: [0xff,0x03,0x5d,0xd6,0xff,0xd6,0xf0,0x61,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x0c,0xd6,0x01,0x05,0x0e,0x00
 # GFX12: v_cubeid_f32 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x0c,0xd6,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
index 232ed8d23c9c6e..cfc1b42b311945 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
@@ -765,59 +765,106 @@
 # GFX12: v_bfm_b32_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x1d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0x1a,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s104 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s104 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s104 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xa2,0x01,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x5f,0x01,0x01]
 
 0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x00,0x5d,0xd6,0xfa,0x04,0xea,0x01,0x01,0x60,0x01,0x13]
 
 0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30
-# GFX12: v_cndmask_b16_e64_dpp v255, v255, v255, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_cndmask_b16_e64_dpp v255.l, v255.l, v255.l, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v255, v255, v255, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_cndmask_b16_e64_dpp v255.l, v255.l, v255.l, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v255, v255, v255, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xfa,0xfe,0xf3,0x01,0xff,0x6f,0x0d,0x30]
+
+0x05,0x09,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x09,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x09,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x5d,0xd6,0xfa,0x04,0xaa,0x41,0x01,0x5f,0x01,0x01]
+
+0x05,0x12,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x12,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp14 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x12,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x02,0x5d,0xd6,0xfa,0x04,0xea,0x21,0x01,0x60,0x01,0x13]
+
+0xff,0x43,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30
+# W32-REAL16: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xfa,0xfe,0xf3,0x61,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # GFX12: v_cubeid_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0c,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
index 469b199053d475..6aebead2ad2f6e 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
@@ -423,23 +423,52 @@
 # GFX12: v_bfm_b32_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x1d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s6 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s6 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s6 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[6:7] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0x1a,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, s104 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s104 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s104 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, s[104:105] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xa2,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05
-# W32: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05]
-# W64: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, v1.l, v2.l, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, v1, v2, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5d,0xd6,0xe9,0x04,0xea,0x01,0x01,0x77,0x39,0x05]
 
 0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00
-# GFX12: v_cndmask_b16_e64_dpp v255, v255, v255, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_cndmask_b16_e64_dpp v255.l, v255.l, v255.l, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v255, v255, v255, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_cndmask_b16_e64_dpp v255.l, v255.l, v255.l, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v255, v255, v255, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x5d,0xd6,0xea,0xfe,0xf3,0x01,0xff,0x00,0x00,0x00]
+
+0x05,0x09,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x09,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, |v1.h|, -v2.l, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x09,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, |v1|, -v2, vcc dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x5d,0xd6,0xe9,0x04,0xaa,0x41,0x01,0x77,0x39,0x05]
+
+0x05,0x12,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05
+# W32-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x12,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp14 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_cndmask_b16_e64_dpp v5.l, -v1.l, |v2.h|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x12,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v5, -v1, |v2|, ttmp[14:15] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x5d,0xd6,0xe9,0x04,0xea,0x21,0x01,0x77,0x39,0x05]
+
+0xff,0x43,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00
+# W32-REAL16: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_cndmask_b16_e64_dpp v255.h, -|v255.l|, -|v255.l|, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x43,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_cndmask_b16_e64_dpp v255, -|v255|, -|v255|, null dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x03,0x5d,0xd6,0xea,0xfe,0xf3,0x61,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 # GFX12: v_cubeid_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0c,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]

>From fd996b4ad0597e0b7fca59a8b9fb67c30530d27a Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Thu, 12 Dec 2024 17:12:44 -0500
Subject: [PATCH 2/2] add true16 codegen and update test

---
 llvm/lib/Target/AMDGPU/SIInstructions.td      |    8 +
 .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll |  135 +-
 .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll |  187 +-
 llvm/test/CodeGen/AMDGPU/bf16.ll              | 1497 +++++++++--------
 .../CodeGen/AMDGPU/dagcombine-fmul-sel.ll     |  100 +-
 .../CodeGen/AMDGPU/extract-subvector-16bit.ll |  156 +-
 .../CodeGen/AMDGPU/extract_vector_elt-f16.ll  |   92 +-
 llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll   |  105 +-
 llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll   |  105 +-
 .../AMDGPU/fmul-2-combine-multi-use.ll        |   27 +-
 llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll     |  368 ++--
 llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll |   31 +-
 .../CodeGen/AMDGPU/fneg-modifier-casting.ll   |  101 +-
 llvm/test/CodeGen/AMDGPU/fract-match.ll       |   28 +-
 .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll |  143 +-
 llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll  |  421 +++--
 llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll  |  421 +++--
 llvm/test/CodeGen/AMDGPU/llvm.round.ll        |   22 +-
 llvm/test/CodeGen/AMDGPU/lround.ll            |    8 +-
 llvm/test/CodeGen/AMDGPU/maximumnum.ll        |   86 +-
 llvm/test/CodeGen/AMDGPU/minimumnum.ll        |   86 +-
 .../AMDGPU/select-fabs-fneg-extract.f16.ll    |  190 +--
 .../AMDGPU/select-fabs-fneg-extract.v2f16.ll  |  673 ++++----
 .../AMDGPU/select-flags-to-fmin-fmax.ll       |  252 +--
 llvm/test/CodeGen/AMDGPU/select.f16.ll        |  740 ++++----
 llvm/test/CodeGen/AMDGPU/v_cndmask.ll         |    4 +-
 .../CodeGen/AMDGPU/valu-mask-write-hazard.mir |    6 +-
 27 files changed, 3026 insertions(+), 2966 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index efdf0a46d19c8f..bf8935d0812be8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1245,6 +1245,10 @@ class VOPSelectPat <ValueType vt> : GCNPat <
   (vt (select i1:$src0, vt:$src1, vt:$src2)),
   (V_CNDMASK_B32_e64 0, VSrc_b32:$src2, 0, VSrc_b32:$src1, SSrc_i1:$src0)
 >;
+class VOPSelectPat_t16 <ValueType vt> : GCNPat <
+  (vt (select i1:$src0, vt:$src1, vt:$src2)),
+  (V_CNDMASK_B16_t16_e64 0, VSrcT_b16:$src2, 0, VSrcT_b16:$src1, SSrc_i1:$src0)
+>;
 class VOPSelectPat_fake16 <ValueType vt> : GCNPat <
   (vt (select i1:$src0, vt:$src1, vt:$src2)),
   (V_CNDMASK_B16_fake16_e64 0, VSrc_b16:$src2, 0, VSrc_b16:$src1, SSrc_i1:$src0)
@@ -1256,6 +1260,10 @@ let True16Predicate = NotHasTrue16BitInsts in {
   def : VOPSelectPat <f16>;
   def : VOPSelectPat <i16>;
 } // End True16Predicate = NotHasTrue16BitInsts
+let True16Predicate = UseRealTrue16Insts in {
+  def : VOPSelectPat_t16 <f16>;
+  def : VOPSelectPat_t16 <i16>;
+} // End True16Predicate = UseRealTrue16Insts
 let True16Predicate = UseFakeTrue16Insts in {
   def : VOPSelectPat_fake16 <f16>;
   def : VOPSelectPat_fake16 <i16>;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index e289ee759da158..e27d4372d87be4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -5294,15 +5294,15 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX11-NEXT:    s_and_b32 s0, 1, s10
 ; GFX11-NEXT:    s_cmp_eq_u64 s[6:7], 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
 ; GFX11-NEXT:    s_and_b32 s1, 1, s1
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
-; GFX11-NEXT:    v_mov_b32_e32 v2, s5
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s1, 0, s1
+; GFX11-NEXT:    v_cndmask_b16 v0, v1, v0, s0
 ; GFX11-NEXT:    s_ashr_i32 s0, s9, 31
+; GFX11-NEXT:    v_cndmask_b16 v1, v2, 0, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX11-NEXT:    s_add_i32 s1, s0, 0x80000000
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 1, v0
@@ -5447,20 +5447,20 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, s3, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[4:5]
-; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
-; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_u64_e64 s0, s[0:1], v[0:1]
+; GFX11-NEXT:    v_cmp_gt_i64_e64 s1, 0, v[2:3]
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[4:5]
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, s[2:3], v[4:5]
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, 0, v[2:3]
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
+; GFX11-NEXT:    v_cndmask_b16 v3, v7, v6, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v2, v2, 0, s0
+; GFX11-NEXT:    v_xor_b32_e32 v2, v2, v3
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v8, 0, vcc_lo
-; GFX11-NEXT:    v_xor_b32_e32 v2, v2, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x80000000, v3
 ; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 0x80000000, v3
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
@@ -5606,21 +5606,22 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
+; GFX11-NEXT:    v_cmp_lt_u64_e64 s0, v[4:5], v[0:1]
 ; GFX11-NEXT:    s_cmp_eq_u64 s[2:3], 0
+; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, v[6:7], v[2:3]
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT:    s_and_b32 s0, 1, s1
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], 0
-; GFX11-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX11-NEXT:    s_and_b32 s0, 1, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
-; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s1
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
+; GFX11-NEXT:    v_cndmask_b16 v0, v1, v0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
+; GFX11-NEXT:    v_cndmask_b16 v1, v2, 0, s0
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v8, 0, s0
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc_lo
@@ -5846,33 +5847,33 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v1, v9, vcc_lo
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v16, vcc_lo, v2, v10, vcc_lo
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, v3, v11, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[0:1]
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[16:17], v[2:3]
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_u64_e64 s0, v[8:9], v[0:1]
+; GFX11-NEXT:    v_cmp_gt_i64_e64 s1, 0, v[10:11]
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[16:17], v[2:3]
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[10:11]
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v12, vcc_lo, v4, v12
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v13, vcc_lo, v5, v13, vcc_lo
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v18, vcc_lo, v6, v14, vcc_lo
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v19, vcc_lo, v7, v15, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11]
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[12:13], v[4:5]
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, v[16:17], v[2:3]
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, 0, v[10:11]
+; GFX11-NEXT:    v_add_co_u32 v10, s1, v4, v12
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v11, s1, v5, v13, s1
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v12, s1, v6, v14, s1
+; GFX11-NEXT:    v_cndmask_b16 v0, v1, v0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v1, v2, 0, s0
+; GFX11-NEXT:    v_cmp_lt_u64_e64 s0, v[10:11], v[4:5]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v13, s1, v7, v15, s1
+; GFX11-NEXT:    v_cmp_gt_i64_e64 s1, 0, v[14:15]
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[6:7]
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0, v[14:15]
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[6:7]
-; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v19
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, v[12:13], v[6:7]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[12:13], v[6:7]
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s1
+; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v13
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, 0, v[14:15]
 ; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x80000000, v6
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v4, 0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v1, v3, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v2, v4, 0, s0
 ; GFX11-NEXT:    v_xor_b32_e32 v1, v2, v1
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v17
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -5882,10 +5883,10 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, v3
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v2, vcc_lo
 ; GFX11-NEXT:    v_dual_cndmask_b32 v2, v16, v2 :: v_dual_cndmask_b32 v3, v17, v4
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, v12, v6, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, v13, v6, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, v18, v6, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v7, v19, v7, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, v10, v6, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, v11, v6, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v12, v6, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
   ret <2 x i128> %result
@@ -6243,16 +6244,16 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX11-NEXT:    s_and_b32 s0, 1, s18
 ; GFX11-NEXT:    s_cmp_eq_u64 s[10:11], 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
 ; GFX11-NEXT:    s_and_b32 s1, 1, s1
 ; GFX11-NEXT:    s_ashr_i32 s10, s17, 31
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s1
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s1, 0, s1
 ; GFX11-NEXT:    s_add_i32 s11, s10, 0x80000000
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v2, 0, s0
+; GFX11-NEXT:    v_cndmask_b16 v0, v1, v0, s0
 ; GFX11-NEXT:    s_add_u32 s0, s4, s12
+; GFX11-NEXT:    v_cndmask_b16 v1, v2, 0, s1
 ; GFX11-NEXT:    s_addc_u32 s1, s5, s13
 ; GFX11-NEXT:    s_addc_u32 s2, s6, s14
 ; GFX11-NEXT:    v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
@@ -6268,17 +6269,18 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
 ; GFX11-NEXT:    s_and_b32 s4, 1, s12
 ; GFX11-NEXT:    s_cmp_eq_u64 s[14:15], 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s6
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s4, 0, s4
 ; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s6
 ; GFX11-NEXT:    s_and_b32 s5, 1, s5
-; GFX11-NEXT:    v_cmp_ne_u32_e64 s4, 0, s5
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v3, 0, s4
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s5, 0, s5
+; GFX11-NEXT:    v_cndmask_b16 v1, v2, v1, s4
+; GFX11-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX11-NEXT:    s_add_i32 s0, s4, 0x80000000
+; GFX11-NEXT:    v_cndmask_b16 v2, v3, 0, s5
 ; GFX11-NEXT:    v_mov_b32_e32 v3, s8
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
-; GFX11-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX11-NEXT:    v_xor_b32_e32 v1, v2, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v4, s9
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s17
@@ -6287,7 +6289,6 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, s10, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s11, vcc_lo
-; GFX11-NEXT:    s_add_i32 s0, s4, 0x80000000
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 43ebe156eb2a28..af96da1bb25adf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -5297,28 +5297,28 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX11-NEXT:    s_sub_u32 s8, s0, s4
 ; GFX11-NEXT:    s_subb_u32 s9, s1, s5
 ; GFX11-NEXT:    s_subb_u32 s10, s2, s6
-; GFX11-NEXT:    v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
 ; GFX11-NEXT:    s_subb_u32 s11, s3, s7
+; GFX11-NEXT:    v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
 ; GFX11-NEXT:    s_cmp_eq_u64 s[10:11], s[2:3]
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, s[10:11], s[2:3]
+; GFX11-NEXT:    v_cmp_gt_u64_e64 s2, s[4:5], 0
 ; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[10:11], s[2:3]
-; GFX11-NEXT:    v_cmp_gt_u64_e64 s2, s[4:5], 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX11-NEXT:    s_and_b32 s0, 1, s12
 ; GFX11-NEXT:    s_cmp_eq_u64 s[6:7], 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s2, s[6:7], 0
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
 ; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX11-NEXT:    s_ashr_i32 s0, s11, 31
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; GFX11-NEXT:    s_and_b32 s1, 1, s1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s1, 0, s1
+; GFX11-NEXT:    v_cndmask_b16 v0, v1, v0, s0
+; GFX11-NEXT:    s_ashr_i32 s0, s11, 31
+; GFX11-NEXT:    v_cndmask_b16 v1, v3, v2, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s9 :: v_dual_mov_b32 v3, s11
 ; GFX11-NEXT:    s_add_i32 s1, s0, 0x80000000
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v3, v2 :: v_dual_mov_b32 v2, s9
-; GFX11-NEXT:    v_mov_b32_e32 v3, s11
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s8 :: v_dual_and_b32 v0, 1, v0
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
@@ -5470,25 +5470,26 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
 ; GFX11-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
 ; GFX11-NEXT:    v_sub_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
 ; GFX11-NEXT:    v_sub_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[4:5]
-; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[6:7]
-; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, 0, v[0:1]
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_u64_e64 s0, s[0:1], v[4:5]
+; GFX11-NEXT:    v_cmp_lt_u64_e64 s1, 0, v[0:1]
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[6:7]
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
+; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
+; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, s[2:3], v[6:7]
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, 0, v[2:3]
+; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, 0, v[2:3]
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
+; GFX11-NEXT:    v_cndmask_b16 v2, v9, v8, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, v1, v0, s0
+; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2
-; GFX11-NEXT:    v_xor_b32_e32 v0, v0, v8
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v2
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs)
   %cast = bitcast i128 %result to <4 x float>
@@ -5638,29 +5639,29 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
 ; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
 ; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
 ; GFX11-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[0:1]
+; GFX11-NEXT:    v_cmp_lt_u64_e64 s4, v[4:5], v[0:1]
 ; GFX11-NEXT:    v_cmp_gt_u64_e64 s0, s[0:1], 0
 ; GFX11-NEXT:    s_cmp_eq_u64 s[2:3], 0
-; GFX11-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
-; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
-; GFX11-NEXT:    v_cmp_gt_i64_e64 s0, s[2:3], 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_i64_e64 s1, s[2:3], 0
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
-; GFX11-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s4, v[6:7], v[2:3]
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
+; GFX11-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX11-NEXT:    s_and_b32 s0, 1, s4
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; GFX11-NEXT:    v_cndmask_b16 v0, v1, v0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v1, v3, v2, s0
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc_lo
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0x80000000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc_lo
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs)
   %cast = bitcast i128 %result to <4 x float>
@@ -5896,37 +5897,38 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX11-NEXT:    v_sub_co_u32 v16, vcc_lo, v0, v8
 ; GFX11-NEXT:    v_sub_co_ci_u32_e32 v17, vcc_lo, v1, v9, vcc_lo
 ; GFX11-NEXT:    v_sub_co_ci_u32_e32 v18, vcc_lo, v2, v10, vcc_lo
+; GFX11-NEXT:    v_cmp_lt_u64_e64 s1, 0, v[8:9]
+; GFX11-NEXT:    v_cmp_lt_u64_e64 s0, v[16:17], v[0:1]
 ; GFX11-NEXT:    v_sub_co_ci_u32_e32 v19, vcc_lo, v3, v11, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[16:17], v[0:1]
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[18:19], v[2:3]
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[18:19], v[2:3]
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, 0, v[8:9]
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 0, v[10:11]
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11-NEXT:    v_sub_co_u32 v8, vcc_lo, v4, v12
-; GFX11-NEXT:    v_sub_co_ci_u32_e32 v9, vcc_lo, v5, v13, vcc_lo
-; GFX11-NEXT:    v_sub_co_ci_u32_e32 v20, vcc_lo, v6, v14, vcc_lo
-; GFX11-NEXT:    v_sub_co_ci_u32_e32 v21, vcc_lo, v7, v15, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[10:11]
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[4:5]
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, v[18:19], v[2:3]
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, 0, v[10:11]
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, 0, v[10:11]
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s1
+; GFX11-NEXT:    v_sub_co_u32 v8, s1, v4, v12
+; GFX11-NEXT:    v_sub_co_ci_u32_e64 v9, s1, v5, v13, s1
+; GFX11-NEXT:    v_sub_co_ci_u32_e64 v10, s1, v6, v14, s1
+; GFX11-NEXT:    v_sub_co_ci_u32_e64 v11, s1, v7, v15, s1
+; GFX11-NEXT:    v_cndmask_b16 v0, v1, v0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v1, v3, v2, s0
+; GFX11-NEXT:    v_cmp_lt_u64_e64 s0, v[8:9], v[4:5]
+; GFX11-NEXT:    v_cmp_lt_u64_e64 s1, 0, v[12:13]
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[10:11], v[6:7]
 ; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[20:21], v[6:7]
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_u64_e32 vcc_lo, 0, v[12:13]
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_i64_e32 vcc_lo, 0, v[14:15]
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[20:21], v[6:7]
-; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v21
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v5, v4 :: v_dual_add_nc_u32 v7, 0x80000000, v6
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, v[10:11], v[6:7]
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s1
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, 0, v[14:15]
+; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v11
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s0, 0, v[14:15]
+; GFX11-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s1
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 0x80000000, v6
+; GFX11-NEXT:    v_cndmask_b16 v1, v3, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v2, v5, v4, s0
 ; GFX11-NEXT:    v_xor_b32_e32 v1, v2, v1
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v19
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -5938,8 +5940,8 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GFX11-NEXT:    v_dual_cndmask_b32 v2, v18, v2 :: v_dual_cndmask_b32 v3, v19, v4
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v5, v9, v6, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, v20, v6, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v7, v21, v7, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v7, v11, v7, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
   ret <2 x i128> %result
@@ -6303,57 +6305,57 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX11-NEXT:    s_sub_u32 s18, s0, s8
 ; GFX11-NEXT:    s_subb_u32 s19, s1, s9
 ; GFX11-NEXT:    s_subb_u32 s16, s2, s10
-; GFX11-NEXT:    v_cmp_lt_u64_e64 s0, s[18:19], s[0:1]
 ; GFX11-NEXT:    s_subb_u32 s17, s3, s11
+; GFX11-NEXT:    v_cmp_lt_u64_e64 s0, s[18:19], s[0:1]
 ; GFX11-NEXT:    s_cmp_eq_u64 s[16:17], s[2:3]
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, s[16:17], s[2:3]
+; GFX11-NEXT:    v_cmp_gt_u64_e64 s2, s[8:9], 0
 ; GFX11-NEXT:    s_cselect_b32 s20, 1, 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[16:17], s[2:3]
-; GFX11-NEXT:    v_cmp_gt_u64_e64 s2, s[8:9], 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX11-NEXT:    s_and_b32 s0, 1, s20
 ; GFX11-NEXT:    s_cmp_eq_u64 s[10:11], 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s2
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s2, s[10:11], 0
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
 ; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX11-NEXT:    s_ashr_i32 s8, s17, 31
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; GFX11-NEXT:    s_and_b32 s1, 1, s1
-; GFX11-NEXT:    s_add_i32 s9, s8, 0x80000000
+; GFX11-NEXT:    s_ashr_i32 s8, s17, 31
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s1
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s1, 0, s1
+; GFX11-NEXT:    s_add_i32 s9, s8, 0x80000000
+; GFX11-NEXT:    v_cndmask_b16 v0, v1, v0, s0
 ; GFX11-NEXT:    s_sub_u32 s0, s4, s12
+; GFX11-NEXT:    v_cndmask_b16 v1, v3, v2, s1
 ; GFX11-NEXT:    s_subb_u32 s1, s5, s13
 ; GFX11-NEXT:    s_subb_u32 s2, s6, s14
-; GFX11-NEXT:    v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc_lo
 ; GFX11-NEXT:    s_subb_u32 s3, s7, s15
-; GFX11-NEXT:    v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v7, s3
+; GFX11-NEXT:    v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
 ; GFX11-NEXT:    s_cmp_eq_u64 s[2:3], s[6:7]
-; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
-; GFX11-NEXT:    v_cmp_lt_i64_e64 s4, s[2:3], s[6:7]
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s5, s[2:3], s[6:7]
 ; GFX11-NEXT:    v_cmp_gt_u64_e64 s6, s[12:13], 0
 ; GFX11-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v5, s0
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT:    v_xor_b32_e32 v0, v1, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
 ; GFX11-NEXT:    s_and_b32 s4, 1, s10
 ; GFX11-NEXT:    s_cmp_eq_u64 s[14:15], 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s6
 ; GFX11-NEXT:    v_cmp_gt_i64_e64 s6, s[14:15], 0
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s5
 ; GFX11-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s4, 0, s4
 ; GFX11-NEXT:    s_and_b32 s5, 1, s5
-; GFX11-NEXT:    s_ashr_i32 s4, s3, 31
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s6
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
-; GFX11-NEXT:    s_add_i32 s0, s4, 0x80000000
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v4, v3 :: v_dual_mov_b32 v3, s18
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s5, 0, s5
+; GFX11-NEXT:    v_cndmask_b16 v1, v2, v1, s4
+; GFX11-NEXT:    v_dual_mov_b32 v5, s0 :: v_dual_mov_b32 v6, s1
+; GFX11-NEXT:    v_mov_b32_e32 v7, s3
+; GFX11-NEXT:    v_cndmask_b16 v2, v4, v3, s5
+; GFX11-NEXT:    v_mov_b32_e32 v3, s18
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s16
+; GFX11-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX11-NEXT:    v_xor_b32_e32 v1, v2, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v4, s19
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s17
@@ -6362,6 +6364,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
 ; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, s8, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s9, vcc_lo
+; GFX11-NEXT:    s_add_i32 s0, s4, 0x80000000
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v4
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index bc359d6ff3aaa0..12e677e5546fd0 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -34508,14 +34508,25 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_select_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_select_bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_select_bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v0, v2, v1, vcc_lo
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, bfloat %a, bfloat %b
   ret bfloat %op
 }
@@ -34573,21 +34584,24 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX11TRUE16-LABEL: v_select_fneg_lhs_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    v_xor_b16 v1.l, 0x8000, v1.l
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.h, v0.l, vcc_lo
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_select_fneg_lhs_bf16:
 ; GFX11FAKE16:       ; %bb.0:
 ; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11FAKE16-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11FAKE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v1
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v0, v2, v0, vcc_lo
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg bfloat %a
   %op = select i1 %cond, bfloat %neg.a, bfloat %b
@@ -34647,21 +34661,24 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX11TRUE16-LABEL: v_select_fneg_rhs_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    v_xor_b16 v2.l, 0x8000, v2.l
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_select_fneg_rhs_bf16:
 ; GFX11FAKE16:       ; %bb.0:
 ; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11FAKE16-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11FAKE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v2
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v0, v0, v1, vcc_lo
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %neg.b = fneg bfloat %b
   %op = select i1 %cond, bfloat %a, bfloat %neg.b
@@ -34749,11 +34766,15 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v0, v2, v1 :: v_dual_cndmask_b32 v1, v3, v4
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, v3.l, vcc_lo
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v2.l, v1.l, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -34761,14 +34782,15 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
 ; GFX11FAKE16-LABEL: v_select_v2bf16:
 ; GFX11FAKE16:       ; %bb.0:
 ; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v1 :: v_dual_cndmask_b32 v1, v4, v3
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v1, v2, v1, vcc_lo
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v0, v0, v3, vcc_lo
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b
   ret <2 x bfloat> %op
@@ -34856,14 +34878,19 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
 ; GFX11TRUE16-LABEL: v_vselect_v2bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v0, v3, v2 :: v_dual_and_b32 v1, 1, v1
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v5.l, v4.l, vcc_lo
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, v2.l, s0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -34872,14 +34899,15 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
 ; GFX11FAKE16:       ; %bb.0:
 ; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v0, v3, v2 :: v_dual_and_b32 v1, 1, v1
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v1
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v0, v3, v2, vcc_lo
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v1, v5, v4, s0
 ; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b
@@ -34936,16 +34964,27 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: s_select_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, s1, v1, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11TRUE16-LABEL: s_select_bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, s0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, s1, v0.l, vcc_lo
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11FAKE16-LABEL: s_select_bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11FAKE16-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v0, s1, v0, vcc_lo
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11FAKE16-NEXT:    ; return to shader part epilog
   %cond = icmp eq i32 %c, 0
   %op = select i1 %cond, bfloat %a, bfloat %b
   %cast = bitcast bfloat %op to i16
@@ -35038,17 +35077,21 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ;
 ; GFX11TRUE16-LABEL: s_select_v2bf16:
 ; GFX11TRUE16:       ; %bb.0:
-; GFX11TRUE16-NEXT:    s_lshr_b32 s2, s1, 16
-; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, s2
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, s3
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, s1
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, s0
+; GFX11TRUE16-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_cndmask_b32 v1, v3, v4
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, s3
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, s2
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, s1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, s0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v1.h, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11TRUE16-NEXT:    ; return to shader part epilog
 ;
@@ -35056,13 +35099,13 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ; GFX11FAKE16:       ; %bb.0:
 ; GFX11FAKE16-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11FAKE16-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s0
-; GFX11FAKE16-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX11FAKE16-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v0, s0
+; GFX11FAKE16-NEXT:    s_lshr_b32 s0, s1, 16
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, s3, v1, vcc_lo
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v1, s0, v1, vcc_lo
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, s1, v2, vcc_lo
-; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v0, s1, v0, vcc_lo
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11FAKE16-NEXT:    ; return to shader part epilog
@@ -35156,17 +35199,20 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ;
 ; GFX11TRUE16-LABEL: s_vselect_v2bf16:
 ; GFX11TRUE16:       ; %bb.0:
-; GFX11TRUE16-NEXT:    s_lshr_b32 s2, s1, 16
-; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, s2
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, s3
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, s1
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, s0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX11TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s2, 0, v1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, s3
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, s4
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, s1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, s0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, s2
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, v1.h, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
@@ -35174,16 +35220,16 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ;
 ; GFX11FAKE16-LABEL: s_vselect_v2bf16:
 ; GFX11FAKE16:       ; %bb.0:
-; GFX11FAKE16-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s0
+; GFX11FAKE16-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s2, 0, v1
+; GFX11FAKE16-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s0
 ; GFX11FAKE16-NEXT:    s_lshr_b32 s0, s1, 16
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, s0, v2, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, s1, v3, vcc_lo
-; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v0, s0, v0, s2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v1, s1, v1, vcc_lo
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11FAKE16-NEXT:    ; return to shader part epilog
@@ -36876,61 +36922,63 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
 ;
 ; GFX11TRUE16-LABEL: s_vselect_v4bf16:
 ; GFX11TRUE16:       ; %bb.0:
-; GFX11TRUE16-NEXT:    s_lshr_b32 s4, s3, 16
-; GFX11TRUE16-NEXT:    s_lshr_b32 s5, s1, 16
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, s4
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, s5
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX11TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v7.l, s2
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v8.l, s0
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v6.l, s4
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc_lo
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, s3
-; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s2, 16
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, s3
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v9.l, s1
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc_lo
+; GFX11TRUE16-NEXT:    s_lshr_b32 s7, s3, 16
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v9, vcc_lo
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s4, 0, v1
+; GFX11TRUE16-NEXT:    s_lshr_b32 s8, s1, 16
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, s7
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, s3
+; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX11TRUE16-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s5, 0, v2
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s6, 0, v3
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, s8
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, s3
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, s7
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, s2
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, s0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.h, s1
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v0.l, v0.h, s6
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.h, v2.l, s4
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.h, v3.l, vcc_lo
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, v3.h, s5
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.h
+; GFX11TRUE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
 ; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX11FAKE16-LABEL: s_vselect_v4bf16:
 ; GFX11FAKE16:       ; %bb.0:
-; GFX11FAKE16-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX11FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s1
-; GFX11FAKE16-NEXT:    s_lshr_b32 s4, s3, 16
-; GFX11FAKE16-NEXT:    s_lshr_b32 s5, s0, 16
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v3, s4, v4, vcc_lo
-; GFX11FAKE16-NEXT:    v_mov_b32_e32 v4, s5
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11FAKE16-NEXT:    v_mov_b32_e32 v6, s0
-; GFX11FAKE16-NEXT:    s_lshr_b32 s0, s2, 16
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instid1(SALU_CYCLE_1)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, s0, v4, vcc_lo
+; GFX11FAKE16-NEXT:    s_lshr_b32 s7, s1, 16
+; GFX11FAKE16-NEXT:    s_lshr_b32 s9, s0, 16
 ; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, s2, v6, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, s3, v5, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s4, 0, v1
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s5, 0, v2
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s6, 0, v3
+; GFX11FAKE16-NEXT:    v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v1, s9
+; GFX11FAKE16-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX11FAKE16-NEXT:    s_lshr_b32 s8, s3, 16
+; GFX11FAKE16-NEXT:    s_lshr_b32 s0, s2, 16
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v0, s8, v0, s6
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v1, s0, v1, s4
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v2, s2, v2, vcc_lo
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v3, s3, v3, s5
+; GFX11FAKE16-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11FAKE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX11FAKE16-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX11FAKE16-NEXT:    ; return to shader part epilog
   %cond = icmp eq <4 x i32> %c, zeroinitializer
   %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b
@@ -37078,53 +37126,60 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo
 ; GFX11TRUE16-LABEL: v_vselect_v4bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v8.l, v7.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v9.l, v5.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc_lo
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v7
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s1, 1, v3
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s2, 1, v0
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v6.l, v4.l, s0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.l, v8.l, v3.l, s1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.h, v7.l, v5.l, s2
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_vselect_v4bf16:
 ; GFX11FAKE16:       ; %bb.0:
 ; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v2, v7, v5 :: v_dual_and_b32 v3, 1, v3
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v0, v6, v4 :: v_dual_and_b32 v1, 1, v1
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc_lo
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s1, 1, v3
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s2, 1, v2
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v5, v7, v5, s2
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v4, v6, v4, s0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v1, v2, v1, vcc_lo
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v2, v3, v0, s1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v4, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v1, v2, v5, 0x5040100
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b
   ret <4 x bfloat> %op
@@ -37368,93 +37423,95 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
 ; GFX11TRUE16-LABEL: v_vselect_v8bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v16.l, v15.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v17.l, v11.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v6, v16, v17, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v14
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v10
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc_lo
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v10.l, v13.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v14.l, v9.l
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v5, v16, v17, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v10, v14, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v8
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v10, v14, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v13, v9, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v7, v15, v11, vcc_lo
-; GFX11TRUE16-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v7
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v6
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s1, 1, v3
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v5
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v15
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s2, 1, v0
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s3, 1, v1
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v11
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s4, 1, v4
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s5, 1, v2
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s6, 1, v3
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v5.l, v1.l, s2
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v12
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v9
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v13
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v10
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v14
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v15.l, v11.l, s3
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.l, v14.l, v10.l, s4
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.h, v3.l, v2.l, vcc_lo
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.l, v12.l, v8.l, s0
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.h, v5.l, v4.l, s1
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.l, v13.l, v9.l, s5
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.h, v7.l, v6.l, s6
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, v3.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v8.l, v0.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v9.l, v0.l
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v4, v5, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v1, v2, v6, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v2, v3, v7, 0x5040100
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_perm_b32 v3, v8, v9, 0x5040100
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_vselect_v8bf16:
 ; GFX11FAKE16:       ; %bb.0:
 ; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v10
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v6, v15, v11 :: v_dual_and_b32 v1, 1, v1
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v4, v14, v10 :: v_dual_and_b32 v3, 1, v3
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
 ; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v5, v17, v16, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v2, v13, v9 :: v_dual_and_b32 v7, 1, v7
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s1, 1, v3
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v6
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v3, 1, v4
 ; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v14, v10, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v3, v13, v9, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11FAKE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v7, v15, v11, vcc_lo
-; GFX11FAKE16-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11FAKE16-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 1, v5
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v4, 1, v7
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s2, 1, v0
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s5, 1, v3
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s3, 1, v1
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s4, 1, v2
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s6, 1, v4
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v3, v15, v11, s2
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v15
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v2, v14, v10, s5
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v10
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v9
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v8
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v12
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v13
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v14
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v9, v13, v9, s3
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v8, v12, v8, s0
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v6, v7, v6, vcc_lo
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v5, v10, v5, s1
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v4, v11, v4, s4
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v7, v1, v0, s6
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v6, v8, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v1, v5, v9, 0x5040100
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_perm_b32 v2, v4, v2, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v3, v7, v3, 0x5040100
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = select <8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b
   ret <8 x bfloat> %op
@@ -38024,181 +38081,176 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v49.l, v26.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v50.l, v18.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v33.l, v30.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v34.l, v22.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v53.l, v24.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v54.l, v16.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v35.l, v29.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v36.l, v21.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v51.l, v25.l
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v12, v33, v34, vcc_lo
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v52.l, v17.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v9, 1, v9
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v37.l, v28.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v38.l, v20.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v10, v35, v36, vcc_lo
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v7, 1, v7
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v20
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s7, 1, v9
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s8, 1, v8
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v39.l, v27.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v48.l, v19.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v8, v37, v38, vcc_lo
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v6, v39, v48, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v4, v49, v50, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v32.l, v23.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v51, v52, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v10, 1, v10
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v53, v54, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v13, v30, v22, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v11, v29, v21, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v9, v28, v20, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v7, v27, v19, vcc_lo
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v19
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v27
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v16
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v24
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v24, v16, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v25, v17, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v5, v26, v18, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s2, 1, v2
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s5, 1, v7
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s6, 1, v6
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.l, v28.l, v20.l, s8
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.h, v38.l, v37.l, s7
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v21
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v29
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v26
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v25
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s1, 1, v3
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s3, 1, v5
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s4, 1, v4
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s9, 1, v11
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s10, 1, v12
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s11, 1, v13
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s12, 1, v10
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s13, 1, v15
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s14, 1, v14
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.l, v27.l, v19.l, s6
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.h, v48.l, v39.l, s5
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v4.h, v54.l, v53.l, vcc_lo
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v5.l, v24.l, v16.l, s0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v12.l, v2.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v13.l, v2.l
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v30.l, v22.l, s10
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v34.l, v33.l, s11
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.l, v29.l, v21.l, s12
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.h, v36.l, v35.l, s9
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v5.h, v52.l, v51.l, s1
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v6.l, v25.l, v17.l, s2
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v6.h, v50.l, v49.l, s3
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v8.l, v5.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v10.l, v3.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v11.l, v3.l
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v4.l, v26.l, v18.l, s4
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, v5.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v9.l, v6.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v6.l, v6.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v14.l, v1.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v15.l, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v16.l, v0.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v17.l, v0.l
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v7, v8, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v1, v5, v9, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v5, v14, v15, 0x5040100
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v31
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v17.l, v31.l
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, v16.l
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v14, v17, v32 :: v_dual_and_b32 v15, 1, v15
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v15, v3, v23, vcc_lo
-; GFX11TRUE16-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11TRUE16-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v31
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.l, v31.l, v23.l, s14
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.h, v2.l, v32.l, s13
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.l
+; GFX11TRUE16-NEXT:    v_perm_b32 v2, v6, v4, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v4, v12, v13, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v6, v16, v17, 0x5040100
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v8.l, v3.h
+; GFX11TRUE16-NEXT:    v_perm_b32 v3, v10, v11, 0x5040100
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_perm_b32 v7, v8, v7, 0x5040100
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_vselect_v16bf16:
 ; GFX11FAKE16:       ; %bb.0:
 ; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11FAKE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v19
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v27
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
 ; GFX11FAKE16-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v26
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v12, v30, v22 :: v_dual_and_b32 v11, 1, v11
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
 ; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v21
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v29
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v16
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v13, v34, v33, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
 ; GFX11FAKE16-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v24
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v10, v29, v21, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
 ; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v20
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v11, v36, v35, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
 ; GFX11FAKE16-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v25
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v8, v28, v20, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
 ; GFX11FAKE16-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v12, 1, v12
 ; GFX11FAKE16-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v9, v38, v37, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v6, v27, v19, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v4, v26, v18 :: v_dual_and_b32 v7, 1, v7
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v3, v52, v51, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc_lo
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v25
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v16
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v24
 ; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v54, v53, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v5, v50, v49, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11FAKE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11FAKE16-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v7, v48, v39, vcc_lo
-; GFX11FAKE16-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
-; GFX11FAKE16-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s1, 1, v3
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s2, 1, v2
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v21
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v29
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v20
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v19
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v27
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v26
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s3, 1, v5
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s4, 1, v4
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s5, 1, v7
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s6, 1, v6
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s7, 1, v9
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s8, 1, v8
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s9, 1, v11
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s10, 1, v12
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s11, 1, v13
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s12, 1, v10
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s13, 1, v15
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s14, 1, v14
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v1, v25, v17, s2
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v0, v24, v16, s0
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v10, v54, v53, vcc_lo
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v11, v52, v51, s1
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v6, v30, v22, s10
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v7, v34, v33, s11
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v5, v29, v21, s12
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v8, v36, v35, s9
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v4, v28, v20, s8
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v9, v38, v37, s7
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v3, v27, v19, s6
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v2, v26, v18, s4
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v12, v50, v49, s3
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v13, v48, v39, s5
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v10, v0, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v1, v11, v1, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v4, v9, v4, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v2, v12, v2, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v3, v13, v3, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v5, v8, v5, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v6, v7, v6, 0x5040100
 ; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v31
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v14, v31, v23, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v15, v3, v32, vcc_lo
-; GFX11FAKE16-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
-; GFX11FAKE16-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11FAKE16-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v31
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v10, v31, v23, s14
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v11, v14, v32, s13
+; GFX11FAKE16-NEXT:    v_perm_b32 v7, v11, v10, 0x5040100
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = select <16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b
   ret <16 x bfloat> %op
@@ -39660,217 +39712,197 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v85, off, s32 offset:8
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v86, off, s32 offset:68
 ; GFX11TRUE16-NEXT:    scratch_load_b32 v87, off, s32 offset:4
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v26, 1, v26
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v28, 1, v28
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v30, 1, v30
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v17, 1, v17
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v19, 1, v19
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v21, 1, v21
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v23, 1, v23
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v25, 1, v25
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v27, 1, v27
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v29, 1, v29
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s8, 1, v8
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s22, 1, v22
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s24, 1, v24
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s26, 1, v30
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s27, 1, v26
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s29, 1, v28
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s1, 1, v3
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s2, 1, v2
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s3, 1, v5
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s4, 1, v4
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s5, 1, v7
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s7, 1, v9
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s9, 1, v11
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s10, 1, v10
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s11, 1, v13
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s12, 1, v12
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s13, 1, v15
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s14, 1, v14
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s15, 1, v17
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s16, 1, v16
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s17, 1, v19
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s18, 1, v18
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s19, 1, v21
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s20, 1, v20
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s21, 1, v23
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s23, 1, v25
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s25, 1, v27
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s28, 1, v29
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s6, 1, v6
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(32)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v8, 1, v31
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(31)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v96.l, v32.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v32
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(30)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v97.l, v33.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(29)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v98.l, v34.l
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.l, v32.l, v33.l, s26
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v33
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(28)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v99.l, v35.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(27)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v100.l, v36.l
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v0.h, v34.l, v35.l, s29
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v35
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v34
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(26)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v101.l, v37.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(25)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v102.l, v38.l
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.l, v36.l, v37.l, s27
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v37
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v12, 16, v36
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(24)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v103.l, v39.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(23)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v112.l, v48.l
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v1.h, v38.l, v39.l, s24
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v39
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v38
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(22)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v113.l, v49.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(21)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v114.l, v50.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(20)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v115.l, v51.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(19)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v116.l, v52.l
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.l, v48.l, v49.l, s22
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v49
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v48
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(18)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v117.l, v53.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(17)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v118.l, v54.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(16)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v119.l, v55.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v128.l, v64.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v53
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v52
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(14)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v129.l, v65.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v130.l, v66.l
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v131.l, v67.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v65
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v64
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v2.h, v50.l, v51.l, s20
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v132.l, v68.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v68
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(10)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v133.l, v69.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v69
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(9)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v134.l, v70.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v70
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v135.l, v71.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v71
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v70
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v30, 1, v30
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v31, 16, v71
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(7)
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v80
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v81
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(5)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v146.l, v82.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v82
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(4)
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v147.l, v83.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v83
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v82
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v28, 1, v28
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v30
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v83
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(3)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v30.l, v84.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v84
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v26, 1, v26
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v96, v96, v97, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v28
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v84
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(2)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v97.l, v85.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 16, v85
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v24, 1, v24
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v98, v98, v99, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v26
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v144.l, v80.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e64 v145.l, v81.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v81
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v26, v100, v101, vcc_lo
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v80
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v22, 1, v22
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v24
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v69
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v68
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v20, 1, v20
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v24, v102, v103, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v22
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v67
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v66
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v18, 1, v18
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v22, v112, v113, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v20
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v65
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v64
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v16, 1, v16
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v20, v114, v115, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v18
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v55
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v54
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v18, v116, v117, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v53
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v52
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v16, v118, v119, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v19, 1, v19
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v51
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v50
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v14, v128, v129, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v17, 1, v17
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v49
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v12, v130, v131, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v23, 1, v23
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v39
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v10, v132, v133, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v21, 1, v21
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v8, v134, v135, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v27, 1, v27
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v6, v144, v145, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v25, 1, v25
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v4, v146, v147, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v85
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v28.l, v86.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v86
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v99.l, v87.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v87
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v30, v97, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT:    v_and_b32_e32 v29, 1, v29
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 16, v86
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v30.l, v84.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v84.l, v85.l
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v28, v99, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v31
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v28.l, v86.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v85.l, v87.l
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v29
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v29, v34, v35, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v27
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v27, v36, v37, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v25
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v25, v38, v39, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v23
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v23, v48, v49, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v21
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v21, v50, v51, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v19
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v19, v52, v53, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v17
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v17, v54, v55, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v15, v64, v65, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v13, v66, v67, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v11, v68, v69, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v7, v80, v81, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v30, v84, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v28, v85, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v5, v82, v83, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
-; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v9, v70, v71, vcc_lo
-; GFX11TRUE16-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v10, v21, v20, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v11, v23, v22, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v8, v17, v16, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v9, v19, v18, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v12, v25, v24, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v13, v27, v26, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v14, v29, v98, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v15, v31, v96, 0x5040100
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v7.h, v86.l, v87.l, s0
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v87
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v8
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v51
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v50
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.l, v52.l, v53.l, s18
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v3.h, v54.l, v55.l, s16
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v55
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v54
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v4.l, v64.l, v65.l, s14
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v4.h, v66.l, v67.l, s12
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v67
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v66
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v5.h, v70.l, v71.l, s8
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v6.h, v82.l, v83.l, s4
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v8.l, v10.l, v9.l, s28
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v8.h, v12.l, v11.l, s25
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v9.l, v14.l, v13.l, s23
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v9.h, v18.l, v15.l, s21
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v10.h, v22.l, v21.l, s17
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v11.h, v26.l, v25.l, s13
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v12.h, v30.l, v29.l, s9
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v13.l, v32.l, v31.l, s7
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v13.h, v34.l, v33.l, s5
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v14.l, v36.l, v35.l, s3
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v14.h, v38.l, v37.l, s1
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v15.l, v48.l, v39.l, vcc_lo
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v15.h, v17.l, v16.l, s0
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v5.l, v68.l, v69.l, s10
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v6.l, v80.l, v81.l, s6
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v7.l, v84.l, v85.l, s2
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v10.l, v20.l, v19.l, s19
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v11.l, v24.l, v23.l, s15
+; GFX11TRUE16-NEXT:    v_cndmask_b16 v12.l, v28.l, v27.l, s11
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v18.l, v7.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v19.l, v6.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v20.l, v5.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v21.l, v4.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v22.l, v4.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v23.l, v3.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v24.l, v3.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v25.l, v2.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v26.l, v2.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v27.l, v1.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v28.l, v1.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v29.l, v0.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v30.l, v0.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, v15.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v14.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, v14.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, v13.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, v13.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v13.l, v12.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v14.l, v11.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v16.l, v10.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v17.l, v9.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v31.l, v9.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v32.l, v8.h
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v33.l, v8.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v15.l, v15.h
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v18, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v1, v1, v7, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v2, v2, v19, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v3, v3, v6, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v4, v4, v20, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v5, v13, v5, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v6, v12, v21, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v7, v14, v22, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v8, v11, v23, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v9, v16, v24, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v10, v10, v25, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v11, v17, v26, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v12, v31, v27, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v13, v32, v28, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v14, v33, v29, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v15, v15, v30, 0x5040100
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_vselect_v32bf16:
@@ -39910,167 +39942,168 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX11FAKE16-NEXT:    scratch_load_b32 v85, off, s32 offset:8
 ; GFX11FAKE16-NEXT:    scratch_load_b32 v86, off, s32 offset:68
 ; GFX11FAKE16-NEXT:    scratch_load_b32 v87, off, s32 offset:4
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v30, 1, v30
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v28, 1, v28
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v26, 1, v26
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v24, 1, v24
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v22, 1, v22
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v30
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v20, 1, v20
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v14, 1, v14
 ; GFX11FAKE16-NEXT:    v_and_b32_e32 v16, 1, v16
-; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(30)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v28
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v26, 1, v26
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v28, 1, v28
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v30, 1, v30
 ; GFX11FAKE16-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(28)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v28, v34, v35, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v26
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(26)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v26, v36, v37, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v24
 ; GFX11FAKE16-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
 ; GFX11FAKE16-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(24)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v24, v38, v39, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v22
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v39
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v7, 1, v7
 ; GFX11FAKE16-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(22)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v22, v48, v49, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v20
 ; GFX11FAKE16-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v49
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
 ; GFX11FAKE16-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(20)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v20, v50, v51, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v18
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v51
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v50
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v11, 1, v11
 ; GFX11FAKE16-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(18)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v18, v52, v53, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
 ; GFX11FAKE16-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v53
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v52
 ; GFX11FAKE16-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v17, 1, v17
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v19, 1, v19
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v21, 1, v21
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v23, 1, v23
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v25, 1, v25
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v27, 1, v27
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v29, 1, v29
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s14, 1, v14
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s16, 1, v16
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s18, 1, v18
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s20, 1, v20
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s22, 1, v22
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s24, 1, v24
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s26, 1, v30
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s27, 1, v26
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s29, 1, v28
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s1, 1, v3
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s2, 1, v2
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s3, 1, v5
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s4, 1, v4
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s5, 1, v7
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s6, 1, v6
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s7, 1, v9
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s8, 1, v8
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s9, 1, v11
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s10, 1, v10
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s11, 1, v13
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s12, 1, v12
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s13, 1, v15
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s15, 1, v17
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s17, 1, v19
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s19, 1, v21
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s21, 1, v23
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s23, 1, v25
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s25, 1, v27
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s28, 1, v29
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(32)
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v31
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v32
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(30)
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v15, v32, v33, s26
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v33
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(28)
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v14, v34, v35, s29
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v35
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v34
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(26)
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v13, v36, v37, s27
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v37
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v36
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(24)
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v12, v38, v39, s24
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v39
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v38
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(22)
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v11, v48, v49, s22
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v49
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v48
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v16, v50, v51, s20
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v51
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v50
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v19, v52, v53, s18
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v53
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v52
 ; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(16)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v16, v54, v55, vcc_lo
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v55
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v54
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v22, v54, v55, s16
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v55
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v54
 ; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(14)
-; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v14, v64, v65 :: v_dual_and_b32 v19, 1, v19
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v17, 1, v17
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v65
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v64
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v25, v64, v65, s14
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v65
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v64
 ; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v12, v66, v67, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v23, 1, v23
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v67
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v66
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v67
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v66
 ; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(10)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v10, v68, v69, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v21, 1, v21
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v69
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v68
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v69
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v68
 ; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v8, v70, v71, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v27, 1, v27
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v71
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v70
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v71
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v70
 ; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v6, v80, v81, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v25, 1, v25
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v81
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v80
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v81
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v80
 ; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(4)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v4, v82, v83, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v31, 1, v31
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v83
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v82
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v83
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v82
 ; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(2)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v84, v85, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11FAKE16-NEXT:    v_and_b32_e32 v29, 1, v29
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 16, v85
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v84
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v85
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v84
 ; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v86, v87, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v31
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v87
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 16, v86
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v29
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v29, v34, v35, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v27
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v27, v36, v37, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v25
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v25, v38, v39, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v23
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v23, v48, v49, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v21
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v21, v50, v51, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v19
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v19, v52, v53, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v17
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v17, v54, v55, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v15, v64, v65, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v13, v66, v67, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v11, v68, v69, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v7, v80, v81, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v3, v84, v85, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v86, v87, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v5, v82, v83, vcc_lo
-; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
-; GFX11FAKE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11FAKE16-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
-; GFX11FAKE16-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
-; GFX11FAKE16-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v9, v70, v71, vcc_lo
-; GFX11FAKE16-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
-; GFX11FAKE16-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
-; GFX11FAKE16-NEXT:    v_perm_b32 v10, v21, v20, 0x5040100
-; GFX11FAKE16-NEXT:    v_perm_b32 v11, v23, v22, 0x5040100
-; GFX11FAKE16-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
-; GFX11FAKE16-NEXT:    v_perm_b32 v8, v17, v16, 0x5040100
-; GFX11FAKE16-NEXT:    v_perm_b32 v9, v19, v18, 0x5040100
-; GFX11FAKE16-NEXT:    v_perm_b32 v12, v25, v24, 0x5040100
-; GFX11FAKE16-NEXT:    v_perm_b32 v13, v27, v26, 0x5040100
-; GFX11FAKE16-NEXT:    v_perm_b32 v14, v29, v28, 0x5040100
-; GFX11FAKE16-NEXT:    v_perm_b32 v15, v31, v30, 0x5040100
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v54, v86, v87, s0
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v87
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v86
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v28, v66, v67, s12
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v31, v68, v69, s10
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v34, v70, v71, s8
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v37, v80, v81, s6
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v48, v82, v83, s4
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v51, v84, v85, s2
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v65, v4, v3, s28
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v66, v6, v5, s25
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v67, v8, v7, s23
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v68, v10, v9, s21
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v10, v18, v17, s19
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v9, v21, v20, s17
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v8, v24, v23, s15
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v7, v27, v26, s13
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v6, v30, v29, s11
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v5, v33, v32, s9
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v4, v36, v35, s7
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v0, v64, v55, vcc_lo
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v3, v53, v52, s1
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v17, v50, v49, s3
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v18, v39, v38, s5
+; GFX11FAKE16-NEXT:    v_cndmask_b16 v20, v2, v1, s0
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v54, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v1, v3, v51, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v2, v17, v48, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v3, v18, v37, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v4, v4, v34, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v5, v5, v31, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v6, v6, v28, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v7, v7, v25, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v8, v8, v22, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v9, v9, v19, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v10, v10, v16, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v11, v68, v11, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v12, v67, v12, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v13, v66, v13, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v14, v65, v14, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v15, v20, v15, 0x5040100
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = select <32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b
   ret <32 x bfloat> %op
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
index 5b72795ba07eaa..6bdab8b28cf59b 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
@@ -1332,14 +1332,14 @@ define <2 x half> @fmul_select_v2f16_test3(<2 x half> %x, <2 x i32> %bool.arg1,
 ; GFX1100-LABEL: fmul_select_v2f16_test3:
 ; GFX1100:       ; %bb.0:
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v5, 0x4000
 ; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
-; GFX1100-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_mov_b32_e32 v2, 0x4000
+; GFX1100-NEXT:    v_cmp_eq_u32_e64 s0, v1, v3
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-NEXT:    v_cndmask_b16 v1, 0x3c00, v2, vcc_lo
+; GFX1100-NEXT:    v_cndmask_b16 v2, 0x3c00, v2, s0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_pack_b32_f16 v1, v2, v1
 ; GFX1100-NEXT:    v_pk_mul_f16 v0, v0, v1
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
@@ -1392,14 +1392,14 @@ define <2 x half> @fmul_select_v2f16_test4(<2 x half> %x, <2 x i32> %bool.arg1,
 ; GFX1100-LABEL: fmul_select_v2f16_test4:
 ; GFX1100:       ; %bb.0:
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v5, 0x3800
 ; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
-; GFX1100-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT:    v_mov_b32_e32 v2, 0x3800
+; GFX1100-NEXT:    v_cmp_eq_u32_e64 s0, v1, v3
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-NEXT:    v_cndmask_b16 v1, 0x3c00, v2, vcc_lo
+; GFX1100-NEXT:    v_cndmask_b16 v2, 0x3c00, v2, s0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_pack_b32_f16 v1, v2, v1
 ; GFX1100-NEXT:    v_pk_mul_f16 v0, v0, v1
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
@@ -1484,10 +1484,10 @@ define half @fmul_select_f16_test6(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX1100-LABEL: fmul_select_f16_test6:
 ; GFX1100:       ; %bb.0:
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v3, 0xc800
 ; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x4200, v3, vcc_lo
+; GFX1100-NEXT:    v_mov_b32_e32 v1, 0xc800
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cndmask_b16 v1, 0x4200, v1, vcc_lo
 ; GFX1100-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
@@ -1530,10 +1530,10 @@ define half @fmul_select_f16_test7(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX1100-LABEL: fmul_select_f16_test7:
 ; GFX1100:       ; %bb.0:
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_mov_b32_e32 v3, 0x4800
 ; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0xc400, v3, vcc_lo
+; GFX1100-NEXT:    v_mov_b32_e32 v1, 0x4800
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cndmask_b16 v1, 0xc400, v1, vcc_lo
 ; GFX1100-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
@@ -1575,7 +1575,7 @@ define half @fmul_select_f16_test8(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX1100:       ; %bb.0:
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX1100-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
+; GFX1100-NEXT:    v_cndmask_b16 v1, 0, 0x8000, vcc_lo
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX1100-NEXT:    s_setpc_b64 s[30:31]
@@ -1776,7 +1776,7 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX1100-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
 ; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX1100-NEXT:    v_cndmask_b16 v1, 0x3f80, v3, vcc_lo
 ; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
@@ -1849,7 +1849,7 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX1100-NEXT:    v_dual_mov_b32 v3, 0x3f00 :: v_dual_lshlrev_b32 v0, 16, v0
 ; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX1100-NEXT:    v_cndmask_b16 v1, 0x3f80, v3, vcc_lo
 ; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
@@ -1947,24 +1947,23 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a
 ; GFX1100:       ; %bb.0:
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_mov_b32_e32 v5, 0x4000
+; GFX1100-NEXT:    v_cmp_eq_u32_e64 s0, v2, v4
 ; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cndmask_b16 v2, 0x3f80, v5, s0
+; GFX1100-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX1100-NEXT:    v_cndmask_b16 v1, 0x3f80, v5, vcc_lo
 ; GFX1100-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX1100-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX1100-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v3, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1100-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX1100-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX1100-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1100-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX1100-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX1100-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
@@ -2059,24 +2058,23 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a
 ; GFX1100:       ; %bb.0:
 ; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-NEXT:    v_mov_b32_e32 v5, 0x3f00
+; GFX1100-NEXT:    v_cmp_eq_u32_e64 s0, v2, v4
 ; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_cndmask_b16 v2, 0x3f80, v5, s0
+; GFX1100-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX1100-NEXT:    v_cndmask_b16 v1, 0x3f80, v5, vcc_lo
 ; GFX1100-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX1100-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX1100-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX1100-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX1100-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX1100-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v3, v1
+; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1100-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX1100-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX1100-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX1100-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1100-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX1100-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX1100-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
@@ -2147,7 +2145,7 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX1100-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
 ; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
+; GFX1100-NEXT:    v_cndmask_b16 v1, 0x4100, v3, vcc_lo
 ; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
@@ -2222,7 +2220,7 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX1100-NEXT:    v_dual_mov_b32 v3, 0xffffc100 :: v_dual_lshlrev_b32 v0, 16, v0
 ; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
+; GFX1100-NEXT:    v_cndmask_b16 v1, 0x4040, v3, vcc_lo
 ; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
@@ -2296,7 +2294,7 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX1100-NEXT:    v_dual_mov_b32 v3, 0x4100 :: v_dual_lshlrev_b32 v0, 16, v0
 ; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
+; GFX1100-NEXT:    v_cndmask_b16 v1, 0xc080, v3, vcc_lo
 ; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
@@ -2445,7 +2443,7 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX1100-NEXT:    v_dual_mov_b32 v3, 0xffffc180 :: v_dual_lshlrev_b32 v0, 16, v0
 ; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
+; GFX1100-NEXT:    v_cndmask_b16 v1, 0xc200, v3, vcc_lo
 ; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
@@ -2520,7 +2518,7 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b
 ; GFX1100-NEXT:    v_dual_mov_b32 v3, 0xffffe000 :: v_dual_lshlrev_b32 v0, 16, v0
 ; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
+; GFX1100-NEXT:    v_cndmask_b16 v1, 0xdb80, v3, vcc_lo
 ; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
@@ -2595,7 +2593,7 @@ define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %b
 ; GFX1100-NEXT:    v_dual_mov_b32 v3, 0x3480 :: v_dual_lshlrev_b32 v0, 16, v0
 ; GFX1100-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
+; GFX1100-NEXT:    v_cndmask_b16 v1, 0x4c00, v3, vcc_lo
 ; GFX1100-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-NEXT:    v_mul_f32_e32 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index efbbe2b27f10f9..7187801e5990b7 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -450,20 +450,22 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB2_4: ; %exit
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0x3d00
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x3900, v0, vcc_lo
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v1
-; GFX11-NEXT:    v_dual_mov_b32 v4, 0x3900 :: v_dual_cndmask_b32 v1, 0x3900, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX11-NEXT:    v_cmp_nge_f16_e32 vcc_lo, 0.5, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x3d00, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x3900, v0, vcc_lo
-; GFX11-NEXT:    v_pack_b32_f16 v0, v2, v1
-; GFX11-NEXT:    v_pack_b32_f16 v1, v3, v4
+; GFX11-NEXT:    v_cmp_ge_f16_e64 s0, 0.5, v2
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0x3d00
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0x3900
+; GFX11-NEXT:    v_cmp_ge_f16_e64 s1, 0.5, v0
+; GFX11-NEXT:    v_cmp_ge_f16_e64 s2, 0.5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x3900, v1, s0
+; GFX11-NEXT:    v_cndmask_b16 v2, 0x3d00, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b16 v3, 0x3900, v1, s1
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x3900, v1, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   br i1 undef, label %T, label %F
 
@@ -1062,20 +1064,22 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB5_4: ; %exit
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0x3d00
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x3900, v0, vcc_lo
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v1
-; GFX11-NEXT:    v_dual_mov_b32 v4, 0x3900 :: v_dual_cndmask_b32 v1, 0x3900, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX11-NEXT:    v_cmp_nge_f16_e32 vcc_lo, 0.5, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x3d00, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x3900, v0, vcc_lo
-; GFX11-NEXT:    v_pack_b32_f16 v0, v2, v1
-; GFX11-NEXT:    v_pack_b32_f16 v1, v3, v4
+; GFX11-NEXT:    v_cmp_ge_f16_e64 s0, 0.5, v2
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0x3d00
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0x3900
+; GFX11-NEXT:    v_cmp_ge_f16_e64 s1, 0.5, v0
+; GFX11-NEXT:    v_cmp_ge_f16_e64 s2, 0.5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x3900, v1, s0
+; GFX11-NEXT:    v_cndmask_b16 v2, 0x3d00, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b16 v3, 0x3900, v1, s1
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x3900, v1, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   br i1 undef, label %T, label %F
 
@@ -1406,34 +1410,34 @@ define amdgpu_gfx <8 x i16> @vec_16xi16_extract_8xi16_0(i1 inreg %cond, ptr addr
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB7_4: ; %exit
-; GFX11-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v5
-; GFX11-NEXT:    v_mov_b32_e32 v9, 0x3900
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0x3d00
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX11-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v5
+; GFX11-NEXT:    v_mov_b32_e32 v5, 0x3d00
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, 0x3900, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x3900, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0x3900, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_gt_u16_e64 s0, 0x3801, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_cmp_lt_u16_e64 s1, 0x3800, v1
+; GFX11-NEXT:    v_cndmask_b16 v7, 0x3900, v5, vcc_lo
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0x3900
+; GFX11-NEXT:    v_cndmask_b16 v8, 0x3900, v5, s0
 ; GFX11-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x3900, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x3900, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3900, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_gt_u16_e32 vcc_lo, 0x3801, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3900, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_u16_e32 vcc_lo, 0x3800, v8
-; GFX11-NEXT:    v_perm_b32 v2, v7, v4, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0x3d00, v9, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v6, v5, 0x5040100
+; GFX11-NEXT:    v_cmp_gt_u16_e64 s0, 0x3801, v2
+; GFX11-NEXT:    v_cmp_gt_u16_e64 s2, 0x3801, v0
+; GFX11-NEXT:    v_cmp_gt_u16_e64 s3, 0x3801, v6
+; GFX11-NEXT:    v_cmp_gt_u16_e64 s34, 0x3801, v4
+; GFX11-NEXT:    v_cndmask_b16 v2, 0x3900, v5, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x3900, v5, s0
+; GFX11-NEXT:    v_cndmask_b16 v3, 0x3900, v5, s2
+; GFX11-NEXT:    v_cndmask_b16 v4, 0x3900, v5, s3
+; GFX11-NEXT:    v_cndmask_b16 v5, 0x3900, v5, s34
+; GFX11-NEXT:    v_cndmask_b16 v6, 0x3d00, v1, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v0, v3, v0, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v4, v2, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v5, v8, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v3, v6, v7, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   br i1 %cond, label %T, label %F
 
@@ -1697,34 +1701,34 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
 ; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:  .LBB8_4: ; %exit
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v5
-; GFX11-NEXT:    v_mov_b32_e32 v9, 0x3900
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0x3d00
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v5
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v5
+; GFX11-NEXT:    v_mov_b32_e32 v5, 0x3d00
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, 0x3900, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x3900, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0x3900, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_ge_f16_e64 s0, 0.5, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_cmp_nge_f16_e64 s1, 0.5, v1
+; GFX11-NEXT:    v_cndmask_b16 v7, 0x3900, v5, vcc_lo
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0x3900
+; GFX11-NEXT:    v_cndmask_b16 v8, 0x3900, v5, s0
 ; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x3900, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x3900, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3900, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3900, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_nge_f16_e32 vcc_lo, 0.5, v8
-; GFX11-NEXT:    v_pack_b32_f16 v2, v4, v7
-; GFX11-NEXT:    v_pack_b32_f16 v1, v3, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0x3d00, v9, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pack_b32_f16 v3, v5, v6
+; GFX11-NEXT:    v_cmp_ge_f16_e64 s0, 0.5, v2
+; GFX11-NEXT:    v_cmp_ge_f16_e64 s2, 0.5, v0
+; GFX11-NEXT:    v_cmp_ge_f16_e64 s3, 0.5, v6
+; GFX11-NEXT:    v_cmp_ge_f16_e64 s34, 0.5, v4
+; GFX11-NEXT:    v_cndmask_b16 v2, 0x3900, v5, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x3900, v5, s0
+; GFX11-NEXT:    v_cndmask_b16 v3, 0x3900, v5, s2
+; GFX11-NEXT:    v_cndmask_b16 v4, 0x3900, v5, s3
+; GFX11-NEXT:    v_cndmask_b16 v5, 0x3900, v5, s34
+; GFX11-NEXT:    v_cndmask_b16 v6, 0x3d00, v1, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT:    v_pack_b32_f16 v1, v2, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_pack_b32_f16 v2, v8, v5
+; GFX11-NEXT:    v_pack_b32_f16 v3, v7, v6
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   br i1 %cond, label %T, label %F
 
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
index c3c1540383ec63..ffe7649e4bbb12 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
@@ -645,36 +645,36 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1)
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 1
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v5, s2
+; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 3
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v1, s2
+; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 4
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v5, s2
+; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 5
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v2, s2
+; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 1, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v1, s2
+; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 7
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v3, s2
+; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v1, s2
 ; GFX11-NEXT:    global_store_b16 v2, v0, s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -865,69 +865,69 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1)
 ; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[2:3]
 ; GFX11-NEXT:    global_load_b128 v[4:7], v4, s[2:3] offset:16
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 1
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc_lo
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v9, s2
+; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 3
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v1, s2
+; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 4
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc_lo
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v9, s2
+; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 5
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v2, s2
+; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 1, v8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v1, s2
+; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 7
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v3, s2
+; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v1, s2
+; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 9
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v4, s2
+; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 10
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v1, s2
+; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 11
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v5, s2
+; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v1, s2
+; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 13
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v6, s2
+; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 14
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v1, s2
+; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v7
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 15
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc_lo
-; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v7, s2
+; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v1, s2
 ; GFX11-NEXT:    global_store_b16 v2, v0, s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
index e874ee56f594ca..91f57e644ec723 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
@@ -62,7 +62,7 @@ define half @test_fmax_legacy_ugt_f16(half %a, half %b) #0 {
 ; GFX11-SAFE:       ; %bb.0:
 ; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v1
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v0, v1, v0, vcc_lo
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_f16:
@@ -151,11 +151,12 @@ define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 {
 ; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e64 s0, v0, v1
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v3, v2
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v1
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v0, v1, v0, s0
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v2, v2, v3, vcc_lo
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -262,12 +263,13 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 {
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v2
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v5, v4
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e64 s0, v5, v4
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v0, v2, v0, vcc_lo
 ; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v1, v3
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v2, v4, v5, s0
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v1, v3, v1, vcc_lo
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-SAFE-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -395,22 +397,23 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_v4f16:
 ; GFX11-SAFE:       ; %bb.0:
 ; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v5, v4
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
-; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v7, v6
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
-; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v2
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v1, v3
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX11-SAFE-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e64 s0, v0, v2
+; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e64 s1, v5, v4
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e64 s2, v7, v6
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v1, v3, v1, vcc_lo
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v0, v2, v0, s0
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v2, v4, v5, s1
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v3, v6, v7, s2
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-SAFE-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v4f16:
@@ -615,36 +618,36 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_v8f16:
 ; GFX11-SAFE:       ; %bb.0:
 ; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
-; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
-; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
-; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
-; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v11, v10
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc_lo
-; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v13, v12
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v11, v12, v13, vcc_lo
-; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v15, v14
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v12, v14, v15, vcc_lo
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
 ; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v9, v8
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
-; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v2, v6
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
-; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0, v4
-; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-SAFE-NEXT:    v_perm_b32 v2, v11, v2, 0x5040100
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v1, v5
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
-; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v3, v7
-; GFX11-SAFE-NEXT:    v_perm_b32 v1, v12, v1, 0x5040100
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e64 s0, v11, v10
+; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e64 s1, v13, v12
+; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e64 s3, v0, v4
+; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e64 s2, v15, v14
+; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e64 s4, v1, v5
+; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e64 s5, v2, v6
+; GFX11-SAFE-NEXT:    v_cmp_nle_f16_e64 s6, v3, v7
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v12, v12, v13, s1
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v14, v14, v15, s2
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v10, v10, v11, s0
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v8, v8, v9, vcc_lo
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v0, v4, v0, s3
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v1, v5, v1, s4
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v2, v6, v2, s5
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v3, v7, v3, s6
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-SAFE-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
-; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SAFE-NEXT:    v_perm_b32 v3, v10, v3, 0x5040100
+; GFX11-SAFE-NEXT:    v_perm_b32 v1, v10, v1, 0x5040100
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SAFE-NEXT:    v_perm_b32 v2, v12, v2, 0x5040100
+; GFX11-SAFE-NEXT:    v_perm_b32 v3, v14, v3, 0x5040100
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v8f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
index 0723290bdf734d..b7e9e15a0561f5 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
@@ -63,7 +63,7 @@ define half @test_fmin_legacy_ule_f16(half %a, half %b) #0 {
 ; GFX11-SAFE:       ; %bb.0:
 ; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v0, v1, v0, vcc_lo
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NNAN-LABEL: test_fmin_legacy_ule_f16:
@@ -152,11 +152,12 @@ define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 {
 ; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e64 s0, v0, v1
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v2
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v0, v1, v0, s0
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v2, v2, v3, vcc_lo
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -263,12 +264,13 @@ define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 {
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v5, v4
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e64 s0, v5, v4
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v0, v2, v0, vcc_lo
 ; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v2, v4, v5, s0
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v1, v3, v1, vcc_lo
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-SAFE-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -396,22 +398,23 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; GFX11-SAFE-LABEL: test_fmin_legacy_ule_v4f16:
 ; GFX11-SAFE:       ; %bb.0:
 ; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v5, v4
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
-; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v7, v6
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
-; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX11-SAFE-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e64 s0, v0, v2
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e64 s1, v5, v4
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e64 s2, v7, v6
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v1, v3, v1, vcc_lo
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v0, v2, v0, s0
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v2, v4, v5, s1
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v3, v6, v7, s2
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-SAFE-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v4f16:
@@ -616,36 +619,36 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; GFX11-SAFE-LABEL: test_fmin_legacy_ule_v8f16:
 ; GFX11-SAFE:       ; %bb.0:
 ; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
-; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
-; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
-; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
-; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
-; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v11, v10
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc_lo
-; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v13, v12
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v11, v12, v13, vcc_lo
-; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v15, v14
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v12, v14, v15, vcc_lo
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
 ; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v9, v8
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
-; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v2, v6
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
-; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v4
-; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-SAFE-NEXT:    v_perm_b32 v2, v11, v2, 0x5040100
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v5
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
-; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v7
-; GFX11-SAFE-NEXT:    v_perm_b32 v1, v12, v1, 0x5040100
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e64 s0, v11, v10
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e64 s1, v13, v12
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e64 s3, v0, v4
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e64 s2, v15, v14
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e64 s4, v1, v5
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e64 s5, v2, v6
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e64 s6, v3, v7
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v12, v12, v13, s1
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v14, v14, v15, s2
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v10, v10, v11, s0
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v8, v8, v9, vcc_lo
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v0, v4, v0, s3
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v1, v5, v1, s4
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v2, v6, v2, s5
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v3, v7, v3, s6
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-SAFE-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
-; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SAFE-NEXT:    v_perm_b32 v3, v10, v3, 0x5040100
+; GFX11-SAFE-NEXT:    v_perm_b32 v1, v10, v1, 0x5040100
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SAFE-NEXT:    v_perm_b32 v2, v12, v2, 0x5040100
+; GFX11-SAFE-NEXT:    v_perm_b32 v3, v14, v3, 0x5040100
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v8f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index 64be9cb72a6ee3..6c1a7ac56a8672 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -425,15 +425,16 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16
 ; GFX11-DENORM-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX11-DENORM-NEXT:    v_add_f16_e64 v0, s0, -1.0
-; GFX11-DENORM-NEXT:    v_add_f16_e64 v1, s1, -1.0
+; GFX11-DENORM-NEXT:    v_add_f16_e64 v1, s0, -1.0
+; GFX11-DENORM-NEXT:    v_add_f16_e64 v0, s1, -1.0
 ; GFX11-DENORM-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-DENORM-NEXT:    v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0|
-; GFX11-DENORM-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-DENORM-NEXT:    v_add_f16_e64 v0, |v0|, |v0|
 ; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DENORM-NEXT:    v_cmp_gt_f16_e64 s2, |v0|, |v1|
+; GFX11-DENORM-NEXT:    v_cndmask_b16 v0, v1, v0, s2
+; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-DENORM-NEXT:    v_add_f16_e64 v0, |v0|, |v0|
 ; GFX11-DENORM-NEXT:    v_mul_f16_e32 v1, v0, v0
+; GFX11-DENORM-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DENORM-NEXT:    v_fma_f16 v0, -v1, v0, 1.0
 ; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DENORM-NEXT:    global_store_b16 v2, v0, s[0:1]
@@ -444,18 +445,18 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16
 ; GFX11-FLUSH-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX11-FLUSH-NEXT:    v_add_f16_e64 v0, s0, -1.0
-; GFX11-FLUSH-NEXT:    v_add_f16_e64 v1, s1, -1.0
+; GFX11-FLUSH-NEXT:    v_add_f16_e64 v1, s0, -1.0
+; GFX11-FLUSH-NEXT:    v_add_f16_e64 v0, s1, -1.0
+; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT:    v_cmp_gt_f16_e64 s0, |v0|, |v1|
+; GFX11-FLUSH-NEXT:    v_cndmask_b16 v0, v1, v0, s0
 ; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FLUSH-NEXT:    v_cmp_gt_f16_e64 vcc_lo, |v1|, |v0|
-; GFX11-FLUSH-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-FLUSH-NEXT:    v_add_f16_e64 v0, |v0|, |v0|
 ; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT:    v_add_f16_e64 v0, |v0|, |v0|
 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v0, v0
+; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v0, v1, v0
 ; GFX11-FLUSH-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v0, 1.0, v0
 ; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-FLUSH-NEXT:    global_store_b16 v1, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
index 104e157e9e15ae..352847bb57c207 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
@@ -4842,9 +4842,9 @@ define half @v_mul_f16_select_64_1(i32 %arg, half %x) {
 ; GFX11-GISEL-LABEL: v_mul_f16_select_64_1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3c00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x5400, vcc_lo
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0x5400, vcc_lo
 ; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
@@ -4900,9 +4900,9 @@ define half @v_mul_f16_select_1_64(i32 %arg, half %x) {
 ; GFX11-GISEL-LABEL: v_mul_f16_select_1_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x3c00, vcc_lo
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x5400
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0x3c00, vcc_lo
 ; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
@@ -4958,9 +4958,9 @@ define half @v_mul_f16_select_n1_n64(i32 %arg, half %x) {
 ; GFX11-GISEL-LABEL: v_mul_f16_select_n1_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0xd400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xbc00, vcc_lo
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0xd400
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0xbc00, vcc_lo
 ; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
@@ -5016,9 +5016,9 @@ define half @v_mul_f16_select_128_64(i32 %arg, half %x) {
 ; GFX11-GISEL-LABEL: v_mul_f16_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0x5800, vcc_lo
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x5400
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0x5800, vcc_lo
 ; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
@@ -5074,9 +5074,9 @@ define half @v_mul_f16_select_n128_n64(i32 %arg, half %x) {
 ; GFX11-GISEL-LABEL: v_mul_f16_select_n128_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0xd400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xd800, vcc_lo
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0xd400
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0xd800, vcc_lo
 ; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
@@ -5132,9 +5132,9 @@ define half @v_mul_f16_select_n128_n16(i32 %arg, half %x) {
 ; GFX11-GISEL-LABEL: v_mul_f16_select_n128_n16:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0xcc00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, 0xd800, vcc_lo
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0xcc00
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0xd800, vcc_lo
 ; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v1, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
@@ -5185,18 +5185,18 @@ define half @v_contract_mul_add_f16_select_64_1(i32 %arg, half %x, half %y) {
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_64_1:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0x5400
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v0, 0x3c00, v0, vcc_lo
 ; GFX11-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_64_1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5400, vcc_lo
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0x5400, vcc_lo
 ; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
@@ -5248,18 +5248,18 @@ define half @v_contract_mul_add_f16_select_1_64(i32 %arg, half %x, half %y) {
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_1_64:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v0, 0x5400, v0, vcc_lo
 ; GFX11-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_1_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x3c00, vcc_lo
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x5400
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0x3c00, vcc_lo
 ; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
@@ -5311,18 +5311,18 @@ define half @v_contract_mul_add_f16_select_n64_n1(i32 %arg, half %x, half %y) {
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_n64_n1:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0xd400
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v0, 0xbc00, v0, vcc_lo
 ; GFX11-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_n64_n1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xd400, vcc_lo
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0xbc00
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0xd400, vcc_lo
 ; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
@@ -5374,18 +5374,18 @@ define half @v_contract_mul_add_f16_select_n1_n64(i32 %arg, half %x, half %y) {
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_n1_n64:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xd400, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0xbc00
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v0, 0xd400, v0, vcc_lo
 ; GFX11-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_n1_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xbc00, vcc_lo
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0xd400
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0xbc00, vcc_lo
 ; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
@@ -5437,18 +5437,18 @@ define half @v_contract_mul_add_f16_select_128_64(i32 %arg, half %x, half %y) {
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_128_64:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5800
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0x5800
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v0, 0x5400, v0, vcc_lo
 ; GFX11-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x5400
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0x5800, vcc_lo
 ; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
@@ -5500,18 +5500,18 @@ define half @v_contract_mul_add_f16_select_128_4(i32 %arg, half %x, half %y) {
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_128_4:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5800
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0x5800
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v0, 0x4400, v0, vcc_lo
 ; GFX11-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_128_4:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x4400
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0x5800, vcc_lo
 ; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
@@ -5563,18 +5563,18 @@ define half @v_contract_mul_add_f16_select_2_4(i32 %arg, half %x, half %y) {
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_2_4:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4000
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0x4000
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v0, 0x4400, v0, vcc_lo
 ; GFX11-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_2_4:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x4000, vcc_lo
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x4400
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0x4000, vcc_lo
 ; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
@@ -5626,18 +5626,18 @@ define half @v_contract_mul_add_f16_select_4_128(i32 %arg, half %x, half %y) {
 ; GFX11-SDAG-LABEL: v_contract_mul_add_f16_select_4_128:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4400
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5800, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v0, 0x4400
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v0, 0x5800, v0, vcc_lo
 ; GFX11-SDAG-NEXT:    v_fma_f16 v0, v1, v0, v2
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_f16_select_4_128:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5800
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x4400, vcc_lo
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x5800
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0x4400, vcc_lo
 ; GFX11-GISEL-NEXT:    v_fma_f16 v0, v1, v0, v2
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq i32 %arg, 0
@@ -5705,23 +5705,23 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v0, 0x3c00, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v1, 0x3c00, v3, s0
+; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_64_1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5400, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x5400, vcc_lo
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e64 s0, 0, v1
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v1, v0, 0x5400, vcc_lo
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0x5400, s0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
@@ -5788,23 +5788,23 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3c00
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x5400, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v0, 0x5400, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v1, 0x5400, v3, s0
+; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_1_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x3c00, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x3c00, vcc_lo
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x5400
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e64 s0, 0, v1
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v1, v0, 0x3c00, vcc_lo
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0x3c00, s0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
@@ -5871,23 +5871,23 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0xbc00
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xd400, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xd400, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v0, 0xd400, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v1, 0xd400, v3, s0
+; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_n1_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xbc00, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xbc00, vcc_lo
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0xd400
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e64 s0, 0, v1
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v1, v0, 0xbc00, vcc_lo
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0xbc00, s0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
@@ -5954,23 +5954,23 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5800
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x5400, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v0, 0x5400, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v1, 0x5400, v3, s0
+; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0x5800, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x5800, vcc_lo
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x5400
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e64 s0, 0, v1
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v1, v0, 0x5800, vcc_lo
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0x5800, s0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
@@ -6037,23 +6037,23 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0xd800
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xd400, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xd400, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v0, 0xd400, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v1, 0xd400, v3, s0
+; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_n128_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xd400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xd800, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xd800, vcc_lo
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0xd400
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e64 s0, 0, v1
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v1, v0, 0xd800, vcc_lo
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0xd800, s0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
@@ -6120,23 +6120,23 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 0xd800
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xcc00, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xcc00, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v0, 0xcc00, v3, vcc_lo
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v1, 0xcc00, v3, s0
+; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_mul_v2f16_select_n128_n16:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xcc00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, 0xd800, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xd800, vcc_lo
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0xcc00
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e64 s0, 0, v1
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v1, v0, 0xd800, vcc_lo
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0xd800, s0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX11-GISEL-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
@@ -6203,23 +6203,23 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v0, 0x3c00, v4, vcc_lo
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v1, 0x3c00, v4, s0
+; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_64_1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x5400, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x5400, vcc_lo
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x3c00
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e64 s0, 0, v1
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v1, v0, 0x5400, vcc_lo
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0x5400, s0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
@@ -6287,23 +6287,23 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x5400, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v0, 0x5400, v4, vcc_lo
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v1, 0x5400, v4, s0
+; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_1_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x3c00, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x3c00, vcc_lo
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x5400
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e64 s0, 0, v1
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v1, v0, 0x3c00, vcc_lo
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0x3c00, s0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
@@ -6371,23 +6371,23 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0xd400
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v0, 0xbc00, v4, vcc_lo
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v1, 0xbc00, v4, s0
+; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0xbc00
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0xd400, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0xd400, vcc_lo
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0xbc00
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e64 s0, 0, v1
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v1, v0, 0xd400, vcc_lo
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0xd400, s0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
@@ -6455,23 +6455,23 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0xbc00
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xd400, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xd400, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v0, 0xd400, v4, vcc_lo
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v1, 0xd400, v4, s0
+; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0xd400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0xbc00, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0xbc00, vcc_lo
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0xd400
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e64 s0, 0, v1
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v1, v0, 0xbc00, vcc_lo
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0xbc00, s0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
@@ -6539,23 +6539,23 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5800
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x5400, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v0, 0x5400, v4, vcc_lo
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v1, 0x5400, v4, s0
+; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x5800, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x5800, vcc_lo
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x5400
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e64 s0, 0, v1
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v1, v0, 0x5800, vcc_lo
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0x5800, s0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
@@ -6623,23 +6623,23 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5800
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v0, 0x4400, v4, vcc_lo
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v1, 0x4400, v4, s0
+; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_128_4:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x5800, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x5800, vcc_lo
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x4400
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e64 s0, 0, v1
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v1, v0, 0x5800, vcc_lo
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0x5800, s0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
@@ -6707,23 +6707,23 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4000
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v0, 0x4400, v4, vcc_lo
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v1, 0x4400, v4, s0
+; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_2_4:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x4000, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x4000, vcc_lo
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x4400
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e64 s0, 0, v1
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v1, v0, 0x4000, vcc_lo
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0x4000, s0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
@@ -6791,23 +6791,23 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x5800, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5800, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v0, 0x5800, v4, vcc_lo
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v1, 0x5800, v4, s0
+; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: v_contract_mul_add_v2f16_select_4_128:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5800
 ; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, v4, 0x4400, vcc_lo
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, 0x4400, vcc_lo
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x5800
+; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e64 s0, 0, v1
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v1, v0, 0x4400, vcc_lo
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v0, v0, 0x4400, s0
+; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cond = icmp eq <2 x i32> %arg, zeroinitializer
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index b32630a97b3ad0..589804177a747c 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -558,12 +558,12 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x
 ; GFX11-SAFE-NEXT:    v_mul_f16_e32 v0, 0, v0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_add_f16_e32 v0, 0, v0
-; GFX11-SAFE-NEXT:    v_xor_b32_e32 v1, 0x8000, v0
 ; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, s0, v0
-; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e64 v0, v1, s0, vcc_lo
+; GFX11-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v0, v0, s0, vcc_lo
 ; GFX11-SAFE-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, 0, v0
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v0, 0x7e00, 0, vcc_lo
 ; GFX11-SAFE-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-NSZ-LABEL: fneg_fadd_0_f16:
@@ -573,10 +573,10 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x
 ; GFX11-NSZ-NEXT:    v_mul_f16_e32 v0, 0x8000, v0
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NSZ-NEXT:    v_cmp_nlt_f16_e64 s1, -v0, s0
-; GFX11-NSZ-NEXT:    v_cndmask_b32_e64 v0, v0, s0, s1
+; GFX11-NSZ-NEXT:    v_cndmask_b16 v0, v0, s0, s1
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NSZ-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, 0, v0
-; GFX11-NSZ-NEXT:    v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
+; GFX11-NSZ-NEXT:    v_cndmask_b16 v0, 0x7e00, 0, vcc_lo
 ; GFX11-NSZ-NEXT:    ; return to shader part epilog
 .entry:
   %tmp7 = fdiv half 1.000000e+00, %tmp6
@@ -646,13 +646,12 @@ define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, <
 ;
 ; GFX11-SAFE-LABEL: fneg_fadd_0_nsz_f16:
 ; GFX11-SAFE:       ; %bb.0: ; %.entry
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e64 s1, s0, 0
 ; GFX11-SAFE-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e64 vcc_lo, s0, 0
-; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, 0x8000, v0, vcc_lo
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v0, 0x8000, v0, s1
 ; GFX11-SAFE-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, 0, v0
-; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v0, 0x7e00, 0, vcc_lo
 ; GFX11-SAFE-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-NSZ-LABEL: fneg_fadd_0_nsz_f16:
@@ -662,10 +661,10 @@ define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, <
 ; GFX11-NSZ-NEXT:    v_mul_f16_e32 v0, 0x8000, v0
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NSZ-NEXT:    v_cmp_nlt_f16_e64 s1, -v0, s0
-; GFX11-NSZ-NEXT:    v_cndmask_b32_e64 v0, v0, s0, s1
+; GFX11-NSZ-NEXT:    v_cndmask_b16 v0, v0, s0, s1
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NSZ-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, 0, v0
-; GFX11-NSZ-NEXT:    v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
+; GFX11-NSZ-NEXT:    v_cndmask_b16 v0, 0x7e00, 0, vcc_lo
 ; GFX11-NSZ-NEXT:    ; return to shader part epilog
 .entry:
   %tmp7 = fdiv afn half 1.000000e+00, %tmp6
@@ -3835,7 +3834,7 @@ define half @v_fneg_round_f16(half %a) #0 {
 ; GFX11-SAFE-NEXT:    v_sub_f16_e32 v2, v0, v1
 ; GFX11-SAFE-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v2, 0, 0x3c00, s0
 ; GFX11-SAFE-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SAFE-NEXT:    v_add_f16_e32 v0, v1, v0
@@ -3850,7 +3849,7 @@ define half @v_fneg_round_f16(half %a) #0 {
 ; GFX11-NSZ-NEXT:    v_sub_f16_e32 v2, v0, v1
 ; GFX11-NSZ-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NSZ-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
+; GFX11-NSZ-NEXT:    v_cndmask_b16 v2, 0, 0x3c00, s0
 ; GFX11-NSZ-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
 ; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NSZ-NEXT:    v_sub_f16_e64 v0, -v1, v0
@@ -4677,7 +4676,7 @@ define half @fadd_select_fneg_fneg_f16(i32 %arg0, half %x, half %y, half %z) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v3, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index b2d30b751ae2c4..0077951c4967ea 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -164,7 +164,7 @@ define i16 @fneg_xor_select_i16(i1 %cond, i16 %arg0, i16 %arg1) {
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %select = select i1 %cond, i16 %arg0, i16 %arg1
@@ -210,14 +210,15 @@ define <2 x i16> @fneg_xor_select_v2i16(<2 x i1> %cond, <2 x i16> %arg0, <2 x i1
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v3, v2 :: v_dual_and_b32 v1, 1, v1
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v1
+; GFX11-NEXT:    v_cndmask_b16 v0, v3, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v1, v5, v4, s0
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
@@ -256,7 +257,7 @@ define i16 @fneg_xor_select_i16_multi_use(i1 %cond, i16 %arg0, i16 %arg1, ptr ad
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v1, v2, v1, vcc_lo
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v1
 ; GFX11-NEXT:    global_store_b16 v[3:4], v1, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -582,16 +583,16 @@ define half @select_fneg_select_f16(i1 %cond0, i1 %cond1, half %arg0, half %arg1
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v2
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v3, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0x8000, v0
+; GFX11-NEXT:    v_xor_b32_e32 v1, 0x8000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = fneg half %arg0
   %select0 = select i1 %cond0, half %arg1, half %fneg0
@@ -618,16 +619,16 @@ define i16 @select_fneg_xor_select_i16(i1 %cond0, i1 %cond1, i16 %arg0, i16 %arg
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v2
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v3, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0xffff8000, v0
+; GFX11-NEXT:    v_xor_b32_e32 v1, 0xffff8000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v1, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = xor i16 %arg0, -32768
   %select0 = select i1 %cond0, i16 %arg1, i16 %fneg0
@@ -702,29 +703,30 @@ define <2 x half> @select_fneg_select_v2f16(<2 x i1> %cond0, <2 x i1> %cond1, <2
 ; GFX11-LABEL: select_fneg_select_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX11-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b16 v0, v1, v6, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v1, v4, v5, s0
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    v_perm_b32 v4, v1, v0, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v2
+; GFX11-NEXT:    v_perm_b32 v4, v0, v1, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v5 :: v_dual_and_b32 v2, 1, v2
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v1, v1, v4, s0
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = fneg <2 x half> %arg0
   %select0 = select <2 x i1> %cond0, <2 x half> %arg1, <2 x half> %fneg0
@@ -787,29 +789,30 @@ define <2 x i16> @select_fneg_xor_select_v2i16(<2 x i1> %cond0, <2 x i1> %cond1,
 ; GFX11-LABEL: select_fneg_xor_select_v2i16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX11-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b16 v0, v1, v6, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v1, v4, v5, s0
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    v_perm_b32 v4, v1, v0, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 1, v2
+; GFX11-NEXT:    v_perm_b32 v4, v0, v1, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v5 :: v_dual_and_b32 v2, 1, v2
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v1, v1, v4, s0
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg0 = xor <2 x i16> %arg0, <i16 -32768, i16 -32768>
   %select0 = select <2 x i1> %cond0, <2 x i16> %arg1, <2 x i16> %fneg0
diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll
index f6ee007facd7fd..4dae542da48e79 100644
--- a/llvm/test/CodeGen/AMDGPU/fract-match.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll
@@ -1996,10 +1996,10 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) nocapture writeonly %
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_fract_f16_e32 v3, v0
-; GFX11-NEXT:    v_cmp_neq_f16_e64 vcc_lo, 0x7c00, |v0|
+; GFX11-NEXT:    v_cmp_neq_f16_e64 s0, 0x7c00, |v0|
 ; GFX11-NEXT:    v_floor_f16_e32 v4, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0, v3, s0
 ; GFX11-NEXT:    global_store_b16 v[1:2], v4, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2154,19 +2154,19 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_fract_f16_e32 v6, v0
-; GFX11-NEXT:    v_floor_f16_e32 v5, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_fract_f16_e32 v4, v3
+; GFX11-NEXT:    v_fract_f16_e32 v4, v0
+; GFX11-NEXT:    v_cmp_class_f16_e64 s1, v0, 0x204
+; GFX11-NEXT:    v_floor_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_fract_f16_e32 v5, v3
 ; GFX11-NEXT:    v_cmp_class_f16_e64 s0, v3, 0x204
-; GFX11-NEXT:    v_floor_f16_e32 v7, v3
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v4, 0, s0
-; GFX11-NEXT:    v_cmp_class_f16_e64 s0, v0, 0x204
+; GFX11-NEXT:    v_floor_f16_e32 v3, v3
+; GFX11-NEXT:    v_cndmask_b16 v4, v4, 0, s1
+; GFX11-NEXT:    v_cndmask_b16 v5, v5, 0, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_pack_b32_f16 v4, v5, v7
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v6, 0, s0
-; GFX11-NEXT:    global_store_b32 v[1:2], v4, off
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX11-NEXT:    v_pack_b32_f16 v3, v0, v3
+; GFX11-NEXT:    v_pack_b32_f16 v0, v4, v5
+; GFX11-NEXT:    global_store_b32 v[1:2], v3, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %floor = tail call <2 x half> @llvm.floor.v2f16(<2 x half> %x)
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index d09af8fd2ac954..da891a709ac1c8 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -2513,36 +2513,37 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_cmp_eq_u32 s5, 6
 ; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s5, 7
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, v3, s4, s2
-; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s5, 4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX11-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s5, 4
+; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s5, 5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s4, s3
-; GFX11-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s7, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s5, 2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, s4, s2
-; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s8, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s5, 3
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s4, s2
-; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s9, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s5, 0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v7, v7, s4, s2
-; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s10, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s5, 1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s2
-; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, v6, s4, s3
-; GFX11-NEXT:    v_cndmask_b32_e64 v8, v8, s4, s2
+; GFX11-NEXT:    s_cselect_b32 s5, -1, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cndmask_b16 v5, v3, s4, s2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_cndmask_b16 v6, v2, s4, s6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_cndmask_b16 v7, v1, s4, s8
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, s4, s10
+; GFX11-NEXT:    v_cndmask_b16 v3, v3, s4, s3
+; GFX11-NEXT:    v_cndmask_b16 v2, v2, s4, s7
+; GFX11-NEXT:    v_cndmask_b16 v1, v1, s4, s9
+; GFX11-NEXT:    v_cndmask_b16 v8, v8, s4, s5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_perm_b32 v3, v3, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v7, v1, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v2, v2, v6, 0x5040100
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v1, v7, 0x5040100
 ; GFX11-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
 ; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
 ; GFX11-NEXT:    s_endpgm
@@ -3082,69 +3083,69 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_cmp_eq_u32 s5, 6
 ; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s5, 7
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v9, v3, s4, s2
-; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s5, 4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX11-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s5, 4
+; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s5, 5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s4, s3
-; GFX11-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s7, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s5, 2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v3, s4, s2
-; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s8, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s5, 3
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s4, s2
-; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s9, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s5, 0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v11, v11, s4, s2
-; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s10, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s5, 1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s2
-; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s11, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s5, 14
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v7
-; GFX11-NEXT:    v_cndmask_b32_e64 v10, v10, s4, s3
-; GFX11-NEXT:    v_perm_b32 v3, v3, v9, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e64 v9, v12, s4, s2
-; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s12, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s5, 15
-; GFX11-NEXT:    v_cndmask_b32_e64 v7, v7, s4, s2
-; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s13, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s5, 12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
-; GFX11-NEXT:    v_perm_b32 v2, v10, v2, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e64 v10, v13, s4, s2
-; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s14, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s5, 13
-; GFX11-NEXT:    v_cndmask_b32_e64 v6, v6, s4, s2
-; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s15, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s5, 10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v5
-; GFX11-NEXT:    v_cndmask_b32_e64 v12, v14, s4, s2
-; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s16, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s5, 11
-; GFX11-NEXT:    v_cndmask_b32_e64 v5, v5, s4, s2
-; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s17, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s5, 8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e64 v13, v15, s4, s2
-; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s18, -1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s5, 9
-; GFX11-NEXT:    v_cndmask_b32_e64 v4, v4, s4, s2
-; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX11-NEXT:    v_perm_b32 v7, v10, v7, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e64 v14, v16, s4, s2
-; GFX11-NEXT:    v_perm_b32 v6, v12, v6, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v5, v13, v5, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v11, v1, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v0, v9, v0, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v14, v4, 0x5040100
+; GFX11-NEXT:    s_cselect_b32 s5, -1, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_cndmask_b16 v9, v3, s4, s2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cndmask_b16 v13, v7, s4, s12
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_cndmask_b16 v14, v6, s4, s14
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_cndmask_b16 v15, v5, s4, s16
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_cndmask_b16 v16, v4, s4, s18
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_cndmask_b16 v10, v2, s4, s6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_cndmask_b16 v11, v1, s4, s8
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_cndmask_b16 v12, v0, s4, s10
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_cndmask_b16 v7, v7, s4, s13
+; GFX11-NEXT:    v_cndmask_b16 v6, v6, s4, s15
+; GFX11-NEXT:    v_cndmask_b16 v5, v5, s4, s17
+; GFX11-NEXT:    v_cndmask_b16 v4, v4, s4, s5
+; GFX11-NEXT:    v_cndmask_b16 v3, v3, s4, s3
+; GFX11-NEXT:    v_cndmask_b16 v2, v2, s4, s7
+; GFX11-NEXT:    v_cndmask_b16 v1, v1, s4, s9
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, s4, s11
+; GFX11-NEXT:    v_perm_b32 v7, v7, v13, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v6, v6, v14, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v5, v5, v15, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v4, v4, v16, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v3, v3, v9, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v1, v11, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v0, v0, v12, 0x5040100
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
 ; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
index 1d0367db701436..60c3f8f60bccc4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -60,10 +60,10 @@ define half @v_maximum_f16(half %src0, half %src1) {
 ; GFX11-LABEL: v_maximum_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e32 v2, v0, v1
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_maximum_f16:
@@ -180,10 +180,10 @@ define half @v_maximum_f16__nsz(half %src0, half %src1) {
 ; GFX11-LABEL: v_maximum_f16__nsz:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e32 v2, v0, v1
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_maximum_f16__nsz:
@@ -306,10 +306,10 @@ define half @v_maximum_f16__nnan_src0(half %arg0, half %src1) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_f16_e32 v0, 1.0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_maximum_f16__nnan_src0:
@@ -387,10 +387,10 @@ define half @v_maximum_f16__nnan_src1(half %src0, half %arg1) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_f16_e32 v1, 1.0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v1
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_maximum_f16__nnan_src1:
@@ -485,10 +485,10 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
 ; GFX11-LABEL: s_maximum_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_o_f16_e64 s2, s0, s1
 ; GFX11-NEXT:    v_max_f16_e64 v0, s0, s1
-; GFX11-NEXT:    v_cmp_o_f16_e64 vcc_lo, s0, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, s2
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use v0
@@ -590,17 +590,17 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) {
 ; GFX11-LABEL: v_maximum_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v2, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_pk_max_f16 v4, v0, v1
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cmp_o_f16_e64 s0, v3, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x7e00, v4, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, s0
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_maximum_v2f16:
@@ -749,17 +749,17 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) {
 ; GFX11-LABEL: v_maximum_v2f16__nsz:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v2, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_pk_max_f16 v4, v0, v1
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cmp_o_f16_e64 s0, v3, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x7e00, v4, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, s0
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_maximum_v2f16__nsz:
@@ -939,17 +939,18 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
 ; GFX11-LABEL: s_maximum_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_o_f16_e64 s2, s0, s1
 ; GFX11-NEXT:    v_pk_max_f16 v0, s0, s1
-; GFX11-NEXT:    v_cmp_o_f16_e64 vcc_lo, s0, s1
-; GFX11-NEXT:    s_lshr_b32 s2, s1, 16
+; GFX11-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_o_f16_e64 s0, s0, s1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e64 vcc_lo, s0, s2
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x7e00, v1, s0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use v0
@@ -1063,21 +1064,20 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) {
 ; GFX11-LABEL: v_maximum_v3f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v4, v0, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    v_pk_max_f16 v6, v0, v2
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v6, v5
-; GFX11-NEXT:    v_pk_max_f16 v4, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e64 s1, v1, v3
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX11-NEXT:    v_cmp_o_f16_e64 s0, v5, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX11-NEXT:    v_cndmask_b16 v2, 0x7e00, v6, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x7e00, v1, s1
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_maximum_v3f16:
@@ -1255,21 +1255,20 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) {
 ; GFX11-LABEL: v_maximum_v3f16__nsz:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v4, v0, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    v_pk_max_f16 v6, v0, v2
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v6, v5
-; GFX11-NEXT:    v_pk_max_f16 v4, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e64 s1, v1, v3
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX11-NEXT:    v_cmp_o_f16_e64 s0, v5, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX11-NEXT:    v_cndmask_b16 v2, 0x7e00, v6, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x7e00, v1, s1
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_maximum_v3f16__nsz:
@@ -1469,26 +1468,26 @@ define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) {
 ; GFX11-LABEL: v_maximum_v4f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v4, v1, v3
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-NEXT:    v_pk_max_f16 v7, v0, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v6, v5
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-NEXT:    v_pk_max_f16 v8, v0, v2
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX11-NEXT:    v_cmp_o_f16_e64 s0, v5, v4
+; GFX11-NEXT:    v_cmp_o_f16_e64 s1, v0, v2
+; GFX11-NEXT:    v_cmp_o_f16_e64 s2, v7, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x7e00, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v3, 0x7e00, v8, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, s2
+; GFX11-NEXT:    v_cndmask_b16 v2, 0x7e00, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_maximum_v4f16:
@@ -1695,26 +1694,26 @@ define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) {
 ; GFX11-LABEL: v_maximum_v4f16__nsz:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v4, v1, v3
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-NEXT:    v_pk_max_f16 v7, v0, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v6, v5
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-NEXT:    v_pk_max_f16 v8, v0, v2
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v3
+; GFX11-NEXT:    v_cmp_o_f16_e64 s0, v5, v4
+; GFX11-NEXT:    v_cmp_o_f16_e64 s1, v0, v2
+; GFX11-NEXT:    v_cmp_o_f16_e64 s2, v7, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x7e00, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v3, 0x7e00, v8, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, s2
+; GFX11-NEXT:    v_cndmask_b16 v2, 0x7e00, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_maximum_v4f16__nsz:
@@ -1999,44 +1998,44 @@ define <8 x half> @v_maximum_v8f16(<8 x half> %src0, <8 x half> %src1) {
 ; GFX11-LABEL: v_maximum_v8f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v8, v3, v7
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v7
-; GFX11-NEXT:    v_pk_max_f16 v10, v2, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
-; GFX11-NEXT:    v_pk_max_f16 v14, v1, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, 0x7e00, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v2, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v10
+; GFX11-NEXT:    v_pk_max_f16 v8, v3, v7
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v10, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v12, v11
-; GFX11-NEXT:    v_pk_max_f16 v11, v0, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0x7e00, v13, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, 0x7e00, v14, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v11, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v13, v12
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7e00, v15, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v14, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e64 s0, v2, v6
+; GFX11-NEXT:    v_pk_max_f16 v9, v2, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_cndmask_b16 v10, 0x7e00, v8, vcc_lo
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v7
-; GFX11-NEXT:    v_perm_b32 v1, v1, v10, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7e00, v8, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v3, v9, 0x5040100
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
+; GFX11-NEXT:    v_cndmask_b16 v7, 0x7e00, v9, s0
+; GFX11-NEXT:    v_cmp_o_f16_e64 s0, v2, v6
+; GFX11-NEXT:    v_cmp_o_f16_e64 s1, v1, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
+; GFX11-NEXT:    v_pk_max_f16 v12, v0, v4
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v5
+; GFX11-NEXT:    v_cmp_o_f16_e64 s2, v6, v2
+; GFX11-NEXT:    v_cmp_o_f16_e64 s3, v0, v4
+; GFX11-NEXT:    v_cmp_o_f16_e64 s4, v11, v8
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v9
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x7e00, v1, s1
+; GFX11-NEXT:    v_cndmask_b16 v5, 0x7e00, v12, s3
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, s4
+; GFX11-NEXT:    v_cndmask_b16 v2, 0x7e00, v2, s2
+; GFX11-NEXT:    v_cndmask_b16 v4, 0x7e00, v4, s0
+; GFX11-NEXT:    v_cndmask_b16 v3, 0x7e00, v3, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v5, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v4, v7, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v3, v3, v10, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_maximum_v8f16:
@@ -2402,86 +2401,78 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) {
 ; GFX11-LABEL: v_maximum_v16f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_max_f16 v16, v7, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v15
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
+; GFX11-NEXT:    v_pk_max_f16 v18, v7, v15
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v7, v15
+; GFX11-NEXT:    v_cmp_o_f16_e64 s1, v6, v14
 ; GFX11-NEXT:    v_pk_max_f16 v15, v6, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v16
-; GFX11-NEXT:    v_pk_max_f16 v20, v4, v12
-; GFX11-NEXT:    v_pk_max_f16 v22, v2, v10
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0x7e00, v16, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v18, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, 0x7e00, v19, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e64 s0, v17, v16
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v18
+; GFX11-NEXT:    v_cndmask_b16 v16, 0x7e00, v18, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
+; GFX11-NEXT:    v_pk_max_f16 v20, v5, v13
+; GFX11-NEXT:    v_cndmask_b16 v7, 0x7e00, v7, s0
+; GFX11-NEXT:    v_cndmask_b16 v17, 0x7e00, v15, s1
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v6, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v15
-; GFX11-NEXT:    v_pk_max_f16 v14, v5, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v7, v16, v7, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0x7e00, v15, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v18, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v15, 0x7e00, v19, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v5, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v6, v15, v6, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, 0x7e00, v14, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v18, v17
-; GFX11-NEXT:    v_pk_max_f16 v17, v3, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v20
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, 0x7e00, v19, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v4, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v17
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v15
+; GFX11-NEXT:    v_cmp_o_f16_e64 s0, v5, v13
+; GFX11-NEXT:    v_cmp_o_f16_e64 s1, v19, v18
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v20
+; GFX11-NEXT:    v_cmp_o_f16_e64 s2, v4, v12
+; GFX11-NEXT:    v_pk_max_f16 v13, v4, v12
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v14, 0x7e00, v20, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v3
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v11
-; GFX11-NEXT:    v_perm_b32 v5, v13, v5, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7e00, v17, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v20, v19
-; GFX11-NEXT:    v_pk_max_f16 v19, v1, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v22
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, 0x7e00, v21, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v2, v10
+; GFX11-NEXT:    v_cndmask_b16 v6, 0x7e00, v6, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v14, 0x7e00, v20, s0
+; GFX11-NEXT:    v_cndmask_b16 v5, 0x7e00, v5, s1
+; GFX11-NEXT:    v_cndmask_b16 v15, 0x7e00, v13, s2
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v4, v12
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v13
+; GFX11-NEXT:    v_cmp_o_f16_e64 s0, v3, v11
+; GFX11-NEXT:    v_pk_max_f16 v12, v3, v11
+; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_cmp_o_f16_e64 s1, v2, v10
+; GFX11-NEXT:    v_pk_max_f16 v13, v2, v10
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v3, v11, v3, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, 0x7e00, v22, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_pk_max_f16 v22, v0, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v21, 0x7e00, v19, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v22
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v19, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v8
-; GFX11-NEXT:    v_perm_b32 v1, v1, v21, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v22, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v24, v23
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, 0x7e00, v25, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v2, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v4, v12
-; GFX11-NEXT:    v_perm_b32 v2, v2, v17, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7e00, v18, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v4, v4, v14, 0x5040100
+; GFX11-NEXT:    v_cndmask_b16 v18, 0x7e00, v12, s0
+; GFX11-NEXT:    v_cmp_o_f16_e64 s0, v3, v11
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v12
+; GFX11-NEXT:    v_cndmask_b16 v11, 0x7e00, v13, s1
+; GFX11-NEXT:    v_cmp_o_f16_e64 s1, v2, v10
+; GFX11-NEXT:    v_cmp_o_f16_e64 s2, v1, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v0
+; GFX11-NEXT:    v_pk_max_f16 v20, v0, v8
+; GFX11-NEXT:    v_pk_max_f16 v1, v1, v9
+; GFX11-NEXT:    v_cmp_o_f16_e64 s3, v10, v2
+; GFX11-NEXT:    v_cmp_o_f16_e64 s4, v0, v8
+; GFX11-NEXT:    v_cmp_o_f16_e64 s5, v19, v12
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v13
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x7e00, v1, s2
+; GFX11-NEXT:    v_cndmask_b16 v9, 0x7e00, v20, s4
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, s5
+; GFX11-NEXT:    v_cndmask_b16 v2, 0x7e00, v2, s3
+; GFX11-NEXT:    v_cndmask_b16 v8, 0x7e00, v8, s1
+; GFX11-NEXT:    v_cndmask_b16 v3, 0x7e00, v3, s0
+; GFX11-NEXT:    v_cndmask_b16 v4, 0x7e00, v4, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v0, v0, v9, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v2, v8, v11, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v3, v3, v18, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v4, v4, v15, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v5, v5, v14, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v6, v6, v17, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_maximum_v16f16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
index f8c2c54af27830..99d0916ae7a285 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
@@ -48,10 +48,10 @@ define half @v_minimum_f16(half %src0, half %src1) {
 ; GFX11-LABEL: v_minimum_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_min_f16_e32 v2, v0, v1
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_minimum_f16:
@@ -145,10 +145,10 @@ define half @v_minimum_f16__nsz(half %src0, half %src1) {
 ; GFX11-LABEL: v_minimum_f16__nsz:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_min_f16_e32 v2, v0, v1
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_minimum_f16__nsz:
@@ -247,10 +247,10 @@ define half @v_minimum_f16__nnan_src0(half %arg0, half %src1) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_f16_e32 v0, 1.0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_minimum_f16__nnan_src0:
@@ -314,10 +314,10 @@ define half @v_minimum_f16__nnan_src1(half %src0, half %arg1) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_add_f16_e32 v1, 1.0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
+; GFX11-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_minimum_f16__nnan_src1:
@@ -395,10 +395,10 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) {
 ; GFX11-LABEL: s_minimum_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_o_f16_e64 s2, s0, s1
 ; GFX11-NEXT:    v_min_f16_e64 v0, s0, s1
-; GFX11-NEXT:    v_cmp_o_f16_e64 vcc_lo, s0, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, s2
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use v0
@@ -480,17 +480,17 @@ define <2 x half> @v_minimum_v2f16(<2 x half> %src0, <2 x half> %src1) {
 ; GFX11-LABEL: v_minimum_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_min_f16 v2, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_pk_min_f16 v4, v0, v1
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cmp_o_f16_e64 s0, v3, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x7e00, v4, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, s0
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_minimum_v2f16:
@@ -604,17 +604,17 @@ define <2 x half> @v_minimum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) {
 ; GFX11-LABEL: v_minimum_v2f16__nsz:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_min_f16 v2, v0, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_pk_min_f16 v4, v0, v1
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cmp_o_f16_e64 s0, v3, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x7e00, v4, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, s0
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_minimum_v2f16__nsz:
@@ -752,17 +752,18 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
 ; GFX11-LABEL: s_minimum_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_o_f16_e64 s2, s0, s1
 ; GFX11-NEXT:    v_pk_min_f16 v0, s0, s1
-; GFX11-NEXT:    v_cmp_o_f16_e64 vcc_lo, s0, s1
-; GFX11-NEXT:    s_lshr_b32 s2, s1, 16
+; GFX11-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_o_f16_e64 s0, s0, s1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e64 vcc_lo, s0, s2
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x7e00, v1, s0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use v0
@@ -849,21 +850,20 @@ define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) {
 ; GFX11-LABEL: v_minimum_v3f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_min_f16 v4, v0, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    v_pk_min_f16 v6, v0, v2
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v6, v5
-; GFX11-NEXT:    v_pk_min_f16 v4, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e64 s1, v1, v3
+; GFX11-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX11-NEXT:    v_cmp_o_f16_e64 s0, v5, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX11-NEXT:    v_cndmask_b16 v2, 0x7e00, v6, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x7e00, v1, s1
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_minimum_v3f16:
@@ -994,21 +994,20 @@ define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) {
 ; GFX11-LABEL: v_minimum_v3f16__nsz:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_min_f16 v4, v0, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    v_pk_min_f16 v6, v0, v2
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v6, v5
-; GFX11-NEXT:    v_pk_min_f16 v4, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e64 s1, v1, v3
+; GFX11-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX11-NEXT:    v_cmp_o_f16_e64 s0, v5, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX11-NEXT:    v_cndmask_b16 v2, 0x7e00, v6, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x7e00, v1, s1
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_minimum_v3f16__nsz:
@@ -1154,26 +1153,26 @@ define <4 x half> @v_minimum_v4f16(<4 x half> %src0, <4 x half> %src1) {
 ; GFX11-LABEL: v_minimum_v4f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_min_f16 v4, v1, v3
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-NEXT:    v_pk_min_f16 v7, v0, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v6, v5
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-NEXT:    v_pk_min_f16 v8, v0, v2
+; GFX11-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX11-NEXT:    v_cmp_o_f16_e64 s0, v5, v4
+; GFX11-NEXT:    v_cmp_o_f16_e64 s1, v0, v2
+; GFX11-NEXT:    v_cmp_o_f16_e64 s2, v7, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x7e00, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v3, 0x7e00, v8, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, s2
+; GFX11-NEXT:    v_cndmask_b16 v2, 0x7e00, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_minimum_v4f16:
@@ -1321,26 +1320,26 @@ define <4 x half> @v_minimum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) {
 ; GFX11-LABEL: v_minimum_v4f16__nsz:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_min_f16 v4, v1, v3
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-NEXT:    v_pk_min_f16 v7, v0, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v6, v5
-; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-NEXT:    v_pk_min_f16 v8, v0, v2
+; GFX11-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX11-NEXT:    v_cmp_o_f16_e64 s0, v5, v4
+; GFX11-NEXT:    v_cmp_o_f16_e64 s1, v0, v2
+; GFX11-NEXT:    v_cmp_o_f16_e64 s2, v7, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x7e00, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v3, 0x7e00, v8, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, s2
+; GFX11-NEXT:    v_cndmask_b16 v2, 0x7e00, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_minimum_v4f16__nsz:
@@ -1538,44 +1537,44 @@ define <8 x half> @v_minimum_v8f16(<8 x half> %src0, <8 x half> %src1) {
 ; GFX11-LABEL: v_minimum_v8f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_min_f16 v8, v3, v7
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v7
-; GFX11-NEXT:    v_pk_min_f16 v10, v2, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
-; GFX11-NEXT:    v_pk_min_f16 v14, v1, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, 0x7e00, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v2, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v10
+; GFX11-NEXT:    v_pk_min_f16 v8, v3, v7
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v10, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v12, v11
-; GFX11-NEXT:    v_pk_min_f16 v11, v0, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0x7e00, v13, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, 0x7e00, v14, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v11, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v13, v12
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7e00, v15, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v14, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e64 s0, v2, v6
+; GFX11-NEXT:    v_pk_min_f16 v9, v2, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_cndmask_b16 v10, 0x7e00, v8, vcc_lo
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v7
-; GFX11-NEXT:    v_perm_b32 v1, v1, v10, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7e00, v8, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v3, v9, 0x5040100
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
+; GFX11-NEXT:    v_cndmask_b16 v7, 0x7e00, v9, s0
+; GFX11-NEXT:    v_cmp_o_f16_e64 s0, v2, v6
+; GFX11-NEXT:    v_cmp_o_f16_e64 s1, v1, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
+; GFX11-NEXT:    v_pk_min_f16 v12, v0, v4
+; GFX11-NEXT:    v_pk_min_f16 v1, v1, v5
+; GFX11-NEXT:    v_cmp_o_f16_e64 s2, v6, v2
+; GFX11-NEXT:    v_cmp_o_f16_e64 s3, v0, v4
+; GFX11-NEXT:    v_cmp_o_f16_e64 s4, v11, v8
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v9
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x7e00, v1, s1
+; GFX11-NEXT:    v_cndmask_b16 v5, 0x7e00, v12, s3
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, s4
+; GFX11-NEXT:    v_cndmask_b16 v2, 0x7e00, v2, s2
+; GFX11-NEXT:    v_cndmask_b16 v4, 0x7e00, v4, s0
+; GFX11-NEXT:    v_cndmask_b16 v3, 0x7e00, v3, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v5, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v4, v7, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v3, v3, v10, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_minimum_v8f16:
@@ -1821,86 +1820,78 @@ define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) {
 ; GFX11-LABEL: v_minimum_v16f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_pk_min_f16 v16, v7, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v15
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
+; GFX11-NEXT:    v_pk_min_f16 v18, v7, v15
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v7, v15
+; GFX11-NEXT:    v_cmp_o_f16_e64 s1, v6, v14
 ; GFX11-NEXT:    v_pk_min_f16 v15, v6, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v16
-; GFX11-NEXT:    v_pk_min_f16 v20, v4, v12
-; GFX11-NEXT:    v_pk_min_f16 v22, v2, v10
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, 0x7e00, v16, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v18, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, 0x7e00, v19, vcc_lo
+; GFX11-NEXT:    v_cmp_o_f16_e64 s0, v17, v16
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v18
+; GFX11-NEXT:    v_cndmask_b16 v16, 0x7e00, v18, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
+; GFX11-NEXT:    v_pk_min_f16 v20, v5, v13
+; GFX11-NEXT:    v_cndmask_b16 v7, 0x7e00, v7, s0
+; GFX11-NEXT:    v_cndmask_b16 v17, 0x7e00, v15, s1
 ; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v6, v14
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v15
-; GFX11-NEXT:    v_pk_min_f16 v14, v5, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v7, v16, v7, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, 0x7e00, v15, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v18, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v15, 0x7e00, v19, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v5, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v6, v15, v6, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, 0x7e00, v14, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v18, v17
-; GFX11-NEXT:    v_pk_min_f16 v17, v3, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v20
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, 0x7e00, v19, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v4, v12
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v17
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v15
+; GFX11-NEXT:    v_cmp_o_f16_e64 s0, v5, v13
+; GFX11-NEXT:    v_cmp_o_f16_e64 s1, v19, v18
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v20
+; GFX11-NEXT:    v_cmp_o_f16_e64 s2, v4, v12
+; GFX11-NEXT:    v_pk_min_f16 v13, v4, v12
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v14, 0x7e00, v20, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v3
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v11
-; GFX11-NEXT:    v_perm_b32 v5, v13, v5, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7e00, v17, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v20, v19
-; GFX11-NEXT:    v_pk_min_f16 v19, v1, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v22
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, 0x7e00, v21, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v2, v10
+; GFX11-NEXT:    v_cndmask_b16 v6, 0x7e00, v6, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v14, 0x7e00, v20, s0
+; GFX11-NEXT:    v_cndmask_b16 v5, 0x7e00, v5, s1
+; GFX11-NEXT:    v_cndmask_b16 v15, 0x7e00, v13, s2
+; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v4, v12
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v13
+; GFX11-NEXT:    v_cmp_o_f16_e64 s0, v3, v11
+; GFX11-NEXT:    v_pk_min_f16 v12, v3, v11
+; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_cmp_o_f16_e64 s1, v2, v10
+; GFX11-NEXT:    v_pk_min_f16 v13, v2, v10
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_perm_b32 v3, v11, v3, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, 0x7e00, v22, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_pk_min_f16 v22, v0, v8
-; GFX11-NEXT:    v_cndmask_b32_e32 v21, 0x7e00, v19, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v22
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v19, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v8
-; GFX11-NEXT:    v_perm_b32 v1, v1, v21, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v22, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v24, v23
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, 0x7e00, v25, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v2, v10
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_o_f16_e32 vcc_lo, v4, v12
-; GFX11-NEXT:    v_perm_b32 v2, v2, v17, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x7e00, v18, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v4, v4, v14, 0x5040100
+; GFX11-NEXT:    v_cndmask_b16 v18, 0x7e00, v12, s0
+; GFX11-NEXT:    v_cmp_o_f16_e64 s0, v3, v11
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v12
+; GFX11-NEXT:    v_cndmask_b16 v11, 0x7e00, v13, s1
+; GFX11-NEXT:    v_cmp_o_f16_e64 s1, v2, v10
+; GFX11-NEXT:    v_cmp_o_f16_e64 s2, v1, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v8
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v0
+; GFX11-NEXT:    v_pk_min_f16 v20, v0, v8
+; GFX11-NEXT:    v_pk_min_f16 v1, v1, v9
+; GFX11-NEXT:    v_cmp_o_f16_e64 s3, v10, v2
+; GFX11-NEXT:    v_cmp_o_f16_e64 s4, v0, v8
+; GFX11-NEXT:    v_cmp_o_f16_e64 s5, v19, v12
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v13
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x7e00, v1, s2
+; GFX11-NEXT:    v_cndmask_b16 v9, 0x7e00, v20, s4
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x7e00, v0, s5
+; GFX11-NEXT:    v_cndmask_b16 v2, 0x7e00, v2, s3
+; GFX11-NEXT:    v_cndmask_b16 v8, 0x7e00, v8, s1
+; GFX11-NEXT:    v_cndmask_b16 v3, 0x7e00, v3, s0
+; GFX11-NEXT:    v_cndmask_b16 v4, 0x7e00, v4, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v0, v0, v9, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v2, v8, v11, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v3, v3, v18, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v4, v4, v15, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v5, v5, v14, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v6, v6, v17, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_minimum_v16f16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
index c0a85bba93b738..ed029a3c6a2597 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
@@ -838,7 +838,7 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 {
 ; GFX11-NEXT:    v_sub_f16_e32 v1, s2, v0
 ; GFX11-NEXT:    v_cmp_ge_f16_e64 s3, |v1|, 0.5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 0x3c00, s3
+; GFX11-NEXT:    v_cndmask_b16 v1, 0, 0x3c00, s3
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    v_bfi_b32 v1, 0x7fff, v1, s2
 ; GFX11-NEXT:    s_mov_b32 s2, -1
@@ -980,20 +980,20 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_sub_f16_e32 v3, s2, v1
 ; GFX11-NEXT:    v_sub_f16_e32 v2, s3, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_ge_f16_e64 s4, |v2|, 0.5
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_ge_f16_e64 s4, |v3|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_ge_f16_e64 s7, |v3|, 0.5
+; GFX11-NEXT:    v_cmp_ge_f16_e64 s6, |v2|, 0.5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v3, 0, 0x3c00, s7
+; GFX11-NEXT:    v_cndmask_b16 v2, 0, 0x3c00, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfi_b32 v3, 0x7fff, v3, s2
 ; GFX11-NEXT:    v_bfi_b32 v2, 0x7fff, v2, s3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 0x3c00, s4
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_bfi_b32 v3, 0x7fff, v3, s2
 ; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f16_e32 v1, v1, v3
+; GFX11-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/lround.ll b/llvm/test/CodeGen/AMDGPU/lround.ll
index 072ee70b840d83..f1678bb8ee4d40 100644
--- a/llvm/test/CodeGen/AMDGPU/lround.ll
+++ b/llvm/test/CodeGen/AMDGPU/lround.ll
@@ -824,7 +824,7 @@ define half @intrinsic_fround_half(half %arg) {
 ; GFX11-SDAG-NEXT:    v_sub_f16_e32 v2, v0, v1
 ; GFX11-SDAG-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v2, 0, 0x3c00, s0
 ; GFX11-SDAG-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_add_f16_e32 v0, v1, v0
@@ -839,7 +839,7 @@ define half @intrinsic_fround_half(half %arg) {
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
 ; GFX11-GISEL-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v2, 0, 0x3c00, s0
 ; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v1, v0
@@ -915,7 +915,7 @@ define i32 @intrinsic_lround_i32_f16(half %arg) {
 ; GFX11-SDAG-NEXT:    v_sub_f16_e32 v2, v0, v1
 ; GFX11-SDAG-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
+; GFX11-SDAG-NEXT:    v_cndmask_b16 v2, 0, 0x3c00, s0
 ; GFX11-SDAG-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_add_f16_e32 v0, v1, v0
@@ -933,7 +933,7 @@ define i32 @intrinsic_lround_i32_f16(half %arg) {
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
 ; GFX11-GISEL-NEXT:    v_cmp_ge_f16_e64 s0, |v2|, 0.5
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 0x3c00, s0
+; GFX11-GISEL-NEXT:    v_cndmask_b16 v2, 0, 0x3c00, s0
 ; GFX11-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_add_f16_e32 v0, v1, v0
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
index 5e46fd6b28d275..095b1c0025c011 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
@@ -236,18 +236,19 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v1, v1, v0, s0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v2, v1, v0, vcc_lo
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
@@ -261,13 +262,12 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, v3, v0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v1, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, v3, v0, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_maximumnum_bf16:
@@ -278,18 +278,19 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT:    v_cndmask_b16 v0, v0, v1, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b16 v1, v1, v0, s0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_eq_u16_e64 s0, 0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v2, v1, v0, vcc_lo
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
@@ -303,13 +304,12 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v0, v3, v0, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v0, v0, v1, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b16 v0, v3, v0, vcc_lo
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %result = call bfloat @llvm.maximumnum.bf16(bfloat %x, bfloat %y)
   ret bfloat %result
@@ -369,17 +369,18 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX11-NEXT:    v_cndmask_b16 v2, v1, v0, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, v2, v0, s0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v1, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, v2, v0, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_maximumnum_bf16_nnan:
@@ -391,17 +392,18 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u16_e64 s0, 0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v0
-; GFX12-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX12-NEXT:    v_cndmask_b16 v2, v1, v0, vcc_lo
 ; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v0, v2, v0, s0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b16 v0, v0, v1, vcc_lo
 ; GFX12-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v0, v2, v0, vcc_lo
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan bfloat @llvm.maximumnum.bf16(bfloat %x, bfloat %y)
   ret bfloat %result
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
index 9e0b7daf38de16..934388fe8edfb1 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
@@ -238,18 +238,19 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v1, v1, v0, s0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v2, v1, v0, vcc_lo
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
@@ -263,13 +264,12 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, v3, v0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v1, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, v3, v0, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_minimumnum_bf16:
@@ -280,18 +280,19 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT:    v_cndmask_b16 v0, v0, v1, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b16 v1, v1, v0, s0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v2, v3
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v2, v1, v0, vcc_lo
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_max_num_f32_e32 v2, v2, v2
@@ -305,13 +306,12 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
-; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v0, v3, v0, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v0, v0, v1, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b16 v0, v3, v0, vcc_lo
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %result = call bfloat @llvm.minimumnum.bf16(bfloat %x, bfloat %y)
   ret bfloat %result
@@ -373,17 +373,18 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX11-NEXT:    v_cndmask_b16 v2, v1, v0, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, v2, v0, s0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v1, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, v2, v0, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_minimumnum_bf16_nnan:
@@ -395,17 +396,18 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cmp_eq_u16_e64 s0, 0x8000, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_lt_f32_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v1, v0, vcc_lo
-; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0
-; GFX12-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2
+; GFX12-NEXT:    v_cndmask_b16 v2, v1, v0, vcc_lo
 ; GFX12-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v0, v2, v0, s0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b16 v0, v0, v1, vcc_lo
 ; GFX12-NEXT:    v_cmp_eq_f32_e32 vcc_lo, 0, v3
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v0, v2, v0, vcc_lo
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %result = call nnan bfloat @llvm.minimumnum.bf16(bfloat %x, bfloat %y)
   ret bfloat %result
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll
index 7c1da18de70f83..e0ea9116e214f3 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll
@@ -34,7 +34,7 @@ define half @add_select_fabs_fabs_f16(i32 %c, half %x, half %y, half %z) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f16_e64 v0, |v0|, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -77,7 +77,7 @@ define { half, half } @add_select_multi_use_lhs_fabs_fabs_f16(i32 %c, half %x, h
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e64 v1, |v1|, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f16_e64 v0, |v0|, v4
@@ -123,7 +123,7 @@ define { half, half } @add_select_multi_store_use_lhs_fabs_fabs_f16(i32 %c, half
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f16_e64 v0, |v0|, v3
@@ -169,7 +169,7 @@ define { half, half } @add_select_multi_use_rhs_fabs_fabs_f16(i32 %c, half %x, h
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e64 v1, |v2|, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f16_e64 v0, |v0|, v3
@@ -212,10 +212,10 @@ define half @add_select_fabs_var_f16(i32 %c, half %x, half %y, half %z) {
 ; GFX11-LABEL: add_select_fabs_var_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, v2, v0, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e32 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
@@ -251,10 +251,10 @@ define half @add_select_fabs_negk_f16(i32 %c, half %x, half %y) {
 ; GFX11-LABEL: add_select_fabs_negk_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0xbc00, v0, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
@@ -289,10 +289,10 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) {
 ; GFX11-LABEL: add_select_fabs_negk_negk_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0xc000
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0xc000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0xbc00, v0, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e64 v0, |v0|, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
@@ -326,10 +326,10 @@ define half @add_select_posk_posk_f16(i32 %c, half %x) {
 ; GFX11-LABEL: add_select_posk_posk_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0x4000
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0x4000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x3c00, v0, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
@@ -367,7 +367,7 @@ define half @add_select_negk_fabs_f16(i32 %c, half %x, half %y) {
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, 0xbc00, v1, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
@@ -407,7 +407,7 @@ define half @add_select_negliteralk_fabs_f16(i32 %c, half %x, half %y) {
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xe400, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, 0xe400, v1, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e32 v0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
@@ -443,7 +443,7 @@ define half @add_select_fabs_posk_f16(i32 %c, half %x, half %y) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x3c00, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f16_e64 v0, |v0|, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -480,7 +480,7 @@ define half @add_select_posk_fabs_f16(i32 %c, half %x, half %y) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x3c00, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f16_e64 v0, |v0|, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -518,7 +518,7 @@ define half @add_select_fneg_fneg_f16(i32 %c, half %x, half %y, half %z) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v3, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -561,7 +561,7 @@ define { half, half } @add_select_multi_use_lhs_fneg_fneg_f16(i32 %c, half %x, h
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    v_sub_f16_e32 v1, v4, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v3, v0
@@ -607,7 +607,7 @@ define { half, half } @add_select_multi_store_use_lhs_fneg_fneg_f16(i32 %c, half
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v3, v0
@@ -653,7 +653,7 @@ define { half, half } @add_select_multi_use_rhs_fneg_fneg_f16(i32 %c, half %x, h
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    v_sub_f16_e32 v1, v4, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v3, v0
@@ -696,10 +696,10 @@ define half @add_select_fneg_var_f16(i32 %c, half %x, half %y, half %z) {
 ; GFX11-LABEL: add_select_fneg_var_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, v2, v0, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e32 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
@@ -735,7 +735,7 @@ define half @add_select_fneg_negk_f16(i32 %c, half %x, half %y) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x3c00, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v2, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -773,7 +773,7 @@ define half @add_select_fneg_inv2pi_f16(i32 %c, half %x, half %y) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xb118, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, 0xb118, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v2, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -811,7 +811,7 @@ define half @add_select_fneg_neginv2pi_f16(i32 %c, half %x, half %y) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3118, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x3118, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v2, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -846,10 +846,10 @@ define half @add_select_negk_negk_f16(i32 %c, half %x) {
 ; GFX11-LABEL: add_select_negk_negk_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0xc000
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0xc000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0xbc00, v0, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
@@ -884,10 +884,10 @@ define half @add_select_negliteralk_negliteralk_f16(i32 %c, half %x) {
 ; GFX11-LABEL: add_select_negliteralk_negliteralk_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0xe800
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xec00, v2, vcc_lo
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0xe800
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0xec00, v0, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
@@ -920,10 +920,10 @@ define half @add_select_fneg_negk_negk_f16(i32 %c, half %x) {
 ; GFX11-LABEL: add_select_fneg_negk_negk_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0xc000
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0xc000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0xbc00, v0, vcc_lo
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
@@ -959,7 +959,7 @@ define half @add_select_negk_fneg_f16(i32 %c, half %x, half %y) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x3c00, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v2, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -996,7 +996,7 @@ define half @add_select_fneg_posk_f16(i32 %c, half %x, half %y) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, 0xbc00, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v2, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1033,7 +1033,7 @@ define half @add_select_posk_fneg_f16(i32 %c, half %x, half %y) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, 0xbc00, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v2, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1072,11 +1072,11 @@ define half @add_select_negfabs_fabs_f16(i32 %c, half %x, half %y, half %z) {
 ; GFX11-LABEL: add_select_negfabs_fabs_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_or_b32_e32 v1, 0x8000, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v0, 0x8000, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, v1, v0, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e32 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
@@ -1116,11 +1116,11 @@ define half @add_select_fabs_negfabs_f16(i32 %c, half %x, half %y, half %z) {
 ; GFX11-LABEL: add_select_fabs_negfabs_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_or_b32_e32 v2, 0x8000, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v1
+; GFX11-NEXT:    v_or_b32_e32 v1, 0x8000, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, v1, v0, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e32 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
@@ -1160,11 +1160,11 @@ define half @add_select_neg_fabs_f16(i32 %c, half %x, half %y, half %z) {
 ; GFX11-LABEL: add_select_neg_fabs_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, v1, v0, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e32 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
@@ -1203,11 +1203,11 @@ define half @add_select_fabs_neg_f16(i32 %c, half %x, half %y, half %z) {
 ; GFX11-LABEL: add_select_fabs_neg_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v1
+; GFX11-NEXT:    v_xor_b32_e32 v1, 0x8000, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, v1, v0, vcc_lo
 ; GFX11-NEXT:    v_add_f16_e32 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
@@ -1245,10 +1245,10 @@ define half @add_select_neg_negfabs_f16(i32 %c, half %x, half %y, half %z) {
 ; GFX11-LABEL: add_select_neg_negfabs_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v1, vcc_lo
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v3, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
@@ -1287,10 +1287,10 @@ define half @add_select_negfabs_neg_f16(i32 %c, half %x, half %y, half %z) {
 ; GFX11-LABEL: add_select_negfabs_neg_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v2, vcc_lo
 ; GFX11-NEXT:    v_sub_f16_e32 v0, v3, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
@@ -1328,10 +1328,10 @@ define half @mul_select_negfabs_posk_f16(i32 %c, half %x, half %y) {
 ; GFX11-LABEL: mul_select_negfabs_posk_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_or_b32_e32 v1, 0x8000, v1
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v1, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v0, 0x8000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x4400, v0, vcc_lo
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
@@ -1371,7 +1371,7 @@ define half @mul_select_posk_negfabs_f16(i32 %c, half %x, half %y) {
 ; GFX11-NEXT:    v_or_b32_e32 v1, 0x8000, v1
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x4400, v1, vcc_lo
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
@@ -1408,10 +1408,10 @@ define half @mul_select_negfabs_negk_f16(i32 %c, half %x, half %y) {
 ; GFX11-LABEL: mul_select_negfabs_negk_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_or_b32_e32 v1, 0x8000, v1
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xc400, v1, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v0, 0x8000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0xc400, v0, vcc_lo
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
@@ -1451,7 +1451,7 @@ define half @mul_select_negk_negfabs_f16(i32 %c, half %x, half %y) {
 ; GFX11-NEXT:    v_or_b32_e32 v1, 0x8000, v1
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xc400, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, 0xc400, v1, vcc_lo
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
@@ -1493,8 +1493,8 @@ define half @select_fneg_posk_src_add_f16(i32 %c, half %x, half %y) {
 ; GFX11-SAFE-NEXT:    v_add_f16_e32 v1, 4.0, v1
 ; GFX11-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
+; GFX11-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v1
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v0, 0x4000, v0, vcc_lo
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-NSZ-LABEL: select_fneg_posk_src_add_f16:
@@ -1519,10 +1519,10 @@ define half @select_fneg_posk_src_add_f16(i32 %c, half %x, half %y) {
 ; GFX11-NSZ-LABEL: select_fneg_posk_src_add_f16:
 ; GFX11-NSZ:       ; %bb.0:
 ; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-NEXT:    v_sub_f16_e32 v1, -4.0, v1
 ; GFX11-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NSZ-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
+; GFX11-NSZ-NEXT:    v_sub_f16_e32 v0, -4.0, v1
+; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-NEXT:    v_cndmask_b16 v0, 0x4000, v0, vcc_lo
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %add = fadd half %x, 4.0
@@ -1558,8 +1558,8 @@ define half @select_fneg_posk_src_sub_f16(i32 %c, half %x) {
 ; GFX11-SAFE-NEXT:    v_add_f16_e32 v1, -4.0, v1
 ; GFX11-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
+; GFX11-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v1
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v0, 0x4000, v0, vcc_lo
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-NSZ-LABEL: select_fneg_posk_src_sub_f16:
@@ -1584,10 +1584,10 @@ define half @select_fneg_posk_src_sub_f16(i32 %c, half %x) {
 ; GFX11-NSZ-LABEL: select_fneg_posk_src_sub_f16:
 ; GFX11-NSZ:       ; %bb.0:
 ; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-NEXT:    v_sub_f16_e32 v1, 4.0, v1
 ; GFX11-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NSZ-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
+; GFX11-NSZ-NEXT:    v_sub_f16_e32 v0, 4.0, v1
+; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-NEXT:    v_cndmask_b16 v0, 0x4000, v0, vcc_lo
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %add = fsub half %x, 4.0
@@ -1619,10 +1619,10 @@ define half @select_fneg_posk_src_mul_f16(i32 %c, half %x) {
 ; GFX11-LABEL: select_fneg_posk_src_mul_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e32 v1, -4.0, v1
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
+; GFX11-NEXT:    v_mul_f16_e32 v0, -4.0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x4000, v0, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %mul = fmul half %x, 4.0
@@ -1660,8 +1660,8 @@ define half @select_fneg_posk_src_fma_f16(i32 %c, half %x, half %z) {
 ; GFX11-SAFE-NEXT:    v_fmac_f16_e32 v2, 4.0, v1
 ; GFX11-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT:    v_xor_b32_e32 v1, 0x8000, v2
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
+; GFX11-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v2
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v0, 0x4000, v0, vcc_lo
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-NSZ-LABEL: select_fneg_posk_src_fma_f16:
@@ -1688,10 +1688,10 @@ define half @select_fneg_posk_src_fma_f16(i32 %c, half %x, half %z) {
 ; GFX11-NSZ-LABEL: select_fneg_posk_src_fma_f16:
 ; GFX11-NSZ:       ; %bb.0:
 ; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-NEXT:    v_fma_f16 v1, v1, -4.0, -v2
 ; GFX11-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NSZ-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
+; GFX11-NSZ-NEXT:    v_fma_f16 v0, v1, -4.0, -v2
+; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-NEXT:    v_cndmask_b16 v0, 0x4000, v0, vcc_lo
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fma = call half @llvm.fma.f16(half %x, half 4.0, half %z)
@@ -1730,8 +1730,8 @@ define half @select_fneg_posk_src_fmad_f16(i32 %c, half %x, half %z) {
 ; GFX11-SAFE-NEXT:    v_fmac_f16_e32 v2, 4.0, v1
 ; GFX11-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT:    v_xor_b32_e32 v1, 0x8000, v2
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
+; GFX11-SAFE-NEXT:    v_xor_b32_e32 v0, 0x8000, v2
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v0, 0x4000, v0, vcc_lo
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-NSZ-LABEL: select_fneg_posk_src_fmad_f16:
@@ -1759,10 +1759,10 @@ define half @select_fneg_posk_src_fmad_f16(i32 %c, half %x, half %z) {
 ; GFX11-NSZ-LABEL: select_fneg_posk_src_fmad_f16:
 ; GFX11-NSZ:       ; %bb.0:
 ; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-NEXT:    v_fma_f16 v1, v1, -4.0, -v2
 ; GFX11-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NSZ-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
+; GFX11-NSZ-NEXT:    v_fma_f16 v0, v1, -4.0, -v2
+; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-NEXT:    v_cndmask_b16 v0, 0x4000, v0, vcc_lo
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %c, 0
   %fmad = call half @llvm.fmuladd.f16(half %x, half 4.0, half %z)
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
index d2bb971b680307..d5b5b052e7ccb5 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
@@ -68,18 +68,18 @@ define <2 x half> @add_select_fabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11-LABEL: add_select_fabs_fabs_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc_lo
+; GFX11-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v2, v3, v2, s0
+; GFX11-NEXT:    v_cndmask_b16 v0, v1, v0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -161,17 +161,17 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fabs_fabs_v2f16(<2 x
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v3, v3, v2, s0
+; GFX11-NEXT:    v_cndmask_b16 v0, v1, v0, vcc_lo
 ; GFX11-NEXT:    v_pk_add_f16 v1, v2, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -246,19 +246,19 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fabs_fabs_v2f1
 ; GFX11-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc_lo
+; GFX11-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v3, v3, v2, s0
+; GFX11-NEXT:    v_cndmask_b16 v0, v1, v0, vcc_lo
 ; GFX11-NEXT:    v_mov_b32_e32 v1, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -342,17 +342,17 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fabs_fabs_v2f16(<2 x
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v2, v3, v2, s0
+; GFX11-NEXT:    v_cndmask_b16 v0, v1, v0, vcc_lo
 ; GFX11-NEXT:    v_pk_add_f16 v1, v3, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -424,15 +424,16 @@ define <2 x half> @add_select_fabs_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-NEXT:    v_cndmask_b16 v0, v1, v5, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b16 v1, v3, v2, s0
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -496,14 +497,15 @@ define <2 x half> @add_select_fabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_cndmask_b16 v1, 0xbc00, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0xbc00, v0, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -567,13 +569,14 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, 0xc000
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xbc00, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0xbc00, v3, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v1, 0xbc00, v3, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -631,12 +634,12 @@ define <2 x half> @add_select_posk_posk_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, 0x4000
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v3, vcc_lo
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x3c00, v3, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x3c00, v3, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -700,13 +703,13 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_cndmask_b16 v1, 0xbc00, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, 0xbc00, v0, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -772,13 +775,13 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xe400, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xe400, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_cndmask_b16 v1, 0xe400, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, 0xe400, v0, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -842,14 +845,15 @@ define <2 x half> @add_select_fabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x3c00, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x3c00, v0, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -914,13 +918,13 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x3c00, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x3c00, v0, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -985,15 +989,15 @@ define <2 x half> @add_select_fneg_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11-LABEL: add_select_fneg_fneg_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v0, v5, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v1, v3, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -1069,16 +1073,16 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fneg_fneg_v2f16(<2 x
 ; GFX11-LABEL: add_select_multi_use_lhs_fneg_fneg_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v0, v6, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v1, v3, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v1, v5, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -1153,16 +1157,16 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fneg_fneg_v2f1
 ; GFX11-LABEL: add_select_multi_store_use_lhs_fneg_fneg_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v0, v5, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v1, v3, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    v_xor_b32_e32 v1, 0x80008000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -1240,16 +1244,16 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fneg_fneg_v2f16(<2 x
 ; GFX11-LABEL: add_select_multi_use_rhs_fneg_fneg_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v0, v6, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v1, v3, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v1, v5, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -1325,15 +1329,16 @@ define <2 x half> @add_select_fneg_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-NEXT:    v_cndmask_b16 v0, v1, v5, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b16 v1, v3, v2, s0
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -1394,14 +1399,14 @@ define <2 x half> @add_select_fneg_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11-LABEL: add_select_fneg_negk_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x3c00, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x3c00, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -1463,14 +1468,14 @@ define <2 x half> @add_select_fneg_inv2pi_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX11-LABEL: add_select_fneg_inv2pi_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xb118, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xb118, v2, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0xb118, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v1, 0xb118, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -1532,14 +1537,14 @@ define <2 x half> @add_select_fneg_neginv2pi_v2f16(<2 x i32> %c, <2 x half> %x,
 ; GFX11-LABEL: add_select_fneg_neginv2pi_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3118, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3118, v2, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x3118, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x3118, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -1597,12 +1602,12 @@ define <2 x half> @add_select_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, 0xc000
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xbc00, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0xbc00, v3, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v1, 0xbc00, v3, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -1661,12 +1666,12 @@ define <2 x half> @add_select_negliteralk_negliteralk_v2f16(<2 x i32> %c, <2 x h
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, 0xe800
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xec00, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xec00, v3, vcc_lo
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0xec00, v3, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v1, 0xec00, v3, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -1723,12 +1728,12 @@ define <2 x half> @add_select_fneg_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v3, 0xc000
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xbc00, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0xbc00, v3, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v1, 0xbc00, v3, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-NEXT:    v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -1789,14 +1794,14 @@ define <2 x half> @add_select_negk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11-LABEL: add_select_negk_fneg_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x3c00, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x3c00, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -1857,14 +1862,14 @@ define <2 x half> @add_select_fneg_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11-LABEL: add_select_fneg_posk_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0xbc00, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v1, 0xbc00, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -1925,14 +1930,14 @@ define <2 x half> @add_select_posk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX11-LABEL: add_select_posk_fneg_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0xbc00, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v1, 0xbc00, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -2007,16 +2012,16 @@ define <2 x half> @add_select_negfabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v2, v3, v2, s0
+; GFX11-NEXT:    v_cndmask_b16 v0, v1, v0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -2091,18 +2096,18 @@ define <2 x half> @add_select_fabs_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX11-LABEL: add_select_fabs_negfabs_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_or_b32_e32 v3, 0x80008000, v3
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v3, 0x80008000, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v2, v3, v2, s0
+; GFX11-NEXT:    v_cndmask_b16 v0, v1, v0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -2179,16 +2184,16 @@ define <2 x half> @add_select_neg_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v2, v3, v2, s0
+; GFX11-NEXT:    v_cndmask_b16 v0, v1, v0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -2262,18 +2267,18 @@ define <2 x half> @add_select_fabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; GFX11-LABEL: add_select_fabs_neg_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80008000, v3
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc_lo
+; GFX11-NEXT:    v_xor_b32_e32 v3, 0x80008000, v3
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v2, v3, v2, s0
+; GFX11-NEXT:    v_cndmask_b16 v0, v1, v0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -2342,15 +2347,16 @@ define <2 x half> @add_select_neg_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX11-NEXT:    v_cndmask_b16 v0, v5, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cndmask_b16 v1, v3, v2, s0
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -2419,16 +2425,17 @@ define <2 x half> @add_select_negfabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX11-LABEL: add_select_negfabs_neg_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-NEXT:    v_cndmask_b16 v0, v5, v1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v1, v2, v3, s0
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -2498,14 +2505,15 @@ define <2 x half> @mul_select_negfabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x4400, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x4400, v0, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_mul_f16 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -2575,13 +2583,13 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x4400, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x4400, v0, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    v_pk_mul_f16 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -2650,14 +2658,15 @@ define <2 x half> @mul_select_negfabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xc400, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xc400, v4, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_cndmask_b16 v1, 0xc400, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0xc400, v0, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_mul_f16 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -2727,13 +2736,13 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0xc400, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xc400, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_cndmask_b16 v1, 0xc400, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, 0xc400, v0, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    v_pk_mul_f16 v0, v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
@@ -2809,15 +2818,16 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, <
 ; GFX11-SAFE:       ; %bb.0:
 ; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_pk_add_f16 v2, v2, 4.0 op_sel_hi:[1,0]
-; GFX11-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo
 ; GFX11-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SAFE-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
-; GFX11-SAFE-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v1, 0x4000, v2, s0
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v0, 0x4000, v0, vcc_lo
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-NSZ-LABEL: select_fneg_posk_src_add_v2f16:
@@ -2868,14 +2878,14 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, <
 ; GFX11-NSZ:       ; %bb.0:
 ; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
-; GFX11-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NSZ-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX11-NSZ-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo
 ; GFX11-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NSZ-NEXT:    v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
-; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NSZ-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NSZ-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NSZ-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NSZ-NEXT:    v_cndmask_b16 v1, 0x4000, v2, s0
+; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-NEXT:    v_cndmask_b16 v0, 0x4000, v0, vcc_lo
+; GFX11-NSZ-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %add = fadd <2 x half> %x, <half 4.0, half 4.0>
@@ -2945,15 +2955,16 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; GFX11-SAFE:       ; %bb.0:
 ; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0]
-; GFX11-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo
 ; GFX11-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SAFE-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
-; GFX11-SAFE-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v1, 0x4000, v2, s0
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v0, 0x4000, v0, vcc_lo
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-NSZ-LABEL: select_fneg_posk_src_sub_v2f16:
@@ -3004,14 +3015,14 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; GFX11-NSZ:       ; %bb.0:
 ; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_pk_add_f16 v2, v2, 4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
-; GFX11-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NSZ-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX11-NSZ-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo
 ; GFX11-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NSZ-NEXT:    v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
-; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NSZ-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NSZ-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NSZ-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NSZ-NEXT:    v_cndmask_b16 v1, 0x4000, v2, s0
+; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-NEXT:    v_cndmask_b16 v0, 0x4000, v0, vcc_lo
+; GFX11-NSZ-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %add = fsub <2 x half> %x, <half 4.0, half 4.0>
@@ -3069,14 +3080,14 @@ define <2 x half> @select_fneg_posk_src_mul_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_mul_f16 v2, v2, -4.0 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x4000, v2, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x4000, v0, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %mul = fmul <2 x half> %x, <half 4.0, half 4.0>
@@ -3152,15 +3163,16 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, <
 ; GFX11-SAFE:       ; %bb.0:
 ; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_pk_fma_f16 v2, v2, 4.0, v3 op_sel_hi:[1,0,1]
-; GFX11-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo
 ; GFX11-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SAFE-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
-; GFX11-SAFE-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v1, 0x4000, v2, s0
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v0, 0x4000, v0, vcc_lo
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NSZ-LABEL: select_fneg_posk_src_fma_v2f16:
@@ -3197,14 +3209,14 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, <
 ; GFX11-NSZ:       ; %bb.0:
 ; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GFX11-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NSZ-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX11-NSZ-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo
 ; GFX11-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NSZ-NEXT:    v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
-; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NSZ-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NSZ-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NSZ-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NSZ-NEXT:    v_cndmask_b16 v1, 0x4000, v2, s0
+; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-NEXT:    v_cndmask_b16 v0, 0x4000, v0, vcc_lo
+; GFX11-NSZ-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x, <2 x half> <half 4.0, half 4.0>, <2 x half> %z)
@@ -3282,15 +3294,16 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x,
 ; GFX11-SAFE:       ; %bb.0:
 ; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SAFE-NEXT:    v_pk_fma_f16 v2, v2, 4.0, v3 op_sel_hi:[1,0,1]
-; GFX11-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo
 ; GFX11-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11-SAFE-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
 ; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
-; GFX11-SAFE-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v1, 0x4000, v2, s0
+; GFX11-SAFE-NEXT:    v_cndmask_b16 v0, 0x4000, v0, vcc_lo
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CI-NSZ-LABEL: select_fneg_posk_src_fmad_v2f16:
@@ -3348,14 +3361,14 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x,
 ; GFX11-NSZ:       ; %bb.0:
 ; GFX11-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NSZ-NEXT:    v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GFX11-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NSZ-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX11-NSZ-NEXT:    v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo
 ; GFX11-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NSZ-NEXT:    v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
-; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NSZ-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NSZ-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
+; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NSZ-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NSZ-NEXT:    v_cndmask_b16 v1, 0x4000, v2, s0
+; GFX11-NSZ-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-NEXT:    v_cndmask_b16 v0, 0x4000, v0, vcc_lo
+; GFX11-NSZ-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX11-NSZ-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %c, zeroinitializer
   %fmad = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %x, <2 x half> <half 4.0, half 4.0>, <2 x half> %z)
diff --git a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll
index 50a3336a7483c7..5111870da5a639 100644
--- a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll
@@ -534,7 +534,7 @@ define half @v_test_fmin_legacy_ule_f16_safe(half %a, half %b) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v0, v1, v0, vcc_lo
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule half %a, %b
   %val = select i1 %cmp, half %a, half %b
@@ -567,7 +567,7 @@ define half @v_test_fmin_legacy_ule_f16_nnan_flag(half %a, half %b) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v0, v1, v0, vcc_lo
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule half %a, %b
   %val = select nnan i1 %cmp, half %a, half %b
@@ -600,7 +600,7 @@ define half @v_test_fmin_legacy_ule_f16_nsz_flag(half %a, half %b) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v0, v1, v0, vcc_lo
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule half %a, %b
   %val = select nsz i1 %cmp, half %a, half %b
@@ -664,7 +664,7 @@ define half @v_test_fmax_legacy_uge_f16_safe(half %a, half %b) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v0, v1, v0, vcc_lo
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp uge half %a, %b
   %val = select i1 %cmp, half %a, half %b
@@ -697,7 +697,7 @@ define half @v_test_fmax_legacy_uge_f16_nnan_flag(half %a, half %b) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v0, v1, v0, vcc_lo
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp uge half %a, %b
   %val = select nnan i1 %cmp, half %a, half %b
@@ -730,7 +730,7 @@ define half @v_test_fmax_legacy_uge_f16_nsz_flag(half %a, half %b) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v0, v1, v0, vcc_lo
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp uge half %a, %b
   %val = select nsz i1 %cmp, half %a, half %b
@@ -806,11 +806,12 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_safe(<2 x half> %a, <2 x half> %
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_ngt_f16_e64 s0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v0, v1, v0, s0
+; GFX12-NEXT:    v_cndmask_b16 v2, v2, v3, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <2 x half> %a, %b
@@ -856,11 +857,12 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nnan_flag(<2 x half> %a, <2 x ha
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_ngt_f16_e64 s0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v0, v1, v0, s0
+; GFX12-NEXT:    v_cndmask_b16 v2, v2, v3, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <2 x half> %a, %b
@@ -906,11 +908,12 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nsz_flag(<2 x half> %a, <2 x hal
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_ngt_f16_e64 s0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v0, v1, v0, s0
+; GFX12-NEXT:    v_cndmask_b16 v2, v2, v3, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <2 x half> %a, %b
@@ -992,11 +995,12 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_safe(<2 x half> %a, <2 x half> %
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_nlt_f16_e64 s0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v0, v1, v0, s0
+; GFX12-NEXT:    v_cndmask_b16 v2, v2, v3, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp uge <2 x half> %a, %b
@@ -1042,11 +1046,12 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nnan_flag(<2 x half> %a, <2 x ha
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_nlt_f16_e64 s0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v0, v1, v0, s0
+; GFX12-NEXT:    v_cndmask_b16 v2, v2, v3, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp uge <2 x half> %a, %b
@@ -1092,11 +1097,12 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nsz_flag(<2 x half> %a, <2 x hal
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_nlt_f16_e64 s0, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v3, v2
-; GFX12-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v0, v1, v0, s0
+; GFX12-NEXT:    v_cndmask_b16 v2, v2, v3, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp uge <2 x half> %a, %b
@@ -1193,22 +1199,23 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_safe(<4 x half> %a, <4 x half> %
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v5, v4
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v7, v6
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
-; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX12-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-NEXT:    v_cmp_ngt_f16_e64 s0, v0, v2
+; GFX12-NEXT:    v_cmp_ngt_f16_e64 s1, v5, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cmp_ngt_f16_e64 s2, v7, v6
+; GFX12-NEXT:    v_cndmask_b16 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v0, v2, v0, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b16 v2, v4, v5, s1
+; GFX12-NEXT:    v_cndmask_b16 v3, v6, v7, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <4 x half> %a, %b
   %val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
@@ -1268,22 +1275,23 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_nnan_flag(<4 x half> %a, <4 x ha
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v5, v4
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v7, v6
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
-; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX12-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-NEXT:    v_cmp_ngt_f16_e64 s0, v0, v2
+; GFX12-NEXT:    v_cmp_ngt_f16_e64 s1, v5, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cmp_ngt_f16_e64 s2, v7, v6
+; GFX12-NEXT:    v_cndmask_b16 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v0, v2, v0, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b16 v2, v4, v5, s1
+; GFX12-NEXT:    v_cndmask_b16 v3, v6, v7, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <4 x half> %a, %b
   %val = select nnan <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
@@ -1343,22 +1351,23 @@ define <4 x half> @v_test_fmin_legacy_ule_v4f16_nsz_flag(<4 x half> %a, <4 x hal
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v5, v4
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v7, v6
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
-; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GFX12-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
-; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX12-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-NEXT:    v_cmp_ngt_f16_e64 s0, v0, v2
+; GFX12-NEXT:    v_cmp_ngt_f16_e64 s1, v5, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cmp_ngt_f16_e64 s2, v7, v6
+; GFX12-NEXT:    v_cndmask_b16 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v0, v2, v0, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b16 v2, v4, v5, s1
+; GFX12-NEXT:    v_cndmask_b16 v3, v6, v7, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <4 x half> %a, %b
   %val = select nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
@@ -1466,22 +1475,23 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_safe(<4 x half> %a, <4 x half> %
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v5, v4
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v7, v6
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v2
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v1, v3
-; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX12-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-NEXT:    v_cmp_nlt_f16_e64 s0, v0, v2
+; GFX12-NEXT:    v_cmp_nlt_f16_e64 s1, v5, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cmp_nlt_f16_e64 s2, v7, v6
+; GFX12-NEXT:    v_cndmask_b16 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v0, v2, v0, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b16 v2, v4, v5, s1
+; GFX12-NEXT:    v_cndmask_b16 v3, v6, v7, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp uge <4 x half> %a, %b
   %val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
@@ -1541,22 +1551,23 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_nnan_flag(<4 x half> %a, <4 x ha
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v5, v4
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v7, v6
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v2
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v1, v3
-; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX12-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-NEXT:    v_cmp_nlt_f16_e64 s0, v0, v2
+; GFX12-NEXT:    v_cmp_nlt_f16_e64 s1, v5, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cmp_nlt_f16_e64 s2, v7, v6
+; GFX12-NEXT:    v_cndmask_b16 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v0, v2, v0, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b16 v2, v4, v5, s1
+; GFX12-NEXT:    v_cndmask_b16 v3, v6, v7, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp uge <4 x half> %a, %b
   %val = select nnan <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
@@ -1616,22 +1627,23 @@ define <4 x half> @v_test_fmax_legacy_uge_v4f16_nsz_flag(<4 x half> %a, <4 x hal
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v5, v4
-; GFX12-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v7, v6
-; GFX12-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
-; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v2
-; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX12-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GFX12-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v1, v3
-; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX12-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX12-NEXT:    v_cmp_nlt_f16_e64 s0, v0, v2
+; GFX12-NEXT:    v_cmp_nlt_f16_e64 s1, v5, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cmp_nlt_f16_e64 s2, v7, v6
+; GFX12-NEXT:    v_cndmask_b16 v1, v3, v1, vcc_lo
+; GFX12-NEXT:    v_cndmask_b16 v0, v2, v0, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b16 v2, v4, v5, s1
+; GFX12-NEXT:    v_cndmask_b16 v3, v6, v7, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX12-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp uge <4 x half> %a, %b
   %val = select nsz <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index 572026da79646c..28b0d2cfba7317 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -101,12 +101,12 @@ define amdgpu_kernel void @select_f16(
 ; GFX11-NEXT:    s_mov_b32 s17, s11
 ; GFX11-NEXT:    s_mov_b32 s20, s12
 ; GFX11-NEXT:    s_mov_b32 s21, s13
-; GFX11-NEXT:    s_mov_b32 s24, s14
-; GFX11-NEXT:    s_mov_b32 s25, s15
 ; GFX11-NEXT:    buffer_load_u16 v0, off, s[16:19], 0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_load_u16 v1, off, s[20:23], 0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s24, s14
+; GFX11-NEXT:    s_mov_b32 s25, s15
 ; GFX11-NEXT:    buffer_load_u16 v2, off, s[24:27], 0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_load_u16 v3, off, s[0:3], 0 glc dlc
@@ -114,7 +114,7 @@ define amdgpu_kernel void @select_f16(
 ; GFX11-NEXT:    s_mov_b32 s4, s8
 ; GFX11-NEXT:    s_mov_b32 s5, s9
 ; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, v3, v2, vcc_lo
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_endpgm
 
@@ -210,25 +210,25 @@ define amdgpu_kernel void @select_f16_imm_a(
 ; GFX11-NEXT:    s_mov_b32 s15, s11
 ; GFX11-NEXT:    s_mov_b32 s18, s10
 ; GFX11-NEXT:    s_mov_b32 s19, s11
-; GFX11-NEXT:    s_mov_b32 s22, s10
-; GFX11-NEXT:    s_mov_b32 s23, s11
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s12, s2
 ; GFX11-NEXT:    s_mov_b32 s13, s3
 ; GFX11-NEXT:    s_mov_b32 s16, s4
-; GFX11-NEXT:    s_mov_b32 s17, s5
-; GFX11-NEXT:    s_mov_b32 s20, s6
-; GFX11-NEXT:    s_mov_b32 s21, s7
 ; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s17, s5
+; GFX11-NEXT:    s_mov_b32 s4, s6
+; GFX11-NEXT:    s_mov_b32 s5, s7
+; GFX11-NEXT:    s_mov_b32 s6, s10
+; GFX11-NEXT:    s_mov_b32 s7, s11
 ; GFX11-NEXT:    buffer_load_u16 v1, off, s[16:19], 0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_load_u16 v2, off, s[20:23], 0 glc dlc
+; GFX11-NEXT:    buffer_load_u16 v2, off, s[4:7], 0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s8, s0
 ; GFX11-NEXT:    s_mov_b32 s9, s1
 ; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, 0.5, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
@@ -321,25 +321,25 @@ define amdgpu_kernel void @select_f16_imm_b(
 ; GFX11-NEXT:    s_mov_b32 s15, s11
 ; GFX11-NEXT:    s_mov_b32 s18, s10
 ; GFX11-NEXT:    s_mov_b32 s19, s11
-; GFX11-NEXT:    s_mov_b32 s22, s10
-; GFX11-NEXT:    s_mov_b32 s23, s11
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s12, s2
 ; GFX11-NEXT:    s_mov_b32 s13, s3
 ; GFX11-NEXT:    s_mov_b32 s16, s4
-; GFX11-NEXT:    s_mov_b32 s17, s5
-; GFX11-NEXT:    s_mov_b32 s20, s6
-; GFX11-NEXT:    s_mov_b32 s21, s7
 ; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s17, s5
+; GFX11-NEXT:    s_mov_b32 s4, s6
+; GFX11-NEXT:    s_mov_b32 s5, s7
+; GFX11-NEXT:    s_mov_b32 s6, s10
+; GFX11-NEXT:    s_mov_b32 s7, s11
 ; GFX11-NEXT:    buffer_load_u16 v1, off, s[16:19], 0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_load_u16 v2, off, s[20:23], 0 glc dlc
+; GFX11-NEXT:    buffer_load_u16 v2, off, s[4:7], 0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s8, s0
 ; GFX11-NEXT:    s_mov_b32 s9, s1
 ; GFX11-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0.5, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, v2, v1, vcc_lo
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
@@ -433,25 +433,25 @@ define amdgpu_kernel void @select_f16_imm_c(
 ; GFX11-NEXT:    s_mov_b32 s15, s11
 ; GFX11-NEXT:    s_mov_b32 s18, s10
 ; GFX11-NEXT:    s_mov_b32 s19, s11
-; GFX11-NEXT:    s_mov_b32 s22, s10
-; GFX11-NEXT:    s_mov_b32 s23, s11
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s12, s2
 ; GFX11-NEXT:    s_mov_b32 s13, s3
 ; GFX11-NEXT:    s_mov_b32 s16, s4
 ; GFX11-NEXT:    s_mov_b32 s17, s5
-; GFX11-NEXT:    s_mov_b32 s20, s6
-; GFX11-NEXT:    s_mov_b32 s21, s7
 ; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_load_u16 v1, off, s[16:19], 0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_load_u16 v2, off, s[20:23], 0 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s4, s6
+; GFX11-NEXT:    s_mov_b32 s5, s7
+; GFX11-NEXT:    s_mov_b32 s6, s10
+; GFX11-NEXT:    s_mov_b32 s7, s11
 ; GFX11-NEXT:    s_mov_b32 s8, s0
+; GFX11-NEXT:    buffer_load_u16 v2, off, s[4:7], 0 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s9, s1
 ; GFX11-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x3800, v2, vcc_lo
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
@@ -545,25 +545,25 @@ define amdgpu_kernel void @select_f16_imm_d(
 ; GFX11-NEXT:    s_mov_b32 s15, s11
 ; GFX11-NEXT:    s_mov_b32 s18, s10
 ; GFX11-NEXT:    s_mov_b32 s19, s11
-; GFX11-NEXT:    s_mov_b32 s22, s10
-; GFX11-NEXT:    s_mov_b32 s23, s11
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s12, s2
 ; GFX11-NEXT:    s_mov_b32 s13, s3
 ; GFX11-NEXT:    s_mov_b32 s16, s4
 ; GFX11-NEXT:    s_mov_b32 s17, s5
-; GFX11-NEXT:    s_mov_b32 s20, s6
-; GFX11-NEXT:    s_mov_b32 s21, s7
 ; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_load_u16 v1, off, s[16:19], 0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_load_u16 v2, off, s[20:23], 0 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s4, s6
+; GFX11-NEXT:    s_mov_b32 s5, s7
+; GFX11-NEXT:    s_mov_b32 s6, s10
+; GFX11-NEXT:    s_mov_b32 s7, s11
 ; GFX11-NEXT:    s_mov_b32 s8, s0
+; GFX11-NEXT:    buffer_load_u16 v2, off, s[4:7], 0 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s9, s1
 ; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x3800, v2, vcc_lo
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
@@ -686,12 +686,12 @@ define amdgpu_kernel void @select_v2f16(
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x44
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s6, s2
-; GFX11-NEXT:    s_mov_b32 s7, s3
 ; GFX11-NEXT:    s_mov_b32 s22, s2
 ; GFX11-NEXT:    s_mov_b32 s23, s3
 ; GFX11-NEXT:    s_mov_b32 s18, s2
 ; GFX11-NEXT:    s_mov_b32 s19, s3
+; GFX11-NEXT:    s_mov_b32 s6, s2
+; GFX11-NEXT:    s_mov_b32 s7, s3
 ; GFX11-NEXT:    s_mov_b32 s26, s2
 ; GFX11-NEXT:    s_mov_b32 s27, s3
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -699,28 +699,30 @@ define amdgpu_kernel void @select_v2f16(
 ; GFX11-NEXT:    s_mov_b32 s21, s13
 ; GFX11-NEXT:    s_mov_b32 s16, s10
 ; GFX11-NEXT:    s_mov_b32 s17, s11
+; GFX11-NEXT:    buffer_load_b32 v0, off, s[20:23], 0
+; GFX11-NEXT:    buffer_load_b32 v1, off, s[16:19], 0
 ; GFX11-NEXT:    s_mov_b32 s24, s14
 ; GFX11-NEXT:    s_mov_b32 s25, s15
-; GFX11-NEXT:    buffer_load_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    buffer_load_b32 v1, off, s[20:23], 0
-; GFX11-NEXT:    buffer_load_b32 v2, off, s[16:19], 0
+; GFX11-NEXT:    buffer_load_b32 v2, off, s[4:7], 0
 ; GFX11-NEXT:    buffer_load_b32 v3, off, s[24:27], 0
-; GFX11-NEXT:    s_mov_b32 s0, s8
 ; GFX11-NEXT:    s_mov_b32 s1, s9
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v1, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v2, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v6, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v4, v7 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-NEXT:    v_cmp_lt_f16_e64 s0, v5, v4
+; GFX11-NEXT:    v_cndmask_b16 v2, v2, v3, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v1, s0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX11-NEXT:    s_mov_b32 s0, s8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
@@ -827,42 +829,42 @@ define amdgpu_kernel void @select_v2f16_imm_a(
 ;
 ; GFX11-LABEL: select_v2f16_imm_a:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT:    s_mov_b32 s10, -1
-; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s14, s10
-; GFX11-NEXT:    s_mov_b32 s15, s11
-; GFX11-NEXT:    s_mov_b32 s18, s10
-; GFX11-NEXT:    s_mov_b32 s19, s11
-; GFX11-NEXT:    s_mov_b32 s22, s10
-; GFX11-NEXT:    s_mov_b32 s23, s11
+; GFX11-NEXT:    s_load_b256 s[4:11], s[4:5], 0x24
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s14, s2
+; GFX11-NEXT:    s_mov_b32 s15, s3
+; GFX11-NEXT:    s_mov_b32 s18, s2
+; GFX11-NEXT:    s_mov_b32 s19, s3
+; GFX11-NEXT:    s_mov_b32 s22, s2
+; GFX11-NEXT:    s_mov_b32 s23, s3
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s12, s2
-; GFX11-NEXT:    s_mov_b32 s13, s3
-; GFX11-NEXT:    s_mov_b32 s16, s4
-; GFX11-NEXT:    s_mov_b32 s17, s5
-; GFX11-NEXT:    s_mov_b32 s20, s6
-; GFX11-NEXT:    s_mov_b32 s21, s7
+; GFX11-NEXT:    s_mov_b32 s12, s6
+; GFX11-NEXT:    s_mov_b32 s13, s7
+; GFX11-NEXT:    s_mov_b32 s16, s8
 ; GFX11-NEXT:    buffer_load_b32 v0, off, s[12:15], 0
+; GFX11-NEXT:    s_mov_b32 s17, s9
+; GFX11-NEXT:    s_mov_b32 s20, s10
+; GFX11-NEXT:    s_mov_b32 s21, s11
 ; GFX11-NEXT:    buffer_load_b32 v1, off, s[16:19], 0
 ; GFX11-NEXT:    buffer_load_b32 v2, off, s[20:23], 0
-; GFX11-NEXT:    s_mov_b32 s8, s0
-; GFX11-NEXT:    s_mov_b32 s9, s1
+; GFX11-NEXT:    s_mov_b32 s1, s5
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, 0.5, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, 0x3900, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_cmp_lt_f16_e64 s0, 0x3900, v3
+; GFX11-NEXT:    v_cndmask_b16 v1, v2, v1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v0, v4, v0, s0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    s_mov_b32 s0, s4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %b,
@@ -966,42 +968,42 @@ define amdgpu_kernel void @select_v2f16_imm_b(
 ;
 ; GFX11-LABEL: select_v2f16_imm_b:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT:    s_mov_b32 s10, -1
-; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s14, s10
-; GFX11-NEXT:    s_mov_b32 s15, s11
-; GFX11-NEXT:    s_mov_b32 s18, s10
-; GFX11-NEXT:    s_mov_b32 s19, s11
-; GFX11-NEXT:    s_mov_b32 s22, s10
-; GFX11-NEXT:    s_mov_b32 s23, s11
+; GFX11-NEXT:    s_load_b256 s[4:11], s[4:5], 0x24
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s14, s2
+; GFX11-NEXT:    s_mov_b32 s15, s3
+; GFX11-NEXT:    s_mov_b32 s18, s2
+; GFX11-NEXT:    s_mov_b32 s19, s3
+; GFX11-NEXT:    s_mov_b32 s22, s2
+; GFX11-NEXT:    s_mov_b32 s23, s3
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s12, s2
-; GFX11-NEXT:    s_mov_b32 s13, s3
-; GFX11-NEXT:    s_mov_b32 s16, s4
-; GFX11-NEXT:    s_mov_b32 s17, s5
-; GFX11-NEXT:    s_mov_b32 s20, s6
-; GFX11-NEXT:    s_mov_b32 s21, s7
+; GFX11-NEXT:    s_mov_b32 s12, s6
+; GFX11-NEXT:    s_mov_b32 s13, s7
+; GFX11-NEXT:    s_mov_b32 s16, s8
 ; GFX11-NEXT:    buffer_load_b32 v0, off, s[12:15], 0
+; GFX11-NEXT:    s_mov_b32 s17, s9
+; GFX11-NEXT:    s_mov_b32 s20, s10
+; GFX11-NEXT:    s_mov_b32 s21, s11
 ; GFX11-NEXT:    buffer_load_b32 v1, off, s[16:19], 0
 ; GFX11-NEXT:    buffer_load_b32 v2, off, s[20:23], 0
-; GFX11-NEXT:    s_mov_b32 s8, s0
-; GFX11-NEXT:    s_mov_b32 s9, s1
+; GFX11-NEXT:    s_mov_b32 s1, s5
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX11-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0.5, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0x3900, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_cmp_gt_f16_e64 s0, 0x3900, v3
+; GFX11-NEXT:    v_cndmask_b16 v1, v2, v1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v0, v4, v0, s0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    s_mov_b32 s0, s4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -1107,42 +1109,42 @@ define amdgpu_kernel void @select_v2f16_imm_c(
 ;
 ; GFX11-LABEL: select_v2f16_imm_c:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT:    s_mov_b32 s10, -1
-; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s18, s10
-; GFX11-NEXT:    s_mov_b32 s19, s11
-; GFX11-NEXT:    s_mov_b32 s14, s10
-; GFX11-NEXT:    s_mov_b32 s15, s11
-; GFX11-NEXT:    s_mov_b32 s22, s10
-; GFX11-NEXT:    s_mov_b32 s23, s11
+; GFX11-NEXT:    s_load_b256 s[4:11], s[4:5], 0x24
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s18, s2
+; GFX11-NEXT:    s_mov_b32 s19, s3
+; GFX11-NEXT:    s_mov_b32 s14, s2
+; GFX11-NEXT:    s_mov_b32 s15, s3
+; GFX11-NEXT:    s_mov_b32 s22, s2
+; GFX11-NEXT:    s_mov_b32 s23, s3
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s16, s4
-; GFX11-NEXT:    s_mov_b32 s17, s5
-; GFX11-NEXT:    s_mov_b32 s12, s2
-; GFX11-NEXT:    s_mov_b32 s13, s3
-; GFX11-NEXT:    s_mov_b32 s20, s6
-; GFX11-NEXT:    s_mov_b32 s21, s7
+; GFX11-NEXT:    s_mov_b32 s16, s8
+; GFX11-NEXT:    s_mov_b32 s17, s9
+; GFX11-NEXT:    s_mov_b32 s12, s6
+; GFX11-NEXT:    s_mov_b32 s13, s7
 ; GFX11-NEXT:    buffer_load_b32 v0, off, s[16:19], 0
 ; GFX11-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-NEXT:    s_mov_b32 s20, s10
+; GFX11-NEXT:    s_mov_b32 s21, s11
+; GFX11-NEXT:    s_mov_b32 s1, s5
 ; GFX11-NEXT:    buffer_load_b32 v2, off, s[20:23], 0
-; GFX11-NEXT:    s_mov_b32 s8, s0
-; GFX11-NEXT:    s_mov_b32 s9, s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX11-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v1, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_nlt_f16_e64 s0, v4, v3
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x3800, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x3900, v0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    s_mov_b32 s0, s4
+; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -1248,42 +1250,42 @@ define amdgpu_kernel void @select_v2f16_imm_d(
 ;
 ; GFX11-LABEL: select_v2f16_imm_d:
 ; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT:    s_mov_b32 s10, -1
-; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s18, s10
-; GFX11-NEXT:    s_mov_b32 s19, s11
-; GFX11-NEXT:    s_mov_b32 s14, s10
-; GFX11-NEXT:    s_mov_b32 s15, s11
-; GFX11-NEXT:    s_mov_b32 s22, s10
-; GFX11-NEXT:    s_mov_b32 s23, s11
+; GFX11-NEXT:    s_load_b256 s[4:11], s[4:5], 0x24
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s18, s2
+; GFX11-NEXT:    s_mov_b32 s19, s3
+; GFX11-NEXT:    s_mov_b32 s14, s2
+; GFX11-NEXT:    s_mov_b32 s15, s3
+; GFX11-NEXT:    s_mov_b32 s22, s2
+; GFX11-NEXT:    s_mov_b32 s23, s3
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s16, s4
-; GFX11-NEXT:    s_mov_b32 s17, s5
-; GFX11-NEXT:    s_mov_b32 s12, s2
-; GFX11-NEXT:    s_mov_b32 s13, s3
-; GFX11-NEXT:    s_mov_b32 s20, s6
-; GFX11-NEXT:    s_mov_b32 s21, s7
+; GFX11-NEXT:    s_mov_b32 s16, s8
+; GFX11-NEXT:    s_mov_b32 s17, s9
+; GFX11-NEXT:    s_mov_b32 s12, s6
+; GFX11-NEXT:    s_mov_b32 s13, s7
 ; GFX11-NEXT:    buffer_load_b32 v0, off, s[16:19], 0
 ; GFX11-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-NEXT:    s_mov_b32 s20, s10
+; GFX11-NEXT:    s_mov_b32 s21, s11
+; GFX11-NEXT:    s_mov_b32 s1, s5
 ; GFX11-NEXT:    buffer_load_b32 v2, off, s[20:23], 0
-; GFX11-NEXT:    s_mov_b32 s8, s0
-; GFX11-NEXT:    s_mov_b32 s9, s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v1, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_lt_f16_e64 s0, v4, v3
+; GFX11-NEXT:    v_cndmask_b16 v1, 0x3800, v2, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, 0x3900, v0, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    s_mov_b32 s0, s4
+; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -1403,22 +1405,23 @@ define <4 x half> @v_vselect_v4f16(<4 x half> %a, <4 x half> %b, <4 x i32> %cond
 ; GFX11-LABEL: v_vselect_v4f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v9, v8, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v5
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v11, v10, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v6
+; GFX11-NEXT:    v_cndmask_b16 v4, v7, v5, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b16 v5, v9, v8, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v0, v2, v0, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b16 v1, v3, v1, s2
 ; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v1, v7, v1, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <4 x i32> %cond, zeroinitializer
   %select = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
@@ -1596,37 +1599,36 @@ define <8 x half> @v_vselect_v8f16(<8 x half> %a, <8 x half> %b, <8 x i32> %cond
 ; GFX11-LABEL: v_vselect_v8f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v15, v17, v16, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v6
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v17, v16, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v19, v18, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v21, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v12
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v2, v13, v2, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v10
-; GFX11-NEXT:    v_perm_b32 v0, v9, v0, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v1, v11, v1, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v3, v15, v3, 0x5040100
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v10
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v12
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v9
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s3, 0, v11
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s4, 0, v13
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s5, 0, v15
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v12, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s6, 0, v14
+; GFX11-NEXT:    v_cndmask_b16 v8, v9, v8, s5
+; GFX11-NEXT:    v_cndmask_b16 v9, v11, v10, s4
+; GFX11-NEXT:    v_cndmask_b16 v10, v13, v12, s3
+; GFX11-NEXT:    v_cndmask_b16 v11, v16, v15, s2
+; GFX11-NEXT:    v_cndmask_b16 v0, v4, v0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v1, v5, v1, s0
+; GFX11-NEXT:    v_cndmask_b16 v2, v6, v2, s1
+; GFX11-NEXT:    v_cndmask_b16 v3, v7, v3, s6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v0, v11, v0, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v10, v1, 0x5040100
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v9, v2, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v3, v8, v3, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <8 x i32> %cond, zeroinitializer
   %select = select <8 x i1> %cmp, <8 x half> %a, <8 x half> %b
@@ -1990,67 +1992,64 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32>
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v14
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v26
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v12
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v24
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v15
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v22
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v20
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v18
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v16
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v29
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v35, v34, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v27
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v37, v36, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v25
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v6, v8, v6, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v5, v9, v5, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v39, v38, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v19
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v53, v52, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v4, v10, v4, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v11, v1, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v55, v54, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v21
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v51, v50, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v23
-; GFX11-NEXT:    v_cndmask_b32_e32 v14, v49, v48, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v2, v13, v2, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v14, v3, 0x5040100
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v18
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v20
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v22
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s3, 0, v24
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s4, 0, v26
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s5, 0, v28
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s6, 0, v30
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s7, 0, v17
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s8, 0, v19
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s9, 0, v21
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s10, 0, v23
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s11, 0, v25
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s12, 0, v27
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s13, 0, v29
+; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 16, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
+; GFX11-NEXT:    v_lshrrev_b32_e32 v20, 16, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 16, v13
+; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v12
+; GFX11-NEXT:    v_lshrrev_b32_e32 v24, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 16, v11
+; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 16, v10
+; GFX11-NEXT:    v_lshrrev_b32_e32 v28, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 16, v9
+; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v8
+; GFX11-NEXT:    v_cndmask_b16 v0, v8, v0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v18, v19, v18, s13
+; GFX11-NEXT:    v_cndmask_b16 v19, v21, v20, s12
+; GFX11-NEXT:    v_cndmask_b16 v20, v23, v22, s11
+; GFX11-NEXT:    v_cndmask_b16 v21, v25, v24, s10
+; GFX11-NEXT:    v_cndmask_b16 v22, v27, v26, s9
+; GFX11-NEXT:    v_cndmask_b16 v23, v29, v28, s8
+; GFX11-NEXT:    v_cndmask_b16 v24, v32, v30, s7
+; GFX11-NEXT:    v_cndmask_b16 v7, v15, v7, s6
+; GFX11-NEXT:    v_cndmask_b16 v6, v14, v6, s5
+; GFX11-NEXT:    v_cndmask_b16 v5, v13, v5, s4
+; GFX11-NEXT:    v_cndmask_b16 v4, v12, v4, s3
+; GFX11-NEXT:    v_cndmask_b16 v1, v9, v1, s0
+; GFX11-NEXT:    v_cndmask_b16 v2, v10, v2, s1
+; GFX11-NEXT:    v_cndmask_b16 v3, v11, v3, s2
+; GFX11-NEXT:    v_perm_b32 v0, v24, v0, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v4, v20, v4, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v23, v1, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v2, v22, v2, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v3, v21, v3, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v5, v19, v5, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v6, v18, v6, 0x5040100
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v31
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v33, v32, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v12, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v7, v11, v7, 0x5040100
+; GFX11-NEXT:    v_cndmask_b16 v8, v17, v16, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v7, v8, v7, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <16 x i32> %cond, zeroinitializer
   %select = select <16 x i1> %cmp, <16 x half> %a, <16 x half> %b
@@ -2922,39 +2921,40 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32>
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32 offset:120
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32
-; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_b32 v64, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_b32 v65, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_b32 v66, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_b32 v67, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_b32 v68, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_b32 v69, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_b32 v70, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_b32 v71, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_b32 v80, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_b32 v81, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_b32 v82, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_b32 v83, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_b32 v84, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_b32 v85, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v86, off, s32 offset:20
-; GFX11-NEXT:    scratch_load_b32 v87, off, s32 offset:128
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:68
+; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:76
+; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:84
+; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:92
+; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:100
+; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:108
+; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:116
+; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:124
+; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v64, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v65, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_b32 v66, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_b32 v67, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_b32 v68, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_b32 v69, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_b32 v70, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_b32 v71, off, s32 offset:72
+; GFX11-NEXT:    scratch_load_b32 v80, off, s32 offset:80
+; GFX11-NEXT:    scratch_load_b32 v81, off, s32 offset:88
+; GFX11-NEXT:    scratch_load_b32 v82, off, s32 offset:128
+; GFX11-NEXT:    scratch_load_b32 v83, off, s32
+; GFX11-NEXT:    scratch_load_b32 v84, off, s32 offset:120
+; GFX11-NEXT:    scratch_load_b32 v85, off, s32 offset:112
+; GFX11-NEXT:    scratch_load_b32 v86, off, s32 offset:104
+; GFX11-NEXT:    scratch_load_b32 v87, off, s32 offset:96
+; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 16, v15
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v97, 16, v14
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v98, 16, v30
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v99, 16, v13
@@ -2982,131 +2982,123 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32>
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v145, 16, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v146, 16, v18
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v147, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v96, 16, v15
 ; GFX11-NEXT:    s_waitcnt vmcnt(32)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v31
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 16, v17
-; GFX11-NEXT:    v_cndmask_b32_e32 v97, v98, v97, vcc_lo
 ; GFX11-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v98, 16, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v16
-; GFX11-NEXT:    v_cndmask_b32_e32 v99, v100, v99, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v32
+; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(30)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s1, 0, v33
+; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
 ; GFX11-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v34
-; GFX11-NEXT:    v_lshrrev_b32_e32 v100, 16, v33
-; GFX11-NEXT:    v_cndmask_b32_e32 v34, v102, v101, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v34
 ; GFX11-NEXT:    s_waitcnt vmcnt(28)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v35
-; GFX11-NEXT:    v_cndmask_b32_e32 v35, v112, v103, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s3, 0, v35
 ; GFX11-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v36
-; GFX11-NEXT:    v_cndmask_b32_e32 v36, v114, v113, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s4, 0, v36
 ; GFX11-NEXT:    s_waitcnt vmcnt(26)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v37
-; GFX11-NEXT:    v_cndmask_b32_e32 v37, v116, v115, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s5, 0, v37
 ; GFX11-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v38
-; GFX11-NEXT:    v_cndmask_b32_e32 v38, v118, v117, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s6, 0, v38
 ; GFX11-NEXT:    s_waitcnt vmcnt(24)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v39
-; GFX11-NEXT:    v_cndmask_b32_e32 v39, v128, v119, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s7, 0, v39
 ; GFX11-NEXT:    s_waitcnt vmcnt(23)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v48
-; GFX11-NEXT:    v_cndmask_b32_e32 v48, v130, v129, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s8, 0, v48
 ; GFX11-NEXT:    s_waitcnt vmcnt(22)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v49
-; GFX11-NEXT:    v_cndmask_b32_e32 v49, v132, v131, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s9, 0, v49
 ; GFX11-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v50
-; GFX11-NEXT:    v_cndmask_b32_e32 v50, v134, v133, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s10, 0, v50
 ; GFX11-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v51
-; GFX11-NEXT:    v_cndmask_b32_e32 v51, v144, v135, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s11, 0, v51
 ; GFX11-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v52
-; GFX11-NEXT:    v_cndmask_b32_e32 v52, v146, v145, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s12, 0, v52
 ; GFX11-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v53
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v31, v147, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s13, 0, v53
 ; GFX11-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v54
-; GFX11-NEXT:    v_cndmask_b32_e32 v32, v32, v98, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s14, 0, v54
 ; GFX11-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v55
-; GFX11-NEXT:    v_cndmask_b32_e32 v15, v33, v15, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s15, 0, v55
 ; GFX11-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v64
-; GFX11-NEXT:    v_cndmask_b32_e32 v14, v30, v14, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s16, 0, v64
 ; GFX11-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v65
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v14, v97, v14, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v29, v13, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s17, 0, v65
 ; GFX11-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v66
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v28, v12, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s18, 0, v66
 ; GFX11-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v67
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v12, v34, v12, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v27, v11, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s19, 0, v67
 ; GFX11-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v68
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v26, v10, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s20, 0, v68
 ; GFX11-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v69
-; GFX11-NEXT:    v_perm_b32 v13, v99, v13, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v10, v36, v10, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v25, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s21, 0, v69
 ; GFX11-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v70
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v24, v8, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s22, 0, v70
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v71
-; GFX11-NEXT:    v_perm_b32 v11, v35, v11, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v8, v38, v8, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v23, v7, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s23, 0, v71
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v80
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v22, v6, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s24, 0, v80
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v81
-; GFX11-NEXT:    v_perm_b32 v9, v37, v9, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v6, v48, v6, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v21, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s25, 0, v81
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v82
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v20, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s26, 0, v82
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v83
-; GFX11-NEXT:    v_perm_b32 v7, v39, v7, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v4, v50, v4, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v3, vcc_lo
+; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v83
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v84
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v17, v1, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s27, 0, v84
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v85
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s28, 0, v85
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v86
-; GFX11-NEXT:    v_perm_b32 v5, v49, v5, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v0, v32, v0, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v18, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s29, 0, v86
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v87
-; GFX11-NEXT:    v_perm_b32 v3, v51, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v2, v52, v2, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v100, v96, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_hi, 0, v87
+; GFX11-NEXT:    v_cndmask_b16 v34, v34, v96, s26
+; GFX11-NEXT:    v_cndmask_b16 v35, v98, v97, s27
+; GFX11-NEXT:    v_cndmask_b16 v36, v100, v99, s28
+; GFX11-NEXT:    v_cndmask_b16 v37, v102, v101, s29
+; GFX11-NEXT:    v_cndmask_b16 v38, v112, v103, vcc_hi
+; GFX11-NEXT:    v_cndmask_b16 v39, v114, v113, s25
+; GFX11-NEXT:    v_cndmask_b16 v48, v116, v115, s24
+; GFX11-NEXT:    v_cndmask_b16 v49, v118, v117, s23
+; GFX11-NEXT:    v_cndmask_b16 v50, v128, v119, s22
+; GFX11-NEXT:    v_cndmask_b16 v51, v130, v129, s21
+; GFX11-NEXT:    v_cndmask_b16 v52, v132, v131, s20
+; GFX11-NEXT:    v_cndmask_b16 v53, v134, v133, s19
+; GFX11-NEXT:    v_cndmask_b16 v54, v144, v135, s18
+; GFX11-NEXT:    v_cndmask_b16 v55, v146, v145, s17
+; GFX11-NEXT:    v_cndmask_b16 v31, v31, v147, s16
+; GFX11-NEXT:    v_cndmask_b16 v32, v33, v32, s15
+; GFX11-NEXT:    v_cndmask_b16 v15, v83, v15, s14
+; GFX11-NEXT:    v_cndmask_b16 v14, v30, v14, s13
+; GFX11-NEXT:    v_cndmask_b16 v13, v29, v13, s12
+; GFX11-NEXT:    v_cndmask_b16 v12, v28, v12, s11
+; GFX11-NEXT:    v_cndmask_b16 v11, v27, v11, s10
+; GFX11-NEXT:    v_cndmask_b16 v10, v26, v10, s9
+; GFX11-NEXT:    v_cndmask_b16 v9, v25, v9, s8
+; GFX11-NEXT:    v_cndmask_b16 v8, v24, v8, s7
+; GFX11-NEXT:    v_cndmask_b16 v7, v23, v7, s6
+; GFX11-NEXT:    v_cndmask_b16 v6, v22, v6, s5
+; GFX11-NEXT:    v_cndmask_b16 v5, v21, v5, s4
+; GFX11-NEXT:    v_cndmask_b16 v0, v16, v0, vcc_lo
+; GFX11-NEXT:    v_cndmask_b16 v1, v17, v1, s0
+; GFX11-NEXT:    v_cndmask_b16 v2, v18, v2, s1
+; GFX11-NEXT:    v_cndmask_b16 v3, v19, v3, s2
+; GFX11-NEXT:    v_cndmask_b16 v4, v20, v4, s3
+; GFX11-NEXT:    v_perm_b32 v0, v32, v0, 0x5040100
 ; GFX11-NEXT:    v_perm_b32 v1, v31, v1, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v15, v16, v15, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v2, v55, v2, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v3, v54, v3, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v4, v53, v4, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v5, v52, v5, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v6, v51, v6, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v7, v50, v7, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v8, v49, v8, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v9, v48, v9, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v10, v39, v10, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v11, v38, v11, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v12, v37, v12, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v13, v36, v13, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v14, v35, v14, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v15, v34, v15, 0x5040100
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <32 x i32> %cond, zeroinitializer
   %select = select <32 x i1> %cmp, <32 x half> %a, <32 x half> %b
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index f20c1ccb2d63eb..577d938771f718 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -1882,12 +1882,12 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c,
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX11-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX11-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX11-NEXT:    v_cndmask_b16 v0, v0, v1, s[2:3]
 ; GFX11-NEXT:    global_store_b16 v2, v0, s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
index c936c13ac6c66f..d91ee542159245 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir
@@ -159,16 +159,16 @@ name:            mask_hazard_cndmask_dpp3
 body:            |
   bb.0:
     ; GFX11-LABEL: name: mask_hazard_cndmask_dpp3
-    ; GFX11: $vgpr0 = V_CNDMASK_B16_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec
+    ; GFX11: $vgpr0 = V_CNDMASK_B16_fake16_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec
     ; GFX11-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX11-NEXT: S_WAITCNT_DEPCTR 65534
     ; GFX11-NEXT: S_ENDPGM 0
     ;
     ; GFX12-LABEL: name: mask_hazard_cndmask_dpp3
-    ; GFX12: $vgpr0 = V_CNDMASK_B16_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec
+    ; GFX12: $vgpr0 = V_CNDMASK_B16_fake16_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec
     ; GFX12-NEXT: $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
     ; GFX12-NEXT: S_ENDPGM 0
-    $vgpr0 = V_CNDMASK_B16_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec
+    $vgpr0 = V_CNDMASK_B16_fake16_e64_dpp $vgpr0, 0, $vgpr1, 0, $vgpr2, $sgpr2_sgpr3, 1, 15, 15, 1, implicit $exec
     $sgpr2_sgpr3 = S_CSELECT_B64 -1, 0, implicit $scc
     S_ENDPGM 0
 ...



More information about the llvm-commits mailing list