[llvm] 721cba4 - [AMDGPU] SIPeepholeSDWA: Handle V_CNDMASK_B32_e64 (#137930)

via llvm-commits llvm-commits at lists.llvm.org
Mon May 5 09:14:17 PDT 2025


Author: Frederik Harwath
Date: 2025-05-05T18:14:14+02:00
New Revision: 721cba476d68bae13e9c0a0060fd37c386050e74

URL: https://github.com/llvm/llvm-project/commit/721cba476d68bae13e9c0a0060fd37c386050e74
DIFF: https://github.com/llvm/llvm-project/commit/721cba476d68bae13e9c0a0060fd37c386050e74.diff

LOG: [AMDGPU] SIPeepholeSDWA: Handle V_CNDMASK_B32_e64 (#137930)

The VOP3 form of the V_CNDMASK_B32 instruction takes a carry-in
operand. The conversion to SDWA implies a conversion to VOP2 form
which reads from VCC instead.

Convert V_CNDMASK_B32_e64 instructions that might be converted to SDWA
to V_CNDMASK_B32_e32 first and introduce a copy of the carry-in operand
to VCC.

Closes #133431.

---------

Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>

Added: 
    llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir
    llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave64.mir

Modified: 
    llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
    llvm/test/CodeGen/AMDGPU/bf16.ll
    llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
    llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
    llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
    llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
    llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
    llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
    llvm/test/CodeGen/AMDGPU/fmaximum3.ll
    llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
    llvm/test/CodeGen/AMDGPU/fminimum3.ll
    llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
    llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
    llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
    llvm/test/CodeGen/AMDGPU/select.f16.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 22f23e4c94e2d..a320e6769a115 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -62,6 +62,7 @@ class SIPeepholeSDWA {
   std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
   void pseudoOpConvertToVOP2(MachineInstr &MI,
                              const GCNSubtarget &ST) const;
+  void convertVcndmaskToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const;
   MachineInstr *createSDWAVersion(MachineInstr &MI);
   bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
   void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
@@ -1037,7 +1038,8 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
     return;
   // Make sure VCC or its subregs are dead before MI.
   MachineBasicBlock &MBB = *MI.getParent();
-  auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25);
+  MachineBasicBlock::LivenessQueryResult Liveness =
+      MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25);
   if (Liveness != MachineBasicBlock::LQR_Dead)
     return;
   // Check if VCC is referenced in range of (MI,MISucc].
@@ -1061,6 +1063,52 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
   MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI);
 }
 
+/// Try to convert an \p MI in VOP3 which takes an src2 carry-in
+/// operand into the corresponding VOP2 form which expects the
+/// argument in VCC. To this end, add an copy from the carry-in to
+/// VCC.  The conversion will only be applied if \p MI can be shrunk
+/// to VOP2 and if VCC can be proven to be dead before \p MI.
+void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI,
+                                           const GCNSubtarget &ST) const {
+  assert(MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64);
+
+  LLVM_DEBUG(dbgs() << "Attempting VOP2 conversion: " << MI);
+  if (!TII->canShrink(MI, *MRI)) {
+    LLVM_DEBUG(dbgs() << "Cannot shrink instruction\n");
+    return;
+  }
+
+  const MachineOperand &CarryIn =
+      *TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+  Register CarryReg = CarryIn.getReg();
+  MachineInstr *CarryDef = MRI->getVRegDef(CarryReg);
+  if (!CarryDef) {
+    LLVM_DEBUG(dbgs() << "Missing carry-in operand definition\n");
+    return;
+  }
+
+  // Make sure VCC or its subregs are dead before MI.
+  MCRegister Vcc = TRI->getVCC();
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineBasicBlock::LivenessQueryResult Liveness =
+      MBB.computeRegisterLiveness(TRI, Vcc, MI);
+  if (Liveness != MachineBasicBlock::LQR_Dead) {
+    LLVM_DEBUG(dbgs() << "VCC not known to be dead before instruction\n");
+    return;
+  }
+
+  BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), Vcc).add(CarryIn);
+
+  auto Converted = BuildMI(MBB, MI, MI.getDebugLoc(),
+                           TII->get(AMDGPU::getVOPe32(MI.getOpcode())))
+                       .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst))
+                       .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0))
+                       .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1))
+                       .setMIFlags(MI.getFlags());
+  LLVM_DEBUG(dbgs() << "Converted to VOP2: " << *Converted);
+  MI.eraseFromParent();
+}
+
 namespace {
 bool isConvertibleToSDWA(MachineInstr &MI,
                          const GCNSubtarget &ST,
@@ -1070,6 +1118,11 @@ bool isConvertibleToSDWA(MachineInstr &MI,
   if (TII->isSDWA(Opc))
     return true;
 
+  // Can only be handled after ealier conversion to
+  // AMDGPU::V_CNDMASK_B32_e32 which is not always possible.
+  if (Opc == AMDGPU::V_CNDMASK_B32_e64)
+    return false;
+
   // Check if this instruction has opcode that supports SDWA
   if (AMDGPU::getSDWAOp(Opc) == -1)
     Opc = AMDGPU::getVOPe32(Opc);
@@ -1108,10 +1161,6 @@ bool isConvertibleToSDWA(MachineInstr &MI,
   if (TII->pseudoToMCOpcode(Opc) == -1)
     return false;
 
-  // FIXME: has SDWA but require handling of implicit VCC use
-  if (Opc == AMDGPU::V_CNDMASK_B32_e32)
-    return false;
-
   if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) {
     if (!Src0->isReg() && !Src0->isImm())
       return false;
@@ -1266,7 +1315,9 @@ MachineInstr *SIPeepholeSDWA::createSDWAVersion(MachineInstr &MI) {
     SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
   }
 
-  return SDWAInst.getInstr();
+  MachineInstr *Ret = SDWAInst.getInstr();
+  TII->fixImplicitOperands(*Ret);
+  return Ret;
 }
 
 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
@@ -1384,10 +1435,18 @@ bool SIPeepholeSDWA::run(MachineFunction &MF) {
       for (const auto &OperandPair : SDWAOperands) {
         const auto &Operand = OperandPair.second;
         MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST);
-        if (PotentialMI &&
-           (PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
-            PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64))
+        if (!PotentialMI)
+          continue;
+
+        switch (PotentialMI->getOpcode()) {
+        case AMDGPU::V_ADD_CO_U32_e64:
+        case AMDGPU::V_SUB_CO_U32_e64:
           pseudoOpConvertToVOP2(*PotentialMI, ST);
+          break;
+        case AMDGPU::V_CNDMASK_B32_e64:
+          convertVcndmaskToVOP2(*PotentialMI, ST);
+          break;
+        };
       }
       SDWAOperands.clear();
 

diff  --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 19b6ff68b9869..c4957fd44e2be 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -38481,10 +38481,7 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -38494,9 +38491,7 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -38505,11 +38500,9 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -38577,29 +38570,24 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
 ; GFX8-LABEL: v_vselect_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_vselect_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -38607,14 +38595,12 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
 ; GFX10-LABEL: v_vselect_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 1, v0
+; GFX10-NEXT:    v_cndmask_b32_sdwa v1, v3, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s4
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -38771,13 +38757,12 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -38882,14 +38867,13 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:    v_mov_b32_e32 v2, s3
-; GFX8-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
@@ -40792,48 +40776,42 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo
 ; GFX9-LABEL: v_vselect_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v3
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX9-NEXT:    v_cndmask_b32_sdwa v2, v7, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT:    s_mov_b64 vcc, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v7, v5, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v3, v6, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT:    v_perm_b32 v1, v3, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
+; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_vselect_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 1, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 1, v0
+; GFX10-NEXT:    v_cndmask_b32_sdwa v2, v7, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    s_mov_b32 vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_sdwa v3, v6, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v6, v4, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v3, v0, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11TRUE16-LABEL: v_vselect_v4bf16:
@@ -41081,42 +41059,37 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
 ; GFX10-LABEL: v_vselect_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v10
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v15, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
 ; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v17, v16, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v13, v9, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 1, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 1, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 1, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 1, v4
+; GFX10-NEXT:    v_cndmask_b32_sdwa v4, v15, v11, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v14, v10, vcc_lo
+; GFX10-NEXT:    s_mov_b32 vcc_lo, s6
+; GFX10-NEXT:    v_cndmask_b32_sdwa v6, v14, v10, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    s_mov_b32 vcc_lo, s5
+; GFX10-NEXT:    v_cndmask_b32_sdwa v1, v13, v9, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v14, v10, vcc_lo
+; GFX10-NEXT:    s_mov_b32 vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_sdwa v7, v12, v8, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX10-NEXT:    v_perm_b32 v0, v7, v0, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v13, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v13, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v15, v11, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v15, v11, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v2, v6, v5, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v3, v4, v3, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11TRUE16-LABEL: v_vselect_v8bf16:

diff  --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 4fe11760e71fd..92ece0d007fe2 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -652,14 +652,14 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 32
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v1
-; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v1
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %arrayidx, align 1
@@ -760,16 +760,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 32
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %arrayidx, align 1

diff  --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 9fcfbba6fb235..3c45596fba14b 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -629,9 +629,9 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 32
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %arrayidx, align 1
@@ -731,9 +731,9 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 32
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %arrayidx, align 1
@@ -1508,10 +1508,9 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_or_b32_e32 v3, 0x100, v1
-; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v3, v3
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v1, v3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %arrayidx, align 1
@@ -1612,10 +1611,9 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; GFX9-GISEL-NEXT:    v_or_b32_e32 v2, 0x10000, v1
-; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
-; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v1, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v1, v1, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %arrayidx, align 1

diff  --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
index ce7281702c108..a511233af0703 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
@@ -2775,11 +2775,10 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-LABEL: fmul_select_bf16_test1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3f80
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4000
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3f80
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4000
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
@@ -2794,11 +2793,11 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX10-LABEL: fmul_select_bf16_test1:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0x4000
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x4000
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0x3f80
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
@@ -2910,11 +2909,10 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-LABEL: fmul_select_bf16_test2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3f80
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3f00
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3f80
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x3f00
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
@@ -2929,11 +2927,11 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX10-LABEL: fmul_select_bf16_test2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0x3f00
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x3f00
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0x3f80
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
@@ -3051,25 +3049,24 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a
 ; GFX9-LABEL: fmul_select_v2bf16_test3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3f80
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0x4000
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3f80
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4000
+; GFX9-NEXT:    v_cndmask_b32_sdwa v3, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    s_mov_b64 vcc, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v1, v2, v1
+; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s4
 ; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
 ; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
@@ -3081,24 +3078,24 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a
 ; GFX10-LABEL: fmul_select_v2bf16_test3:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, 0x4000
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX10-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0x4000
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, v1, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cndmask_b32_sdwa v3, v4, v2, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    s_mov_b32 vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_sdwa v2, v4, v2, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
 ; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
@@ -3261,25 +3258,24 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a
 ; GFX9-LABEL: fmul_select_v2bf16_test4:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3f80
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0x3f00
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3f80
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x3f00
+; GFX9-NEXT:    v_cndmask_b32_sdwa v3, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    s_mov_b64 vcc, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v1, v2, v1
+; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s4
 ; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
 ; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
@@ -3291,24 +3287,24 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a
 ; GFX10-LABEL: fmul_select_v2bf16_test4:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v5, 0x3f00
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX10-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0x3f00
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, v1, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cndmask_b32_sdwa v3, v4, v2, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    s_mov_b32 vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_sdwa v2, v4, v2, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
 ; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
@@ -3466,11 +3462,10 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-LABEL: fmul_select_bf16_test5:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4100
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4000
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4100
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4000
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
@@ -3485,11 +3480,11 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX10-LABEL: fmul_select_bf16_test5:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0x4000
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x4000
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0x4100
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
@@ -3603,11 +3598,10 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-LABEL: fmul_select_bf16_test6:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4040
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffc100
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4040
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffffc100
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
@@ -3622,11 +3616,11 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX10-LABEL: fmul_select_bf16_test6:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffffc100
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0xffffc100
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0x4040
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
@@ -3739,11 +3733,10 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-LABEL: fmul_select_bf16_test7:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffffc080
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4100
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffffc080
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4100
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
@@ -3758,11 +3751,11 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX10-LABEL: fmul_select_bf16_test7:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0x4100
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x4100
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffffc080
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
@@ -3875,10 +3868,10 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-LABEL: fmul_select_bf16_test8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff8000
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff8000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
@@ -4004,11 +3997,10 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-LABEL: fmul_select_bf16_test9:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffffc200
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffc180
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffffc200
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffffc180
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
@@ -4023,11 +4015,11 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX10-LABEL: fmul_select_bf16_test9:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffffc180
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0xffffc180
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffffc200
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
@@ -4141,11 +4133,10 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b
 ; GFX9-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffffdb80
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffe000
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffffdb80
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffffe000
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
@@ -4160,11 +4151,11 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b
 ; GFX10-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffffe000
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0xffffe000
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffffdb80
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
@@ -4278,11 +4269,10 @@ define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %b
 ; GFX9-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4c00
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3480
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4c00
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x3480
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
@@ -4297,11 +4287,11 @@ define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %b
 ; GFX10-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0x3480
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x3480
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0x4c00
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
index 8aab9ec885f3c..35fe6ebaf1b12 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
@@ -611,28 +611,24 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1)
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_eq_u32 s4, 2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; VI-NEXT:    v_cndmask_b32_sdwa v0, v1, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_eq_u32 s4, 3
-; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_eq_u32 s4, 4
-; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; VI-NEXT:    v_cndmask_b32_sdwa v0, v0, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_eq_u32 s4, 5
-; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_eq_u32 s4, 6
-; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; VI-NEXT:    v_cndmask_b32_sdwa v0, v0, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_eq_u32 s4, 7
-; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
+; VI-NEXT:    v_cndmask_b32_sdwa v0, v0, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    flat_store_short v[5:6], v0
 ; VI-NEXT:    s_endpgm
 ;
@@ -864,31 +860,27 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1)
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_eq_u32 s4, 9
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_eq_u32 s4, 10
-; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT:    v_cndmask_b32_sdwa v0, v0, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_eq_u32 s4, 11
-; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_eq_u32 s4, 12
-; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
+; VI-NEXT:    v_cndmask_b32_sdwa v0, v0, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_eq_u32 s4, 13
-; VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_eq_u32 s4, 14
-; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v15, vcc
+; VI-NEXT:    v_cndmask_b32_sdwa v0, v0, v7, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_eq_u32 s4, 15
-; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v8
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; VI-NEXT:    v_cndmask_b32_sdwa v0, v0, v8, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    flat_store_short v[9:10], v0
 ; VI-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
index db6ec2b32ad63..12b26cb6d8c19 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
@@ -1,18 +1,24 @@
-; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,SIVI %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5
+; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89 %s
 
-; GCN-LABEL: {{^}}extract_vector_elt_v2i16:
-; GCN: s_load_dword [[VEC:s[0-9]+]]
-; SIVI: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
-; SIVI-DAG: v_mov_b32_e32 [[VELT0:v[0-9]+]], [[VEC]]
-; SIVI-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
-; SIVI-DAG: buffer_store_short [[VELT0]]
-; SIVI-DAG: buffer_store_short [[VELT1]]
-; GFX9: v_mov_b32_e32 [[VVEC:v[0-9]+]], [[VEC]]
-; GFX9: global_store_short_d16_hi v{{[0-9]+}}, [[VVEC]],
-; GFX9: buffer_store_short [[VVEC]],
 define amdgpu_kernel void @extract_vector_elt_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
+;
+; GCN-LABEL: extract_vector_elt_v2i16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s5, s4, 16
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    buffer_store_short v1, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:20
+; GCN-NEXT:    s_endpgm
   %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
   %p0 = extractelement <2 x i16> %vec, i32 0
   %p1 = extractelement <2 x i16> %vec, i32 1
@@ -22,33 +28,65 @@ define amdgpu_kernel void @extract_vector_elt_v2i16(ptr addrspace(1) %out, ptr a
   ret void
 }
 
-; GCN-LABEL: {{^}}extract_vector_elt_v2i16_dynamic_sgpr:
-; GCN: s_load_dword [[IDX:s[0-9]+]]
-; GCN: s_load_dword [[VEC:s[0-9]+]]
-; GCN: s_lshl_b32 [[IDX_SCALED:s[0-9]+]], [[IDX]], 4
-; GCN: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], [[IDX_SCALED]]
-; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
-; GCN: buffer_store_short [[VELT1]]
-; GCN: ScratchSize: 0
 define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i32 %idx) #0 {
+; GCN-LABEL: extract_vector_elt_v2i16_dynamic_sgpr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_load_dword s4, s[4:5], 0x15
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_lshl_b32 s4, s4, 4
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s4, s2, s4
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+;
+; GFX89-LABEL: extract_vector_elt_v2i16_dynamic_sgpr:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX89-NEXT:    s_load_dword s8, s[4:5], 0x54
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX89-NEXT:    s_mov_b32 s4, s0
+; GFX89-NEXT:    s_lshl_b32 s0, s8, 4
+; GFX89-NEXT:    s_mov_b32 s5, s1
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    s_lshr_b32 s0, s2, s0
+; GFX89-NEXT:    v_mov_b32_e32 v0, s0
+; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT:    s_endpgm
   %vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
   %elt = extractelement <2 x i16> %vec, i32 %idx
   store i16 %elt, ptr addrspace(1) %out, align 2
   ret void
 }
 
-; GCN-LABEL: {{^}}extract_vector_elt_v2i16_dynamic_vgpr:
-; GCN-DAG: {{flat|buffer|global}}_load_dword [[IDX:v[0-9]+]]
-; GCN-DAG: v_lshlrev_b32_e32 [[IDX_SCALED:v[0-9]+]], 4, [[IDX]]
-; GCN-DAG: s_load_dword [[VEC:s[0-9]+]]
-
-; SI: v_lshr_b32_e32 [[ELT:v[0-9]+]], [[VEC]], [[IDX_SCALED]]
-; VI: v_lshrrev_b32_e64 [[ELT:v[0-9]+]], [[IDX_SCALED]], [[VEC]]
-
-; SI: buffer_store_short [[ELT]]
-; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]]
-; GCN: ScratchSize: 0{{$}}
 define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, ptr addrspace(1) %idx.ptr) #0 {
+;
+; GCN-LABEL: extract_vector_elt_v2i16_dynamic_vgpr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v3, v[1:2], s[4:7], 0 addr64 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GCN-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 4, v3
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_lshr_b32_e32 v0, s2, v0
+; GCN-NEXT:    buffer_store_short v0, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, ptr addrspace(1) %idx.ptr, i64 %tid.ext
@@ -60,14 +98,39 @@ define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(ptr addrspace(1
   ret void
 }
 
-; GCN-LABEL: {{^}}extract_vector_elt_v3i16:
-; GCN: s_load_dwordx4
-
-; GCN-NOT: {{buffer|flat|global}}_load
-
-; GCN: buffer_store_short
-; GCN: buffer_store_short
 define amdgpu_kernel void @extract_vector_elt_v3i16(ptr addrspace(1) %out, <3 x i16> %foo) #0 {
+;
+;
+;
+;
+; GCN-LABEL: extract_vector_elt_v3i16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:2
+; GCN-NEXT:    s_endpgm
+;
+; GFX89-LABEL: extract_vector_elt_v3i16:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s4, s0
+; GFX89-NEXT:    s_mov_b32 s5, s1
+; GFX89-NEXT:    v_mov_b32_e32 v0, s3
+; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT:    v_mov_b32_e32 v0, s2
+; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:2
+; GFX89-NEXT:    s_endpgm
   %p0 = extractelement <3 x i16> %foo, i32 0
   %p1 = extractelement <3 x i16> %foo, i32 2
   %out1 = getelementptr i16, ptr addrspace(1) %out, i32 1
@@ -76,17 +139,40 @@ define amdgpu_kernel void @extract_vector_elt_v3i16(ptr addrspace(1) %out, <3 x
   ret void
 }
 
-; GCN-LABEL: {{^}}extract_vector_elt_v4i16:
-; SI: s_load_dwordx4
-; SI: buffer_store_short
-; SI: buffer_store_short
-
-; GFX89-DAG: s_load_dwordx4 s[[[#LOAD:]]:[[#END:]]], s[4:5], 0x24
-; GFX89-DAG: v_mov_b32_e32 [[VLOAD0:v[0-9]+]], s[[#LOAD + 2]]
-; GFX89-DAG: buffer_store_short [[VLOAD0]], off
-; GFX89-DAG: v_mov_b32_e32 [[VLOAD1:v[0-9]+]], s[[#LOAD + 3]]
-; GFX89-DAG: buffer_store_short [[VLOAD1]], off
 define amdgpu_kernel void @extract_vector_elt_v4i16(ptr addrspace(1) %out, <4 x i16> %foo) #0 {
+;
+;
+; GCN-LABEL: extract_vector_elt_v4i16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:20
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
+;
+; GFX89-LABEL: extract_vector_elt_v4i16:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s4, s0
+; GFX89-NEXT:    s_mov_b32 s5, s1
+; GFX89-NEXT:    v_mov_b32_e32 v0, s3
+; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    v_mov_b32_e32 v0, s2
+; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:20
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_endpgm
   %p0 = extractelement <4 x i16> %foo, i32 0
   %p1 = extractelement <4 x i16> %foo, i32 2
   %out1 = getelementptr i16, ptr addrspace(1) %out, i32 10
@@ -95,36 +181,64 @@ define amdgpu_kernel void @extract_vector_elt_v4i16(ptr addrspace(1) %out, <4 x
   ret void
 }
 
-; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16:
-; SI: s_load_dword s
-; SI: s_load_dwordx2 s
-; SI: s_load_dwordx2 s
-
-; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[4:5], 0x24
-; GFX89-DAG: s_load_dwordx2 s[[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]], s[4:5], 0x4c
-; GFX89-DAG: s_load_dword s{{[0-9]+}}, s[4:5], 0x54
-
-; GCN-NOT: {{buffer|flat|global}}
-
-; SICI: buffer_store_short
-; SICI: buffer_store_short
-; SICI: buffer_store_short
-
-; GFX9-NOT: s_pack_ll_b32_b16
-; GFX9-NOT: s_pack_lh_b32_b16
-
-; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4
-; GCN: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s
-; GCN: {{buffer|global}}_store_short
 define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(ptr addrspace(1) %out, [8 x i32], <3 x i16> %foo, i32 %idx) #0 {
+;
+;
+; GCN-LABEL: dynamic_extract_vector_elt_v3i16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x15
+; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshl_b32 s2, s2, 4
+; GCN-NEXT:    s_lshr_b64 s[4:5], s[6:7], s2
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GCN-NEXT:    s_endpgm
+;
+; GFX89-LABEL: dynamic_extract_vector_elt_v3i16:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_load_dword s8, s[4:5], 0x54
+; GFX89-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX89-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX89-NEXT:    s_mov_b32 s3, 0xf000
+; GFX89-NEXT:    s_mov_b32 s2, -1
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    s_lshl_b32 s4, s8, 4
+; GFX89-NEXT:    s_lshr_b64 s[4:5], s[6:7], s4
+; GFX89-NEXT:    v_mov_b32_e32 v0, s4
+; GFX89-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX89-NEXT:    s_endpgm
   %p0 = extractelement <3 x i16> %foo, i32 %idx
   %out1 = getelementptr i16, ptr addrspace(1) %out, i32 1
   store i16 %p0, ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}v_insertelement_v4i16_dynamic_sgpr:
 define amdgpu_kernel void @v_insertelement_v4i16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %idx) #0 {
+;
+;
+;
+; GCN-LABEL: v_insertelement_v4i16_dynamic_sgpr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_load_dword s8, s[4:5], 0xd
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GCN-NEXT:    s_lshl_b32 s4, s8, 4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshr_b64 v[3:4], v[3:4], s4
+; GCN-NEXT:    buffer_store_short v3, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -135,13 +249,44 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_sgpr(ptr addrspace(1) %
   ret void
 }
 
-; GCN-LABEL: {{^}}reduce_load_vector_v8i16_extract_01:
-; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]],
-; GCN-NOT: {{s|buffer|flat|global}}_load_
-; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0
-; GCN-NOT: {{s|buffer|flat|global}}_load_
-; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
 define amdgpu_kernel void @reduce_load_vector_v8i16_extract_01(ptr addrspace(4) %ptr) #0 {
+;
+;
+;
+;
+; GCN-LABEL: reduce_load_vector_v8i16_extract_01:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s1, s0, 16
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
+;
+; GFX89-LABEL: reduce_load_vector_v8i16_extract_01:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX89-NEXT:    s_mov_b32 s3, 0xf000
+; GFX89-NEXT:    s_mov_b32 s2, -1
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX89-NEXT:    v_mov_b32_e32 v0, s0
+; GFX89-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    v_mov_b32_e32 v0, s1
+; GFX89-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_endpgm
   %load = load <16 x i16>, ptr addrspace(4) %ptr
   %elt0 = extractelement <16 x i16> %load, i32 0
   %elt1 = extractelement <16 x i16> %load, i32 1
@@ -150,13 +295,44 @@ define amdgpu_kernel void @reduce_load_vector_v8i16_extract_01(ptr addrspace(4)
   ret void
 }
 
-; GCN-LABEL: {{^}}reduce_load_vector_v8i16_extract_23:
-; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]],
-; GCN-NOT: {{s|buffer|flat|global}}_load_
-; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], {{0x1|0x4}}
-; GCN-NOT: {{s|buffer|flat|global}}_load_
-; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
 define amdgpu_kernel void @reduce_load_vector_v8i16_extract_23(ptr addrspace(4) %ptr) #0 {
+;
+;
+;
+;
+; GCN-LABEL: reduce_load_vector_v8i16_extract_23:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x1
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    s_mov_b32 s2, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_lshr_b32 s1, s0, 16
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
+;
+; GFX89-LABEL: reduce_load_vector_v8i16_extract_23:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX89-NEXT:    s_mov_b32 s3, 0xf000
+; GFX89-NEXT:    s_mov_b32 s2, -1
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    s_load_dword s0, s[0:1], 0x4
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX89-NEXT:    v_mov_b32_e32 v0, s0
+; GFX89-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    v_mov_b32_e32 v0, s1
+; GFX89-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_endpgm
   %load = load <16 x i16>, ptr addrspace(4) %ptr
   %elt2 = extractelement <16 x i16> %load, i32 2
   %elt3 = extractelement <16 x i16> %load, i32 3
@@ -165,14 +341,26 @@ define amdgpu_kernel void @reduce_load_vector_v8i16_extract_23(ptr addrspace(4)
   ret void
 }
 
-; GCN-LABEL: {{^}}v_extractelement_v8i16_2:
-; SI: buffer_load_dword [[RES:v[0-9]+]], v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 addr64 offset:4
-; SI: buffer_store_short [[RES]]
-; VI: flat_load_dword [[RES:v[0-9]+]]
-; VI: flat_store_short v[{{[0-9:]+}}], [[RES]]
-; GFX9: global_load_dword [[RES:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9:]+}}] offset:4
-; GFX9: global_store_short v{{[0-9]+}}, [[RES]]
 define amdgpu_kernel void @v_extractelement_v8i16_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+;
+;
+;
+; GCN-LABEL: v_extractelement_v8i16_2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 4, v0
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GCN-NEXT:    buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:4
+; GCN-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v3, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <8 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -183,14 +371,26 @@ define amdgpu_kernel void @v_extractelement_v8i16_2(ptr addrspace(1) %out, ptr a
   ret void
 }
 
-; GCN-LABEL: {{^}}v_extractelement_v8i16_6:
-; SI: buffer_load_dword [[RES:v[0-9]+]], v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 addr64 offset:12
-; SI: buffer_store_short [[RES]]
-; VI: flat_load_dword [[RES:v[0-9]+]]
-; VI: flat_store_short v[{{[0-9:]+}}], [[RES]]
-; GFX9: global_load_dword [[RES:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9:]+}}] offset:12
-; GFX9: global_store_short v{{[0-9]+}}, [[RES]]
 define amdgpu_kernel void @v_extractelement_v8i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+;
+;
+;
+; GCN-LABEL: v_extractelement_v8i16_6:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 4, v0
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GCN-NEXT:    buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:12
+; GCN-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v3, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <8 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -201,9 +401,52 @@ define amdgpu_kernel void @v_extractelement_v8i16_6(ptr addrspace(1) %out, ptr a
   ret void
 }
 
-; GCN-LABEL: {{^}}v_extractelement_v8i16_dynamic_sgpr:
-; GCN-COUNT-7: v_cndmask_b32_e32
 define amdgpu_kernel void @v_extractelement_v8i16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 {
+;
+;
+;
+; GCN-LABEL: v_extractelement_v8i16_dynamic_sgpr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_load_dword s8, s[4:5], 0xd
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GCN-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx4 v[1:4], v[4:5], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
+; GCN-NEXT:    v_mov_b32_e32 v7, v5
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_cmp_eq_u32 s8, 1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 2
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 3
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 4
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 5
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 6
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 7
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
+; GCN-NEXT:    buffer_store_short v0, v[6:7], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <8 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -214,14 +457,26 @@ define amdgpu_kernel void @v_extractelement_v8i16_dynamic_sgpr(ptr addrspace(1)
   ret void
 }
 
-; GCN-LABEL: {{^}}v_extractelement_v16i16_2:
-; SI: buffer_load_dword [[RES:v[0-9]+]], v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 addr64 offset:4
-; SI: buffer_store_short [[RES]]
-; VI: flat_load_dword [[RES:v[0-9]+]]
-; VI: flat_store_short v[{{[0-9:]+}}], [[RES]]
-; GFX9: global_load_dword [[RES:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9:]+}}] offset:4
-; GFX9: global_store_short v{{[0-9]+}}, [[RES]]
 define amdgpu_kernel void @v_extractelement_v16i16_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+;
+;
+;
+; GCN-LABEL: v_extractelement_v16i16_2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 5, v0
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GCN-NEXT:    buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:4
+; GCN-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v3, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <16 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -232,14 +487,26 @@ define amdgpu_kernel void @v_extractelement_v16i16_2(ptr addrspace(1) %out, ptr
   ret void
 }
 
-; GCN-LABEL: {{^}}v_extractelement_v16i16_6:
-; SI: buffer_load_dword [[RES:v[0-9]+]], v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 addr64 offset:12
-; SI: buffer_store_short [[RES]]
-; VI: flat_load_dword [[RES:v[0-9]+]]
-; VI: flat_store_short v[{{[0-9:]+}}], [[RES]]
-; GFX9: global_load_dword [[RES:v[0-9]+]], v{{[0-9]+}}, s[{{[0-9:]+}}] offset:12
-; GFX9: global_store_short v{{[0-9]+}}, [[RES]]
 define amdgpu_kernel void @v_extractelement_v16i16_6(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+;
+;
+;
+; GCN-LABEL: v_extractelement_v16i16_6:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 5, v0
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GCN-NEXT:    buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:12
+; GCN-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v3, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <16 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -250,9 +517,82 @@ define amdgpu_kernel void @v_extractelement_v16i16_6(ptr addrspace(1) %out, ptr
   ret void
 }
 
-; GCN-LABEL: {{^}}v_extractelement_v16i16_dynamic_sgpr:
-; GCN-COUNT-15: v_cndmask_b32_e32
 define amdgpu_kernel void @v_extractelement_v16i16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 {
+;
+;
+;
+; GCN-LABEL: v_extractelement_v16i16_dynamic_sgpr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_load_dword s8, s[4:5], 0xd
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v5, 5, v0
+; GCN-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx4 v[1:4], v[5:6], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v9, 1, v0
+; GCN-NEXT:    v_mov_b32_e32 v10, v6
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    buffer_load_dwordx4 v[5:8], v[5:6], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    s_cmp_eq_u32 s8, 1
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 2
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v6
+; GCN-NEXT:    v_lshrrev_b32_e32 v16, 16, v7
+; GCN-NEXT:    v_lshrrev_b32_e32 v17, 16, v8
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 3
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 4
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 5
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 6
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 7
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 8
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 9
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 10
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 11
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 12
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v15, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 13
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 14
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, 15
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v17, vcc
+; GCN-NEXT:    buffer_store_short v0, v[9:10], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <16 x i16>, ptr addrspace(1) %in, i64 %tid.ext

diff  --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
index 09279f6f0768c..67a9c12dca94a 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
@@ -116,9 +116,8 @@ define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 {
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v3, v2
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; VI-SAFE-NEXT:    v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v1
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
@@ -223,13 +222,12 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 {
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v5, v4
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; VI-SAFE-NEXT:    v_cndmask_b32_sdwa v4, v4, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v1, v3
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-SAFE-NEXT:    v_cmp_nle_f16_e32 vcc, v0, v2
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NNAN-LABEL: test_fmax_legacy_ugt_v3f16:

diff  --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 1b8a79ee982d1..567202be69fa6 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -2083,19 +2083,17 @@ define <2 x half> @v_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c
 ; GFX942-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v1, v0, v5, s0
 ; GFX942-NEXT:    v_pk_max_f16 v1, v2, v1
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v2, v5
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2131,19 +2129,17 @@ define <2 x half> @v_fmaximum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x
 ; GFX942-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v1, v0, v5, s0
 ; GFX942-NEXT:    v_pk_max_f16 v1, v1, v2
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v5, v2
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2178,25 +2174,22 @@ define <2 x half> @v_fmaximum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v0
 ; GFX942-NEXT:    v_and_b32_e32 v4, 0x7fff7fff, v1
-; GFX942-NEXT:    v_pk_max_f16 v3, v3, v4
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0x7e00
-; GFX942-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX942-NEXT:    v_pk_max_f16 v3, v3, v4
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT:    v_cndmask_b32_sdwa v6, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v1|
 ; GFX942-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX942-NEXT:    v_and_b32_e32 v5, 0x7fff7fff, v2
-; GFX942-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v1|
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v6, v3, vcc
-; GFX942-NEXT:    v_perm_b32 v1, v4, v0, s0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_perm_b32 v1, v6, v0, s0
+; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v6, |v2| src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_pk_max_f16 v1, v1, v5
-; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v3, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v6, v1, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX942-NEXT:    v_perm_b32 v0, v3, v0, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2238,19 +2231,17 @@ define <2 x half> @v_fmaximum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2
 ; GFX942-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v1, v0, v5, s0
 ; GFX942-NEXT:    v_pk_max_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, v5, -v2
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2283,26 +2274,23 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) {
 ; GFX942-LABEL: v_fmaximum3_v2f16__inlineimm1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_pk_max_f16 v2, v0, 2.0 op_sel_hi:[1,0]
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
-; GFX942-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX942-NEXT:    v_pk_max_f16 v2, v0, 2.0 op_sel_hi:[1,0]
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX942-NEXT:    v_cndmask_b32_sdwa v4, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
 ; GFX942-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX942-NEXT:    v_perm_b32 v2, v3, v0, s0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT:    v_perm_b32 v2, v4, v0, s0
+; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v1 src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_pk_max_f16 v2, v2, v1
-; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v4, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX942-NEXT:    v_perm_b32 v0, v3, v0, s0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT:    v_perm_b32 v0, v4, v0, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: v_fmaximum3_v2f16__inlineimm1:
@@ -2337,19 +2325,17 @@ define <2 x half> @v_fmaximum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) {
 ; GFX942-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v1, v0, v4, s0
 ; GFX942-NEXT:    v_pk_max_f16 v1, v1, 4.0 op_sel_hi:[1,0]
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v4, v4
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v0, v0, v2, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2408,10 +2394,9 @@ define <3 x half> @v_fmaximum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v4, v8
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2471,10 +2456,9 @@ define <3 x half> @v_fmaximum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v8, v4
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2617,10 +2601,9 @@ define <3 x half> @v_fmaximum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3
 ; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, v8, -v4
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2656,35 +2639,33 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) {
 ; GFX942-LABEL: v_fmaximum3_v3f16__inlineimm1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0]
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0x7e00
-; GFX942-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX942-NEXT:    v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0]
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX942-NEXT:    v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_pk_max_f16 v7, v1, 2.0
-; GFX942-NEXT:    s_mov_b32 s1, 0x5040100
-; GFX942-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v1, v1
+; GFX942-NEXT:    s_mov_b32 s1, 0x5040100
 ; GFX942-NEXT:    s_movk_i32 s0, 0x7e00
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
-; GFX942-NEXT:    v_perm_b32 v4, v5, v0, s1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX942-NEXT:    v_perm_b32 v4, v6, v0, s1
 ; GFX942-NEXT:    v_pk_max_f16 v4, v4, v2
-; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v6, v5, v7, vcc
 ; GFX942-NEXT:    v_pack_b32_f16 v7, v1, s0
 ; GFX942-NEXT:    v_pk_max_f16 v7, v7, v3
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
-; GFX942-NEXT:    v_perm_b32 v0, v5, v0, s1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX942-NEXT:    v_perm_b32 v0, v6, v0, s1
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: v_fmaximum3_v3f16__inlineimm1:
@@ -2743,10 +2724,9 @@ define <3 x half> @v_fmaximum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) {
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v6, v6
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2785,37 +2765,33 @@ define <4 x half> @v_fmaximum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c
 ; GFX942-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX942-NEXT:    v_pk_max_f16 v2, v1, v3
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v2, v1, v6, s0
 ; GFX942-NEXT:    v_pk_max_f16 v2, v5, v2
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v5, v6
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v2, v0, v8, s0
 ; GFX942-NEXT:    v_pk_max_f16 v2, v4, v2
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v4, v8
 ; GFX942-NEXT:    v_perm_b32 v1, v1, v3, s0
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v5, v7, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v0, v0, v5, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2854,37 +2830,33 @@ define <4 x half> @v_fmaximum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x
 ; GFX942-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX942-NEXT:    v_pk_max_f16 v2, v1, v3
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v2, v1, v6, s0
 ; GFX942-NEXT:    v_pk_max_f16 v2, v2, v5
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v6, v5
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v2, v0, v8, s0
 ; GFX942-NEXT:    v_pk_max_f16 v2, v2, v4
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v8, v4
 ; GFX942-NEXT:    v_perm_b32 v1, v1, v3, s0
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v5, v7, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v0, v0, v5, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2947,17 +2919,14 @@ define <4 x half> @v_fmaximum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v1, v12, v6, vcc
 ; GFX942-NEXT:    v_perm_b32 v2, v8, v1, s0
+; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_pk_max_f16 v2, v2, v11
 ; GFX942-NEXT:    v_perm_b32 v6, v9, v0, s0
-; GFX942-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_cndmask_b32_sdwa v3, v12, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_pk_max_f16 v6, v6, v10
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v7, v12, v7, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v7, v12, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, v1, |v5|
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v1, v12, v2, vcc
@@ -3012,37 +2981,33 @@ define <4 x half> @v_fmaximum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4
 ; GFX942-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX942-NEXT:    v_pk_max_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, -v1, -v3
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v2, v1, v6, s0
 ; GFX942-NEXT:    v_pk_max_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, v6, -v5
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v2, v0, v8, s0
 ; GFX942-NEXT:    v_pk_max_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, v8, -v4
 ; GFX942-NEXT:    v_perm_b32 v1, v1, v3, s0
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v5, v7, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v0, v0, v5, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3078,42 +3043,40 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) {
 ; GFX942-LABEL: v_fmaximum3_v4f16__inlineimm1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0]
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0x7e00
-; GFX942-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX942-NEXT:    v_pk_max_f16 v4, v0, 2.0 op_sel_hi:[1,0]
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0x7e00
 ; GFX942-NEXT:    v_pk_max_f16 v7, v1, 2.0 op_sel_hi:[1,0]
-; GFX942-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX942-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_lshrrev_b32_e32 v8, 16, v7
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v8, v6, v8, vcc
+; GFX942-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v8, v5, v8, vcc
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v1, v1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
 ; GFX942-NEXT:    v_perm_b32 v4, v8, v1, s0
-; GFX942-NEXT:    v_pk_max_f16 v4, v4, v3
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT:    v_perm_b32 v8, v5, v0, s0
-; GFX942-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX942-NEXT:    v_pk_max_f16 v8, v8, v2
-; GFX942-NEXT:    v_cndmask_b32_e32 v7, v6, v7, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
-; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v5, v6, v9, vcc
+; GFX942-NEXT:    v_pk_max_f16 v4, v4, v3
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_sdwa v7, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_perm_b32 v6, v6, v0, s0
+; GFX942-NEXT:    v_pk_max_f16 v6, v6, v2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_sdwa v8, v5, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
 ; GFX942-NEXT:    v_perm_b32 v1, v7, v1, s0
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc
-; GFX942-NEXT:    v_perm_b32 v0, v5, v0, s0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v5, v6, vcc
+; GFX942-NEXT:    v_perm_b32 v0, v8, v0, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: v_fmaximum3_v4f16__inlineimm1:
@@ -3151,37 +3114,33 @@ define <4 x half> @v_fmaximum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) {
 ; GFX942-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX942-NEXT:    v_pk_max_f16 v2, v1, v3
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v4, v5, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v2, v1, v4, s0
 ; GFX942-NEXT:    v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0]
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v4, v4
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v1, v1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v2, v0, v6, s0
 ; GFX942-NEXT:    v_pk_max_f16 v2, v2, 4.0 op_sel_hi:[1,0]
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v6, v6
 ; GFX942-NEXT:    v_perm_b32 v1, v1, v3, s0
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v4, v5, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v0, v0, v4, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4005,19 +3964,17 @@ define <4 x half> @v_no_fmaximum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b,
 ; GFX942-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v0, v1, v5, s0
 ; GFX942-NEXT:    v_pk_max_f16 v3, v0, v2
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v5, v2
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v1, v1, v5, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
index 049c6799da079..fd809c6103d2c 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
@@ -117,9 +117,8 @@ define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 {
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v3, v2
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; VI-SAFE-NEXT:    v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v1
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
@@ -224,13 +223,12 @@ define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 {
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v5, v4
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; VI-SAFE-NEXT:    v_cndmask_b32_sdwa v4, v4, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v1, v3
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; VI-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc, v0, v2
 ; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-NNAN-LABEL: test_fmin_legacy_ule_v3f16:

diff  --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
index 96e9aa375f5ee..81b8e8ebd10e3 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
@@ -2083,19 +2083,17 @@ define <2 x half> @v_fminimum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c
 ; GFX942-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v1, v0, v5, s0
 ; GFX942-NEXT:    v_pk_min_f16 v1, v2, v1
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v2, v5
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v2, v0 src0_sel:WORD_1 src1_sel:DWORD
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2131,19 +2129,17 @@ define <2 x half> @v_fminimum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x
 ; GFX942-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v1, v0, v5, s0
 ; GFX942-NEXT:    v_pk_min_f16 v1, v1, v2
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v5, v2
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2178,25 +2174,22 @@ define <2 x half> @v_fminimum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v0
 ; GFX942-NEXT:    v_and_b32_e32 v4, 0x7fff7fff, v1
-; GFX942-NEXT:    v_pk_min_f16 v3, v3, v4
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0x7e00
-; GFX942-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, |v0|, |v1| src0_sel:WORD_1 src1_sel:WORD_1
+; GFX942-NEXT:    v_pk_min_f16 v3, v3, v4
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT:    v_cndmask_b32_sdwa v6, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v1|
 ; GFX942-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX942-NEXT:    v_and_b32_e32 v5, 0x7fff7fff, v2
-; GFX942-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, |v0|, |v1|
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v6, v3, vcc
-; GFX942-NEXT:    v_perm_b32 v1, v4, v0, s0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_perm_b32 v1, v6, v0, s0
+; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v6, |v2| src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_pk_min_f16 v1, v1, v5
-; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v4, |v2| src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v3, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, v0, |v2|
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v6, v1, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
 ; GFX942-NEXT:    v_perm_b32 v0, v3, v0, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2238,19 +2231,17 @@ define <2 x half> @v_fminimum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2
 ; GFX942-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, -v0, -v1 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v1, v0, v5, s0
 ; GFX942-NEXT:    v_pk_min_f16 v1, v1, v2 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, v5, -v2
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v3, v4, v1, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v2 src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v4, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2283,26 +2274,23 @@ define <2 x half> @v_fminimum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) {
 ; GFX942-LABEL: v_fminimum3_v2f16__inlineimm1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_pk_min_f16 v2, v0, 2.0 op_sel_hi:[1,0]
-; GFX942-NEXT:    v_mov_b32_e32 v4, 0x7e00
-; GFX942-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX942-NEXT:    v_pk_min_f16 v2, v0, 2.0 op_sel_hi:[1,0]
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX942-NEXT:    v_cndmask_b32_sdwa v4, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
 ; GFX942-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX942-NEXT:    v_perm_b32 v2, v3, v0, s0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT:    v_perm_b32 v2, v4, v0, s0
+; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v1 src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_pk_min_f16 v2, v2, v1
-; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v3, v1 src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v4, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX942-NEXT:    v_perm_b32 v0, v3, v0, s0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT:    v_perm_b32 v0, v4, v0, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: v_fminimum3_v2f16__inlineimm1:
@@ -2337,19 +2325,17 @@ define <2 x half> @v_fminimum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) {
 ; GFX942-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v1, v0, v4, s0
 ; GFX942-NEXT:    v_pk_min_f16 v1, v1, 4.0 op_sel_hi:[1,0]
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v4, v4
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v2, v3, v1, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v0, v0, v2, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2408,10 +2394,9 @@ define <3 x half> @v_fminimum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v4, v8
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2471,10 +2456,9 @@ define <3 x half> @v_fminimum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v8, v4
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2617,10 +2601,9 @@ define <3 x half> @v_fminimum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3
 ; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, v8, -v4
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2656,35 +2639,33 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) {
 ; GFX942-LABEL: v_fminimum3_v3f16__inlineimm1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0]
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0x7e00
-; GFX942-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX942-NEXT:    v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0]
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX942-NEXT:    v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_pk_min_f16 v7, v1, 2.0
-; GFX942-NEXT:    s_mov_b32 s1, 0x5040100
-; GFX942-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v1, v1
+; GFX942-NEXT:    s_mov_b32 s1, 0x5040100
 ; GFX942-NEXT:    s_movk_i32 s0, 0x7e00
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
-; GFX942-NEXT:    v_perm_b32 v4, v5, v0, s1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX942-NEXT:    v_perm_b32 v4, v6, v0, s1
 ; GFX942-NEXT:    v_pk_min_f16 v4, v4, v2
-; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v6, v5, v7, vcc
 ; GFX942-NEXT:    v_pack_b32_f16 v7, v1, s0
 ; GFX942-NEXT:    v_pk_min_f16 v7, v7, v3
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
-; GFX942-NEXT:    v_perm_b32 v0, v5, v0, s1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX942-NEXT:    v_perm_b32 v0, v6, v0, s1
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: v_fminimum3_v3f16__inlineimm1:
@@ -2743,10 +2724,9 @@ define <3 x half> @v_fminimum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) {
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v6, v6
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2785,37 +2765,33 @@ define <4 x half> @v_fminimum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c
 ; GFX942-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX942-NEXT:    v_pk_min_f16 v2, v1, v3
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v2, v1, v6, s0
 ; GFX942-NEXT:    v_pk_min_f16 v2, v5, v2
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v5, v6
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v1 src0_sel:WORD_1 src1_sel:DWORD
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v2, v0, v8, s0
 ; GFX942-NEXT:    v_pk_min_f16 v2, v4, v2
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v4, v8
 ; GFX942-NEXT:    v_perm_b32 v1, v1, v3, s0
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v5, v7, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v0, v0, v5, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2854,37 +2830,33 @@ define <4 x half> @v_fminimum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x
 ; GFX942-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX942-NEXT:    v_pk_min_f16 v2, v1, v3
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v2, v1, v6, s0
 ; GFX942-NEXT:    v_pk_min_f16 v2, v2, v5
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v6, v5
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v5 src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v2, v0, v8, s0
 ; GFX942-NEXT:    v_pk_min_f16 v2, v2, v4
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v8, v4
 ; GFX942-NEXT:    v_perm_b32 v1, v1, v3, s0
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v5, v7, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v4 src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v0, v0, v5, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2947,17 +2919,14 @@ define <4 x half> @v_fminimum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v1, v12, v6, vcc
 ; GFX942-NEXT:    v_perm_b32 v2, v8, v1, s0
+; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_pk_min_f16 v2, v2, v11
 ; GFX942-NEXT:    v_perm_b32 v6, v9, v0, s0
-; GFX942-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v8, |v5| src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_cndmask_b32_sdwa v3, v12, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_pk_min_f16 v6, v6, v10
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
-; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v9, |v4| src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v7, v12, v7, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v7, v12, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, v1, |v5|
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v1, v12, v2, vcc
@@ -3012,37 +2981,33 @@ define <4 x half> @v_fminimum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4
 ; GFX942-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v8, v7, v6, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, -v0, -v2 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX942-NEXT:    v_pk_min_f16 v2, v1, v3 neg_lo:[1,1] neg_hi:[1,1]
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v7, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, -v1, -v3
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v6, v7, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, -v1, -v3 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v2, v1, v6, s0
 ; GFX942-NEXT:    v_pk_min_f16 v2, v2, v5 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, v6, -v5
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v3, v7, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v1, -v5 src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v2, v0, v8, s0
 ; GFX942-NEXT:    v_pk_min_f16 v2, v2, v4 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX942-NEXT:    v_cmp_o_f16_e64 vcc, v8, -v4
 ; GFX942-NEXT:    v_perm_b32 v1, v1, v3, s0
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v5, v7, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, -v4 src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v7, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v0, v0, v5, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3078,42 +3043,40 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) {
 ; GFX942-LABEL: v_fminimum3_v4f16__inlineimm1:
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0]
-; GFX942-NEXT:    v_mov_b32_e32 v6, 0x7e00
-; GFX942-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX942-NEXT:    v_pk_min_f16 v4, v0, 2.0 op_sel_hi:[1,0]
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0x7e00
 ; GFX942-NEXT:    v_pk_min_f16 v7, v1, 2.0 op_sel_hi:[1,0]
-; GFX942-NEXT:    s_mov_b32 s0, 0x5040100
-; GFX942-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v6, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_lshrrev_b32_e32 v8, 16, v7
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v1 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v8, v6, v8, vcc
+; GFX942-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v8, v5, v8, vcc
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v1, v1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
 ; GFX942-NEXT:    v_perm_b32 v4, v8, v1, s0
-; GFX942-NEXT:    v_pk_min_f16 v4, v4, v3
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v8, v3 src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT:    v_perm_b32 v8, v5, v0, s0
-; GFX942-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX942-NEXT:    v_pk_min_f16 v8, v8, v2
-; GFX942-NEXT:    v_cndmask_b32_e32 v7, v6, v7, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
-; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v5, v2 src0_sel:DWORD src1_sel:WORD_1
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v5, v6, v9, vcc
+; GFX942-NEXT:    v_pk_min_f16 v4, v4, v3
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_sdwa v7, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v6, v2 src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_perm_b32 v6, v6, v0, s0
+; GFX942-NEXT:    v_pk_min_f16 v6, v6, v2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_sdwa v8, v5, v6, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
 ; GFX942-NEXT:    v_perm_b32 v1, v7, v1, s0
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v6, v8, vcc
-; GFX942-NEXT:    v_perm_b32 v0, v5, v0, s0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v5, v6, vcc
+; GFX942-NEXT:    v_perm_b32 v0, v8, v0, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: v_fminimum3_v4f16__inlineimm1:
@@ -3151,37 +3114,33 @@ define <4 x half> @v_fminimum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) {
 ; GFX942-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX942-NEXT:    v_pk_min_f16 v2, v1, v3
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v4, v5, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v2, v1, v4, s0
 ; GFX942-NEXT:    v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0]
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v4, v4
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v3, v5, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v1, v1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v2, v0, v6, s0
 ; GFX942-NEXT:    v_pk_min_f16 v2, v2, 4.0 op_sel_hi:[1,0]
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v6, v6
 ; GFX942-NEXT:    v_perm_b32 v1, v1, v3, s0
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v4, v5, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v0
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v5, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v0, v0, v4, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4005,19 +3964,17 @@ define <4 x half> @v_no_fminimum3_v2f16__multi_use(<2 x half> %a, <2 x half> %b,
 ; GFX942-NEXT:    s_mov_b32 s0, 0x5040100
 ; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v0, v1, v5, s0
 ; GFX942-NEXT:    v_pk_min_f16 v3, v0, v2
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v5, v2
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v5, v4, v3, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX942-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v4, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX942-NEXT:    v_perm_b32 v1, v1, v5, s0
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index ff894d184e6c4..924378eb2376d 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -204,14 +204,12 @@ define <2 x i16> @fneg_xor_select_v2i16(<2 x i1> %cond, <2 x i16> %arg0, <2 x i1
 ; GFX9-LABEL: fneg_xor_select_v2i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
@@ -746,25 +744,23 @@ define <2 x half> @select_fneg_select_v2f16(<2 x i1> %cond0, <2 x i1> %cond1, <2
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80008000, v4
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v3
+; GFX9-NEXT:    v_cndmask_b32_sdwa v3, v1, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v4, v1, v0, s4
-; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v5, vcc
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v3, v0, s6
 ; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
+; GFX9-NEXT:    s_mov_b64 vcc, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v3, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s6
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: select_fneg_select_v2f16:
@@ -854,25 +850,23 @@ define <2 x i16> @select_fneg_xor_select_v2i16(<2 x i1> %cond0, <2 x i1> %cond1,
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80008000, v4
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v3
+; GFX9-NEXT:    v_cndmask_b32_sdwa v3, v1, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v4, v1, v0, s4
-; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_xor_b32_e32 v4, 0x80008000, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v5, vcc
+; GFX9-NEXT:    s_mov_b32 s6, 0x5040100
+; GFX9-NEXT:    v_perm_b32 v1, v3, v0, s6
 ; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
+; GFX9-NEXT:    s_mov_b64 vcc, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v3, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s6
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: select_fneg_xor_select_v2i16:

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 4b9da7b49e997..6925a98f643b9 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -345,11 +345,10 @@ define amdgpu_kernel void @half8_inselt(ptr addrspace(1) %out, <8 x half> %vec,
 ; GCN-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s6, 6
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
+; GCN-NEXT:    v_cndmask_b32_sdwa v1, v0, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GCN-NEXT:    v_mov_b32_e32 v2, s3
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
 ; GCN-NEXT:    s_lshr_b32 s3, s2, 16
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GCN-NEXT:    s_cmp_lg_u32 s6, 5
 ; GCN-NEXT:    v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
index b51cb9df8d784..47a371d8de07c 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
@@ -1145,106 +1145,104 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out,
 ; VI-NEXT:    v_cndmask_b32_e32 v7, v9, v6, vcc
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_eq_u32 s5, 1
-; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v8, v0, v6, vcc
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
 ; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; VI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; VI-NEXT:    v_cndmask_b32_sdwa v0, v0, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX900-LABEL: v_insertelement_v8bf16_dynamic:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX900-NEXT:    s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX900-NEXT:    s_load_dwordx2 s[12:13], s[8:9], 0x10
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX900-NEXT:    s_mov_b32 s14, 0x5040100
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
-; GFX900-NEXT:    s_cmp_eq_u32 s5, 6
-; GFX900-NEXT:    v_mov_b32_e32 v5, s4
+; GFX900-NEXT:    global_load_dwordx4 v[0:3], v4, s[18:19]
+; GFX900-NEXT:    s_cmp_eq_u32 s13, 6
 ; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX900-NEXT:    s_cmp_eq_u32 s5, 7
-; GFX900-NEXT:    s_mov_b32 s2, 0x5040100
+; GFX900-NEXT:    s_cmp_eq_u32 s13, 7
+; GFX900-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX900-NEXT:    s_cmp_eq_u32 s13, 4
+; GFX900-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX900-NEXT:    s_cmp_eq_u32 s13, 5
+; GFX900-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX900-NEXT:    s_cmp_eq_u32 s13, 2
+; GFX900-NEXT:    v_mov_b32_e32 v5, s12
+; GFX900-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GFX900-NEXT:    s_cmp_eq_u32 s13, 3
+; GFX900-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX900-NEXT:    s_cmp_eq_u32 s13, 0
+; GFX900-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GFX900-NEXT:    s_cmp_eq_u32 s13, 1
+; GFX900-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v3, v5, vcc
-; GFX900-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX900-NEXT:    s_cmp_eq_u32 s5, 4
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX900-NEXT:    s_cmp_eq_u32 s5, 5
-; GFX900-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX900-NEXT:    s_cmp_eq_u32 s5, 2
-; GFX900-NEXT:    v_perm_b32 v3, v3, v6, s2
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v7, v5, vcc
-; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX900-NEXT:    s_cmp_eq_u32 s5, 3
-; GFX900-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX900-NEXT:    s_cmp_eq_u32 s5, 0
-; GFX900-NEXT:    v_perm_b32 v2, v6, v2, s2
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v8, v5, vcc
-; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX900-NEXT:    s_cmp_eq_u32 s5, 1
-; GFX900-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
-; GFX900-NEXT:    v_perm_b32 v1, v6, v1, s2
-; GFX900-NEXT:    v_perm_b32 v0, v5, v0, s2
-; GFX900-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX900-NEXT:    s_mov_b64 vcc, s[0:1]
+; GFX900-NEXT:    v_cndmask_b32_sdwa v3, v3, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-NEXT:    s_mov_b64 vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v7, v2, v5, s[2:3]
+; GFX900-NEXT:    v_cndmask_b32_sdwa v2, v2, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-NEXT:    s_mov_b64 vcc, s[8:9]
+; GFX900-NEXT:    v_cndmask_b32_e64 v8, v1, v5, s[6:7]
+; GFX900-NEXT:    v_cndmask_b32_sdwa v1, v1, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-NEXT:    s_mov_b64 vcc, s[12:13]
+; GFX900-NEXT:    v_cndmask_b32_e64 v9, v0, v5, s[10:11]
+; GFX900-NEXT:    v_cndmask_b32_sdwa v0, v0, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-NEXT:    v_perm_b32 v3, v3, v6, s14
+; GFX900-NEXT:    v_perm_b32 v2, v2, v7, s14
+; GFX900-NEXT:    v_perm_b32 v1, v1, v8, s14
+; GFX900-NEXT:    v_perm_b32 v0, v0, v9, s14
+; GFX900-NEXT:    global_store_dwordx4 v4, v[0:3], s[16:17]
 ; GFX900-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: v_insertelement_v8bf16_dynamic:
 ; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX942-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x0
+; GFX942-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x10
 ; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX942-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX942-NEXT:    s_mov_b32 s14, 0x5040100
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
-; GFX942-NEXT:    s_cmp_eq_u32 s7, 6
-; GFX942-NEXT:    v_mov_b32_e32 v5, s6
+; GFX942-NEXT:    global_load_dwordx4 v[0:3], v4, s[18:19]
+; GFX942-NEXT:    s_cmp_eq_u32 s13, 6
 ; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX942-NEXT:    s_cmp_eq_u32 s7, 7
-; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
+; GFX942-NEXT:    s_cmp_eq_u32 s13, 7
+; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX942-NEXT:    s_cmp_eq_u32 s13, 4
+; GFX942-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX942-NEXT:    s_cmp_eq_u32 s13, 5
+; GFX942-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX942-NEXT:    s_cmp_eq_u32 s13, 2
+; GFX942-NEXT:    v_mov_b32_e32 v5, s12
+; GFX942-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GFX942-NEXT:    s_cmp_eq_u32 s13, 3
+; GFX942-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX942-NEXT:    s_cmp_eq_u32 s13, 0
+; GFX942-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GFX942-NEXT:    s_cmp_eq_u32 s13, 1
+; GFX942-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    v_cndmask_b32_e32 v6, v3, v5, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX942-NEXT:    s_cmp_eq_u32 s7, 4
-; GFX942-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX942-NEXT:    s_cmp_eq_u32 s7, 5
-; GFX942-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX942-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX942-NEXT:    s_cmp_eq_u32 s7, 2
-; GFX942-NEXT:    v_perm_b32 v3, v3, v6, s2
-; GFX942-NEXT:    v_cndmask_b32_e32 v6, v7, v5, vcc
-; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX942-NEXT:    s_cmp_eq_u32 s7, 3
-; GFX942-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX942-NEXT:    s_cmp_eq_u32 s7, 0
-; GFX942-NEXT:    v_perm_b32 v2, v6, v2, s2
-; GFX942-NEXT:    v_cndmask_b32_e32 v6, v8, v5, vcc
-; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX942-NEXT:    s_cmp_eq_u32 s7, 1
-; GFX942-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX942-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
-; GFX942-NEXT:    v_perm_b32 v1, v6, v1, s2
-; GFX942-NEXT:    v_perm_b32 v0, v5, v0, s2
-; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT:    s_mov_b64 vcc, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_sdwa v3, v3, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    s_mov_b64 vcc, s[4:5]
+; GFX942-NEXT:    v_cndmask_b32_e64 v7, v2, v5, s[2:3]
+; GFX942-NEXT:    v_cndmask_b32_sdwa v2, v2, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    s_mov_b64 vcc, s[8:9]
+; GFX942-NEXT:    v_cndmask_b32_e64 v8, v1, v5, s[6:7]
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v1, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    s_mov_b64 vcc, s[12:13]
+; GFX942-NEXT:    v_cndmask_b32_e64 v9, v0, v5, s[10:11]
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v0, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    v_perm_b32 v3, v3, v6, s14
+; GFX942-NEXT:    v_perm_b32 v2, v2, v7, s14
+; GFX942-NEXT:    v_perm_b32 v1, v1, v8, s14
+; GFX942-NEXT:    v_perm_b32 v0, v0, v9, s14
+; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[16:17]
 ; GFX942-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
@@ -1559,163 +1557,163 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out
 ;
 ; GFX900-LABEL: v_insertelement_v16bf16_dynamic:
 ; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX900-NEXT:    s_load_dwordx4 s[36:39], s[8:9], 0x0
+; GFX900-NEXT:    s_load_dwordx2 s[28:29], s[8:9], 0x10
+; GFX900-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
+; GFX900-NEXT:    s_mov_b32 s30, 0x5040100
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    global_load_dwordx4 v[1:4], v0, s[2:3]
-; GFX900-NEXT:    global_load_dwordx4 v[5:8], v0, s[2:3] offset:16
-; GFX900-NEXT:    s_cmp_eq_u32 s5, 6
-; GFX900-NEXT:    v_mov_b32_e32 v9, s4
+; GFX900-NEXT:    global_load_dwordx4 v[0:3], v8, s[38:39]
+; GFX900-NEXT:    global_load_dwordx4 v[4:7], v8, s[38:39] offset:16
+; GFX900-NEXT:    s_cmp_eq_u32 s29, 6
 ; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX900-NEXT:    s_cmp_eq_u32 s5, 7
-; GFX900-NEXT:    s_mov_b32 s2, 0x5040100
+; GFX900-NEXT:    s_cmp_eq_u32 s29, 7
+; GFX900-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX900-NEXT:    s_cmp_eq_u32 s29, 4
+; GFX900-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX900-NEXT:    s_cmp_eq_u32 s29, 5
+; GFX900-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX900-NEXT:    s_cmp_eq_u32 s29, 2
+; GFX900-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GFX900-NEXT:    s_cmp_eq_u32 s29, 3
+; GFX900-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX900-NEXT:    s_cmp_eq_u32 s29, 0
+; GFX900-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GFX900-NEXT:    s_cmp_eq_u32 s29, 1
+; GFX900-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX900-NEXT:    s_cmp_eq_u32 s29, 14
+; GFX900-NEXT:    v_mov_b32_e32 v9, s28
+; GFX900-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX900-NEXT:    s_cmp_eq_u32 s29, 15
+; GFX900-NEXT:    s_cselect_b64 s[16:17], -1, 0
+; GFX900-NEXT:    s_cmp_eq_u32 s29, 12
+; GFX900-NEXT:    s_cselect_b64 s[18:19], -1, 0
+; GFX900-NEXT:    s_cmp_eq_u32 s29, 13
+; GFX900-NEXT:    s_cselect_b64 s[20:21], -1, 0
+; GFX900-NEXT:    s_cmp_eq_u32 s29, 10
+; GFX900-NEXT:    s_cselect_b64 s[22:23], -1, 0
+; GFX900-NEXT:    s_cmp_eq_u32 s29, 11
+; GFX900-NEXT:    s_cselect_b64 s[24:25], -1, 0
+; GFX900-NEXT:    s_cmp_eq_u32 s29, 8
+; GFX900-NEXT:    s_cselect_b64 s[26:27], -1, 0
+; GFX900-NEXT:    s_cmp_eq_u32 s29, 9
+; GFX900-NEXT:    s_cselect_b64 s[28:29], -1, 0
 ; GFX900-NEXT:    s_waitcnt vmcnt(1)
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v4, v9, vcc
-; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX900-NEXT:    s_cmp_eq_u32 s5, 4
-; GFX900-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
-; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX900-NEXT:    s_cmp_eq_u32 s5, 5
-; GFX900-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX900-NEXT:    s_cmp_eq_u32 s5, 2
-; GFX900-NEXT:    v_perm_b32 v4, v4, v10, s2
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v11, v9, vcc
-; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX900-NEXT:    s_cmp_eq_u32 s5, 3
-; GFX900-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
-; GFX900-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc
-; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX900-NEXT:    s_cmp_eq_u32 s5, 0
-; GFX900-NEXT:    v_cndmask_b32_e32 v11, v12, v9, vcc
-; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX900-NEXT:    s_cmp_eq_u32 s5, 1
-; GFX900-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
-; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX900-NEXT:    s_cmp_eq_u32 s5, 14
-; GFX900-NEXT:    v_cndmask_b32_e32 v12, v13, v9, vcc
-; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX900-NEXT:    s_cmp_eq_u32 s5, 15
+; GFX900-NEXT:    v_cndmask_b32_e32 v10, v3, v9, vcc
+; GFX900-NEXT:    s_mov_b64 vcc, s[0:1]
+; GFX900-NEXT:    v_cndmask_b32_sdwa v3, v3, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-NEXT:    s_mov_b64 vcc, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e64 v11, v2, v9, s[2:3]
+; GFX900-NEXT:    v_cndmask_b32_sdwa v2, v2, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-NEXT:    s_mov_b64 vcc, s[8:9]
+; GFX900-NEXT:    v_cndmask_b32_e64 v12, v1, v9, s[6:7]
+; GFX900-NEXT:    v_cndmask_b32_sdwa v1, v1, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-NEXT:    s_mov_b64 vcc, s[12:13]
+; GFX900-NEXT:    v_cndmask_b32_e64 v13, v0, v9, s[10:11]
+; GFX900-NEXT:    v_cndmask_b32_sdwa v0, v0, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-NEXT:    s_mov_b64 vcc, s[16:17]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    v_lshrrev_b32_e32 v14, 16, v8
-; GFX900-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX900-NEXT:    s_cmp_eq_u32 s5, 12
-; GFX900-NEXT:    v_perm_b32 v1, v12, v1, s2
-; GFX900-NEXT:    v_cndmask_b32_e32 v12, v14, v9, vcc
-; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX900-NEXT:    s_cmp_eq_u32 s5, 13
-; GFX900-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
-; GFX900-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
-; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX900-NEXT:    s_cmp_eq_u32 s5, 10
-; GFX900-NEXT:    v_perm_b32 v8, v12, v8, s2
-; GFX900-NEXT:    v_cndmask_b32_e32 v12, v15, v9, vcc
-; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX900-NEXT:    s_cmp_eq_u32 s5, 11
-; GFX900-NEXT:    v_perm_b32 v3, v10, v3, s2
-; GFX900-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc
-; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX900-NEXT:    s_cmp_eq_u32 s5, 8
-; GFX900-NEXT:    v_cndmask_b32_e32 v10, v10, v9, vcc
-; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX900-NEXT:    s_cmp_eq_u32 s5, 9
-; GFX900-NEXT:    v_perm_b32 v2, v11, v2, s2
-; GFX900-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
-; GFX900-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX900-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
-; GFX900-NEXT:    v_perm_b32 v7, v12, v7, s2
-; GFX900-NEXT:    v_perm_b32 v6, v10, v6, s2
-; GFX900-NEXT:    v_perm_b32 v5, v9, v5, s2
-; GFX900-NEXT:    global_store_dwordx4 v0, v[5:8], s[0:1] offset:16
-; GFX900-NEXT:    global_store_dwordx4 v0, v[1:4], s[0:1]
+; GFX900-NEXT:    v_cndmask_b32_e64 v14, v7, v9, s[14:15]
+; GFX900-NEXT:    v_cndmask_b32_sdwa v7, v7, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-NEXT:    s_mov_b64 vcc, s[20:21]
+; GFX900-NEXT:    v_cndmask_b32_e64 v15, v6, v9, s[18:19]
+; GFX900-NEXT:    v_cndmask_b32_sdwa v6, v6, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-NEXT:    s_mov_b64 vcc, s[24:25]
+; GFX900-NEXT:    v_perm_b32 v3, v3, v10, s30
+; GFX900-NEXT:    v_cndmask_b32_e64 v10, v5, v9, s[22:23]
+; GFX900-NEXT:    v_cndmask_b32_sdwa v5, v5, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-NEXT:    s_mov_b64 vcc, s[28:29]
+; GFX900-NEXT:    v_perm_b32 v2, v2, v11, s30
+; GFX900-NEXT:    v_cndmask_b32_e64 v11, v4, v9, s[26:27]
+; GFX900-NEXT:    v_cndmask_b32_sdwa v4, v4, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-NEXT:    v_perm_b32 v7, v7, v14, s30
+; GFX900-NEXT:    v_perm_b32 v6, v6, v15, s30
+; GFX900-NEXT:    v_perm_b32 v5, v5, v10, s30
+; GFX900-NEXT:    v_perm_b32 v4, v4, v11, s30
+; GFX900-NEXT:    v_perm_b32 v1, v1, v12, s30
+; GFX900-NEXT:    v_perm_b32 v0, v0, v13, s30
+; GFX900-NEXT:    global_store_dwordx4 v8, v[4:7], s[36:37] offset:16
+; GFX900-NEXT:    global_store_dwordx4 v8, v[0:3], s[36:37]
 ; GFX900-NEXT:    s_endpgm
 ;
 ; GFX942-LABEL: v_insertelement_v16bf16_dynamic:
 ; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX942-NEXT:    s_load_dwordx4 s[36:39], s[4:5], 0x0
+; GFX942-NEXT:    s_load_dwordx2 s[28:29], s[4:5], 0x10
 ; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX942-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
+; GFX942-NEXT:    s_mov_b32 s30, 0x5040100
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3]
-; GFX942-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
-; GFX942-NEXT:    s_cmp_eq_u32 s7, 6
-; GFX942-NEXT:    v_mov_b32_e32 v9, s6
+; GFX942-NEXT:    global_load_dwordx4 v[0:3], v8, s[38:39]
+; GFX942-NEXT:    global_load_dwordx4 v[4:7], v8, s[38:39] offset:16
+; GFX942-NEXT:    s_cmp_eq_u32 s29, 6
 ; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX942-NEXT:    s_cmp_eq_u32 s7, 7
-; GFX942-NEXT:    s_mov_b32 s2, 0x5040100
+; GFX942-NEXT:    s_cmp_eq_u32 s29, 7
+; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX942-NEXT:    s_cmp_eq_u32 s29, 4
+; GFX942-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX942-NEXT:    s_cmp_eq_u32 s29, 5
+; GFX942-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX942-NEXT:    s_cmp_eq_u32 s29, 2
+; GFX942-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GFX942-NEXT:    s_cmp_eq_u32 s29, 3
+; GFX942-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX942-NEXT:    s_cmp_eq_u32 s29, 0
+; GFX942-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GFX942-NEXT:    s_cmp_eq_u32 s29, 1
+; GFX942-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX942-NEXT:    s_cmp_eq_u32 s29, 14
+; GFX942-NEXT:    v_mov_b32_e32 v9, s28
+; GFX942-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX942-NEXT:    s_cmp_eq_u32 s29, 15
+; GFX942-NEXT:    s_cselect_b64 s[16:17], -1, 0
+; GFX942-NEXT:    s_cmp_eq_u32 s29, 12
+; GFX942-NEXT:    s_cselect_b64 s[18:19], -1, 0
+; GFX942-NEXT:    s_cmp_eq_u32 s29, 13
+; GFX942-NEXT:    s_cselect_b64 s[20:21], -1, 0
+; GFX942-NEXT:    s_cmp_eq_u32 s29, 10
+; GFX942-NEXT:    s_cselect_b64 s[22:23], -1, 0
+; GFX942-NEXT:    s_cmp_eq_u32 s29, 11
+; GFX942-NEXT:    s_cselect_b64 s[24:25], -1, 0
+; GFX942-NEXT:    s_cmp_eq_u32 s29, 8
+; GFX942-NEXT:    s_cselect_b64 s[26:27], -1, 0
+; GFX942-NEXT:    s_cmp_eq_u32 s29, 9
+; GFX942-NEXT:    s_cselect_b64 s[28:29], -1, 0
 ; GFX942-NEXT:    s_waitcnt vmcnt(1)
 ; GFX942-NEXT:    v_cndmask_b32_e32 v10, v3, v9, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX942-NEXT:    s_cmp_eq_u32 s7, 4
-; GFX942-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX942-NEXT:    s_cmp_eq_u32 s7, 5
-; GFX942-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX942-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc
-; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX942-NEXT:    s_cmp_eq_u32 s7, 2
-; GFX942-NEXT:    v_perm_b32 v3, v3, v10, s2
-; GFX942-NEXT:    v_cndmask_b32_e32 v10, v11, v9, vcc
-; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX942-NEXT:    s_cmp_eq_u32 s7, 3
-; GFX942-NEXT:    v_lshrrev_b32_e32 v12, 16, v1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
-; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX942-NEXT:    s_cmp_eq_u32 s7, 0
-; GFX942-NEXT:    v_perm_b32 v2, v10, v2, s2
-; GFX942-NEXT:    v_cndmask_b32_e32 v10, v12, v9, vcc
-; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX942-NEXT:    s_cmp_eq_u32 s7, 1
-; GFX942-NEXT:    v_lshrrev_b32_e32 v13, 16, v0
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
-; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX942-NEXT:    s_cmp_eq_u32 s7, 14
-; GFX942-NEXT:    v_perm_b32 v1, v10, v1, s2
-; GFX942-NEXT:    v_cndmask_b32_e32 v10, v13, v9, vcc
-; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX942-NEXT:    s_cmp_eq_u32 s7, 15
+; GFX942-NEXT:    s_mov_b64 vcc, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_sdwa v3, v3, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    s_mov_b64 vcc, s[4:5]
+; GFX942-NEXT:    v_cndmask_b32_e64 v11, v2, v9, s[2:3]
+; GFX942-NEXT:    v_cndmask_b32_sdwa v2, v2, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    s_mov_b64 vcc, s[8:9]
+; GFX942-NEXT:    v_cndmask_b32_e64 v12, v1, v9, s[6:7]
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v1, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    s_mov_b64 vcc, s[12:13]
+; GFX942-NEXT:    v_cndmask_b32_e64 v13, v0, v9, s[10:11]
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v0, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    s_mov_b64 vcc, s[16:17]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
-; GFX942-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
-; GFX942-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
-; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX942-NEXT:    s_cmp_eq_u32 s7, 12
-; GFX942-NEXT:    v_perm_b32 v0, v10, v0, s2
-; GFX942-NEXT:    v_cndmask_b32_e32 v10, v14, v9, vcc
-; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX942-NEXT:    s_cmp_eq_u32 s7, 13
-; GFX942-NEXT:    v_lshrrev_b32_e32 v15, 16, v6
-; GFX942-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc
-; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX942-NEXT:    s_cmp_eq_u32 s7, 10
-; GFX942-NEXT:    v_perm_b32 v7, v10, v7, s2
-; GFX942-NEXT:    v_cndmask_b32_e32 v10, v15, v9, vcc
-; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX942-NEXT:    s_cmp_eq_u32 s7, 11
-; GFX942-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
-; GFX942-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
-; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX942-NEXT:    s_cmp_eq_u32 s7, 8
-; GFX942-NEXT:    v_perm_b32 v6, v10, v6, s2
-; GFX942-NEXT:    v_cndmask_b32_e32 v10, v16, v9, vcc
-; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX942-NEXT:    s_cmp_eq_u32 s7, 9
-; GFX942-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
-; GFX942-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
-; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX942-NEXT:    v_cndmask_b32_e32 v9, v17, v9, vcc
-; GFX942-NEXT:    v_perm_b32 v5, v10, v5, s2
-; GFX942-NEXT:    v_perm_b32 v4, v9, v4, s2
-; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
-; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v14, v7, v9, s[14:15]
+; GFX942-NEXT:    v_cndmask_b32_sdwa v7, v7, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    s_mov_b64 vcc, s[20:21]
+; GFX942-NEXT:    v_cndmask_b32_e64 v15, v6, v9, s[18:19]
+; GFX942-NEXT:    v_cndmask_b32_sdwa v6, v6, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    s_mov_b64 vcc, s[24:25]
+; GFX942-NEXT:    v_cndmask_b32_e64 v16, v5, v9, s[22:23]
+; GFX942-NEXT:    v_cndmask_b32_sdwa v5, v5, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    s_mov_b64 vcc, s[28:29]
+; GFX942-NEXT:    v_cndmask_b32_e64 v17, v4, v9, s[26:27]
+; GFX942-NEXT:    v_cndmask_b32_sdwa v4, v4, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-NEXT:    v_perm_b32 v7, v7, v14, s30
+; GFX942-NEXT:    v_perm_b32 v6, v6, v15, s30
+; GFX942-NEXT:    v_perm_b32 v5, v5, v16, s30
+; GFX942-NEXT:    v_perm_b32 v4, v4, v17, s30
+; GFX942-NEXT:    v_perm_b32 v3, v3, v10, s30
+; GFX942-NEXT:    v_perm_b32 v2, v2, v11, s30
+; GFX942-NEXT:    v_perm_b32 v1, v1, v12, s30
+; GFX942-NEXT:    v_perm_b32 v0, v0, v13, s30
+; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[36:37] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[36:37]
 ; GFX942-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index e11900ac0ca68..e0dacb7a59a42 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -2694,47 +2694,47 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
 define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) {
 ; GFX9-LABEL: v_insertelement_v8f16_dynamic:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX9-NEXT:    s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[8:9], 0x10
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX9-NEXT:    s_mov_b32 s14, 0x5040100
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
-; GFX9-NEXT:    s_cmp_eq_u32 s5, 6
-; GFX9-NEXT:    v_mov_b32_e32 v5, s4
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[18:19]
+; GFX9-NEXT:    s_cmp_eq_u32 s13, 6
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s5, 7
-; GFX9-NEXT:    s_mov_b32 s2, 0x5040100
+; GFX9-NEXT:    s_cmp_eq_u32 s13, 7
+; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s13, 4
+; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s13, 5
+; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s13, 2
+; GFX9-NEXT:    v_mov_b32_e32 v5, s12
+; GFX9-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s13, 3
+; GFX9-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s13, 0
+; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s13, 1
+; GFX9-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v3, v5, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s5, 4
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s5, 5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s5, 2
-; GFX9-NEXT:    v_perm_b32 v3, v3, v6, s2
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v5, vcc
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s5, 3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s5, 0
-; GFX9-NEXT:    v_perm_b32 v2, v6, v2, s2
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v8, v5, vcc
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s5, 1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
-; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s2
-; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s2
-; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT:    s_mov_b64 vcc, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v3, v3, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_mov_b64 vcc, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v2, v5, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v2, v2, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_mov_b64 vcc, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v1, v5, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v1, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_mov_b64 vcc, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v0, v5, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v0, v0, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_perm_b32 v3, v3, v6, s14
+; GFX9-NEXT:    v_perm_b32 v2, v2, v7, s14
+; GFX9-NEXT:    v_perm_b32 v1, v1, v8, s14
+; GFX9-NEXT:    v_perm_b32 v0, v0, v9, s14
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[16:17]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_insertelement_v8f16_dynamic:
@@ -2783,14 +2783,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out,
 ; VI-NEXT:    v_cndmask_b32_e32 v7, v9, v6, vcc
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
 ; VI-NEXT:    s_cmp_eq_u32 s5, 1
-; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; VI-NEXT:    v_cndmask_b32_e32 v8, v0, v6, vcc
 ; VI-NEXT:    s_cselect_b64 vcc, -1, 0
-; VI-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc
 ; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; VI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; VI-NEXT:    v_cndmask_b32_sdwa v0, v0, v6, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
 ;
@@ -3196,82 +3194,82 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
 define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %val, i32 %n) {
 ; GFX9-LABEL: v_insertelement_v16f16_dynamic:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX9-NEXT:    s_load_dwordx4 s[36:39], s[8:9], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[28:29], s[8:9], 0x10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
+; GFX9-NEXT:    s_mov_b32 s30, 0x5040100
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[1:4], v0, s[2:3]
-; GFX9-NEXT:    global_load_dwordx4 v[5:8], v0, s[2:3] offset:16
-; GFX9-NEXT:    s_cmp_eq_u32 s5, 6
-; GFX9-NEXT:    v_mov_b32_e32 v9, s4
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[38:39]
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[38:39] offset:16
+; GFX9-NEXT:    s_cmp_eq_u32 s29, 6
 ; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s5, 7
-; GFX9-NEXT:    s_mov_b32 s2, 0x5040100
+; GFX9-NEXT:    s_cmp_eq_u32 s29, 7
+; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s29, 4
+; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s29, 5
+; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s29, 2
+; GFX9-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s29, 3
+; GFX9-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s29, 0
+; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s29, 1
+; GFX9-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s29, 14
+; GFX9-NEXT:    v_mov_b32_e32 v9, s28
+; GFX9-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s29, 15
+; GFX9-NEXT:    s_cselect_b64 s[16:17], -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s29, 12
+; GFX9-NEXT:    s_cselect_b64 s[18:19], -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s29, 13
+; GFX9-NEXT:    s_cselect_b64 s[20:21], -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s29, 10
+; GFX9-NEXT:    s_cselect_b64 s[22:23], -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s29, 11
+; GFX9-NEXT:    s_cselect_b64 s[24:25], -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s29, 8
+; GFX9-NEXT:    s_cselect_b64 s[26:27], -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s29, 9
+; GFX9-NEXT:    s_cselect_b64 s[28:29], -1, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v4, v9, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s5, 4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s5, 5
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s5, 2
-; GFX9-NEXT:    v_perm_b32 v4, v4, v10, s2
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v11, v9, vcc
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s5, 3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s5, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v12, v9, vcc
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s5, 1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s5, 14
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v13, v9, vcc
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s5, 15
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v3, v9, vcc
+; GFX9-NEXT:    s_mov_b64 vcc, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v3, v3, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_mov_b64 vcc, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v2, v9, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v2, v2, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_mov_b64 vcc, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v12, v1, v9, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v1, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_mov_b64 vcc, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v0, v9, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v0, v0, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_mov_b64 vcc, s[16:17]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s5, 12
-; GFX9-NEXT:    v_perm_b32 v1, v12, v1, s2
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v14, v9, vcc
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s5, 13
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s5, 10
-; GFX9-NEXT:    v_perm_b32 v8, v12, v8, s2
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v15, v9, vcc
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s5, 11
-; GFX9-NEXT:    v_perm_b32 v3, v10, v3, s2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s5, 8
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v9, vcc
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s5, 9
-; GFX9-NEXT:    v_perm_b32 v2, v11, v2, s2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
-; GFX9-NEXT:    s_cselect_b64 vcc, -1, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
-; GFX9-NEXT:    v_perm_b32 v7, v12, v7, s2
-; GFX9-NEXT:    v_perm_b32 v6, v10, v6, s2
-; GFX9-NEXT:    v_perm_b32 v5, v9, v5, s2
-; GFX9-NEXT:    global_store_dwordx4 v0, v[5:8], s[0:1] offset:16
-; GFX9-NEXT:    global_store_dwordx4 v0, v[1:4], s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, v7, v9, s[14:15]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v7, v7, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_mov_b64 vcc, s[20:21]
+; GFX9-NEXT:    v_cndmask_b32_e64 v15, v6, v9, s[18:19]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v6, v6, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_mov_b64 vcc, s[24:25]
+; GFX9-NEXT:    v_perm_b32 v3, v3, v10, s30
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, v5, v9, s[22:23]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v5, v5, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_mov_b64 vcc, s[28:29]
+; GFX9-NEXT:    v_perm_b32 v2, v2, v11, s30
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v4, v9, s[26:27]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v4, v4, v9, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_perm_b32 v7, v7, v14, s30
+; GFX9-NEXT:    v_perm_b32 v6, v6, v15, s30
+; GFX9-NEXT:    v_perm_b32 v5, v5, v10, s30
+; GFX9-NEXT:    v_perm_b32 v4, v4, v11, s30
+; GFX9-NEXT:    v_perm_b32 v1, v1, v12, s30
+; GFX9-NEXT:    v_perm_b32 v0, v0, v13, s30
+; GFX9-NEXT:    global_store_dwordx4 v8, v[4:7], s[36:37] offset:16
+; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[36:37]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; VI-LABEL: v_insertelement_v16f16_dynamic:

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
index 362b9dacaf257..17fdc841a1258 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -666,14 +666,13 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_max_f16_e32 v4, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7e00
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v3, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
-; GFX8-NEXT:    v_max_f16_e32 v3, v0, v1
+; GFX8-NEXT:    v_max_f16_e32 v2, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX8-NEXT:    v_cndmask_b32_sdwa v2, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v4, v0, v1
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -684,9 +683,8 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) {
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7e00
 ; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
-; GFX900-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v4, s4
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -701,12 +699,12 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v0, v1
-; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX10-NEXT:    v_cmp_o_f16_e64 s4, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_sdwa v1, v3, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0x7e00, v2, s4
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_maximum_v2f16:
@@ -836,14 +834,13 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_max_f16_e32 v4, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7e00
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v3, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
-; GFX8-NEXT:    v_max_f16_e32 v3, v0, v1
+; GFX8-NEXT:    v_max_f16_e32 v2, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX8-NEXT:    v_cndmask_b32_sdwa v2, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v4, v0, v1
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -854,9 +851,8 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) {
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7e00
 ; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
-; GFX900-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v4, s4
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -871,12 +867,12 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v2, v0, v1
-; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX10-NEXT:    v_cmp_o_f16_e64 s4, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_sdwa v1, v3, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0x7e00, v2, s4
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_maximum_v2f16__nsz:
@@ -1014,15 +1010,14 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
 ; GFX8-NEXT:    s_lshr_b32 s4, s17, 16
 ; GFX8-NEXT:    s_lshr_b32 s5, s16, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_max_f16_e32 v1, s5, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7e00
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, s5, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v1, s17
-; GFX8-NEXT:    v_max_f16_e32 v3, s16, v1
-; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, s16, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX8-NEXT:    v_max_f16_e32 v0, s5, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7e00
+; GFX8-NEXT:    v_mov_b32_e32 v2, s17
+; GFX8-NEXT:    v_cndmask_b32_sdwa v0, v1, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v3, s16, v2
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, s16, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use v0
@@ -1041,9 +1036,8 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
 ; GFX900-NEXT:    s_lshr_b32 s5, s16, 16
 ; GFX900-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX900-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, s5, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX900-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX900-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -1065,16 +1059,16 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
 ; GFX10-LABEL: s_maximum_v2f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v0, s16, s17
-; GFX10-NEXT:    v_cmp_o_f16_e64 vcc_lo, s16, s17
 ; GFX10-NEXT:    s_lshr_b32 s4, s17, 16
 ; GFX10-NEXT:    s_lshr_b32 s5, s16, 16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
+; GFX10-NEXT:    v_pk_max_f16 v0, s16, s17
 ; GFX10-NEXT:    v_cmp_o_f16_e64 vcc_lo, s5, s4
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo
-; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX10-NEXT:    v_cmp_o_f16_e64 s4, s16, s17
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x7e00
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0x7e00, v0, s4
+; GFX10-NEXT:    v_cndmask_b32_sdwa v0, v1, v0, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use v0
 ; GFX10-NEXT:    ;;#ASMEND
@@ -1172,18 +1166,17 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX8-NEXT:    v_max_f16_e32 v6, v5, v4
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0x7e00
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v5, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v6, vcc
-; GFX8-NEXT:    v_max_f16_e32 v5, v1, v3
+; GFX8-NEXT:    v_max_f16_e32 v4, v5, v4
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX8-NEXT:    v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v6, v1, v3
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
 ; GFX8-NEXT:    v_max_f16_e32 v3, v0, v2
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_maximum_v3f16:
@@ -1196,9 +1189,8 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) {
 ; GFX900-NEXT:    v_pk_max_f16 v3, v0, v2
 ; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX900-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX900-NEXT:    v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v4, s4
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -1214,15 +1206,15 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v4, v0, v2
-; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0x7e00, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_pk_max_f16 v2, v1, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX10-NEXT:    v_cmp_o_f16_e64 s4, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_sdwa v2, v5, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0x7e00, v4, s4
+; GFX10-NEXT:    v_pk_max_f16 v4, v1, v3
 ; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v2, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_maximum_v3f16:
@@ -1379,18 +1371,17 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX8-NEXT:    v_max_f16_e32 v6, v5, v4
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0x7e00
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v5, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v6, vcc
-; GFX8-NEXT:    v_max_f16_e32 v5, v1, v3
+; GFX8-NEXT:    v_max_f16_e32 v4, v5, v4
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX8-NEXT:    v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v6, v1, v3
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
 ; GFX8-NEXT:    v_max_f16_e32 v3, v0, v2
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_maximum_v3f16__nsz:
@@ -1403,9 +1394,8 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) {
 ; GFX900-NEXT:    v_pk_max_f16 v3, v0, v2
 ; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX900-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX900-NEXT:    v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v4, s4
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -1421,15 +1411,15 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_max_f16 v4, v0, v2
-; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0x7e00, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_pk_max_f16 v2, v1, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX10-NEXT:    v_cmp_o_f16_e64 s4, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_sdwa v2, v5, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0x7e00, v4, s4
+; GFX10-NEXT:    v_pk_max_f16 v4, v1, v3
 ; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v2, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_maximum_v3f16__nsz:
@@ -1593,25 +1583,24 @@ define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_max_f16_e32 v6, v5, v4
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0x7e00
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v5, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v6, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX8-NEXT:    v_max_f16_e32 v8, v6, v5
-; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v6, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
-; GFX8-NEXT:    v_max_f16_e32 v6, v1, v3
+; GFX8-NEXT:    v_max_f16_e32 v4, v5, v4
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v8, v7, v6
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v7, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    v_max_f16_e32 v7, v1, v3
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
 ; GFX8-NEXT:    v_max_f16_e32 v3, v0, v2
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_maximum_v4f16:
@@ -1621,15 +1610,13 @@ define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) {
 ; GFX900-NEXT:    v_mov_b32_e32 v5, 0x7e00
 ; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
-; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX900-NEXT:    v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX900-NEXT:    v_pk_max_f16 v3, v0, v2
 ; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX900-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX900-NEXT:    v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v4, s4
 ; GFX900-NEXT:    v_perm_b32 v1, v1, v6, s4
@@ -1645,20 +1632,20 @@ define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) {
 ; GFX10-LABEL: v_maximum_v4f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v4, v1, v3
-; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
-; GFX10-NEXT:    v_pk_max_f16 v5, v0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0x7e00, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0x7e00, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v5, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v1, v1, v6, 0x5040100
+; GFX10-NEXT:    v_pk_max_f16 v6, v1, v3
+; GFX10-NEXT:    v_cmp_o_f16_sdwa s4, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_pk_max_f16 v5, v0, v2
+; GFX10-NEXT:    v_cmp_o_f16_e64 s5, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_sdwa v2, v4, v6, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    s_mov_b32 vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_sdwa v4, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0x7e00, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v6, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_maximum_v4f16:
@@ -1836,25 +1823,24 @@ define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_max_f16_e32 v6, v5, v4
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0x7e00
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v5, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v6, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX8-NEXT:    v_max_f16_e32 v8, v6, v5
-; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v6, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
-; GFX8-NEXT:    v_max_f16_e32 v6, v1, v3
+; GFX8-NEXT:    v_max_f16_e32 v4, v5, v4
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_f16_e32 v8, v7, v6
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v7, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    v_max_f16_e32 v7, v1, v3
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
 ; GFX8-NEXT:    v_max_f16_e32 v3, v0, v2
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_maximum_v4f16__nsz:
@@ -1864,15 +1850,13 @@ define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) {
 ; GFX900-NEXT:    v_mov_b32_e32 v5, 0x7e00
 ; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
-; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX900-NEXT:    v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX900-NEXT:    v_pk_max_f16 v3, v0, v2
 ; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX900-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX900-NEXT:    v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v4, s4
 ; GFX900-NEXT:    v_perm_b32 v1, v1, v6, s4
@@ -1888,20 +1872,20 @@ define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) {
 ; GFX10-LABEL: v_maximum_v4f16__nsz:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_max_f16 v4, v1, v3
-; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
-; GFX10-NEXT:    v_pk_max_f16 v5, v0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0x7e00, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0x7e00, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v5, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v1, v1, v6, 0x5040100
+; GFX10-NEXT:    v_pk_max_f16 v6, v1, v3
+; GFX10-NEXT:    v_cmp_o_f16_sdwa s4, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_pk_max_f16 v5, v0, v2
+; GFX10-NEXT:    v_cmp_o_f16_e64 s5, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_sdwa v2, v4, v6, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    s_mov_b32 vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_sdwa v4, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0x7e00, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v6, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_maximum_v4f16__nsz:
@@ -2195,34 +2179,34 @@ define <8 x half> @v_maximum_v8f16(<8 x half> %src0, <8 x half> %src1) {
 ; GFX10-LABEL: v_maximum_v8f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    v_pk_max_f16 v8, v3, v7
-; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v7
-; GFX10-NEXT:    v_pk_max_f16 v9, v2, v6
-; GFX10-NEXT:    v_pk_max_f16 v12, v1, v5
-; GFX10-NEXT:    v_pk_max_f16 v13, v0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, 0x7e00, v8, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v9, 0x7e00
+; GFX10-NEXT:    v_pk_max_f16 v10, v2, v6
+; GFX10-NEXT:    v_cmp_o_f16_sdwa s4, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_cmp_o_f16_sdwa s5, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_pk_max_f16 v12, v0, v4
+; GFX10-NEXT:    v_cndmask_b32_sdwa v11, v9, v8, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v2, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, 0x7e00, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v13
-; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0x7e00, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v10, vcc_lo
+; GFX10-NEXT:    s_mov_b32 vcc_lo, s4
+; GFX10-NEXT:    v_cmp_o_f16_sdwa s4, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_cndmask_b32_sdwa v6, v9, v10, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_pk_max_f16 v10, v1, v5
+; GFX10-NEXT:    s_mov_b32 vcc_lo, s5
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_sdwa v13, v9, v10, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, 0x7e00, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v13, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_perm_b32 v1, v1, v6, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v12, vcc_lo
+; GFX10-NEXT:    s_mov_b32 vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_sdwa v4, v9, v12, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v7
+; GFX10-NEXT:    v_perm_b32 v1, v13, v1, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7e00, v8, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v3, v10, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v3, v11, v3, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_maximum_v8f16:

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
index f6d37b34807b1..b8e5be785a77d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll
@@ -556,14 +556,13 @@ define <2 x half> @v_minimum_v2f16(<2 x half> %src0, <2 x half> %src1) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_min_f16_e32 v4, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7e00
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v3, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
-; GFX8-NEXT:    v_min_f16_e32 v3, v0, v1
+; GFX8-NEXT:    v_min_f16_e32 v2, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX8-NEXT:    v_cndmask_b32_sdwa v2, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v4, v0, v1
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -574,9 +573,8 @@ define <2 x half> @v_minimum_v2f16(<2 x half> %src0, <2 x half> %src1) {
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7e00
 ; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
-; GFX900-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v4, s4
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -591,12 +589,12 @@ define <2 x half> @v_minimum_v2f16(<2 x half> %src0, <2 x half> %src1) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_min_f16 v2, v0, v1
-; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX10-NEXT:    v_cmp_o_f16_e64 s4, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_sdwa v1, v3, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0x7e00, v2, s4
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_minimum_v2f16:
@@ -691,14 +689,13 @@ define <2 x half> @v_minimum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_min_f16_e32 v4, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7e00
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v3, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
-; GFX8-NEXT:    v_min_f16_e32 v3, v0, v1
+; GFX8-NEXT:    v_min_f16_e32 v2, v3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX8-NEXT:    v_cndmask_b32_sdwa v2, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v4, v0, v1
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -709,9 +706,8 @@ define <2 x half> @v_minimum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) {
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7e00
 ; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
-; GFX900-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX900-NEXT:    v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v4, s4
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -726,12 +722,12 @@ define <2 x half> @v_minimum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_min_f16 v2, v0, v1
-; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v3, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX10-NEXT:    v_cmp_o_f16_e64 s4, v0, v1
+; GFX10-NEXT:    v_cndmask_b32_sdwa v1, v3, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0x7e00, v2, s4
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_minimum_v2f16__nsz:
@@ -827,15 +823,14 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
 ; GFX8-NEXT:    s_lshr_b32 s4, s17, 16
 ; GFX8-NEXT:    s_lshr_b32 s5, s16, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_min_f16_e32 v1, s5, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7e00
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, s5, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v1, s17
-; GFX8-NEXT:    v_min_f16_e32 v3, s16, v1
-; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, s16, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX8-NEXT:    v_min_f16_e32 v0, s5, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7e00
+; GFX8-NEXT:    v_mov_b32_e32 v2, s17
+; GFX8-NEXT:    v_cndmask_b32_sdwa v0, v1, v0, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v3, s16, v2
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, s16, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    ;;#ASMSTART
 ; GFX8-NEXT:    ; use v0
@@ -854,9 +849,8 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
 ; GFX900-NEXT:    s_lshr_b32 s5, s16, 16
 ; GFX900-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX900-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX900-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, s5, v3
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX900-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX900-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX900-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
 ; GFX900-NEXT:    ;;#ASMSTART
@@ -878,16 +872,16 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) {
 ; GFX10-LABEL: s_minimum_v2f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_min_f16 v0, s16, s17
-; GFX10-NEXT:    v_cmp_o_f16_e64 vcc_lo, s16, s17
 ; GFX10-NEXT:    s_lshr_b32 s4, s17, 16
 ; GFX10-NEXT:    s_lshr_b32 s5, s16, 16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo
+; GFX10-NEXT:    v_pk_min_f16 v0, s16, s17
 ; GFX10-NEXT:    v_cmp_o_f16_e64 vcc_lo, s5, s4
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo
-; GFX10-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX10-NEXT:    v_cmp_o_f16_e64 s4, s16, s17
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x7e00
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0x7e00, v0, s4
+; GFX10-NEXT:    v_cndmask_b32_sdwa v0, v1, v0, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use v0
 ; GFX10-NEXT:    ;;#ASMEND
@@ -958,18 +952,17 @@ define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX8-NEXT:    v_min_f16_e32 v6, v5, v4
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0x7e00
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v5, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v6, vcc
-; GFX8-NEXT:    v_min_f16_e32 v5, v1, v3
+; GFX8-NEXT:    v_min_f16_e32 v4, v5, v4
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX8-NEXT:    v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v6, v1, v3
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
 ; GFX8-NEXT:    v_min_f16_e32 v3, v0, v2
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_minimum_v3f16:
@@ -982,9 +975,8 @@ define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) {
 ; GFX900-NEXT:    v_pk_min_f16 v3, v0, v2
 ; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX900-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX900-NEXT:    v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v4, s4
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -1000,15 +992,15 @@ define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_min_f16 v4, v0, v2
-; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0x7e00, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_pk_min_f16 v2, v1, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX10-NEXT:    v_cmp_o_f16_e64 s4, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_sdwa v2, v5, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0x7e00, v4, s4
+; GFX10-NEXT:    v_pk_min_f16 v4, v1, v3
 ; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v2, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_minimum_v3f16:
@@ -1118,18 +1110,17 @@ define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX8-NEXT:    v_min_f16_e32 v6, v5, v4
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0x7e00
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v5, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v6, vcc
-; GFX8-NEXT:    v_min_f16_e32 v5, v1, v3
+; GFX8-NEXT:    v_min_f16_e32 v4, v5, v4
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX8-NEXT:    v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v6, v1, v3
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
 ; GFX8-NEXT:    v_min_f16_e32 v3, v0, v2
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_minimum_v3f16__nsz:
@@ -1142,9 +1133,8 @@ define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) {
 ; GFX900-NEXT:    v_pk_min_f16 v3, v0, v2
 ; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX900-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX900-NEXT:    v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v4, s4
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -1160,15 +1150,15 @@ define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_min_f16 v4, v0, v2
-; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, 0x7e00, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_pk_min_f16 v2, v1, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX10-NEXT:    v_cmp_o_f16_e64 s4, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_sdwa v2, v5, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0x7e00, v4, s4
+; GFX10-NEXT:    v_pk_min_f16 v4, v1, v3
 ; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v2, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_minimum_v3f16__nsz:
@@ -1278,25 +1268,24 @@ define <4 x half> @v_minimum_v4f16(<4 x half> %src0, <4 x half> %src1) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_min_f16_e32 v6, v5, v4
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0x7e00
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v5, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v6, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX8-NEXT:    v_min_f16_e32 v8, v6, v5
-; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v6, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
-; GFX8-NEXT:    v_min_f16_e32 v6, v1, v3
+; GFX8-NEXT:    v_min_f16_e32 v4, v5, v4
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v8, v7, v6
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v7, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    v_min_f16_e32 v7, v1, v3
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
 ; GFX8-NEXT:    v_min_f16_e32 v3, v0, v2
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_minimum_v4f16:
@@ -1306,15 +1295,13 @@ define <4 x half> @v_minimum_v4f16(<4 x half> %src0, <4 x half> %src1) {
 ; GFX900-NEXT:    v_mov_b32_e32 v5, 0x7e00
 ; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
-; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX900-NEXT:    v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX900-NEXT:    v_pk_min_f16 v3, v0, v2
 ; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX900-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX900-NEXT:    v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v4, s4
 ; GFX900-NEXT:    v_perm_b32 v1, v1, v6, s4
@@ -1330,20 +1317,20 @@ define <4 x half> @v_minimum_v4f16(<4 x half> %src0, <4 x half> %src1) {
 ; GFX10-LABEL: v_minimum_v4f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_min_f16 v4, v1, v3
-; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
-; GFX10-NEXT:    v_pk_min_f16 v5, v0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0x7e00, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0x7e00, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v5, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v1, v1, v6, 0x5040100
+; GFX10-NEXT:    v_pk_min_f16 v6, v1, v3
+; GFX10-NEXT:    v_cmp_o_f16_sdwa s4, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_pk_min_f16 v5, v0, v2
+; GFX10-NEXT:    v_cmp_o_f16_e64 s5, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_sdwa v2, v4, v6, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    s_mov_b32 vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_sdwa v4, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0x7e00, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v6, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_minimum_v4f16:
@@ -1462,25 +1449,24 @@ define <4 x half> @v_minimum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_min_f16_e32 v6, v5, v4
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0x7e00
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v5, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v7, v6, vcc
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX8-NEXT:    v_min_f16_e32 v8, v6, v5
-; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v6, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v8, vcc
-; GFX8-NEXT:    v_min_f16_e32 v6, v1, v3
+; GFX8-NEXT:    v_min_f16_e32 v4, v5, v4
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_sdwa v4, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_min_f16_e32 v8, v7, v6
+; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v7, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    v_min_f16_e32 v7, v1, v3
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v7, vcc
 ; GFX8-NEXT:    v_min_f16_e32 v3, v0, v2
 ; GFX8-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_minimum_v4f16__nsz:
@@ -1490,15 +1476,13 @@ define <4 x half> @v_minimum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) {
 ; GFX900-NEXT:    v_mov_b32_e32 v5, 0x7e00
 ; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
 ; GFX900-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
-; GFX900-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX900-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX900-NEXT:    v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX900-NEXT:    v_pk_min_f16 v3, v0, v2
 ; GFX900-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
 ; GFX900-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
-; GFX900-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX900-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX900-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX900-NEXT:    v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX900-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX900-NEXT:    v_perm_b32 v0, v0, v4, s4
 ; GFX900-NEXT:    v_perm_b32 v1, v1, v6, s4
@@ -1514,20 +1498,20 @@ define <4 x half> @v_minimum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) {
 ; GFX10-LABEL: v_minimum_v4f16__nsz:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_min_f16 v4, v1, v3
-; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
-; GFX10-NEXT:    v_pk_min_f16 v5, v0, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0x7e00, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0x7e00, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0x7e00
 ; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v5, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v1, v1, v6, 0x5040100
+; GFX10-NEXT:    v_pk_min_f16 v6, v1, v3
+; GFX10-NEXT:    v_cmp_o_f16_sdwa s4, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_pk_min_f16 v5, v0, v2
+; GFX10-NEXT:    v_cmp_o_f16_e64 s5, v0, v2
+; GFX10-NEXT:    v_cndmask_b32_sdwa v2, v4, v6, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    s_mov_b32 vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_sdwa v4, v4, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0x7e00, v5, s5
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v6, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_minimum_v4f16__nsz:
@@ -1734,34 +1718,34 @@ define <8 x half> @v_minimum_v8f16(<8 x half> %src0, <8 x half> %src1) {
 ; GFX10-LABEL: v_minimum_v8f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    v_pk_min_f16 v8, v3, v7
-; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v7
-; GFX10-NEXT:    v_pk_min_f16 v9, v2, v6
-; GFX10-NEXT:    v_pk_min_f16 v12, v1, v5
-; GFX10-NEXT:    v_pk_min_f16 v13, v0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v10, 0x7e00, v8, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v9, 0x7e00
+; GFX10-NEXT:    v_pk_min_f16 v10, v2, v6
+; GFX10-NEXT:    v_cmp_o_f16_sdwa s4, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_cmp_o_f16_sdwa s5, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_pk_min_f16 v12, v0, v4
+; GFX10-NEXT:    v_cndmask_b32_sdwa v11, v9, v8, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v2, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX10-NEXT:    v_cndmask_b32_e32 v9, 0x7e00, v9, vcc_lo
-; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v2, v6 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v5
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v13
-; GFX10-NEXT:    v_perm_b32 v2, v2, v9, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0x7e00, v12, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v10, vcc_lo
+; GFX10-NEXT:    s_mov_b32 vcc_lo, s4
+; GFX10-NEXT:    v_cmp_o_f16_sdwa s4, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_cndmask_b32_sdwa v6, v9, v10, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_pk_min_f16 v10, v1, v5
+; GFX10-NEXT:    s_mov_b32 vcc_lo, s5
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_sdwa v13, v9, v10, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX10-NEXT:    v_cndmask_b32_e32 v13, 0x7e00, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v0, v4 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v1, v5 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v13, 0x5040100
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_o_f16_sdwa vcc_lo, v3, v7 src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_perm_b32 v1, v1, v6, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v12, vcc_lo
+; GFX10-NEXT:    s_mov_b32 vcc_lo, s4
+; GFX10-NEXT:    v_cndmask_b32_sdwa v4, v9, v12, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v1, v5
+; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x5040100
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x7e00, v10, vcc_lo
+; GFX10-NEXT:    v_cmp_o_f16_e32 vcc_lo, v3, v7
+; GFX10-NEXT:    v_perm_b32 v1, v13, v1, 0x5040100
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0x7e00, v8, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v3, v10, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v3, v11, v3, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_minimum_v8f16:

diff  --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir
new file mode 100644
index 0000000000000..4b45c54a3b83d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave32.mir
@@ -0,0 +1,232 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc %s -mtriple=amdgcn -mcpu=gfx1030 -run-pass=si-peephole-sdwa -o - | FileCheck %s
+
+---
+name: cndmask_b32 # can be directly converted to SDWA without a copy to VCC
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vcc
+
+    ; CHECK-LABEL: name: cndmask_b32
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vcc
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY]], implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY1]], implicit $exec
+    ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[COPY]], 0, [[COPY1]], 0, 6, 0, 5, 5, implicit $vcc_lo, implicit $exec
+    ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]]
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+    %0:vgpr_32 = COPY $vgpr1
+    %1:vgpr_32 = COPY $vgpr0
+    %2:vgpr_32 = V_LSHRREV_B32_e64 16, %0, implicit $exec
+    %3:vgpr_32 = V_LSHRREV_B32_e64 16, %1, implicit $exec
+    %4:vgpr_32 = V_CNDMASK_B32_e32 killed %2, killed %3, implicit $exec, implicit $vcc
+    $vgpr0 = COPY %4
+    SI_RETURN implicit $vgpr0
+...
+
+# For SDWA conversion of V_CNDMASK, the carry-in operand must be
+# available in VCC_LO.  This is achieved by introducing a COPY
+# instruction. Comparison instructions could be changed to VOP2 form
+# intead, but we prefer to use a COPY.
+
+---
+name: carry-compare
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: carry-compare
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 undef [[DEF]], 1, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec
+    ; CHECK-NEXT: $vcc_lo = COPY killed [[V_CMP_EQ_U32_e64_]]
+    ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, undef [[DEF1]], 0, 6, 0, 6, 5, implicit $vcc_lo, implicit $exec
+    ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]]
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+    %0:sreg_32_xm0_xexec = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 undef %0, 1, implicit $exec
+    %3:vgpr_32 = V_LSHRREV_B32_e64 16, undef %1, implicit $exec
+    %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed %2, implicit $exec
+    $vgpr0 = COPY %4
+    SI_RETURN implicit $vgpr0
+...
+
+---
+name: carry-compare-class
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: carry-compare-class
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_CMP_CLASS_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_CLASS_F32_e64 2, undef [[DEF]], 1, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec
+    ; CHECK-NEXT: $vcc_lo = COPY killed [[V_CMP_CLASS_F32_e64_]]
+    ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, undef [[DEF1]], 0, 6, 0, 6, 5, implicit $vcc_lo, implicit $exec
+    ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]]
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+    %0:sreg_32_xm0_xexec = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:sreg_32_xm0_xexec = V_CMP_CLASS_F32_e64 2, undef %0, 1, implicit $exec
+    %3:vgpr_32 = V_LSHRREV_B32_e64 16, undef %1, implicit $exec
+    %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed %2, implicit $exec
+    $vgpr0 = COPY %4
+    SI_RETURN implicit $vgpr0
+...
+
+...
+---
+name: carry-non-compare
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: carry-non-compare
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[DEF1]], 8, [[DEF1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_PK_MAX_F16_]], implicit $exec
+    ; CHECK-NEXT: $vcc_lo = COPY killed undef [[DEF]]
+    ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, [[V_PK_MAX_F16_]], 0, 6, 0, 6, 5, implicit $vcc_lo, implicit $exec
+    ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]]
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+    %0:sreg_32_xm0_xexec = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = V_PK_MAX_F16 8, %1, 8, %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec
+    %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed undef %0, implicit $exec
+    $vgpr0 = COPY %4
+    SI_RETURN implicit $vgpr0
+...
+
+...
+---
+name: carry-multiuse
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: carry-multiuse
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[DEF1]], 8, [[DEF1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_PK_MAX_F16_]], implicit $exec
+    ; CHECK-NEXT: $vcc_lo = COPY undef [[DEF]]
+    ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, [[V_PK_MAX_F16_]], 0, 6, 0, 6, 5, implicit $vcc_lo, implicit $exec
+    ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]]
+    ; CHECK-NEXT: $vgpr1 = COPY [[DEF]]
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+    %0:sreg_32_xm0_xexec = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = V_PK_MAX_F16 8, %1, 8, %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec
+    %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, undef %0, implicit $exec
+    $vgpr0 = COPY %4
+    $vgpr1 = COPY %0
+    SI_RETURN implicit $vgpr0
+...
+
+...
+---
+name: live-vcc # cannot convert because of live VCC
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: live-vcc
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[DEF]], 1, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[DEF1]], implicit $exec
+    ; CHECK-NEXT: V_CMP_EQ_U32_e32 1, undef [[DEF1]], implicit-def $vcc_lo, implicit $exec
+    ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_LSHRREV_B32_e64_]], [[V_CMP_EQ_U32_e64_]], implicit $exec
+    ; CHECK-NEXT: [[V_CNDMASK_B32_e32_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 killed [[V_LSHRREV_B32_e64_]], killed [[V_LSHRREV_B32_e64_]], implicit $vcc_lo, implicit $exec
+    ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]]
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+    %0:sreg_32_xm0_xexec = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    %3:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 %0, 1, implicit $exec
+    %5:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec
+    V_CMP_EQ_U32_e32 1, undef %2, implicit-def $vcc, implicit $exec
+    %6:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %5, %3, implicit $exec
+    %7:vgpr_32 = V_CNDMASK_B32_e32 killed %5, killed %5, implicit $vcc, implicit $exec
+    $vgpr0 = COPY %6
+    SI_RETURN implicit $vgpr0
+...
+
+...
+---
+name: cannot-shrink-with-source-mods
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: cannot-shrink-with-source-mods
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[DEF]], 0, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[DEF1]], implicit $exec
+    ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 1, 0, 0, [[V_LSHRREV_B32_e64_]], killed [[V_CMP_EQ_U32_e64_]], implicit $exec
+    ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]]
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+    %0:sreg_32_xm0_xexec = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 %0, 0, implicit $exec
+    %3:vgpr_32 = V_LSHRREV_B32_e64 16, %1, implicit $exec
+    %4:vgpr_32 = V_CNDMASK_B32_e64 1, 0, 0, %3, killed %2, implicit $exec
+    $vgpr0 = COPY %4
+    SI_RETURN implicit $vgpr0
+...
+
+...
+---
+name: missing-carry-def
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_32_xm0_xexec }
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: missing-carry-def
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[DEF]], implicit $exec
+    ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_LSHRREV_B32_e64_]], undef %0:sreg_32_xm0_xexec, implicit $exec
+    ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]]
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+    %1:sreg_32_xm0_xexec = IMPLICIT_DEF
+    %2:vgpr_32 = V_LSHRREV_B32_e64 16, %1:sreg_32_xm0_xexec, implicit $exec
+    %3:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %2, undef %0:sreg_32_xm0_xexec, implicit $exec
+    $vgpr0 = COPY %3
+    SI_RETURN implicit $vgpr0
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave64.mir b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave64.mir
new file mode 100644
index 0000000000000..e243df4077ff4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole-cndmask-wave64.mir
@@ -0,0 +1,233 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc %s -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -o - | FileCheck %s
+
+---
+name: cndmask_b32 # can be directly converted to SDWA without a copy to VCC
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vcc
+
+    ; CHECK-LABEL: name: cndmask_b32
+    ; CHECK: liveins: $vgpr0, $vgpr1, $vcc
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY]], implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[COPY1]], implicit $exec
+    ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[COPY]], 0, [[COPY1]], 0, 6, 0, 5, 5, implicit $vcc, implicit $exec
+    ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]]
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+    %0:vgpr_32 = COPY $vgpr1
+    %1:vgpr_32 = COPY $vgpr0
+    %2:vgpr_32 = V_LSHRREV_B32_e64 16, %0, implicit $exec
+    %3:vgpr_32 = V_LSHRREV_B32_e64 16, %1, implicit $exec
+    %4:vgpr_32 = V_CNDMASK_B32_e32 killed %2, killed %3, implicit $exec, implicit $vcc
+    $vgpr0 = COPY %4
+    SI_RETURN implicit $vgpr0
+
+...
+
+# For SDWA conversion of V_CNDMASK, the carry-in operand must be
+# available in VCC.  This is achieved by introducing a COPY
+# instruction. Comparison instructions could be changed to VOP2 form
+# instead, but we prefer to use a COPY.
+
+---
+name: carry-compare
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: carry-compare
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[DEF]], 1, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[DEF1]], implicit $exec
+    ; CHECK-NEXT: $vcc = COPY killed [[V_CMP_EQ_U32_e64_]]
+    ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, [[DEF1]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec
+    ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]]
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:sreg_64_xexec = V_CMP_EQ_U32_e64 %0, 1, implicit $exec
+    %3:vgpr_32 = V_LSHRREV_B32_e64 16, %1, implicit $exec
+    %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed %2, implicit $exec
+    $vgpr0 = COPY %4
+    SI_RETURN implicit $vgpr0
+...
+
+---
+name: carry-compare-class
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: carry-compare-class
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_CMP_CLASS_F32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_CLASS_F32_e64 2, [[DEF]], 1, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[DEF1]], implicit $exec
+    ; CHECK-NEXT: $vcc = COPY killed [[V_CMP_CLASS_F32_e64_]]
+    ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, [[DEF1]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec
+    ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]]
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+    %0:sreg_32_xm0_xexec = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:sreg_64_xexec = V_CMP_CLASS_F32_e64 2,%0, 1, implicit $exec
+    %3:vgpr_32 = V_LSHRREV_B32_e64 16, %1, implicit $exec
+    %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed %2, implicit $exec
+    $vgpr0 = COPY %4
+    SI_RETURN implicit $vgpr0
+...
+
+...
+---
+name: carry-non-compare
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: carry-non-compare
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, [[DEF1]], 8, [[DEF1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_PK_MAX_F16_]], implicit $exec
+    ; CHECK-NEXT: $vcc = COPY killed [[DEF]]
+    ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, [[V_PK_MAX_F16_]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec
+    ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]]
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = V_PK_MAX_F16 8, %1, 8, %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec
+    %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, killed %0, implicit $exec
+    $vgpr0 = COPY %4
+    SI_RETURN implicit $vgpr0
+...
+
+...
+---
+name: carry-multiuse
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; CHECK-LABEL: name: carry-multiuse
+    ; CHECK: liveins: $vgpr0, $vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_PK_MAX_F16_:%[0-9]+]]:vgpr_32 = V_PK_MAX_F16 8, undef [[DEF1]], 8, undef [[DEF1]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_PK_MAX_F16_]], implicit $exec
+    ; CHECK-NEXT: $vcc = COPY [[DEF]]
+    ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: [[V_CNDMASK_B32_sdwa:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_sdwa 0, [[V_MOV_B32_e32_]], 0, [[V_PK_MAX_F16_]], 0, 6, 0, 6, 5, implicit $vcc, implicit $exec
+    ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_sdwa]]
+    ; CHECK-NEXT: $vgpr1 = COPY [[DEF]]
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = V_PK_MAX_F16 8, undef %1, 8, undef %1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+    %3:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec
+    %4:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %3, %0, implicit $exec
+    $vgpr0 = COPY %4
+    $vgpr1 = COPY %0
+    SI_RETURN implicit $vgpr0
+...
+
+...
+---
+name: live-vcc # cannot convert because of live VCC
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: live-vcc
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[DEF]], 1, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[DEF1]], implicit $exec
+    ; CHECK-NEXT: V_CMP_EQ_U32_e32 1, [[DEF1]], implicit-def $vcc, implicit $exec
+    ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_LSHRREV_B32_e64_]], [[V_CMP_EQ_U32_e64_]], implicit $exec
+    ; CHECK-NEXT: [[V_CNDMASK_B32_e32_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e32 killed [[V_LSHRREV_B32_e64_]], killed [[V_LSHRREV_B32_e64_]], implicit $vcc, implicit $exec
+    ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]]
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+    %0:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    %3:sreg_64_xexec = V_CMP_EQ_U32_e64 %0, 1, implicit $exec
+    %5:vgpr_32 = V_LSHRREV_B32_e64 16, %2, implicit $exec
+    V_CMP_EQ_U32_e32 1, %2, implicit-def $vcc, implicit $exec
+    %6:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %5, %3, implicit $exec
+    %7:vgpr_32 = V_CNDMASK_B32_e32 killed %5, killed %5, implicit $vcc, implicit $exec
+    $vgpr0 = COPY %6
+    SI_RETURN implicit $vgpr0
+...
+
+...
+---
+name: cannot-shrink-with-source-mods
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: cannot-shrink-with-source-mods
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 undef [[DEF]], 0, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF1]], implicit $exec
+    ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 1, 0, 0, [[V_LSHRREV_B32_e64_]], killed [[V_CMP_EQ_U32_e64_]], implicit $exec
+    ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]]
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:sreg_64_xexec = V_CMP_EQ_U32_e64 undef %0, 0, implicit $exec
+    %3:vgpr_32 = V_LSHRREV_B32_e64 16, undef %1, implicit $exec
+    %4:vgpr_32 = V_CNDMASK_B32_e64 1, 0, 0, %3, killed %2, implicit $exec
+    $vgpr0 = COPY %4
+    SI_RETURN implicit $vgpr0
+...
+
+...
+---
+name: missing-carry-def
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64_xexec }
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; CHECK-LABEL: name: missing-carry-def
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, undef [[DEF]], implicit $exec
+    ; CHECK-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_LSHRREV_B32_e64_]], undef %0:sreg_64_xexec, implicit $exec
+    ; CHECK-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]]
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = V_LSHRREV_B32_e64 16, undef %1, implicit $exec
+    %3:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, %2, undef %0:sreg_64_xexec, implicit $exec
+    $vgpr0 = COPY %3
+    SI_RETURN implicit $vgpr0
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
index a2e1ae83a6e5f..7ed27f008083e 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
@@ -38,14 +38,12 @@ define <2 x half> @add_select_fabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; VI-LABEL: add_select_fabs_fabs_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; VI-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; VI-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
+; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v3
+; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v4
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -54,14 +52,12 @@ define <2 x half> @add_select_fabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX9-LABEL: add_select_fabs_fabs_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v4
@@ -190,18 +186,16 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fabs_fabs_v2f16(<2 x
 ; GFX9-LABEL: add_select_multi_use_lhs_fabs_fabs_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v2, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v5
-; GFX9-NEXT:    v_pk_add_f16 v1, v2, v4
+; GFX9-NEXT:    v_pk_add_f16 v1, v1, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-TRUE16-LABEL: add_select_multi_use_lhs_fabs_fabs_v2f16:
@@ -309,35 +303,29 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fabs_fabs_v2f1
 ; VI-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; VI-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
-; VI-NEXT:    v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
+; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v3
+; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_sdwa v2, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NEXT:    v_add_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v4
-; VI-NEXT:    v_or_b32_e32 v0, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v1, v2
+; VI-NEXT:    v_or_b32_e32 v0, v0, v2
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v2, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v4
-; GFX9-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-TRUE16-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16:
@@ -469,18 +457,16 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fabs_fabs_v2f16(<2 x
 ; GFX9-LABEL: add_select_multi_use_rhs_fabs_fabs_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v4
-; GFX9-NEXT:    v_pk_add_f16 v1, v3, v5
+; GFX9-NEXT:    v_pk_add_f16 v1, v2, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-TRUE16-LABEL: add_select_multi_use_rhs_fabs_fabs_v2f16:
@@ -588,14 +574,12 @@ define <2 x half> @add_select_fabs_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; VI-LABEL: add_select_fabs_var_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; VI-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
+; VI-NEXT:    v_cndmask_b32_sdwa v2, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; VI-NEXT:    v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; VI-NEXT:    v_add_f16_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v4
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -603,15 +587,13 @@ define <2 x half> @add_select_fabs_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; GFX9-LABEL: add_select_fabs_var_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
+; GFX9-NEXT:    v_cndmask_b32_sdwa v2, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -702,13 +684,12 @@ define <2 x half> @add_select_fabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; VI-LABEL: add_select_fabs_negk_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0xbc00
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; VI-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
+; VI-NEXT:    v_mov_b32_e32 v2, 0xbc00
+; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -717,13 +698,12 @@ define <2 x half> @add_select_fabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX9-LABEL: add_select_fabs_negk_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xbc00
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xbc00
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v3
@@ -1039,9 +1019,8 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; VI-NEXT:    v_mov_b32_e32 v4, 0xbc00
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1054,9 +1033,8 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xbc00
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v3
@@ -1152,9 +1130,8 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x
 ; VI-NEXT:    v_mov_b32_e32 v4, 0xe400
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1167,9 +1144,8 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xe400
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v3
@@ -1260,13 +1236,12 @@ define <2 x half> @add_select_fabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; VI-LABEL: add_select_fabs_posk_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0x3c00
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; VI-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
+; VI-NEXT:    v_mov_b32_e32 v2, 0x3c00
+; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1275,13 +1250,12 @@ define <2 x half> @add_select_fabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX9-LABEL: add_select_fabs_posk_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3c00
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x3c00
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v3
@@ -1376,9 +1350,8 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1391,9 +1364,8 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v3
@@ -1488,10 +1460,8 @@ define <2 x half> @add_select_fneg_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; VI-LABEL: add_select_fneg_fneg_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; VI-NEXT:    v_sub_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -1502,10 +1472,8 @@ define <2 x half> @add_select_fneg_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX9-LABEL: add_select_fneg_fneg_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
@@ -1605,16 +1573,14 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fneg_fneg_v2f16(<2 x
 ; VI-LABEL: add_select_multi_use_lhs_fneg_fneg_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; VI-NEXT:    v_sub_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_sub_f16_e32 v0, v4, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
-; VI-NEXT:    v_sub_f16_sdwa v1, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_sub_f16_sdwa v1, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_sub_f16_e32 v2, v5, v2
 ; VI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -1622,10 +1588,8 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fneg_fneg_v2f16(<2 x
 ; GFX9-LABEL: add_select_multi_use_lhs_fneg_fneg_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
@@ -1729,33 +1693,27 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fneg_fneg_v2f1
 ; VI-LABEL: add_select_multi_store_use_lhs_fneg_fneg_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; VI-NEXT:    v_cndmask_b32_sdwa v5, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; VI-NEXT:    v_xor_b32_e32 v5, 0x80008000, v2
-; VI-NEXT:    v_sub_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_xor_b32_e32 v1, 0x80008000, v2
+; VI-NEXT:    v_sub_f16_sdwa v2, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_sub_f16_e32 v0, v4, v0
-; VI-NEXT:    v_or_b32_e32 v0, v0, v1
-; VI-NEXT:    v_mov_b32_e32 v1, v5
+; VI-NEXT:    v_or_b32_e32 v0, v0, v2
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: add_select_multi_store_use_lhs_fneg_fneg_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_sdwa v5, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_xor_b32_e32 v5, 0x80008000, v2
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s4
+; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80008000, v2
 ; GFX9-NEXT:    v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1]
-; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SAFE-TRUE16-LABEL: add_select_multi_store_use_lhs_fneg_fneg_v2f16:
@@ -1856,16 +1814,14 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fneg_fneg_v2f16(<2 x
 ; VI-LABEL: add_select_multi_use_rhs_fneg_fneg_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; VI-NEXT:    v_sub_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_sub_f16_e32 v0, v4, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
-; VI-NEXT:    v_sub_f16_sdwa v1, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_sub_f16_sdwa v1, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_sub_f16_e32 v2, v5, v3
 ; VI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -1873,10 +1829,8 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fneg_fneg_v2f16(<2 x
 ; GFX9-LABEL: add_select_multi_use_rhs_fneg_fneg_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
@@ -1982,14 +1936,12 @@ define <2 x half> @add_select_fneg_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; VI-LABEL: add_select_fneg_var_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; VI-NEXT:    v_xor_b32_e32 v1, 0x80008000, v2
+; VI-NEXT:    v_cndmask_b32_sdwa v2, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; VI-NEXT:    v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; VI-NEXT:    v_add_f16_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v4
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -1997,15 +1949,13 @@ define <2 x half> @add_select_fneg_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; GFX9-LABEL: add_select_fneg_var_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80008000, v2
+; GFX9-NEXT:    v_cndmask_b32_sdwa v2, v3, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2096,13 +2046,12 @@ define <2 x half> @add_select_fneg_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; VI-LABEL: add_select_fneg_negk_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; VI-NEXT:    v_mov_b32_e32 v5, 0x3c00
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; VI-NEXT:    v_mov_b32_e32 v1, 0x3c00
+; VI-NEXT:    v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
-; VI-NEXT:    v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_sub_f16_sdwa v1, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_sub_f16_e32 v0, v3, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -2110,14 +2059,13 @@ define <2 x half> @add_select_fneg_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX9-LABEL: add_select_fneg_negk_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3c00
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3c00
+; GFX9-NEXT:    v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2201,13 +2149,12 @@ define <2 x half> @add_select_fneg_inv2pi_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; VI-LABEL: add_select_fneg_inv2pi_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; VI-NEXT:    v_mov_b32_e32 v5, 0xb118
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; VI-NEXT:    v_mov_b32_e32 v1, 0xb118
+; VI-NEXT:    v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
-; VI-NEXT:    v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_sub_f16_sdwa v1, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_sub_f16_e32 v0, v3, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -2215,14 +2162,13 @@ define <2 x half> @add_select_fneg_inv2pi_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX9-LABEL: add_select_fneg_inv2pi_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0xb118
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xb118
+; GFX9-NEXT:    v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2306,13 +2252,12 @@ define <2 x half> @add_select_fneg_neginv2pi_v2f16(<2 x i32> %c, <2 x half> %x,
 ; VI-LABEL: add_select_fneg_neginv2pi_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; VI-NEXT:    v_mov_b32_e32 v5, 0x3118
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; VI-NEXT:    v_mov_b32_e32 v1, 0x3118
+; VI-NEXT:    v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
-; VI-NEXT:    v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_sub_f16_sdwa v1, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_sub_f16_e32 v0, v3, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -2320,14 +2265,13 @@ define <2 x half> @add_select_fneg_neginv2pi_v2f16(<2 x i32> %c, <2 x half> %x,
 ; GFX9-LABEL: add_select_fneg_neginv2pi_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3118
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3118
+; GFX9-NEXT:    v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2725,13 +2669,12 @@ define <2 x half> @add_select_negk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; VI-LABEL: add_select_negk_fneg_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; VI-NEXT:    v_mov_b32_e32 v5, 0x3c00
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; VI-NEXT:    v_mov_b32_e32 v1, 0x3c00
+; VI-NEXT:    v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
-; VI-NEXT:    v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_sub_f16_sdwa v1, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_sub_f16_e32 v0, v3, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -2739,14 +2682,13 @@ define <2 x half> @add_select_negk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX9-LABEL: add_select_negk_fneg_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3c00
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3c00
+; GFX9-NEXT:    v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2829,13 +2771,12 @@ define <2 x half> @add_select_fneg_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; VI-LABEL: add_select_fneg_posk_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; VI-NEXT:    v_mov_b32_e32 v5, 0xbc00
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; VI-NEXT:    v_mov_b32_e32 v1, 0xbc00
+; VI-NEXT:    v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
-; VI-NEXT:    v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_sub_f16_sdwa v1, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_sub_f16_e32 v0, v3, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -2843,14 +2784,13 @@ define <2 x half> @add_select_fneg_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX9-LABEL: add_select_fneg_posk_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0xbc00
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xbc00
+; GFX9-NEXT:    v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2933,13 +2873,12 @@ define <2 x half> @add_select_posk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; VI-LABEL: add_select_posk_fneg_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; VI-NEXT:    v_mov_b32_e32 v5, 0xbc00
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; VI-NEXT:    v_mov_b32_e32 v1, 0xbc00
+; VI-NEXT:    v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
-; VI-NEXT:    v_sub_f16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_sub_f16_sdwa v1, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_sub_f16_e32 v0, v3, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -2947,14 +2886,13 @@ define <2 x half> @add_select_posk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x
 ; GFX9-LABEL: add_select_posk_fneg_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0xbc00
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xbc00
+; GFX9-NEXT:    v_cndmask_b32_sdwa v4, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3045,14 +2983,12 @@ define <2 x half> @add_select_negfabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; VI-LABEL: add_select_negfabs_fabs_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
-; VI-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; VI-NEXT:    v_or_b32_e32 v1, 0x80008000, v2
+; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v3
+; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v4
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -3061,14 +2997,12 @@ define <2 x half> @add_select_negfabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX9-LABEL: add_select_negfabs_fabs_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
-; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-NEXT:    v_or_b32_e32 v1, 0x80008000, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v4
@@ -3177,14 +3111,12 @@ define <2 x half> @add_select_fabs_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; VI-LABEL: add_select_fabs_negfabs_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; VI-NEXT:    v_or_b32_e32 v3, 0x80008000, v3
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; VI-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
+; VI-NEXT:    v_or_b32_e32 v2, 0x80008000, v3
+; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v4
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -3193,14 +3125,12 @@ define <2 x half> @add_select_fabs_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX9-LABEL: add_select_fabs_negfabs_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x80008000, v3
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x80008000, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v4
@@ -3309,14 +3239,12 @@ define <2 x half> @add_select_neg_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; VI-LABEL: add_select_neg_fabs_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; VI-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; VI-NEXT:    v_xor_b32_e32 v1, 0x80008000, v2
+; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v3
+; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v4
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -3325,14 +3253,12 @@ define <2 x half> @add_select_neg_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; GFX9-LABEL: add_select_neg_fabs_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80008000, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v4
@@ -3440,14 +3366,12 @@ define <2 x half> @add_select_fabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; VI-LABEL: add_select_fabs_neg_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; VI-NEXT:    v_xor_b32_e32 v3, 0x80008000, v3
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; VI-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
+; VI-NEXT:    v_xor_b32_e32 v2, 0x80008000, v3
+; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_add_f16_e32 v0, v0, v4
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -3456,14 +3380,12 @@ define <2 x half> @add_select_fabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h
 ; GFX9-LABEL: add_select_fabs_neg_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; GFX9-NEXT:    v_xor_b32_e32 v3, 0x80008000, v3
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
+; GFX9-NEXT:    v_xor_b32_e32 v2, 0x80008000, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v4
@@ -3567,14 +3489,12 @@ define <2 x half> @add_select_neg_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; VI-LABEL: add_select_neg_negfabs_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc
+; VI-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v3
+; VI-NEXT:    v_cndmask_b32_sdwa v3, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; VI-NEXT:    v_sub_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; VI-NEXT:    v_sub_f16_sdwa v1, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_sub_f16_e32 v0, v4, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -3582,15 +3502,13 @@ define <2 x half> @add_select_neg_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX9-LABEL: add_select_neg_negfabs_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v3
+; GFX9-NEXT:    v_cndmask_b32_sdwa v3, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_perm_b32 v0, v3, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3687,14 +3605,12 @@ define <2 x half> @add_select_negfabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; VI-LABEL: add_select_negfabs_neg_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc
+; VI-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
+; VI-NEXT:    v_cndmask_b32_sdwa v2, v1, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT:    v_sub_f16_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; VI-NEXT:    v_sub_f16_sdwa v1, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_sub_f16_e32 v0, v4, v0
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -3702,15 +3618,13 @@ define <2 x half> @add_select_negfabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX9-LABEL: add_select_negfabs_neg_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v5, vcc
+; GFX9-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v2
+; GFX9-NEXT:    v_cndmask_b32_sdwa v2, v1, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
-; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s4
 ; GFX9-NEXT:    v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3807,13 +3721,12 @@ define <2 x half> @mul_select_negfabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; VI-LABEL: mul_select_negfabs_posk_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0x4400
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; VI-NEXT:    v_or_b32_e32 v1, 0x80008000, v2
+; VI-NEXT:    v_mov_b32_e32 v2, 0x4400
+; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_mul_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -3822,13 +3735,12 @@ define <2 x half> @mul_select_negfabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX9-LABEL: mul_select_negfabs_posk_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4400
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX9-NEXT:    v_or_b32_e32 v1, 0x80008000, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4400
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v3
@@ -3928,9 +3840,8 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; VI-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_mul_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -3943,9 +3854,8 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4400
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v3
@@ -4041,13 +3951,12 @@ define <2 x half> @mul_select_negfabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; VI-LABEL: mul_select_negfabs_negk_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0xc400
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; VI-NEXT:    v_or_b32_e32 v1, 0x80008000, v2
+; VI-NEXT:    v_mov_b32_e32 v2, 0xc400
+; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_mul_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -4056,13 +3965,12 @@ define <2 x half> @mul_select_negfabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX9-LABEL: mul_select_negfabs_negk_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x80008000, v2
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xc400
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX9-NEXT:    v_or_b32_e32 v1, 0x80008000, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xc400
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v3
@@ -4162,9 +4070,8 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; VI-NEXT:    v_mov_b32_e32 v4, 0xc400
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_mul_f16_e32 v0, v0, v3
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -4177,9 +4084,8 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0xc400
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v3
@@ -4279,32 +4185,29 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, <
 ; VI-SAFE-LABEL: select_fneg_posk_src_add_v2f16:
 ; VI-SAFE:       ; %bb.0:
 ; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SAFE-NEXT:    v_mov_b32_e32 v3, 0x4400
-; VI-SAFE-NEXT:    v_add_f16_sdwa v3, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-SAFE-NEXT:    v_add_f16_e32 v2, 4.0, v2
-; VI-SAFE-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; VI-SAFE-NEXT:    v_mov_b32_e32 v3, 0x4000
-; VI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; VI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VI-SAFE-NEXT:    v_mov_b32_e32 v1, 0x4400
+; VI-SAFE-NEXT:    v_add_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-SAFE-NEXT:    v_add_f16_e32 v2, 4.0, v2
+; VI-SAFE-NEXT:    v_or_b32_e32 v1, v2, v1
+; VI-SAFE-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
+; VI-SAFE-NEXT:    v_mov_b32_e32 v2, 0x4000
+; VI-SAFE-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; VI-SAFE-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; VI-SAFE-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SAFE-LABEL: select_fneg_posk_src_add_v2f16:
 ; GFX9-SAFE:       ; %bb.0:
 ; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SAFE-NEXT:    v_pk_add_f16 v2, v2, 4.0 op_sel_hi:[1,0]
-; GFX9-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX9-SAFE-NEXT:    v_mov_b32_e32 v3, 0x4000
-; GFX9-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-SAFE-NEXT:    v_pk_add_f16 v1, v2, 4.0 op_sel_hi:[1,0]
+; GFX9-SAFE-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
+; GFX9-SAFE-NEXT:    v_mov_b32_e32 v2, 0x4000
+; GFX9-SAFE-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SAFE-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-SAFE-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-SAFE-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
@@ -4355,28 +4258,26 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, <
 ; VI-NSZ-LABEL: select_fneg_posk_src_add_v2f16:
 ; VI-NSZ:       ; %bb.0:
 ; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NSZ-NEXT:    v_mov_b32_e32 v3, 0xc400
-; VI-NSZ-NEXT:    v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-NSZ-NEXT:    v_sub_f16_e32 v2, -4.0, v2
-; VI-NSZ-NEXT:    v_mov_b32_e32 v4, 0x4000
-; VI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-NSZ-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; VI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NSZ-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; VI-NSZ-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VI-NSZ-NEXT:    v_mov_b32_e32 v1, 0xc400
+; VI-NSZ-NEXT:    v_sub_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NSZ-NEXT:    v_sub_f16_e32 v2, -4.0, v2
+; VI-NSZ-NEXT:    v_mov_b32_e32 v3, 0x4000
+; VI-NSZ-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; VI-NSZ-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[4:5]
+; VI-NSZ-NEXT:    v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NSZ-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-NSZ-LABEL: select_fneg_posk_src_add_v2f16:
 ; GFX9-NSZ:       ; %bb.0:
 ; GFX9-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NSZ-NEXT:    v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
-; GFX9-NSZ-NEXT:    v_mov_b32_e32 v3, 0x4000
-; GFX9-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NSZ-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NSZ-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NSZ-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-NSZ-NEXT:    v_pk_add_f16 v1, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NSZ-NEXT:    v_mov_b32_e32 v2, 0x4000
+; GFX9-NSZ-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NSZ-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NSZ-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NSZ-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NSZ-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NSZ-NEXT:    s_setpc_b64 s[30:31]
@@ -4439,32 +4340,29 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; VI-SAFE-LABEL: select_fneg_posk_src_sub_v2f16:
 ; VI-SAFE:       ; %bb.0:
 ; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SAFE-NEXT:    v_mov_b32_e32 v3, 0xc400
-; VI-SAFE-NEXT:    v_add_f16_sdwa v3, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-SAFE-NEXT:    v_add_f16_e32 v2, -4.0, v2
-; VI-SAFE-NEXT:    v_or_b32_e32 v2, v2, v3
-; VI-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; VI-SAFE-NEXT:    v_mov_b32_e32 v3, 0x4000
-; VI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; VI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VI-SAFE-NEXT:    v_mov_b32_e32 v1, 0xc400
+; VI-SAFE-NEXT:    v_add_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-SAFE-NEXT:    v_add_f16_e32 v2, -4.0, v2
+; VI-SAFE-NEXT:    v_or_b32_e32 v1, v2, v1
+; VI-SAFE-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
+; VI-SAFE-NEXT:    v_mov_b32_e32 v2, 0x4000
+; VI-SAFE-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; VI-SAFE-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; VI-SAFE-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SAFE-LABEL: select_fneg_posk_src_sub_v2f16:
 ; GFX9-SAFE:       ; %bb.0:
 ; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SAFE-NEXT:    v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0]
-; GFX9-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX9-SAFE-NEXT:    v_mov_b32_e32 v3, 0x4000
-; GFX9-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-SAFE-NEXT:    v_pk_add_f16 v1, v2, -4.0 op_sel_hi:[1,0]
+; GFX9-SAFE-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
+; GFX9-SAFE-NEXT:    v_mov_b32_e32 v2, 0x4000
+; GFX9-SAFE-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SAFE-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-SAFE-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-SAFE-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
@@ -4515,28 +4413,26 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; VI-NSZ-LABEL: select_fneg_posk_src_sub_v2f16:
 ; VI-NSZ:       ; %bb.0:
 ; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NSZ-NEXT:    v_mov_b32_e32 v3, 0x4400
-; VI-NSZ-NEXT:    v_sub_f16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-NSZ-NEXT:    v_sub_f16_e32 v2, 4.0, v2
-; VI-NSZ-NEXT:    v_mov_b32_e32 v4, 0x4000
-; VI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-NSZ-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; VI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NSZ-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; VI-NSZ-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VI-NSZ-NEXT:    v_mov_b32_e32 v1, 0x4400
+; VI-NSZ-NEXT:    v_sub_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NSZ-NEXT:    v_sub_f16_e32 v2, 4.0, v2
+; VI-NSZ-NEXT:    v_mov_b32_e32 v3, 0x4000
+; VI-NSZ-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; VI-NSZ-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[4:5]
+; VI-NSZ-NEXT:    v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NSZ-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-NSZ-LABEL: select_fneg_posk_src_sub_v2f16:
 ; GFX9-NSZ:       ; %bb.0:
 ; GFX9-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NSZ-NEXT:    v_pk_add_f16 v2, v2, 4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
-; GFX9-NSZ-NEXT:    v_mov_b32_e32 v3, 0x4000
-; GFX9-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NSZ-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NSZ-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NSZ-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-NSZ-NEXT:    v_pk_add_f16 v1, v2, 4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NSZ-NEXT:    v_mov_b32_e32 v2, 0x4000
+; GFX9-NSZ-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NSZ-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NSZ-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NSZ-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NSZ-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NSZ-NEXT:    s_setpc_b64 s[30:31]
@@ -4591,28 +4487,26 @@ define <2 x half> @select_fneg_posk_src_mul_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; VI-LABEL: select_fneg_posk_src_mul_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, 0xc400
-; VI-NEXT:    v_mul_f16_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_mul_f16_e32 v2, -4.0, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0x4000
-; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_mov_b32_e32 v1, 0xc400
+; VI-NEXT:    v_mul_f16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_mul_f16_e32 v2, -4.0, v2
+; VI-NEXT:    v_mov_b32_e32 v3, 0x4000
+; VI-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[4:5]
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: select_fneg_posk_src_mul_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_mul_f16 v2, v2, -4.0 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4000
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-NEXT:    v_pk_mul_f16 v1, v2, -4.0 op_sel_hi:[1,0]
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4000
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -4704,34 +4598,31 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, <
 ; VI-SAFE-LABEL: select_fneg_posk_src_fma_v2f16:
 ; VI-SAFE:       ; %bb.0:
 ; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; VI-SAFE-NEXT:    v_fma_f16 v4, v5, 4.0, v4
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; VI-SAFE-NEXT:    v_fma_f16 v2, v2, 4.0, v3
-; VI-SAFE-NEXT:    v_or_b32_e32 v2, v2, v4
-; VI-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; VI-SAFE-NEXT:    v_mov_b32_e32 v3, 0x4000
-; VI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; VI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; VI-SAFE-NEXT:    v_fma_f16 v1, v4, 4.0, v1
 ; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VI-SAFE-NEXT:    v_fma_f16 v2, v2, 4.0, v3
+; VI-SAFE-NEXT:    v_or_b32_e32 v1, v2, v1
+; VI-SAFE-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
+; VI-SAFE-NEXT:    v_mov_b32_e32 v2, 0x4000
+; VI-SAFE-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; VI-SAFE-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; VI-SAFE-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SAFE-LABEL: select_fneg_posk_src_fma_v2f16:
 ; GFX9-SAFE:       ; %bb.0:
 ; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SAFE-NEXT:    v_pk_fma_f16 v2, v2, 4.0, v3 op_sel_hi:[1,0,1]
-; GFX9-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX9-SAFE-NEXT:    v_mov_b32_e32 v3, 0x4000
-; GFX9-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-SAFE-NEXT:    v_pk_fma_f16 v1, v2, 4.0, v3 op_sel_hi:[1,0,1]
+; GFX9-SAFE-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
+; GFX9-SAFE-NEXT:    v_mov_b32_e32 v2, 0x4000
+; GFX9-SAFE-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SAFE-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-SAFE-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-SAFE-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
@@ -4767,29 +4658,27 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, <
 ; VI-NSZ-LABEL: select_fneg_posk_src_fma_v2f16:
 ; VI-NSZ:       ; %bb.0:
 ; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NSZ-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; VI-NSZ-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; VI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; VI-NSZ-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; VI-NSZ-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; VI-NSZ-NEXT:    v_fma_f16 v1, v4, -4.0, -v1
 ; VI-NSZ-NEXT:    v_fma_f16 v2, v2, -4.0, -v3
 ; VI-NSZ-NEXT:    v_mov_b32_e32 v3, 0x4000
-; VI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-NSZ-NEXT:    v_fma_f16 v4, v5, -4.0, -v4
-; VI-NSZ-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; VI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NSZ-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; VI-NSZ-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VI-NSZ-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; VI-NSZ-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[4:5]
+; VI-NSZ-NEXT:    v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NSZ-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-NSZ-LABEL: select_fneg_posk_src_fma_v2f16:
 ; GFX9-NSZ:       ; %bb.0:
 ; GFX9-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NSZ-NEXT:    v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GFX9-NSZ-NEXT:    v_mov_b32_e32 v3, 0x4000
-; GFX9-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NSZ-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NSZ-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NSZ-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-NSZ-NEXT:    v_pk_fma_f16 v1, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX9-NSZ-NEXT:    v_mov_b32_e32 v2, 0x4000
+; GFX9-NSZ-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NSZ-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NSZ-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NSZ-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NSZ-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NSZ-NEXT:    s_setpc_b64 s[30:31]
@@ -4858,34 +4747,31 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x,
 ; VI-SAFE-LABEL: select_fneg_posk_src_fmad_v2f16:
 ; VI-SAFE:       ; %bb.0:
 ; VI-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; VI-SAFE-NEXT:    v_fma_f16 v4, v5, 4.0, v4
-; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; VI-SAFE-NEXT:    v_fma_f16 v2, v2, 4.0, v3
-; VI-SAFE-NEXT:    v_or_b32_e32 v2, v2, v4
-; VI-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; VI-SAFE-NEXT:    v_mov_b32_e32 v3, 0x4000
-; VI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; VI-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; VI-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; VI-SAFE-NEXT:    v_fma_f16 v1, v4, 4.0, v1
 ; VI-SAFE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VI-SAFE-NEXT:    v_fma_f16 v2, v2, 4.0, v3
+; VI-SAFE-NEXT:    v_or_b32_e32 v1, v2, v1
+; VI-SAFE-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
+; VI-SAFE-NEXT:    v_mov_b32_e32 v2, 0x4000
+; VI-SAFE-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; VI-SAFE-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; VI-SAFE-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-SAFE-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-SAFE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SAFE-LABEL: select_fneg_posk_src_fmad_v2f16:
 ; GFX9-SAFE:       ; %bb.0:
 ; GFX9-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SAFE-NEXT:    v_pk_fma_f16 v2, v2, 4.0, v3 op_sel_hi:[1,0,1]
-; GFX9-SAFE-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
-; GFX9-SAFE-NEXT:    v_mov_b32_e32 v3, 0x4000
-; GFX9-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-SAFE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-SAFE-NEXT:    v_pk_fma_f16 v1, v2, 4.0, v3 op_sel_hi:[1,0,1]
+; GFX9-SAFE-NEXT:    v_xor_b32_e32 v1, 0x80008000, v1
+; GFX9-SAFE-NEXT:    v_mov_b32_e32 v2, 0x4000
+; GFX9-SAFE-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SAFE-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-SAFE-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-SAFE-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-SAFE-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-SAFE-NEXT:    s_setpc_b64 s[30:31]
@@ -4942,29 +4828,27 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x,
 ; VI-NSZ-LABEL: select_fneg_posk_src_fmad_v2f16:
 ; VI-NSZ:       ; %bb.0:
 ; VI-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NSZ-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; VI-NSZ-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; VI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; VI-NSZ-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; VI-NSZ-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; VI-NSZ-NEXT:    v_fma_f16 v1, v4, -4.0, -v1
 ; VI-NSZ-NEXT:    v_fma_f16 v2, v2, -4.0, -v3
 ; VI-NSZ-NEXT:    v_mov_b32_e32 v3, 0x4000
-; VI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-NSZ-NEXT:    v_fma_f16 v4, v5, -4.0, -v4
-; VI-NSZ-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; VI-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NSZ-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; VI-NSZ-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VI-NSZ-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; VI-NSZ-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[4:5]
+; VI-NSZ-NEXT:    v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NSZ-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NSZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-NSZ-LABEL: select_fneg_posk_src_fmad_v2f16:
 ; GFX9-NSZ:       ; %bb.0:
 ; GFX9-NSZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NSZ-NEXT:    v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GFX9-NSZ-NEXT:    v_mov_b32_e32 v3, 0x4000
-; GFX9-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NSZ-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NSZ-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GFX9-NSZ-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NSZ-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-NSZ-NEXT:    v_pk_fma_f16 v1, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX9-NSZ-NEXT:    v_mov_b32_e32 v2, 0x4000
+; GFX9-NSZ-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NSZ-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[4:5]
+; GFX9-NSZ-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NSZ-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NSZ-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX9-NSZ-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index 7339b545686f5..21719226710de 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -806,45 +806,41 @@ define amdgpu_kernel void @select_v2f16(
 ; VI-LABEL: select_v2f16:
 ; VI:       ; %bb.0: ; %entry
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x44
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    s_mov_b32 s6, s2
-; VI-NEXT:    s_mov_b32 s7, s3
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s22, s6
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s16, s10
-; VI-NEXT:    s_mov_b32 s17, s11
-; VI-NEXT:    s_mov_b32 s18, s2
-; VI-NEXT:    s_mov_b32 s19, s3
 ; VI-NEXT:    s_mov_b32 s20, s12
 ; VI-NEXT:    s_mov_b32 s21, s13
-; VI-NEXT:    s_mov_b32 s22, s2
-; VI-NEXT:    s_mov_b32 s23, s3
+; VI-NEXT:    s_mov_b32 s23, s7
+; VI-NEXT:    s_mov_b32 s16, s10
+; VI-NEXT:    s_mov_b32 s17, s11
+; VI-NEXT:    s_mov_b32 s18, s6
+; VI-NEXT:    s_mov_b32 s19, s7
+; VI-NEXT:    buffer_load_dword v0, off, s[20:23], 0
+; VI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
+; VI-NEXT:    s_mov_b32 s2, s6
+; VI-NEXT:    s_mov_b32 s3, s7
 ; VI-NEXT:    s_mov_b32 s12, s14
 ; VI-NEXT:    s_mov_b32 s13, s15
-; VI-NEXT:    s_mov_b32 s14, s2
-; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
-; VI-NEXT:    buffer_load_dword v1, off, s[20:23], 0
-; VI-NEXT:    buffer_load_dword v2, off, s[16:19], 0
-; VI-NEXT:    s_mov_b32 s15, s3
+; VI-NEXT:    s_mov_b32 s14, s6
+; VI-NEXT:    s_mov_b32 s15, s7
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], 0
 ; VI-NEXT:    buffer_load_dword v3, off, s[12:15], 0
-; VI-NEXT:    s_mov_b32 s0, s8
-; VI-NEXT:    s_mov_b32 s1, s9
+; VI-NEXT:    s_mov_b32 s4, s8
+; VI-NEXT:    s_mov_b32 s5, s9
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v2, v1
+; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v5, v4
+; VI-NEXT:    v_cmp_lt_f16_e64 s[0:1], v1, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
-; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v6, v5
-; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_cndmask_b32_e64 v0, v2, v3, s[0:1]
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-TRUE16-LABEL: select_v2f16:
@@ -1012,6 +1008,7 @@ define amdgpu_kernel void @select_v2f16_imm_a(
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s12, s2
 ; VI-NEXT:    s_mov_b32 s13, s3
+; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
 ; VI-NEXT:    s_mov_b32 s16, s4
 ; VI-NEXT:    s_mov_b32 s17, s5
 ; VI-NEXT:    s_mov_b32 s18, s10
@@ -1020,7 +1017,6 @@ define amdgpu_kernel void @select_v2f16_imm_a(
 ; VI-NEXT:    s_mov_b32 s5, s7
 ; VI-NEXT:    s_mov_b32 s6, s10
 ; VI-NEXT:    s_mov_b32 s7, s11
-; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
 ; VI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
 ; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
 ; VI-NEXT:    s_movk_i32 s2, 0x3900
@@ -1028,14 +1024,11 @@ define amdgpu_kernel void @select_v2f16_imm_a(
 ; VI-NEXT:    s_mov_b32 s9, s1
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; VI-NEXT:    v_cmp_lt_f16_e32 vcc, 0.5, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    v_cmp_lt_f16_e32 vcc, s2, v3
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_cmp_lt_f16_e64 s[0:1], 0.5, v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[0:1]
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
@@ -1191,6 +1184,7 @@ define amdgpu_kernel void @select_v2f16_imm_b(
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s12, s2
 ; VI-NEXT:    s_mov_b32 s13, s3
+; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
 ; VI-NEXT:    s_mov_b32 s16, s4
 ; VI-NEXT:    s_mov_b32 s17, s5
 ; VI-NEXT:    s_mov_b32 s18, s10
@@ -1199,7 +1193,6 @@ define amdgpu_kernel void @select_v2f16_imm_b(
 ; VI-NEXT:    s_mov_b32 s5, s7
 ; VI-NEXT:    s_mov_b32 s6, s10
 ; VI-NEXT:    s_mov_b32 s7, s11
-; VI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
 ; VI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
 ; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
 ; VI-NEXT:    s_movk_i32 s2, 0x3900
@@ -1207,14 +1200,11 @@ define amdgpu_kernel void @select_v2f16_imm_b(
 ; VI-NEXT:    s_mov_b32 s9, s1
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; VI-NEXT:    v_cmp_gt_f16_e32 vcc, 0.5, v0
-; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; VI-NEXT:    v_cmp_gt_f16_e32 vcc, s2, v3
-; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_cmp_gt_f16_e64 s[0:1], 0.5, v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[0:1]
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
@@ -1362,42 +1352,40 @@ define amdgpu_kernel void @select_v2f16_imm_c(
 ;
 ; VI-LABEL: select_v2f16_imm_c:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    s_mov_b32 s18, s2
-; VI-NEXT:    s_mov_b32 s19, s3
+; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s11, 0xf000
+; VI-NEXT:    s_mov_b32 s10, -1
+; VI-NEXT:    s_mov_b32 s18, s10
+; VI-NEXT:    s_mov_b32 s19, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s16, s8
-; VI-NEXT:    s_mov_b32 s17, s9
-; VI-NEXT:    s_mov_b32 s14, s2
-; VI-NEXT:    s_mov_b32 s12, s6
-; VI-NEXT:    s_mov_b32 s13, s7
-; VI-NEXT:    s_mov_b32 s15, s3
-; VI-NEXT:    s_mov_b32 s8, s10
-; VI-NEXT:    s_mov_b32 s9, s11
-; VI-NEXT:    s_mov_b32 s10, s2
+; VI-NEXT:    s_mov_b32 s16, s4
+; VI-NEXT:    s_mov_b32 s17, s5
+; VI-NEXT:    s_mov_b32 s14, s10
+; VI-NEXT:    s_mov_b32 s12, s2
+; VI-NEXT:    s_mov_b32 s13, s3
+; VI-NEXT:    s_mov_b32 s15, s11
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s6, s10
 ; VI-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
-; VI-NEXT:    s_mov_b32 s11, s3
-; VI-NEXT:    buffer_load_dword v2, off, s[8:11], 0
+; VI-NEXT:    s_mov_b32 s7, s11
+; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3800
-; VI-NEXT:    v_mov_b32_e32 v4, 0x3900
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s8, s0
+; VI-NEXT:    s_mov_b32 s9, s1
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v1, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v6, v5
-; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_cmp_nlt_f16_e32 vcc, v5, v4
+; VI-NEXT:    v_mov_b32_e32 v1, 0x3900
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-TRUE16-LABEL: select_v2f16_imm_c:
@@ -1543,42 +1531,40 @@ define amdgpu_kernel void @select_v2f16_imm_d(
 ;
 ; VI-LABEL: select_v2f16_imm_d:
 ; VI:       ; %bb.0: ; %entry
-; VI-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x24
-; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    s_mov_b32 s18, s2
-; VI-NEXT:    s_mov_b32 s19, s3
+; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
+; VI-NEXT:    s_mov_b32 s11, 0xf000
+; VI-NEXT:    s_mov_b32 s10, -1
+; VI-NEXT:    s_mov_b32 s18, s10
+; VI-NEXT:    s_mov_b32 s19, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s16, s8
-; VI-NEXT:    s_mov_b32 s17, s9
-; VI-NEXT:    s_mov_b32 s14, s2
-; VI-NEXT:    s_mov_b32 s12, s6
-; VI-NEXT:    s_mov_b32 s13, s7
-; VI-NEXT:    s_mov_b32 s15, s3
-; VI-NEXT:    s_mov_b32 s8, s10
-; VI-NEXT:    s_mov_b32 s9, s11
-; VI-NEXT:    s_mov_b32 s10, s2
+; VI-NEXT:    s_mov_b32 s16, s4
+; VI-NEXT:    s_mov_b32 s17, s5
+; VI-NEXT:    s_mov_b32 s14, s10
+; VI-NEXT:    s_mov_b32 s12, s2
+; VI-NEXT:    s_mov_b32 s13, s3
+; VI-NEXT:    s_mov_b32 s15, s11
+; VI-NEXT:    s_mov_b32 s4, s6
+; VI-NEXT:    s_mov_b32 s5, s7
+; VI-NEXT:    s_mov_b32 s6, s10
 ; VI-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
-; VI-NEXT:    s_mov_b32 s11, s3
-; VI-NEXT:    buffer_load_dword v2, off, s[8:11], 0
+; VI-NEXT:    s_mov_b32 s7, s11
+; VI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3800
-; VI-NEXT:    v_mov_b32_e32 v4, 0x3900
-; VI-NEXT:    s_mov_b32 s0, s4
-; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_mov_b32 s8, s0
+; VI-NEXT:    s_mov_b32 s9, s1
 ; VI-NEXT:    s_waitcnt vmcnt(2)
-; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; VI-NEXT:    s_waitcnt vmcnt(1)
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v1, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v6, v5
-; VI-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_cmp_lt_f16_e32 vcc, v5, v4
+; VI-NEXT:    v_mov_b32_e32 v1, 0x3900
+; VI-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX11-TRUE16-LABEL: select_v2f16_imm_d:


        


More information about the llvm-commits mailing list