[llvm-branch-commits] [llvm] [AMDGPU] si-peephole-sdwa: Handle V_PACK_B32_F16_e64 (WIP) (PR #176383)

Frederik Harwath via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Fri Jan 16 05:56:10 PST 2026


https://github.com/frederik-h updated https://github.com/llvm/llvm-project/pull/176383

>From 907b6c6b6eef7e673fcb844afc16e9aa6f8268a4 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Fri, 16 Jan 2026 04:17:32 -0500
Subject: [PATCH] [AMDGPU] si-peephole-sdwa: Handle V_PACK_B32_F16_e64 (WIP)

Change si-peephole-sdwa to eliminate V_PACK_B32_F16_e64 instructions
by changing the second operand to write to the upper word of the
destination directly.
---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp     |  35 +++
 .../AMDGPU/GlobalISel/combine-fma-sub-mul.ll  | 112 ++++----
 .../GlobalISel/combine-fma-sub-neg-mul.ll     |  56 ++--
 .../CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll     |  80 +++---
 .../AMDGPU/copysign-simplify-demanded-bits.ll |  11 +-
 .../CodeGen/AMDGPU/dagcombine-fmul-sel.ll     |  38 ++-
 .../CodeGen/AMDGPU/extract-subvector-16bit.ll |  71 ++---
 .../AMDGPU/fcanonicalize-elimination.ll       |   7 +-
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll     |  50 ++--
 llvm/test/CodeGen/AMDGPU/fdiv.f16.ll          |  24 +-
 llvm/test/CodeGen/AMDGPU/fmaximum3.ll         |   7 +-
 llvm/test/CodeGen/AMDGPU/fminimum3.ll         |   7 +-
 llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll     | 266 +++++++++---------
 .../AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll |  63 ++---
 llvm/test/CodeGen/AMDGPU/fpow.ll              | 196 ++++++-------
 llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll       |  19 +-
 llvm/test/CodeGen/AMDGPU/fract-match.ll       |   9 +-
 llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll      |  10 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp.ll          | 128 +++++----
 llvm/test/CodeGen/AMDGPU/llvm.exp10.ll        | 169 +++++------
 llvm/test/CodeGen/AMDGPU/llvm.exp2.ll         |  42 +--
 llvm/test/CodeGen/AMDGPU/llvm.frexp.ll        |  35 +--
 llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll        |  56 ++--
 llvm/test/CodeGen/AMDGPU/llvm.log.ll          | 236 +++++++++-------
 llvm/test/CodeGen/AMDGPU/llvm.log10.ll        | 236 +++++++++-------
 llvm/test/CodeGen/AMDGPU/llvm.log2.ll         | 195 +++++--------
 llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll     |   5 +-
 llvm/test/CodeGen/AMDGPU/llvm.round.ll        |  15 +-
 llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll      |  10 +-
 llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll        | 192 +++++--------
 llvm/test/CodeGen/AMDGPU/repeated-divisor.ll  |  35 ++-
 llvm/test/CodeGen/AMDGPU/roundeven.ll         |  86 +++---
 llvm/test/CodeGen/AMDGPU/sdwa-commute.ll      |   5 +-
 .../AMDGPU/select-fabs-fneg-extract.v2f16.ll  |  55 ++--
 llvm/test/CodeGen/AMDGPU/v_pack.ll            |  38 ++-
 35 files changed, 1251 insertions(+), 1348 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index acc4b3f0a68b4..232d975c3fc4e 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -455,6 +455,23 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
         // writing WORD_1. Modifiers don't matter because all the bits that
         // would be impacted are being overwritten by the dst.
         // Any other case will not work.
+        //
+        // FIXME Is this really true for f16 operands? That is, this
+        // change introduced by the v_pack_b32_f16 conversion looks wrong:
+        //@@ -2394,17 +2394,17 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half>
+        //%a) {
+        //  ; GFX9-LABEL: v_neg_rsq_v2f16:
+        //  ; GFX9:       ; %bb.0:
+        //  ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+        // -; GFX9-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:DWORD
+        // dst_unused:UNUSED_PAD src0_sel:WORD_1
+        // -; GFX9-NEXT:    v_rsq_f16_e32 v0, v0
+        // -; GFX9-NEXT:    v_pack_b32_f16 v0, -v0, -v1
+        // +; GFX9-NEXT:    v_rsq_f16_e32 v1, v0
+        // +; GFX9-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:WORD_1
+        // dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+        // +; GFX9-NEXT:    v_mov_b32_e32 v0, v1
+        //  ; GFX9-NEXT:    s_setpc_b64 s[30:31]
         SdwaSel DstSel = static_cast<SdwaSel>(
             TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel));
         if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
@@ -961,7 +978,25 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
 
     return std::make_unique<SDWADstPreserveOperand>(
       OrDst, OrSDWADef, OrOtherDef, DstSel);
+  }
+  case AMDGPU::V_PACK_B32_F16_e64: {
+    MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+    MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+    MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+
+    bool InvalidOp = false;
+    for (auto *Op : {Dst, Src1, Src2})
+      if (!Op || !Op->isReg() || Op->getReg().isPhysical())
+        InvalidOp = true;
+
+    if (InvalidOp)
+      break;
+
+    if (isSameReg(*Src1, *Src2))
+      break;
 
+    // FIXME Figure out necessary restrictions on Src1 and Src2
+    return std::make_unique<SDWADstPreserveOperand>(Dst, Src1, Src2, WORD_1);
   }
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
index d046b854fb0d8..9b4b14e6ca105 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
@@ -543,14 +543,12 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
 ; GFX9-LABEL: test_v4f16_sub_mul:
 ; GFX9:       ; %bb.0: ; %.entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
-; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX9-NEXT:    v_sub_f16_e32 v2, v0, v4
-; GFX9-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_sub_f16_e32 v3, v1, v5
-; GFX9-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT:    v_pk_mul_f16 v2, v0, v2
+; GFX9-NEXT:    v_pk_mul_f16 v3, v1, v3
+; GFX9-NEXT:    v_sub_f16_e32 v0, v2, v4
+; GFX9-NEXT:    v_sub_f16_e32 v1, v3, v5
+; GFX9-NEXT:    v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul:
@@ -563,27 +561,23 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
 ; GFX9-DENORM-LABEL: test_v4f16_sub_mul:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
-; GFX9-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX9-DENORM-NEXT:    v_sub_f16_e32 v2, v0, v4
-; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT:    v_sub_f16_e32 v3, v1, v5
-; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX9-DENORM-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-DENORM-NEXT:    v_pk_mul_f16 v2, v0, v2
+; GFX9-DENORM-NEXT:    v_pk_mul_f16 v3, v1, v3
+; GFX9-DENORM-NEXT:    v_sub_f16_e32 v0, v2, v4
+; GFX9-DENORM-NEXT:    v_sub_f16_e32 v1, v3, v5
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_v4f16_sub_mul:
 ; GFX10:       ; %bb.0: ; %.entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2
-; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX10-NEXT:    v_sub_f16_e32 v2, v0, v4
-; GFX10-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_sub_f16_e32 v3, v1, v5
-; GFX10-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT:    v_pk_mul_f16 v2, v0, v2
+; GFX10-NEXT:    v_pk_mul_f16 v3, v1, v3
+; GFX10-NEXT:    v_sub_f16_e32 v0, v2, v4
+; GFX10-NEXT:    v_sub_f16_e32 v1, v3, v5
+; GFX10-NEXT:    v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul:
@@ -596,14 +590,12 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
 ; GFX10-DENORM-LABEL: test_v4f16_sub_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
-; GFX10-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX10-DENORM-NEXT:    v_sub_f16_e32 v2, v0, v4
-; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT:    v_sub_f16_e32 v3, v1, v5
-; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX10-DENORM-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-DENORM-NEXT:    v_pk_mul_f16 v2, v0, v2
+; GFX10-DENORM-NEXT:    v_pk_mul_f16 v3, v1, v3
+; GFX10-DENORM-NEXT:    v_sub_f16_e32 v0, v2, v4
+; GFX10-DENORM-NEXT:    v_sub_f16_e32 v1, v3, v5
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_v4f16_sub_mul:
@@ -642,14 +634,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
 ; GFX9-LABEL: test_v4f16_sub_mul_rhs:
 ; GFX9:       ; %bb.0: ; %.entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
-; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX9-NEXT:    v_sub_f16_e32 v2, v4, v0
-; GFX9-NEXT:    v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_sub_f16_e32 v3, v5, v1
-; GFX9-NEXT:    v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT:    v_pk_mul_f16 v2, v0, v2
+; GFX9-NEXT:    v_pk_mul_f16 v3, v1, v3
+; GFX9-NEXT:    v_sub_f16_e32 v0, v4, v2
+; GFX9-NEXT:    v_sub_f16_e32 v1, v5, v3
+; GFX9-NEXT:    v_sub_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_sub_f16_sdwa v1, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
@@ -662,27 +652,23 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
 ; GFX9-DENORM-LABEL: test_v4f16_sub_mul_rhs:
 ; GFX9-DENORM:       ; %bb.0: ; %.entry
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
-; GFX9-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX9-DENORM-NEXT:    v_sub_f16_e32 v2, v4, v0
-; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT:    v_sub_f16_e32 v3, v5, v1
-; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX9-DENORM-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-DENORM-NEXT:    v_pk_mul_f16 v2, v0, v2
+; GFX9-DENORM-NEXT:    v_pk_mul_f16 v3, v1, v3
+; GFX9-DENORM-NEXT:    v_sub_f16_e32 v0, v4, v2
+; GFX9-DENORM-NEXT:    v_sub_f16_e32 v1, v5, v3
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v1, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_v4f16_sub_mul_rhs:
 ; GFX10:       ; %bb.0: ; %.entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2
-; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX10-NEXT:    v_sub_f16_e32 v2, v4, v0
-; GFX10-NEXT:    v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_sub_f16_e32 v3, v5, v1
-; GFX10-NEXT:    v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT:    v_pk_mul_f16 v2, v0, v2
+; GFX10-NEXT:    v_pk_mul_f16 v3, v1, v3
+; GFX10-NEXT:    v_sub_f16_e32 v0, v4, v2
+; GFX10-NEXT:    v_sub_f16_e32 v1, v5, v3
+; GFX10-NEXT:    v_sub_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_sub_f16_sdwa v1, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
@@ -695,14 +681,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
 ; GFX10-DENORM-LABEL: test_v4f16_sub_mul_rhs:
 ; GFX10-DENORM:       ; %bb.0: ; %.entry
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2
-; GFX10-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3
-; GFX10-DENORM-NEXT:    v_sub_f16_e32 v2, v4, v0
-; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT:    v_sub_f16_e32 v3, v5, v1
-; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX10-DENORM-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-DENORM-NEXT:    v_pk_mul_f16 v2, v0, v2
+; GFX10-DENORM-NEXT:    v_pk_mul_f16 v3, v1, v3
+; GFX10-DENORM-NEXT:    v_sub_f16_e32 v0, v4, v2
+; GFX10-DENORM-NEXT:    v_sub_f16_e32 v1, v5, v3
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v1, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
index c0a828ecacbae..6143e91f037df 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
@@ -219,14 +219,12 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
 ; GFX9-LABEL: test_v4f16_sub_ext_neg_mul:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX9-NEXT:    v_sub_f16_e32 v2, v0, v4
-; GFX9-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_sub_f16_e32 v3, v1, v5
-; GFX9-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT:    v_pk_mul_f16 v2, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_pk_mul_f16 v3, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT:    v_sub_f16_e32 v0, v2, v4
+; GFX9-NEXT:    v_sub_f16_e32 v1, v3, v5
+; GFX9-NEXT:    v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT:    v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul:
@@ -239,27 +237,23 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
 ; GFX9-DENORM-LABEL: test_v4f16_sub_ext_neg_mul:
 ; GFX9-DENORM:       ; %bb.0: ; %entry
 ; GFX9-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX9-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX9-DENORM-NEXT:    v_sub_f16_e32 v2, v0, v4
-; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT:    v_sub_f16_e32 v3, v1, v5
-; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX9-DENORM-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-DENORM-NEXT:    v_pk_mul_f16 v2, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-DENORM-NEXT:    v_pk_mul_f16 v3, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-DENORM-NEXT:    v_sub_f16_e32 v0, v2, v4
+; GFX9-DENORM-NEXT:    v_sub_f16_e32 v1, v3, v5
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DENORM-NEXT:    v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-DENORM-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: test_v4f16_sub_ext_neg_mul:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX10-NEXT:    v_sub_f16_e32 v2, v0, v4
-; GFX10-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_sub_f16_e32 v3, v1, v5
-; GFX10-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT:    v_pk_mul_f16 v2, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-NEXT:    v_pk_mul_f16 v3, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-NEXT:    v_sub_f16_e32 v0, v2, v4
+; GFX10-NEXT:    v_sub_f16_e32 v1, v3, v5
+; GFX10-NEXT:    v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul:
@@ -272,14 +266,12 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
 ; GFX10-DENORM-LABEL: test_v4f16_sub_ext_neg_mul:
 ; GFX10-DENORM:       ; %bb.0: ; %entry
 ; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-DENORM-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX10-DENORM-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX10-DENORM-NEXT:    v_sub_f16_e32 v2, v0, v4
-; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT:    v_sub_f16_e32 v3, v1, v5
-; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX10-DENORM-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-DENORM-NEXT:    v_pk_mul_f16 v2, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-DENORM-NEXT:    v_pk_mul_f16 v3, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-DENORM-NEXT:    v_sub_f16_e32 v0, v2, v4
+; GFX10-DENORM-NEXT:    v_sub_f16_e32 v1, v3, v5
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DENORM-NEXT:    v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX10-DENORM-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %a = fmul <4 x half> %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 62b264a537457..510f1e012440a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -1092,20 +1092,20 @@ define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT:    v_rcp_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_mul_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_v2f16_afn:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT:    v_rcp_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_mul_f16_e32 v1, v0, v2
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f16_afn:
@@ -2686,16 +2686,16 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e32 v1, v0
-; GFX9-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT:    v_rcp_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rcp_v2f16_arcp:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f16_e32 v1, v0
-; GFX10-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT:    v_rcp_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rcp_v2f16_arcp:
@@ -2735,16 +2735,16 @@ define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e32 v1, v0
-; GFX9-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT:    v_rcp_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rcp_v2f16_arcp_afn:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f16_e32 v1, v0
-; GFX10-NEXT:    v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT:    v_rcp_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rcp_v2f16_arcp_afn:
@@ -3071,20 +3071,20 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT:    v_rcp_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_mul_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_v2f16_afn_ulp25:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT:    v_rcp_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_mul_f16_e32 v1, v0, v2
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f16_afn_ulp25:
@@ -3189,20 +3189,20 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT:    v_rcp_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_mul_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_v2f16_arcp_ulp25:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT:    v_rcp_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_mul_f16_e32 v1, v0, v2
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f16_arcp_ulp25:
@@ -3251,20 +3251,20 @@ define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT:    v_rcp_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_mul_f16_e32 v1, v0, v2
+; GFX9-NEXT:    v_mul_f16_sdwa v1, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT:    v_rcp_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_mul_f16_e32 v1, v0, v2
+; GFX10-NEXT:    v_mul_f16_sdwa v1, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index ef676ddc8070e..8611af0e58a80 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -31,16 +31,15 @@ define <2 x half> @test_pown_reduced_fast_v2f16_known_odd(<2 x half> %x, <2 x i3
 ; GFX9-LABEL: test_pown_reduced_fast_v2f16_known_odd:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_or_b32_e32 v2, 1, v2
 ; GFX9-NEXT:    v_or_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0x7fff7fff, v0
+; GFX9-NEXT:    v_or_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7fff7fff
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX9-NEXT:    v_pk_mul_f16 v1, v3, v1
+; GFX9-NEXT:    v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v2, 0x7fff7fff, v0
+; GFX9-NEXT:    v_pk_mul_f16 v1, v2, v1
 ; GFX9-NEXT:    v_bfi_b32 v0, s4, v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %y = or <2 x i32> %y.arg, <i32 1, i32 1>
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
index d1b8a17915adc..f01407470e3ac 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
@@ -1787,13 +1787,12 @@ define <2 x half> @fmul_select_v2f16_test3(<2 x half> %x, <2 x i32> %bool.arg1,
 ; GFX9-SDAG-LABEL: fmul_select_v2f16_test3:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3c00
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x4000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x3c00
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v2, v4, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v1, v2, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1817,11 +1816,11 @@ define <2 x half> @fmul_select_v2f16_test3(<2 x half> %x, <2 x i32> %bool.arg1,
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0x4000
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e64 s4, v1, v3
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0x3c00
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0x3c00, v5, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_sdwa v1, v2, v5, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX10-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1943,13 +1942,12 @@ define <2 x half> @fmul_select_v2f16_test4(<2 x half> %x, <2 x i32> %bool.arg1,
 ; GFX9-SDAG-LABEL: fmul_select_v2f16_test4:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3c00
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x3800
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x3c00
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3800
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, v2, v4, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v1, v2, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1973,11 +1971,11 @@ define <2 x half> @fmul_select_v2f16_test4(<2 x half> %x, <2 x i32> %bool.arg1,
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3800
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e64 s4, v1, v3
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v2, 0x3c00
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0x3c00, v5, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_sdwa v1, v2, v5, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX10-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index d370ea86e3dc8..50dff6fc3c391 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -490,19 +490,15 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:  .LBB2_3: ; %exit
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3900
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3d00
-; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v2
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3800
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
-; GFX9-NEXT:    v_cmp_le_f16_sdwa vcc, v2, v5 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3900
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3d00
+; GFX9-NEXT:    v_cmp_ge_f16_e64 s[4:5], 0.5, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, v5, s[4:5]
+; GFX9-NEXT:    v_cmp_ge_f16_e64 s[4:5], 0.5, v3
 ; GFX9-NEXT:    v_cmp_nge_f16_e32 vcc, 0.5, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v1, v0, vcc
-; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT:    v_pack_b32_f16 v1, v0, v5
-; GFX9-NEXT:    v_pack_b32_f16 v0, v4, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_sdwa v0, v4, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ; GFX9-NEXT:  .LBB2_4:
 ; GFX9-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
@@ -1210,19 +1206,15 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
 ; GFX9-NEXT:  .LBB5_3: ; %exit
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3900
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3d00
-; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v4
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3800
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
-; GFX9-NEXT:    v_cmp_le_f16_sdwa vcc, v4, v3 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x3900
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3d00
+; GFX9-NEXT:    v_cmp_ge_f16_e64 s[4:5], 0.5, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v3, s[4:5]
+; GFX9-NEXT:    v_cmp_ge_f16_e64 s[4:5], 0.5, v5
 ; GFX9-NEXT:    v_cmp_nge_f16_e32 vcc, 0.5, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc
-; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT:    v_pack_b32_f16 v1, v0, v4
-; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_sdwa v0, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ; GFX9-NEXT:  .LBB5_4:
 ; GFX9-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
@@ -1920,29 +1912,22 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
 ; GFX9-NEXT:  .LBB8_4: ; %exit
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3800
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3900
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x3d00
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0x3900
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x3d00
 ; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v2, vcc
-; GFX9-NEXT:    v_cmp_nle_f16_sdwa vcc, v7, v0 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v2, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
 ; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v1, v2, vcc
-; GFX9-NEXT:    v_cmp_le_f16_sdwa vcc, v6, v0 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v1, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v10, vcc
 ; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v1, v2, vcc
-; GFX9-NEXT:    v_cmp_le_f16_sdwa vcc, v5, v0 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v1, v2, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0x3800
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc
 ; GFX9-NEXT:    v_cmp_ge_f16_e32 vcc, 0.5, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v1, v2, vcc
-; GFX9-NEXT:    v_cmp_le_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_pack_b32_f16 v0, v10, v0
-; GFX9-NEXT:    v_pack_b32_f16 v1, v9, v5
-; GFX9-NEXT:    v_pack_b32_f16 v2, v8, v6
-; GFX9-NEXT:    v_pack_b32_f16 v3, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v10, vcc
+; GFX9-NEXT:    v_cmp_le_f16_sdwa vcc, v4, v8 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_sdwa v0, v9, v10, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_sdwa v1, v9, v10, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_sdwa v2, v9, v10, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_sdwa v3, v10, v9, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: vec_16xf16_extract_8xf16_0:
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
index a62673679eb8e..8df2a2f530e1d 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -286,11 +286,10 @@ define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32_flushf16
 ; GCN-DAG: v_cvt_f16_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
 ; VI-DAG: v_cvt_f16_f32_sdwa [[V1:v[0-9]+]], v{{[0-9]+}}
 ; VI: v_or_b32_e32 [[V:v[0-9]+]], [[V0]], [[V1]]
-; GFX9: v_cvt_f16_f32_e32 [[V1:v[0-9]+]], v{{[0-9]+}}
-; GFX9: v_pack_b32_f16 [[V:v[0-9]+]], [[V1]], [[V0]]
+; GFX9: v_cvt_f16_f32_sdwa [[V1:v[0-9]+]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GCN-NOT: v_mul
 ; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[V1]]
 define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(ptr addrspace(1) %arg, ptr addrspace(1) %out) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %arg, i32 %id
@@ -732,7 +731,7 @@ define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode_nnan(fl
 
 ; GCN-LABEL: {{^}}v_test_canonicalize_build_vector_v2f16:
 ; GFX9-DAG: v_add_f16_e32
-; GFX9-DAG: v_mul_f16_e32
+; GFX9-DAG: v_mul_f16_sdwa
 ; GFX9-NOT: v_max
 ; GFX9-NOT: v_pk_max
 define <2 x half> @v_test_canonicalize_build_vector_v2f16(<2 x half> %vec) {
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index eec6bab67b6c2..9583c8bb8227f 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -3136,10 +3136,9 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f32_sign_v2f16(<2 x float> %mag, <
 ; GFX9-LABEL: v_copysign_out_v2f16_mag_v2f32_sign_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7fff7fff
-; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3642,10 +3641,9 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f16_sign_v2f32(<2 x half> %mag, <2
 ; GFX9-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7fff7fff
-; GFX9-NEXT:    v_pack_b32_f16 v1, v1, v2
 ; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3970,10 +3968,9 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f32_sign_v2f16(<2 x float> inre
 ;
 ; GFX9-LABEL: s_copysign_out_v2f16_mag_v2f32_sign_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, s1
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, s0
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, s0
+; GFX9-NEXT:    v_cvt_f16_f32_sdwa v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX9-NEXT:    s_mov_b32 s0, 0x7fff7fff
-; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    v_bfi_b32 v0, s0, v0, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
@@ -4428,10 +4425,9 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f16_sign_v2f32(<2 x half> inreg
 ;
 ; GFX9-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, s2
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, s1
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, s1
+; GFX9-NEXT:    v_cvt_f16_f32_sdwa v0, s2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX9-NEXT:    s_mov_b32 s1, 0x7fff7fff
-; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_bfi_b32 v0, s1, v1, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
@@ -4762,12 +4758,11 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f32_sign_v3f16(<3 x float> %mag, <
 ; GFX9-LABEL: v_copysign_out_v3f16_mag_v3f32_sign_v3f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v5, v1
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7fff7fff
-; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v5
-; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v4
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v2, v4
 ; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -5463,11 +5458,10 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f16_sign_v3f32(<3 x half> %mag, <3
 ; GFX9-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX9-NEXT:    v_cvt_f16_f32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7fff7fff
-; GFX9-NEXT:    v_pack_b32_f16 v2, v2, v3
 ; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v4
 ; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -5870,15 +5864,13 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f32_sign_v4f16(<4 x float> %mag, <
 ; GFX9-LABEL: v_copysign_out_v4f16_mag_v4f32_sign_v4f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GFX9-NEXT:    v_cvt_f16_f32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7fff7fff
-; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v4
-; GFX9-NEXT:    v_pack_b32_f16 v1, v2, v3
-; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v5
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v2, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_copysign_out_v4f16_mag_v4f32_sign_v4f16:
@@ -6780,15 +6772,13 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f16_sign_v4f32(<4 x half> %mag, <4
 ; GFX9-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX9-NEXT:    v_cvt_f16_f32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GFX9-NEXT:    v_cvt_f16_f32_sdwa v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7fff7fff
-; GFX9-NEXT:    v_pack_b32_f16 v2, v2, v3
 ; GFX9-NEXT:    v_bfi_b32 v0, s4, v0, v2
-; GFX9-NEXT:    v_pack_b32_f16 v2, v4, v5
-; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v2
+; GFX9-NEXT:    v_bfi_b32 v1, s4, v1, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f32:
@@ -7371,9 +7361,8 @@ define amdgpu_ps i32 @s_copysign_v2f16_0_v2f32(<2 x float> inreg %sign) {
 ;
 ; GFX9-LABEL: s_copysign_v2f16_0_v2f32:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, s1
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, s0
-; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, s0
+; GFX9-NEXT:    v_cvt_f16_f32_sdwa v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0x80008000, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -7428,9 +7417,8 @@ define <2 x half> @v_copysign_v2f16_0_v2bf32(<2 x float> %sign) {
 ; GFX9-LABEL: v_copysign_v2f16_0_v2bf32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0x80008000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index 1334d0ef278d1..5239cd0901071 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -2186,17 +2186,17 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
 ; GFX9-LABEL: v_rsq_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_rsq_f16_e32 v0, v0
-; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT:    v_rsq_f16_e32 v1, v0
+; GFX9-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_rsq_v2f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_rsq_f16_e32 v0, v0
-; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT:    v_rsq_f16_e32 v1, v0
+; GFX10-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_rsq_v2f16:
@@ -2394,17 +2394,17 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
 ; GFX9-LABEL: v_neg_rsq_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_rsq_f16_e32 v0, v0
-; GFX9-NEXT:    v_pack_b32_f16 v0, -v0, -v1
+; GFX9-NEXT:    v_rsq_f16_e32 v1, v0
+; GFX9-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_neg_rsq_v2f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_rsq_f16_e32 v0, v0
-; GFX10-NEXT:    v_pack_b32_f16 v0, -v0, -v1
+; GFX10-NEXT:    v_rsq_f16_e32 v1, v0
+; GFX10-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_neg_rsq_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 6010f29c166a1..bef30f7d3f5e7 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -3860,11 +3860,10 @@ define <2 x half> @v_no_fmaximum3_f16__multi_use(half %a, half %b, half %c) {
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX942-NEXT:    v_max_f16_e32 v1, v0, v2
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX942-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX942-NEXT:    v_max_f16_e32 v1, v0, v2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v4, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: v_no_fmaximum3_f16__multi_use:
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
index 4506fd649a5ff..b5a6843dad58a 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
@@ -3860,11 +3860,10 @@ define <2 x half> @v_no_fminimum3_f16__multi_use(half %a, half %b, half %c) {
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
 ; GFX942-NEXT:    s_nop 1
 ; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX942-NEXT:    v_min_f16_e32 v1, v0, v2
 ; GFX942-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX942-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX942-NEXT:    v_min_f16_e32 v1, v0, v2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_sdwa v0, v4, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX942-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-LABEL: v_no_fminimum3_f16__multi_use:
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
index b25b9b994ea09..cc34df7709cde 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
@@ -6721,13 +6721,12 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-SDAG-LABEL: v_mul_v2f16_select_64_1:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3c00
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5400
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x3c00
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5400
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v1, v3, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6747,11 +6746,11 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5400
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e64 s4, 0, v0
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0x3c00
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0x3c00, v3, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX10-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6825,13 +6824,12 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-SDAG-LABEL: v_mul_v2f16_select_1_64:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5400
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3c00
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x5400
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3c00
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v1, v3, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6851,11 +6849,11 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3c00
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e64 s4, 0, v0
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x5400, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0x5400
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0x5400, v3, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX10-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6929,13 +6927,12 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-SDAG-LABEL: v_mul_v2f16_select_n1_n64:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xd400
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xbc00
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0xd400
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xbc00
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v1, v3, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6956,11 +6953,11 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xbc00
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e64 s4, 0, v0
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xd400, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xd400, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0xd400
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0xd400, v3, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX10-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7037,13 +7034,12 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-SDAG-LABEL: v_mul_v2f16_select_128_64:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5400
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5800
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x5400
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5800
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v1, v3, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7069,11 +7065,11 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x5800
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e64 s4, 0, v0
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x5400, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0x5400
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0x5400, v3, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX10-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7161,13 +7157,12 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-SDAG-LABEL: v_mul_v2f16_select_n128_n64:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xd400
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xd800
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0xd400
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xd800
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v1, v3, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7194,11 +7189,11 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xd800
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e64 s4, 0, v0
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xd400, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xd400, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0xd400
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0xd400, v3, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX10-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7289,13 +7284,12 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
 ; GFX9-SDAG-LABEL: v_mul_v2f16_select_n128_n16:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xcc00
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xd800
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0xcc00
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xd800
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v1, v3, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7316,11 +7310,11 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xd800
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e64 s4, 0, v0
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xcc00, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xcc00, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0xcc00
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0xcc00, v3, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX10-SDAG-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7397,13 +7391,12 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal
 ; GFX9-SDAG-LABEL: v_contract_mul_add_v2f16_select_64_1:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3c00
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x5400
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x3c00
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5400
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v1, v4, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7424,11 +7417,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5400
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e64 s4, 0, v0
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x3c00, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0x3c00
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0x3c00, v4, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v4, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX10-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7506,13 +7499,12 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal
 ; GFX9-SDAG-LABEL: v_contract_mul_add_v2f16_select_1_64:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5400
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3c00
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x5400
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3c00
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v1, v4, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7533,11 +7525,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3c00
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e64 s4, 0, v0
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x5400, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0x5400
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0x5400, v4, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v4, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX10-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7615,13 +7607,12 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h
 ; GFX9-SDAG-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xbc00
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0xd400
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0xbc00
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xd400
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v1, v4, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7643,11 +7634,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v4, 0xd400
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e64 s4, 0, v0
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xbc00, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0xbc00
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0xbc00, v4, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v4, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX10-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7728,13 +7719,12 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h
 ; GFX9-SDAG-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xd400
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0xbc00
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0xd400
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xbc00
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v1, v4, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7756,11 +7746,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v4, 0xbc00
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e64 s4, 0, v0
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xd400, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0xd400, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0xd400
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0xd400, v4, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v4, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX10-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7841,13 +7831,12 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h
 ; GFX9-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_64:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5400
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x5800
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x5400
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5800
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v1, v4, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7874,11 +7863,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5800
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e64 s4, 0, v0
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x5400, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5400, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0x5400
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0x5400, v4, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v4, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX10-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7969,13 +7958,12 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha
 ; GFX9-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_4:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4400
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x5800
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4400
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5800
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v1, v4, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7996,11 +7984,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5800
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e64 s4, 0, v0
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4400
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0x4400, v4, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v4, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX10-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8078,13 +8066,12 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half
 ; GFX9-SDAG-LABEL: v_contract_mul_add_v2f16_select_2_4:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4400
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x4000
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4400
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4000
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v1, v4, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8111,11 +8098,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4000
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e64 s4, 0, v0
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x4400, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4400
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0x4400, v4, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v4, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX10-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8206,13 +8193,12 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha
 ; GFX9-SDAG-LABEL: v_contract_mul_add_v2f16_select_4_128:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x5800
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x4400
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x5800
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4400
+; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v1, v4, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8233,11 +8219,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4400
+; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e64 s4, 0, v0
 ; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x5800, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0x5800, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0x5800
+; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0x5800, v4, s4
+; GFX10-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v1, v4, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX10-SDAG-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index f03becfbb8968..c210379479b3e 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -187,26 +187,22 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
 ; GFX10-LABEL: fmul_pow2_8xhalf:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
-; GFX10-NEXT:    v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0]
-; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX10-NEXT:    v_cvt_f16_u16_e32 v4, v3
-; GFX10-NEXT:    v_cvt_f16_u16_e32 v5, v2
-; GFX10-NEXT:    v_cvt_f16_u16_e32 v6, v1
-; GFX10-NEXT:    v_cvt_f16_u16_e32 v7, v0
-; GFX10-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f16_u16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f16_u16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f16_u16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v7, v0
-; GFX10-NEXT:    v_pack_b32_f16 v1, v6, v1
-; GFX10-NEXT:    v_pack_b32_f16 v2, v5, v2
-; GFX10-NEXT:    v_pack_b32_f16 v3, v4, v3
-; GFX10-NEXT:    v_pk_mul_f16 v0, 0x7000, v0 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_pk_mul_f16 v1, 0x7000, v1 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_pk_mul_f16 v2, 0x7000, v2 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_pk_mul_f16 v3, 0x7000, v3 op_sel_hi:[0,1]
+; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_cvt_f16_u16_e32 v4, v0
+; GFX10-NEXT:    v_cvt_f16_u16_e32 v5, v1
+; GFX10-NEXT:    v_cvt_f16_u16_e32 v6, v2
+; GFX10-NEXT:    v_cvt_f16_u16_e32 v7, v3
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v7, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_pk_mul_f16 v0, 0x7000, v4 op_sel_hi:[0,1]
+; GFX10-NEXT:    v_pk_mul_f16 v1, 0x7000, v5 op_sel_hi:[0,1]
+; GFX10-NEXT:    v_pk_mul_f16 v2, 0x7000, v6 op_sel_hi:[0,1]
+; GFX10-NEXT:    v_pk_mul_f16 v3, 0x7000, v7 op_sel_hi:[0,1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: fmul_pow2_8xhalf:
@@ -295,19 +291,19 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
 ; GFX10-LABEL: fmul_pow2_ldexp_8xhalf:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0x7000
-; GFX10-NEXT:    v_ldexp_f16_e32 v5, 0x7000, v3
+; GFX10-NEXT:    v_mov_b32_e32 v8, 0x7000
+; GFX10-NEXT:    v_ldexp_f16_e32 v4, 0x7000, v0
+; GFX10-NEXT:    v_ldexp_f16_e32 v5, 0x7000, v1
 ; GFX10-NEXT:    v_ldexp_f16_e32 v6, 0x7000, v2
-; GFX10-NEXT:    v_ldexp_f16_e32 v7, 0x7000, v1
-; GFX10-NEXT:    v_ldexp_f16_e32 v8, 0x7000, v0
-; GFX10-NEXT:    v_ldexp_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_ldexp_f16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_ldexp_f16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_ldexp_f16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v8, v0
-; GFX10-NEXT:    v_pack_b32_f16 v1, v7, v1
-; GFX10-NEXT:    v_pack_b32_f16 v2, v6, v2
-; GFX10-NEXT:    v_pack_b32_f16 v3, v5, v3
+; GFX10-NEXT:    v_ldexp_f16_e32 v7, 0x7000, v3
+; GFX10-NEXT:    v_ldexp_f16_sdwa v4, v8, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_ldexp_f16_sdwa v5, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_ldexp_f16_sdwa v6, v8, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_ldexp_f16_sdwa v7, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    v_mov_b32_e32 v2, v6
+; GFX10-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: fmul_pow2_ldexp_8xhalf:
@@ -1073,9 +1069,8 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v0, 2 op_sel_hi:[1,0]
 ; GFX10-NEXT:    v_cvt_f16_u16_e32 v1, v0
-; GFX10-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX10-NEXT:    v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1]
+; GFX10-NEXT:    v_cvt_f16_u16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_pk_mul_f16 v0, 0x4b80, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
diff --git a/llvm/test/CodeGen/AMDGPU/fpow.ll b/llvm/test/CodeGen/AMDGPU/fpow.ll
index ad3f3433c74b3..b360387d009a0 100644
--- a/llvm/test/CodeGen/AMDGPU/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpow.ll
@@ -261,55 +261,52 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX9-LABEL: v_pow_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_log_f32_e32 v2, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_exp_f32_e32 v1, v2
-; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT:    v_exp_f32_e32 v1, v0
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v2
+; GFX9-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_pow_v2f16:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX90A-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX90A-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX90A-NEXT:    v_log_f32_e32 v2, v2
 ; GFX90A-NEXT:    v_log_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_mul_legacy_f32 v2, v3, v2
+; GFX90A-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX90A-NEXT:    v_mul_legacy_f32 v0, v1, v0
-; GFX90A-NEXT:    v_exp_f32_e32 v1, v2
-; GFX90A-NEXT:    v_exp_f32_e32 v0, v0
-; GFX90A-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX90A-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX90A-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX90A-NEXT:    v_exp_f32_e32 v1, v0
+; GFX90A-NEXT:    v_cvt_f16_f32_e32 v0, v2
+; GFX90A-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_v2f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX10-NEXT:    v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX10-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX10-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_log_f32_e32 v2, v2
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_exp_f32_e32 v1, v2
-; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT:    v_exp_f32_e32 v2, v0
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v1
+; GFX10-NEXT:    v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_pow_v2f16:
@@ -407,55 +404,52 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX9-LABEL: v_pow_v2f16_fneg_lhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_f16_e64 v0, -v0
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v2, -v0
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_log_f32_e32 v2, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_exp_f32_e32 v1, v2
-; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT:    v_exp_f32_e32 v1, v0
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v2
+; GFX9-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_pow_v2f16_fneg_lhs:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX90A-NEXT:    v_cvt_f32_f16_e64 v0, -v0
-; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX90A-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX90A-NEXT:    v_cvt_f32_f16_e64 v2, -v0
+; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX90A-NEXT:    v_log_f32_e32 v2, v2
 ; GFX90A-NEXT:    v_log_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_mul_legacy_f32 v2, v3, v2
+; GFX90A-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX90A-NEXT:    v_mul_legacy_f32 v0, v1, v0
-; GFX90A-NEXT:    v_exp_f32_e32 v1, v2
-; GFX90A-NEXT:    v_exp_f32_e32 v0, v0
-; GFX90A-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX90A-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX90A-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX90A-NEXT:    v_exp_f32_e32 v1, v0
+; GFX90A-NEXT:    v_cvt_f16_f32_e32 v0, v2
+; GFX90A-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_v2f16_fneg_lhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_f16_e64 v0, -v0
-; GFX10-NEXT:    v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX10-NEXT:    v_cvt_f32_f16_e64 v2, -v0
+; GFX10-NEXT:    v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX10-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_log_f32_e32 v2, v2
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_exp_f32_e32 v1, v2
-; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT:    v_exp_f32_e32 v2, v0
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v1
+; GFX10-NEXT:    v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_pow_v2f16_fneg_lhs:
@@ -554,55 +548,52 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX9-LABEL: v_pow_v2f16_fneg_rhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_f16_e64 v1, -v1
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v3, -v1
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_log_f32_e32 v2, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_exp_f32_e32 v1, v2
-; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT:    v_exp_f32_e32 v1, v0
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v2
+; GFX9-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_pow_v2f16_fneg_rhs:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX90A-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX90A-NEXT:    v_cvt_f32_f16_e64 v1, -v1
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX90A-NEXT:    v_cvt_f32_f16_e64 v3, -v1
+; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX90A-NEXT:    v_log_f32_e32 v2, v2
 ; GFX90A-NEXT:    v_log_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_mul_legacy_f32 v2, v3, v2
+; GFX90A-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX90A-NEXT:    v_mul_legacy_f32 v0, v1, v0
-; GFX90A-NEXT:    v_exp_f32_e32 v1, v2
-; GFX90A-NEXT:    v_exp_f32_e32 v0, v0
-; GFX90A-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX90A-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX90A-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX90A-NEXT:    v_exp_f32_e32 v1, v0
+; GFX90A-NEXT:    v_cvt_f16_f32_e32 v0, v2
+; GFX90A-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_v2f16_fneg_rhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX10-NEXT:    v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_f16_e64 v1, -v1
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX10-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_f16_e64 v3, -v1
+; GFX10-NEXT:    v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_log_f32_e32 v2, v2
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_exp_f32_e32 v1, v2
-; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT:    v_exp_f32_e32 v2, v0
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v1
+; GFX10-NEXT:    v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_pow_v2f16_fneg_rhs:
@@ -705,55 +696,52 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX9-LABEL: v_pow_v2f16_fneg_lhs_rhs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_f16_e64 v0, -v0
-; GFX9-NEXT:    v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_cvt_f32_f16_e64 v1, -v1
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v2, -v0
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_cvt_f32_f16_e64 v3, -v1
+; GFX9-NEXT:    v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_log_f32_e32 v2, v2
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_exp_f32_e32 v1, v2
-; GFX9-NEXT:    v_exp_f32_e32 v0, v0
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT:    v_exp_f32_e32 v1, v0
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v2
+; GFX9-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: v_pow_v2f16_fneg_lhs_rhs:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX90A-NEXT:    v_cvt_f32_f16_e64 v0, -v0
-; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX90A-NEXT:    v_cvt_f32_f16_e64 v1, -v1
+; GFX90A-NEXT:    v_cvt_f32_f16_e64 v2, -v0
+; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX90A-NEXT:    v_cvt_f32_f16_e64 v3, -v1
+; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX90A-NEXT:    v_log_f32_e32 v2, v2
 ; GFX90A-NEXT:    v_log_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_mul_legacy_f32 v2, v3, v2
+; GFX90A-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX90A-NEXT:    v_mul_legacy_f32 v0, v1, v0
-; GFX90A-NEXT:    v_exp_f32_e32 v1, v2
-; GFX90A-NEXT:    v_exp_f32_e32 v0, v0
-; GFX90A-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX90A-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX90A-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX90A-NEXT:    v_exp_f32_e32 v1, v0
+; GFX90A-NEXT:    v_cvt_f16_f32_e32 v0, v2
+; GFX90A-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_pow_v2f16_fneg_lhs_rhs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_f16_e64 v0, -v0
-; GFX10-NEXT:    v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_f16_e64 v1, -v1
+; GFX10-NEXT:    v_cvt_f32_f16_e64 v2, -v0
+; GFX10-NEXT:    v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_f16_e64 v3, -v1
+; GFX10-NEXT:    v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_log_f32_e32 v2, v2
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_mul_legacy_f32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_exp_f32_e32 v1, v2
-; GFX10-NEXT:    v_exp_f32_e32 v0, v0
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT:    v_exp_f32_e32 v2, v0
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v1
+; GFX10-NEXT:    v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_pow_v2f16_fneg_lhs_rhs:
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index 77faf363ca412..0c271e76c503a 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -1853,9 +1853,8 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, s0
 ; GFX9-SDAG-NEXT:    s_mov_b32 s5, s1
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
@@ -1866,10 +1865,9 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
 ; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, s3
+; GFX9-GISEL-NEXT:    v_cvt_f16_f32_sdwa v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX9-GISEL-NEXT:    s_mov_b32 s2, -1
 ; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0xf000
-; GFX9-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
@@ -3796,11 +3794,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn(
 ; GFX9-SDAG-NEXT:    s_mov_b32 s4, s0
 ; GFX9-SDAG-NEXT:    s_mov_b32 s5, s1
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_cvt_f32_f64_e32 v2, v[2:3]
 ; GFX9-SDAG-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; GFX9-SDAG-NEXT:    v_cvt_f32_f64_e32 v1, v[2:3]
 ; GFX9-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX9-SDAG-NEXT:    s_endpgm
 ;
@@ -3815,8 +3812,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn(
 ; GFX9-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[4:5]
 ; GFX9-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[6:7]
 ; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX9-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-GISEL-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
@@ -3849,10 +3845,9 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn(
 ; GFX950-GISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX950-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[6:7]
 ; GFX950-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX950-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX950-GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX950-GISEL-NEXT:    v_cvt_f32_f64_e32 v1, s[6:7]
+; GFX950-GISEL-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX950-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX950-GISEL-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll
index b14935c57152b..97537605de469 100644
--- a/llvm/test/CodeGen/AMDGPU/fract-match.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll
@@ -1701,8 +1701,8 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) {
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_fract_f16_e32 v1, v0
-; GFX8-NEXT:    v_fract_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX8-NEXT:    v_fract_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX8-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: basic_fract_v2f16_nonan:
@@ -2722,14 +2722,13 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) writeon
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_movk_i32 s6, 0x204
-; GFX8-NEXT:    v_floor_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_floor_f16_e32 v4, v0
 ; GFX8-NEXT:    v_fract_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_cmp_class_f16_sdwa s[4:5], v0, s6 src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_pack_b32_f16 v3, v4, v3
+; GFX8-NEXT:    v_floor_f16_e32 v3, v0
 ; GFX8-NEXT:    v_fract_f16_e32 v4, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s[4:5]
 ; GFX8-NEXT:    v_cmp_class_f16_e64 s[4:5], v0, s6
+; GFX8-NEXT:    v_floor_f16_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, 0, s[4:5]
 ; GFX8-NEXT:    v_pack_b32_f16 v0, v0, v5
 ; GFX8-NEXT:    global_store_dword v[1:2], v3, off
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index 769bf0a6458b2..0ebb306d5bae9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -189,9 +189,8 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, 0.15915494 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_cos_f16_e32 v2, v1
-; GFX9-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v1, v2, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_cos_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: cos_v2f16:
@@ -203,9 +202,8 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, 0.15915494 op_sel_hi:[1,0]
 ; GFX10-NEXT:    v_cos_f16_e32 v2, v1
-; GFX10-NEXT:    v_cos_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v1, v2, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    v_cos_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-TRUE16-LABEL: cos_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index 3897a0e028334..e546ce94515c0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -6065,12 +6065,11 @@ define <2 x half> @v_exp_v2f16(<2 x half> %in) {
 ; GFX900-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX900-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX900-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
 ; GFX900-NEXT:    v_exp_f32_e32 v1, v1
-; GFX900-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX900-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX900-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-NEXT:    v_cvt_f16_f32_e32 v0, v1
+; GFX900-NEXT:    v_exp_f32_e32 v1, v2
+; GFX900-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_v2f16:
@@ -6152,12 +6151,11 @@ define <2 x half> @v_exp_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-NEXT:    v_cvt_f32_f16_e64 v1, |v0|
 ; GFX900-SDAG-NEXT:    v_cvt_f32_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v1
+; GFX900-SDAG-NEXT:    v_exp_f32_e32 v1, v2
+; GFX900-SDAG-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_fabs_v2f16:
@@ -6167,12 +6165,11 @@ define <2 x half> @v_exp_fabs_v2f16(<2 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v2
+; GFX900-GISEL-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_fabs_v2f16:
@@ -6260,12 +6257,11 @@ define <2 x half> @v_exp_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-NEXT:    v_cvt_f32_f16_e64 v1, -|v0|
 ; GFX900-SDAG-NEXT:    v_cvt_f32_f16_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v1
+; GFX900-SDAG-NEXT:    v_exp_f32_e32 v1, v2
+; GFX900-SDAG-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_fneg_fabs_v2f16:
@@ -6275,12 +6271,11 @@ define <2 x half> @v_exp_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v2
+; GFX900-GISEL-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_fneg_fabs_v2f16:
@@ -6373,12 +6368,11 @@ define <2 x half> @v_exp_fneg_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-NEXT:    v_cvt_f32_f16_e64 v1, -v0
 ; GFX900-SDAG-NEXT:    v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v1
+; GFX900-SDAG-NEXT:    v_exp_f32_e32 v1, v2
+; GFX900-SDAG-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_fneg_v2f16:
@@ -6388,12 +6382,11 @@ define <2 x half> @v_exp_fneg_v2f16(<2 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v2
+; GFX900-GISEL-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_fneg_v2f16:
@@ -6476,10 +6469,9 @@ define <2 x half> @v_exp_v2f16_fast(<2 x half> %in) {
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x3dc5
-; GFX900-SDAG-NEXT:    v_pk_mul_f16 v0, v0, s4 op_sel_hi:[1,0]
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v1, v0
-; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_pk_mul_f16 v1, v0, s4 op_sel_hi:[1,0]
+; GFX900-SDAG-NEXT:    v_exp_f16_e32 v0, v1
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_v2f16_fast:
@@ -6555,23 +6547,39 @@ define <3 x half> @v_exp_v3f16(<3 x half> %in) {
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_exp_v3f16:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; GFX900-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX900-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GFX900-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX900-NEXT:    v_exp_f32_e32 v2, v2
-; GFX900-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX900-NEXT:    v_exp_f32_e32 v1, v1
-; GFX900-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX900-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX900-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX900-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_exp_v3f16:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX900-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX900-SDAG-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX900-SDAG-NEXT:    v_exp_f32_e32 v2, v2
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX900-SDAG-NEXT:    v_exp_f32_e32 v3, v0
+; GFX900-SDAG-NEXT:    v_exp_f32_e32 v1, v1
+; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v2
+; GFX900-SDAG-NEXT:    v_cvt_f16_f32_sdwa v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_exp_v3f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX900-GISEL-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v2, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v3, v0
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
+; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v2
+; GFX900-GISEL-NEXT:    v_cvt_f16_f32_sdwa v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_v3f16:
 ; SI-SDAG:       ; %bb.0:
@@ -6656,14 +6664,14 @@ define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) {
 ; GFX900-SDAG-LABEL: v_exp_v3f16_afn:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x3dc5
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v2, 0x3dc5, v0
-; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x3dc5
 ; GFX900-SDAG-NEXT:    v_exp_f16_e32 v2, v2
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v0, v0
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x3dc5, v1
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX900-SDAG-NEXT:    v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_v3f16_afn:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index 574b1c0b4974c..3e0000b9a85e0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -6151,12 +6151,11 @@ define <2 x half> @v_exp10_v2f16(<2 x half> %in) {
 ; GFX900-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX900-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v1
-; GFX900-NEXT:    v_mul_f32_e32 v0, 0x40549a78, v0
 ; GFX900-NEXT:    v_exp_f32_e32 v1, v1
-; GFX900-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX900-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX900-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-NEXT:    v_mul_f32_e32 v2, 0x40549a78, v0
+; GFX900-NEXT:    v_cvt_f16_f32_e32 v0, v1
+; GFX900-NEXT:    v_exp_f32_e32 v1, v2
+; GFX900-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp10_v2f16:
@@ -6238,12 +6237,11 @@ define <2 x half> @v_exp10_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-NEXT:    v_cvt_f32_f16_e64 v1, |v0|
 ; GFX900-SDAG-NEXT:    v_cvt_f32_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v1
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v0, 0x40549a78, v0
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, 0x40549a78, v0
+; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v1
+; GFX900-SDAG-NEXT:    v_exp_f32_e32 v1, v2
+; GFX900-SDAG-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp10_fabs_v2f16:
@@ -6253,12 +6251,11 @@ define <2 x half> @v_exp10_fabs_v2f16(<2 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v1
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, 0x40549a78, v0
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x40549a78, v0
+; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v2
+; GFX900-GISEL-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp10_fabs_v2f16:
@@ -6346,12 +6343,11 @@ define <2 x half> @v_exp10_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-NEXT:    v_cvt_f32_f16_e64 v1, -|v0|
 ; GFX900-SDAG-NEXT:    v_cvt_f32_f16_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v1
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v0, 0x40549a78, v0
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, 0x40549a78, v0
+; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v1
+; GFX900-SDAG-NEXT:    v_exp_f32_e32 v1, v2
+; GFX900-SDAG-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp10_fneg_fabs_v2f16:
@@ -6361,12 +6357,11 @@ define <2 x half> @v_exp10_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v1
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, 0x40549a78, v0
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x40549a78, v0
+; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v2
+; GFX900-GISEL-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp10_fneg_fabs_v2f16:
@@ -6459,12 +6454,11 @@ define <2 x half> @v_exp10_fneg_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-NEXT:    v_cvt_f32_f16_e64 v1, -v0
 ; GFX900-SDAG-NEXT:    v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v1
-; GFX900-SDAG-NEXT:    v_mul_f32_e32 v0, 0x40549a78, v0
 ; GFX900-SDAG-NEXT:    v_exp_f32_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, 0x40549a78, v0
+; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v1
+; GFX900-SDAG-NEXT:    v_exp_f32_e32 v1, v2
+; GFX900-SDAG-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp10_fneg_v2f16:
@@ -6474,12 +6468,11 @@ define <2 x half> @v_exp10_fneg_v2f16(<2 x half> %in) {
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v1
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, 0x40549a78, v0
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x40549a78, v0
+; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v2
+; GFX900-GISEL-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp10_fneg_v2f16:
@@ -6576,36 +6569,34 @@ define <2 x half> @v_exp10_v2f16_fast(<2 x half> %in) {
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x113c
-; GFX900-SDAG-NEXT:    s_movk_i32 s5, 0x42a4
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x113c, v0
+; GFX900-SDAG-NEXT:    s_movk_i32 s5, 0x42a4
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v2, 0x42a4, v0
-; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    v_exp_f16_e32 v1, v1
 ; GFX900-SDAG-NEXT:    v_exp_f16_e32 v2, v2
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    v_exp_f16_e32 v3, v3
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, v2, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v3
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_exp_f16_e32 v4, v0
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, v2, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp10_v2f16_fast:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x113c
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42a4
 ; GFX900-GISEL-NEXT:    v_mul_f16_e32 v2, 0x113c, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42a4
 ; GFX900-GISEL-NEXT:    v_mul_f16_e32 v4, 0x42a4, v0
-; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    v_exp_f16_e32 v2, v2
 ; GFX900-GISEL-NEXT:    v_exp_f16_e32 v4, v4
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    v_exp_f16_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_exp_f16_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v2, v4, v2
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX900-GISEL-NEXT:    v_exp_f16_e32 v3, v0
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, v4, v2
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp10_v2f16_fast:
@@ -6694,23 +6685,39 @@ define <3 x half> @v_exp10_v3f16(<3 x half> %in) {
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_exp10_v3f16:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; GFX900-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX900-NEXT:    v_mul_f32_e32 v2, 0x40549a78, v2
-; GFX900-NEXT:    v_mul_f32_e32 v0, 0x40549a78, v0
-; GFX900-NEXT:    v_exp_f32_e32 v2, v2
-; GFX900-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v1
-; GFX900-NEXT:    v_exp_f32_e32 v1, v1
-; GFX900-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX900-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX900-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX900-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_exp10_v3f16:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX900-SDAG-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX900-SDAG-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v2, 0x40549a78, v2
+; GFX900-SDAG-NEXT:    v_exp_f32_e32 v2, v2
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v1
+; GFX900-SDAG-NEXT:    v_mul_f32_e32 v0, 0x40549a78, v0
+; GFX900-SDAG-NEXT:    v_exp_f32_e32 v3, v0
+; GFX900-SDAG-NEXT:    v_exp_f32_e32 v1, v1
+; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v0, v2
+; GFX900-SDAG-NEXT:    v_cvt_f16_f32_sdwa v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GFX900-SDAG-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_exp10_v3f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX900-GISEL-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x40549a78, v2
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v2, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, 0x40549a78, v0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v3, v0
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
+; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v2
+; GFX900-GISEL-NEXT:    v_cvt_f16_f32_sdwa v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GFX900-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp10_v3f16:
 ; SI-SDAG:       ; %bb.0:
@@ -6817,45 +6824,43 @@ define <3 x half> @v_exp10_v3f16_afn(<3 x half> %in) {
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x113c
 ; GFX900-SDAG-NEXT:    s_movk_i32 s5, 0x42a4
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v2, 0x113c, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x42a4, v1
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v3, 0x113c, v0
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v4, 0x42a4, v0
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v2, 0x113c, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x42a4, v1
+; GFX900-SDAG-NEXT:    v_exp_f16_e32 v3, v3
+; GFX900-SDAG-NEXT:    v_exp_f16_e32 v4, v4
 ; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v5, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    v_exp_f16_e32 v2, v2
 ; GFX900-SDAG-NEXT:    v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v3, v3
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v4, v4
 ; GFX900-SDAG-NEXT:    v_exp_f16_e32 v5, v5
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v0, v0
+; GFX900-SDAG-NEXT:    v_exp_f16_e32 v6, v0
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, v4, v3
 ; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, v1, v2
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v2, v4, v3
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v5
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp10_v3f16_afn:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x113c
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x42a4
 ; GFX900-GISEL-NEXT:    v_mul_f16_e32 v3, 0x113c, v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x42a4
 ; GFX900-GISEL-NEXT:    v_mul_f16_e32 v5, 0x42a4, v0
-; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    v_exp_f16_e32 v3, v3
 ; GFX900-GISEL-NEXT:    v_exp_f16_e32 v5, v5
-; GFX900-GISEL-NEXT:    v_exp_f16_e32 v2, v2
-; GFX900-GISEL-NEXT:    v_exp_f16_e32 v0, v0
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    v_mul_f16_e32 v4, 0x113c, v1
 ; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x42a4, v1
+; GFX900-GISEL-NEXT:    v_exp_f16_e32 v2, v2
 ; GFX900-GISEL-NEXT:    v_exp_f16_e32 v4, v4
 ; GFX900-GISEL-NEXT:    v_exp_f16_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v3, v5, v3
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v2
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v3, v0
+; GFX900-GISEL-NEXT:    v_exp_f16_e32 v6, v0
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, v5, v3
 ; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, v1, v4
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp10_v3f16_afn:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index dd44a1a35067e..af84a7f8fc303 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -3188,9 +3188,9 @@ define <2 x half> @v_exp2_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_exp2_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_exp_f16_e32 v1, v0
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp2_v2f16:
@@ -3266,9 +3266,9 @@ define <2 x half> @v_exp2_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_exp2_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v1, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_exp_f16_e64 v0, |v0|
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_exp_f16_e64 v1, |v0|
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v1, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp2_fabs_v2f16:
@@ -3350,9 +3350,9 @@ define <2 x half> @v_exp2_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_exp2_fneg_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v1, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_exp_f16_e64 v0, -|v0|
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_exp_f16_e64 v1, -|v0|
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v1, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp2_fneg_fabs_v2f16:
@@ -3435,9 +3435,9 @@ define <2 x half> @v_exp2_fneg_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_exp2_fneg_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_exp_f16_e64 v0, -v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_exp_f16_e64 v1, -v0
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v1, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp2_fneg_v2f16:
@@ -3505,9 +3505,9 @@ define <2 x half> @v_exp2_v2f16_fast(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_exp2_v2f16_fast:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_exp_f16_e32 v1, v0
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp2_v2f16_fast:
@@ -3587,10 +3587,10 @@ define <3 x half> @v_exp_v3f16(<3 x half> %in) {
 ; GFX900-SDAG-LABEL: v_exp_v3f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v0, v0
+; GFX900-SDAG-NEXT:    v_exp_f16_e32 v2, v0
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp_v3f16:
@@ -3671,10 +3671,10 @@ define <3 x half> @v_exp2_v3f16_afn(<3 x half> %in) {
 ; GFX900-SDAG-LABEL: v_exp2_v3f16_afn:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_exp_f16_e32 v0, v0
+; GFX900-SDAG-NEXT:    v_exp_f16_e32 v2, v0
+; GFX900-SDAG-NEXT:    v_exp_f16_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_exp2_v3f16_afn:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
index 37b49d48a10ee..3d48c285cf6ce 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
@@ -411,9 +411,8 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX9-SDAG-LABEL: test_frexp_v2f16_v2i32:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_frexp_mant_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-SDAG-NEXT:    v_frexp_mant_f16_e32 v2, v0
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v3, v2, v1
+; GFX9-SDAG-NEXT:    v_frexp_mant_f16_e32 v3, v0
+; GFX9-SDAG-NEXT:    v_frexp_mant_f16_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX9-SDAG-NEXT:    v_frexp_exp_i16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-SDAG-NEXT:    v_frexp_exp_i16_f16_e32 v0, v0
 ; GFX9-SDAG-NEXT:    v_bfe_i32 v2, v1, 0, 16
@@ -520,11 +519,11 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_frexp_mant_f16_e32 v3, v0
 ; GFX9-GISEL-NEXT:    v_frexp_exp_i16_f16_e32 v1, v0
-; GFX9-GISEL-NEXT:    v_frexp_mant_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_frexp_exp_i16_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_frexp_mant_f16_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX9-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX9-GISEL-NEXT:    v_bfe_i32 v2, v0, 0, 16
-; GFX9-GISEL-NEXT:    v_pack_b32_f16 v0, v3, v4
+; GFX9-GISEL-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-TRUE16-LABEL: test_frexp_v2f16_v2i32:
@@ -617,13 +616,13 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-SDAG-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_frexp_mant_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-SDAG-NEXT:    v_frexp_mant_f16_e32 v0, v0
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_frexp_mant_f16_e32 v1, v0
+; GFX9-NEXT:    v_frexp_mant_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-TRUE16-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
@@ -683,14 +682,6 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_frexp_mant_f16_e32 v1, v0
-; GFX9-GISEL-NEXT:    v_frexp_mant_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-GISEL-TRUE16-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
index 5fae6de4a9682..bbcd4ceb51f8a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
@@ -460,11 +460,11 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x7fff
-; GFX9-SDAG-NEXT:    v_med3_i32 v2, v2, s4, v3
 ; GFX9-SDAG-NEXT:    v_med3_i32 v1, v1, s4, v3
-; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX9-SDAG-NEXT:    v_med3_i32 v2, v2, s4, v3
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v1, v0, v1
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v1, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v2f16_v2i32:
@@ -584,9 +584,9 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
 ; GFX9-SDAG-LABEL: test_ldexp_v2f16_v2i16:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v2, v0, v1
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v2f16_v2i16:
@@ -693,13 +693,13 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7fff
-; GFX9-SDAG-NEXT:    v_med3_i32 v3, v3, s4, v5
 ; GFX9-SDAG-NEXT:    v_med3_i32 v2, v2, s4, v5
+; GFX9-SDAG-NEXT:    v_med3_i32 v3, v3, s4, v5
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v2, v0, v2
 ; GFX9-SDAG-NEXT:    v_med3_i32 v4, v4, s4, v5
-; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v2
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v2, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v1, v1, v4
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v3f16_v3i32:
@@ -843,10 +843,10 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) {
 ; GFX9-SDAG-LABEL: test_ldexp_v3f16_v3i16:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v2
 ; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v4
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v3, v0, v2
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v3f16_v3i16:
@@ -973,16 +973,16 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x8000
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x7fff
-; GFX9-SDAG-NEXT:    v_med3_i32 v5, v5, s4, v6
 ; GFX9-SDAG-NEXT:    v_med3_i32 v4, v4, s4, v6
-; GFX9-SDAG-NEXT:    v_med3_i32 v3, v3, s4, v6
 ; GFX9-SDAG-NEXT:    v_med3_i32 v2, v2, s4, v6
-; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v1, v1, v4
-; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v2
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v3
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v5
+; GFX9-SDAG-NEXT:    v_med3_i32 v5, v5, s4, v6
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v4, v1, v4
+; GFX9-SDAG-NEXT:    v_med3_i32 v3, v3, s4, v6
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v2, v0, v2
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v2, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v4, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v4f16_v4i32:
@@ -1155,12 +1155,12 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) {
 ; GFX9-SDAG-LABEL: test_ldexp_v4f16_v4i16:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v2
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v5
-; GFX9-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v4
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v4, v1, v3
+; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v5, v0, v2
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, v5
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v4f16_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index 6353640bed146..22f7c8f56b52b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -6741,15 +6741,25 @@ define <2 x half> @v_log_v2f16(<2 x half> %in) {
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_log_v2f16:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log_v2f16:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log_v2f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x398c
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log_v2f16:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -6873,10 +6883,10 @@ define <2 x half> @v_log_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_e64 v1, |v0|
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v2, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_fabs_v2f16:
@@ -6884,10 +6894,10 @@ define <2 x half> @v_log_fabs_v2f16(<2 x half> %in) {
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x398c
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log_fabs_v2f16:
@@ -7021,10 +7031,10 @@ define <2 x half> @v_log_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_e64 v1, -|v0|
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v2, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_fneg_fabs_v2f16:
@@ -7032,10 +7042,10 @@ define <2 x half> @v_log_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x398c
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log_fneg_fabs_v2f16:
@@ -7170,10 +7180,10 @@ define <2 x half> @v_log_fneg_v2f16(<2 x half> %in) {
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_e64 v1, -v0
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_fneg_v2f16:
@@ -7181,10 +7191,10 @@ define <2 x half> @v_log_fneg_v2f16(<2 x half> %in) {
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x398c
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log_fneg_v2f16:
@@ -7300,15 +7310,25 @@ define <2 x half> @v_log_v2f16_fast(<2 x half> %in) {
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_log_v2f16_fast:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log_v2f16_fast:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log_v2f16_fast:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x398c
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log_v2f16_fast:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -7423,17 +7443,29 @@ define <3 x half> @v_log_v3f16(<3 x half> %in) {
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_log_v3f16:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log_v3f16:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_log_f16_e32 v2, v0
+; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v2
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v3, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log_v3f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
+; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x398c
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v2
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log_v3f16:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -7554,17 +7586,29 @@ define <3 x half> @v_log_v3f16_fast(<3 x half> %in) {
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_log_v3f16_fast:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log_v3f16_fast:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_log_f16_e32 v2, v0
+; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v2
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v3, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log_v3f16_fast:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
+; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x398c
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v2
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log_v3f16_fast:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -7724,30 +7768,28 @@ define <4 x half> @v_log_v4f16(<4 x half> %in) {
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v2, v1
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v3, v0
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v3, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v2
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v3
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v4, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v1, v5, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_v4f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT:    v_log_f16_e32 v4, v1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x398c
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v2
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v4
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v1, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log_v4f16:
@@ -7914,30 +7956,28 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) {
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v2, v1
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v3, v0
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v3, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x398c, v2
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x398c, v3
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v4, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v1, v5, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log_v4f16_fast:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT:    v_log_f16_e32 v4, v1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x398c
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x398c, v2
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x398c, v4
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v1, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 58665c7b24aea..63d8d4a659cf1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -6741,15 +6741,25 @@ define <2 x half> @v_log10_v2f16(<2 x half> %in) {
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_log10_v2f16:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log10_v2f16:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log10_v2f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x34d1
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log10_v2f16:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -6873,10 +6883,10 @@ define <2 x half> @v_log10_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_e64 v1, |v0|
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v2, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_fabs_v2f16:
@@ -6884,10 +6894,10 @@ define <2 x half> @v_log10_fabs_v2f16(<2 x half> %in) {
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x34d1
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log10_fabs_v2f16:
@@ -7021,10 +7031,10 @@ define <2 x half> @v_log10_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_e64 v1, -|v0|
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v2, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_fneg_fabs_v2f16:
@@ -7032,10 +7042,10 @@ define <2 x half> @v_log10_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x34d1
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log10_fneg_fabs_v2f16:
@@ -7170,10 +7180,10 @@ define <2 x half> @v_log10_fneg_v2f16(<2 x half> %in) {
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_e64 v1, -v0
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_fneg_v2f16:
@@ -7181,10 +7191,10 @@ define <2 x half> @v_log10_fneg_v2f16(<2 x half> %in) {
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x34d1
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log10_fneg_v2f16:
@@ -7300,15 +7310,25 @@ define <2 x half> @v_log10_v2f16_fast(<2 x half> %in) {
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_log10_v2f16_fast:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log10_v2f16_fast:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log10_v2f16_fast:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x34d1
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log10_v2f16_fast:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -7423,17 +7443,29 @@ define <3 x half> @v_log10_v3f16(<3 x half> %in) {
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_log10_v3f16:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log10_v3f16:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_log_f16_e32 v2, v0
+; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v2
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v3, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log10_v3f16:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
+; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x34d1
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v2
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log10_v3f16:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -7554,17 +7586,29 @@ define <3 x half> @v_log10_v3f16_fast(<3 x half> %in) {
 ; VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_log10_v3f16_fast:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log10_v3f16_fast:
+; GFX900-SDAG:       ; %bb.0:
+; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT:    v_log_f16_e32 v2, v0
+; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v2
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v3, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log10_v3f16_fast:
+; GFX900-GISEL:       ; %bb.0:
+; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
+; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x34d1
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v2
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log10_v3f16_fast:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -7724,30 +7768,28 @@ define <4 x half> @v_log10_v4f16(<4 x half> %in) {
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v2, v1
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v3, v0
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v3, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v2
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v3
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v4, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v1, v5, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_v4f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT:    v_log_f16_e32 v4, v1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x34d1
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v2
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v4
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v1, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log10_v4f16:
@@ -7914,30 +7956,28 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) {
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v2, v1
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX900-SDAG-NEXT:    v_log_f16_e32 v3, v0
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
-; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v3, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v1, 0x34d1, v2
+; GFX900-SDAG-NEXT:    v_mul_f16_e32 v0, 0x34d1, v3
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v0, v4, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-SDAG-NEXT:    v_mul_f16_sdwa v1, v5, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log10_v4f16_fast:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT:    v_log_f16_e32 v4, v1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x34d1
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v0, 0x34d1, v2
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x34d1, v4
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v1, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log10_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index cf2c8fe8fc574..20ccd049735ef 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -4121,21 +4121,13 @@ define <2 x half> @v_log2_v2f16(<2 x half> %in) {
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-SDAG-LABEL: v_log2_v2f16:
-; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_log2_v2f16:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_log2_v2f16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_log_f16_e32 v1, v0
+; GFX900-NEXT:    v_log_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log2_v2f16:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -4238,18 +4230,17 @@ define <2 x half> @v_log2_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log2_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e64 v0, |v0|
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_log_f16_e64 v1, |v0|
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_fabs_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX900-GISEL-NEXT:    v_log_f16_e32 v0, v1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log2_fabs_v2f16:
@@ -4361,18 +4352,17 @@ define <2 x half> @v_log2_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log2_fneg_fabs_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e64 v0, -|v0|
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_log_f16_e64 v1, -|v0|
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_fneg_fabs_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_or_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_or_b32_e32 v1, 0x80008000, v0
+; GFX900-GISEL-NEXT:    v_log_f16_e32 v0, v1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log2_fneg_fabs_v2f16:
@@ -4485,18 +4475,17 @@ define <2 x half> @v_log2_fneg_v2f16(<2 x half> %in) {
 ; GFX900-SDAG-LABEL: v_log2_fneg_v2f16:
 ; GFX900-SDAG:       ; %bb.0:
 ; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e64 v0, -v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT:    v_log_f16_e64 v1, -v0
+; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-GISEL-LABEL: v_log2_fneg_v2f16:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_xor_b32_e32 v1, 0x80008000, v0
+; GFX900-GISEL-NEXT:    v_log_f16_e32 v0, v1
+; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log2_fneg_v2f16:
@@ -4591,21 +4580,13 @@ define <2 x half> @v_log2_v2f16_fast(<2 x half> %in) {
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-SDAG-LABEL: v_log2_v2f16_fast:
-; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_log2_v2f16_fast:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_log2_v2f16_fast:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_log_f16_e32 v1, v0
+; GFX900-NEXT:    v_log_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log2_v2f16_fast:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -4709,23 +4690,14 @@ define <3 x half> @v_log2_v3f16(<3 x half> %in) {
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-SDAG-LABEL: v_log2_v3f16:
-; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_log2_v3f16:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_log2_v3f16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_log_f16_e32 v2, v0
+; GFX900-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-NEXT:    v_log_f16_e32 v1, v1
+; GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log2_v3f16:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -4827,23 +4799,14 @@ define <3 x half> @v_log2_v3f16_fast(<3 x half> %in) {
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-SDAG-LABEL: v_log2_v3f16_fast:
-; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_log2_v3f16_fast:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_log2_v3f16_fast:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_log_f16_e32 v2, v0
+; GFX900-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-NEXT:    v_log_f16_e32 v1, v1
+; GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log2_v3f16_fast:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -4963,27 +4926,16 @@ define <4 x half> @v_log2_v4f16(<4 x half> %in) {
 ; VI-GISEL-NEXT:    v_or_b32_e32 v1, v3, v1
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-SDAG-LABEL: v_log2_v4f16:
-; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v3
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_log2_v4f16:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_log2_v4f16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_log_f16_e32 v2, v0
+; GFX900-NEXT:    v_log_f16_e32 v3, v1
+; GFX900-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-NEXT:    v_log_f16_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-NEXT:    v_mov_b32_e32 v1, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log2_v4f16:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
@@ -5103,27 +5055,16 @@ define <4 x half> @v_log2_v4f16_fast(<4 x half> %in) {
 ; VI-GISEL-NEXT:    v_or_b32_e32 v1, v3, v1
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-SDAG-LABEL: v_log2_v4f16_fast:
-; GFX900-SDAG:       ; %bb.0:
-; GFX900-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT:    v_log_f16_e32 v1, v1
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v3
-; GFX900-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX900-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_log2_v4f16_fast:
-; GFX900-GISEL:       ; %bb.0:
-; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT:    v_pack_b32_f16 v1, v3, v1
-; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_log2_v4f16_fast:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_log_f16_e32 v2, v0
+; GFX900-NEXT:    v_log_f16_e32 v3, v1
+; GFX900-NEXT:    v_log_f16_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-NEXT:    v_log_f16_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; GFX900-NEXT:    v_mov_b32_e32 v1, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-TRUE16-LABEL: v_log2_v4f16_fast:
 ; GFX1100-SDAG-TRUE16:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index 57ce028b1fc4a..513a6f4aba92d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -199,9 +199,8 @@ define amdgpu_kernel void @rint_v2f16(
 ; GFX9-NEXT:    s_mov_b32 s5, s1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_rndne_f16_e32 v1, v0
-; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    v_rndne_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    buffer_store_dword v1, off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-TRUE16-LABEL: rint_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
index d1a16b687f930..2bb957813ce86 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
@@ -977,16 +977,15 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v0, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX9-NEXT:    v_bfi_b32 v2, s5, v2, v3
-; GFX9-NEXT:    v_add_f16_e32 v1, v1, v2
-; GFX9-NEXT:    v_trunc_f16_e32 v2, s6
-; GFX9-NEXT:    v_sub_f16_e32 v3, s6, v2
-; GFX9-NEXT:    v_cmp_ge_f16_e64 vcc, |v3|, 0.5
+; GFX9-NEXT:    v_trunc_f16_e32 v3, s6
+; GFX9-NEXT:    v_sub_f16_e32 v4, s6, v3
+; GFX9-NEXT:    v_cmp_ge_f16_e64 vcc, |v4|, 0.5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v3, s6
-; GFX9-NEXT:    v_bfi_b32 v0, s5, v0, v3
-; GFX9-NEXT:    v_add_f16_e32 v0, v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s6
+; GFX9-NEXT:    v_bfi_b32 v0, s5, v0, v4
+; GFX9-NEXT:    v_add_f16_e32 v0, v3, v0
 ; GFX9-NEXT:    s_mov_b32 s2, -1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT:    v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index b7fc76aecf080..b117a922dfba6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -189,9 +189,8 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_pk_mul_f16 v1, v1, 0.15915494 op_sel_hi:[1,0]
 ; GFX9-NEXT:    v_sin_f16_e32 v2, v1
-; GFX9-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v1, v2, v1
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_sin_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sin_v2f16:
@@ -203,9 +202,8 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, 0.15915494 op_sel_hi:[1,0]
 ; GFX10-NEXT:    v_sin_f16_e32 v2, v1
-; GFX10-NEXT:    v_sin_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v1, v2, v1
-; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT:    v_sin_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-TRUE16-LABEL: sin_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index 8b31944acc15a..e5e1a1ce53bdd 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -1829,25 +1829,25 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr
 ; SDAG-GFX1100-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; SDAG-GFX1100-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; SDAG-GFX900-LABEL: v_mad_mix_v2f32_clamp_precvt:
-; SDAG-GFX900:       ; %bb.0:
-; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX900-NEXT:    v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX900-NEXT:    v_cvt_f16_f32_e32 v1, v3
-; SDAG-GFX900-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SDAG-GFX900-NEXT:    v_pack_b32_f16 v0, v0, v1
-; SDAG-GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_mad_mix_v2f32_clamp_precvt:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mad_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GFX900-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX900-NEXT:    v_cvt_f16_f32_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; SDAG-GFX906-LABEL: v_mad_mix_v2f32_clamp_precvt:
-; SDAG-GFX906:       ; %bb.0:
-; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX906-NEXT:    v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX906-NEXT:    v_cvt_f16_f32_e32 v1, v3
-; SDAG-GFX906-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SDAG-GFX906-NEXT:    v_pack_b32_f16 v0, v0, v1
-; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
+; GFX906-LABEL: v_mad_mix_v2f32_clamp_precvt:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GFX906-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX906-NEXT:    v_cvt_f16_f32_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GFX906-NEXT:    v_mov_b32_e32 v0, v3
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mix_v2f32_clamp_precvt:
 ; SDAG-VI:       ; %bb.0:
@@ -1900,26 +1900,6 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr
 ; GISEL-GFX1100-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GISEL-GFX900-LABEL: v_mad_mix_v2f32_clamp_precvt:
-; GISEL-GFX900:       ; %bb.0:
-; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX900-NEXT:    v_mad_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT:    v_cvt_f16_f32_e32 v1, v3
-; GISEL-GFX900-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GISEL-GFX900-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GISEL-GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-GFX906-LABEL: v_mad_mix_v2f32_clamp_precvt:
-; GISEL-GFX906:       ; %bb.0:
-; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX906-NEXT:    v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT:    v_cvt_f16_f32_e32 v1, v3
-; GISEL-GFX906-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GISEL-GFX906-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GISEL-VI-LABEL: v_mad_mix_v2f32_clamp_precvt:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1990,29 +1970,29 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr
 ; SDAG-GFX1100-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
 ; SDAG-GFX1100-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; SDAG-GFX900-LABEL: v_mad_mix_v3f32_clamp_precvt:
-; SDAG-GFX900:       ; %bb.0:
-; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX900-NEXT:    v_mad_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX900-NEXT:    v_cvt_f16_f32_e32 v2, v6
-; SDAG-GFX900-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SDAG-GFX900-NEXT:    v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX900-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SDAG-GFX900-NEXT:    v_pack_b32_f16 v0, v0, v2
-; SDAG-GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_mad_mix_v3f32_clamp_precvt:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_mad_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GFX900-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX900-NEXT:    v_mad_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX900-NEXT:    v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GFX900-NEXT:    v_cvt_f16_f32_sdwa v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GFX900-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX900-NEXT:    v_mov_b32_e32 v0, v6
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; SDAG-GFX906-LABEL: v_mad_mix_v3f32_clamp_precvt:
-; SDAG-GFX906:       ; %bb.0:
-; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX906-NEXT:    v_fma_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX906-NEXT:    v_cvt_f16_f32_e32 v2, v6
-; SDAG-GFX906-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SDAG-GFX906-NEXT:    v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX906-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SDAG-GFX906-NEXT:    v_pack_b32_f16 v0, v0, v2
-; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
+; GFX906-LABEL: v_mad_mix_v3f32_clamp_precvt:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GFX906-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX906-NEXT:    v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX906-NEXT:    v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GFX906-NEXT:    v_cvt_f16_f32_sdwa v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GFX906-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX906-NEXT:    v_mov_b32_e32 v0, v6
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mix_v3f32_clamp_precvt:
 ; SDAG-VI:       ; %bb.0:
@@ -2081,30 +2061,6 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr
 ; GISEL-GFX1100-NEXT:    v_pack_b32_f16 v0, v2, v0
 ; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GISEL-GFX900-LABEL: v_mad_mix_v3f32_clamp_precvt:
-; GISEL-GFX900:       ; %bb.0:
-; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX900-NEXT:    v_mad_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT:    v_cvt_f16_f32_e32 v2, v6
-; GISEL-GFX900-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GISEL-GFX900-NEXT:    v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GISEL-GFX900-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GISEL-GFX900-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-GFX906-LABEL: v_mad_mix_v3f32_clamp_precvt:
-; GISEL-GFX906:       ; %bb.0:
-; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX906-NEXT:    v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT:    v_cvt_f16_f32_e32 v2, v6
-; GISEL-GFX906-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GISEL-GFX906-NEXT:    v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GISEL-GFX906-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GISEL-VI-LABEL: v_mad_mix_v3f32_clamp_precvt:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2192,31 +2148,31 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
 ; SDAG-GFX900-LABEL: v_mad_mix_v4f32_clamp_precvt:
 ; SDAG-GFX900:       ; %bb.0:
 ; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX900-NEXT:    v_mad_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX900-NEXT:    v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX900-NEXT:    v_mad_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX900-NEXT:    v_cvt_f16_f32_e32 v2, v6
-; SDAG-GFX900-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SDAG-GFX900-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SDAG-GFX900-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SDAG-GFX900-NEXT:    v_pack_b32_f16 v0, v0, v3
-; SDAG-GFX900-NEXT:    v_pack_b32_f16 v1, v1, v2
+; SDAG-GFX900-NEXT:    v_mad_mix_f32 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX900-NEXT:    v_mad_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX900-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SDAG-GFX900-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SDAG-GFX900-NEXT:    v_mad_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX900-NEXT:    v_cvt_f16_f32_sdwa v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; SDAG-GFX900-NEXT:    v_cvt_f16_f32_sdwa v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; SDAG-GFX900-NEXT:    v_mov_b32_e32 v0, v6
+; SDAG-GFX900-NEXT:    v_mov_b32_e32 v1, v7
 ; SDAG-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX906-LABEL: v_mad_mix_v4f32_clamp_precvt:
 ; SDAG-GFX906:       ; %bb.0:
 ; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX906-NEXT:    v_fma_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX906-NEXT:    v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX906-NEXT:    v_fma_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX906-NEXT:    v_cvt_f16_f32_e32 v2, v6
-; SDAG-GFX906-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SDAG-GFX906-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SDAG-GFX906-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SDAG-GFX906-NEXT:    v_pack_b32_f16 v0, v0, v3
-; SDAG-GFX906-NEXT:    v_pack_b32_f16 v1, v1, v2
+; SDAG-GFX906-NEXT:    v_fma_mix_f32 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX906-NEXT:    v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX906-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; SDAG-GFX906-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; SDAG-GFX906-NEXT:    v_fma_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX906-NEXT:    v_cvt_f16_f32_sdwa v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; SDAG-GFX906-NEXT:    v_cvt_f16_f32_sdwa v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; SDAG-GFX906-NEXT:    v_mov_b32_e32 v0, v6
+; SDAG-GFX906-NEXT:    v_mov_b32_e32 v1, v7
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mix_v4f32_clamp_precvt:
@@ -2309,30 +2265,30 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
 ; GISEL-GFX900:       ; %bb.0:
 ; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX900-NEXT:    v_mad_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX900-NEXT:    v_mad_mix_f32 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX900-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GISEL-GFX900-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; GISEL-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT:    v_mad_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX900-NEXT:    v_mad_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT:    v_cvt_f16_f32_e32 v3, v6
-; GISEL-GFX900-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GISEL-GFX900-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GISEL-GFX900-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GISEL-GFX900-NEXT:    v_pack_b32_f16 v0, v3, v0
-; GISEL-GFX900-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GISEL-GFX900-NEXT:    v_cvt_f16_f32_sdwa v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GISEL-GFX900-NEXT:    v_cvt_f16_f32_sdwa v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GISEL-GFX900-NEXT:    v_mov_b32_e32 v0, v6
+; GISEL-GFX900-NEXT:    v_mov_b32_e32 v1, v7
 ; GISEL-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX906-LABEL: v_mad_mix_v4f32_clamp_precvt:
 ; GISEL-GFX906:       ; %bb.0:
 ; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX906-NEXT:    v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX906-NEXT:    v_fma_mix_f32 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX906-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GISEL-GFX906-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; GISEL-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT:    v_fma_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
 ; GISEL-GFX906-NEXT:    v_fma_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT:    v_cvt_f16_f32_e32 v3, v6
-; GISEL-GFX906-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GISEL-GFX906-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GISEL-GFX906-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GISEL-GFX906-NEXT:    v_pack_b32_f16 v0, v3, v0
-; GISEL-GFX906-NEXT:    v_pack_b32_f16 v1, v2, v1
+; GISEL-GFX906-NEXT:    v_cvt_f16_f32_sdwa v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GISEL-GFX906-NEXT:    v_cvt_f16_f32_sdwa v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GISEL-GFX906-NEXT:    v_mov_b32_e32 v0, v6
+; GISEL-GFX906-NEXT:    v_mov_b32_e32 v1, v7
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-VI-LABEL: v_mad_mix_v4f32_clamp_precvt:
diff --git a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
index 88a8f3affc83a..0276529b2ade7 100644
--- a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
+++ b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
@@ -272,8 +272,7 @@ define <2 x half> @v_repeat_divisor_f16_x2_arcp(half %x, half %y, half %D) #0 {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e32 v2, v2
 ; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v2
-; GFX9-NEXT:    v_mul_f16_e32 v1, v1, v2
-; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT:    v_mul_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_repeat_divisor_f16_x2_arcp:
@@ -555,9 +554,9 @@ define <3 x half> @v_repeat_divisor_f16_x3_arcp(half %x, half %y, half %z, half
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rcp_f16_e32 v3, v3
 ; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v3
-; GFX9-NEXT:    v_mul_f16_e32 v4, v1, v3
-; GFX9-NEXT:    v_mul_f16_e32 v1, v2, v3
-; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v4
+; GFX9-NEXT:    v_mul_f16_e32 v2, v2, v3
+; GFX9-NEXT:    v_mul_f16_sdwa v0, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_repeat_divisor_f16_x3_arcp:
@@ -825,11 +824,10 @@ define <4 x half> @v_repeat_divisor_v2f16_x2(<2 x half> %x, <2 x half> %y, <2 x
 ; GFX9-LABEL: v_repeat_divisor_v2f16_x2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rcp_f16_sdwa v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_rcp_f16_e32 v2, v2
-; GFX9-NEXT:    v_pack_b32_f16 v2, v2, v3
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v2
-; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v2
+; GFX9-NEXT:    v_rcp_f16_e32 v3, v2
+; GFX9-NEXT:    v_rcp_f16_sdwa v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v3
+; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_repeat_divisor_v2f16_x2:
@@ -926,16 +924,15 @@ define <6 x half> @v_repeat_divisor_v3f16_x2(<3 x half> %x, <3 x half> %y, <3 x
 ; GFX9-LABEL: v_repeat_divisor_v3f16_x2:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rcp_f16_sdwa v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_rcp_f16_e32 v4, v4
-; GFX9-NEXT:    v_rcp_f16_e32 v5, v5
+; GFX9-NEXT:    v_rcp_f16_e32 v6, v4
+; GFX9-NEXT:    v_rcp_f16_sdwa v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_rcp_f16_e32 v4, v5
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7e00
-; GFX9-NEXT:    v_pack_b32_f16 v4, v4, v6
-; GFX9-NEXT:    v_pack_b32_f16 v5, v5, s4
-; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v4
-; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v5
-; GFX9-NEXT:    v_pk_mul_f16 v3, v3, v5
-; GFX9-NEXT:    v_pk_mul_f16 v4, v2, v4
+; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v6
+; GFX9-NEXT:    v_pack_b32_f16 v4, v4, s4
+; GFX9-NEXT:    v_pk_mul_f16 v1, v1, v4
+; GFX9-NEXT:    v_pk_mul_f16 v3, v3, v4
+; GFX9-NEXT:    v_pk_mul_f16 v4, v2, v6
 ; GFX9-NEXT:    v_alignbit_b32 v2, v3, v4, 16
 ; GFX9-NEXT:    v_pack_b32_f16 v1, v1, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll
index a259156c09bd7..5fa6b046b9d4d 100644
--- a/llvm/test/CodeGen/AMDGPU/roundeven.ll
+++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll
@@ -461,16 +461,16 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rndne_f16_e32 v1, v0
-; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT:    v_rndne_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_roundeven_v2f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rndne_f16_e32 v1, v0
-; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT:    v_rndne_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_roundeven_v2f16:
@@ -522,17 +522,17 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
 ; SDAG_GFX9-LABEL: v_roundeven_v2f16:
 ; SDAG_GFX9:       ; %bb.0:
 ; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX9-NEXT:    v_rndne_f16_e32 v0, v0
-; SDAG_GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX9-NEXT:    v_rndne_f16_e32 v1, v0
+; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; SDAG_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX10-LABEL: v_roundeven_v2f16:
 ; SDAG_GFX10:       ; %bb.0:
 ; SDAG_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX10-NEXT:    v_rndne_f16_e32 v0, v0
-; SDAG_GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX10-NEXT:    v_rndne_f16_e32 v1, v0
+; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; SDAG_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX11-TRUE16-LABEL: v_roundeven_v2f16:
@@ -599,19 +599,17 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
 ; GFX9-LABEL: v_roundeven_v2f16_fneg:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX9-NEXT:    v_rndne_f16_e32 v1, v0
-; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80008000, v0
+; GFX9-NEXT:    v_rndne_f16_e32 v0, v1
+; GFX9-NEXT:    v_rndne_f16_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_roundeven_v2f16_fneg:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
-; GFX10-NEXT:    v_rndne_f16_e32 v1, v0
-; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT:    v_xor_b32_e32 v1, 0x80008000, v0
+; GFX10-NEXT:    v_rndne_f16_e32 v0, v1
+; GFX10-NEXT:    v_rndne_f16_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_roundeven_v2f16_fneg:
@@ -673,17 +671,17 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
 ; SDAG_GFX9-LABEL: v_roundeven_v2f16_fneg:
 ; SDAG_GFX9:       ; %bb.0:
 ; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX9-NEXT:    v_rndne_f16_e64 v0, -v0
-; SDAG_GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX9-NEXT:    v_rndne_f16_e64 v1, -v0
+; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v1, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX9-NEXT:    v_mov_b32_e32 v0, v1
 ; SDAG_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX10-LABEL: v_roundeven_v2f16_fneg:
 ; SDAG_GFX10:       ; %bb.0:
 ; SDAG_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX10-NEXT:    v_rndne_f16_e64 v0, -v0
-; SDAG_GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX10-NEXT:    v_rndne_f16_e64 v1, -v0
+; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v1, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; SDAG_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX11-TRUE16-LABEL: v_roundeven_v2f16_fneg:
@@ -756,22 +754,22 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_rndne_f16_e32 v2, v0
-; GFX9-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_rndne_f16_e32 v3, v1
-; GFX9-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT:    v_rndne_f16_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_rndne_f16_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_roundeven_v4f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_rndne_f16_e32 v2, v0
-; GFX10-NEXT:    v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_rndne_f16_e32 v3, v1
-; GFX10-NEXT:    v_rndne_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
-; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT:    v_rndne_f16_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_rndne_f16_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_roundeven_v4f16:
@@ -844,23 +842,23 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
 ; SDAG_GFX9-LABEL: v_roundeven_v4f16:
 ; SDAG_GFX9:       ; %bb.0:
 ; SDAG_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX9-NEXT:    v_rndne_f16_e32 v1, v1
-; SDAG_GFX9-NEXT:    v_rndne_f16_e32 v0, v0
-; SDAG_GFX9-NEXT:    v_pack_b32_f16 v0, v0, v3
-; SDAG_GFX9-NEXT:    v_pack_b32_f16 v1, v1, v2
+; SDAG_GFX9-NEXT:    v_rndne_f16_e32 v2, v1
+; SDAG_GFX9-NEXT:    v_rndne_f16_e32 v3, v0
+; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX9-NEXT:    v_rndne_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX9-NEXT:    v_mov_b32_e32 v0, v3
+; SDAG_GFX9-NEXT:    v_mov_b32_e32 v1, v2
 ; SDAG_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX10-LABEL: v_roundeven_v4f16:
 ; SDAG_GFX10:       ; %bb.0:
 ; SDAG_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX10-NEXT:    v_rndne_f16_e32 v0, v0
-; SDAG_GFX10-NEXT:    v_rndne_f16_e32 v1, v1
-; SDAG_GFX10-NEXT:    v_pack_b32_f16 v0, v0, v3
-; SDAG_GFX10-NEXT:    v_pack_b32_f16 v1, v1, v2
+; SDAG_GFX10-NEXT:    v_rndne_f16_e32 v2, v0
+; SDAG_GFX10-NEXT:    v_rndne_f16_e32 v3, v1
+; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX10-NEXT:    v_rndne_f16_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; SDAG_GFX10-NEXT:    v_mov_b32_e32 v1, v3
 ; SDAG_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX11-TRUE16-LABEL: v_roundeven_v4f16:
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll b/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll
index 8ad6a4e534d23..5931a5a1abddb 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll
@@ -18,9 +18,8 @@ define void @extracted_values(ptr %ret_struct, ptr addrspace(3) %arg0, ptr addrs
 ; CHECK-NEXT:    v_sub_f16_sdwa v7, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; CHECK-NEXT:    v_sub_f16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; CHECK-NEXT:    v_add_f16_e32 v4, v6, v7
-; CHECK-NEXT:    v_add_f16_e32 v2, v3, v2
-; CHECK-NEXT:    v_pack_b32_f16 v2, v4, v2
-; CHECK-NEXT:    flat_store_dword v[0:1], v2
+; CHECK-NEXT:    v_add_f16_sdwa v4, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; CHECK-NEXT:    flat_store_dword v[0:1], v4
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
index 5518c7a14cc69..4c5572e2fb58e 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
@@ -608,13 +608,12 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
 ; GFX9-LABEL: add_select_fabs_negk_negk_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xbc00
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xc000
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xbc00
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xc000
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v1, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v0, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -684,13 +683,12 @@ define <2 x half> @add_select_posk_posk_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; GFX9-LABEL: add_select_posk_posk_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3c00
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4000
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x3c00
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4000
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v1, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v0, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1772,13 +1770,12 @@ define <2 x half> @add_select_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; GFX9-LABEL: add_select_negk_negk_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xbc00
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xc000
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xbc00
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xc000
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v1, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v0, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1849,13 +1846,12 @@ define <2 x half> @add_select_negliteralk_negliteralk_v2f16(<2 x i32> %c, <2 x h
 ; GFX9-LABEL: add_select_negliteralk_negliteralk_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xec00
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xe800
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xec00
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xe800
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v1, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v0, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_pk_add_f16 v0, v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1924,13 +1920,12 @@ define <2 x half> @add_select_fneg_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
 ; GFX9-LABEL: add_select_fneg_negk_negk_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0xbc00
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0xc000
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xbc00
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xc000
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v1, v3, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_sdwa v0, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll
index 5440eb871f349..c93849fabf2e2 100644
--- a/llvm/test/CodeGen/AMDGPU/v_pack.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll
@@ -19,8 +19,8 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace
 ; GCN-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_f16_e32 v0, 2.0, v1
-; GCN-NEXT:    v_add_f16_e32 v1, 2.0, v2
-; GCN-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x4000
+; GCN-NEXT:    v_add_f16_sdwa v0, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use v0
 ; GCN-NEXT:    ;;#ASMEND
@@ -36,8 +36,8 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace
 ; GISEL-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_add_f16_e32 v0, 2.0, v1
-; GISEL-NEXT:    v_add_f16_e32 v1, 2.0, v2
-; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0x4000
+; GISEL-NEXT:    v_add_f16_sdwa v0, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GISEL-NEXT:    ;;#ASMSTART
 ; GISEL-NEXT:    ; use v0
 ; GISEL-NEXT:    ;;#ASMEND
@@ -144,8 +144,8 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs
 ; GCN-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_subrev_f16_e32 v0, 2.0, v1
-; GCN-NEXT:    v_add_f16_e32 v1, 2.0, v2
-; GCN-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x4000
+; GCN-NEXT:    v_add_f16_sdwa v0, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use v0
 ; GCN-NEXT:    ;;#ASMEND
@@ -161,8 +161,8 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs
 ; GISEL-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_subrev_f16_e32 v0, 2.0, v1
-; GISEL-NEXT:    v_add_f16_e32 v1, 2.0, v2
-; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0x4000
+; GISEL-NEXT:    v_add_f16_sdwa v0, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GISEL-NEXT:    ;;#ASMSTART
 ; GISEL-NEXT:    ; use v0
 ; GISEL-NEXT:    ;;#ASMEND
@@ -273,9 +273,8 @@ define amdgpu_kernel void @fptrunc(
 ; GCN-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GCN-NEXT:    v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
@@ -286,10 +285,9 @@ define amdgpu_kernel void @fptrunc(
 ; GISEL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_cvt_f16_f32_e32 v0, s2
-; GISEL-NEXT:    v_cvt_f16_f32_e32 v1, s3
+; GISEL-NEXT:    v_cvt_f16_f32_sdwa v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 ; GISEL-NEXT:    s_mov_b32 s2, -1
 ; GISEL-NEXT:    s_mov_b32 s3, 0x31016000
-; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GISEL-NEXT:    s_endpgm
 ;
@@ -379,8 +377,8 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(
 ; GCN-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_f16_e32 v0, 2.0, v1
-; GCN-NEXT:    v_add_f16_e32 v1, 2.0, v2
-; GCN-NEXT:    v_pack_b32_f16 v0, |v0|, |v1|
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x4000
+; GCN-NEXT:    v_add_f16_sdwa v0, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use v0
 ; GCN-NEXT:    ;;#ASMEND
@@ -396,8 +394,8 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(
 ; GISEL-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_add_f16_e32 v0, 2.0, v1
-; GISEL-NEXT:    v_add_f16_e32 v1, 2.0, v2
-; GISEL-NEXT:    v_pack_b32_f16 v0, |v0|, |v1|
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0x4000
+; GISEL-NEXT:    v_add_f16_sdwa v0, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GISEL-NEXT:    ;;#ASMSTART
 ; GISEL-NEXT:    ; use v0
 ; GISEL-NEXT:    ;;#ASMEND
@@ -512,8 +510,8 @@ define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(
 ; GCN-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_f16_e32 v0, 2.0, v1
-; GCN-NEXT:    v_add_f16_e32 v1, 2.0, v2
-; GCN-NEXT:    v_pack_b32_f16 v0, -v0, -v1
+; GCN-NEXT:    v_mov_b32_e32 v1, 0x4000
+; GCN-NEXT:    v_add_f16_sdwa v0, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use v0
 ; GCN-NEXT:    ;;#ASMEND
@@ -529,8 +527,8 @@ define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(
 ; GISEL-NEXT:    global_load_ushort v2, v0, s[2:3] glc dlc
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_add_f16_e32 v0, 2.0, v1
-; GISEL-NEXT:    v_add_f16_e32 v1, 2.0, v2
-; GISEL-NEXT:    v_pack_b32_f16 v0, -v0, -v1
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0x4000
+; GISEL-NEXT:    v_add_f16_sdwa v0, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
 ; GISEL-NEXT:    ;;#ASMSTART
 ; GISEL-NEXT:    ; use v0
 ; GISEL-NEXT:    ;;#ASMEND



More information about the llvm-branch-commits mailing list