[llvm-branch-commits] [llvm] [AMDGPU] si-peephole-sdwa: Handle V_PACK_B32_F16_e64 (WIP) (PR #176383)
Frederik Harwath via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Jan 16 05:56:10 PST 2026
https://github.com/frederik-h updated https://github.com/llvm/llvm-project/pull/176383
>From 907b6c6b6eef7e673fcb844afc16e9aa6f8268a4 Mon Sep 17 00:00:00 2001
From: Frederik Harwath <fharwath at amd.com>
Date: Fri, 16 Jan 2026 04:17:32 -0500
Subject: [PATCH] [AMDGPU] si-peephole-sdwa: Handle V_PACK_B32_F16_e64 (WIP)
Change si-peephole-sdwa to eliminate V_PACK_B32_F16_e64 instructions
by changing the second operand to write to the upper word of the
destination directly.
---
llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 35 +++
.../AMDGPU/GlobalISel/combine-fma-sub-mul.ll | 112 ++++----
.../GlobalISel/combine-fma-sub-neg-mul.ll | 56 ++--
.../CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll | 80 +++---
.../AMDGPU/copysign-simplify-demanded-bits.ll | 11 +-
.../CodeGen/AMDGPU/dagcombine-fmul-sel.ll | 38 ++-
.../CodeGen/AMDGPU/extract-subvector-16bit.ll | 71 ++---
.../AMDGPU/fcanonicalize-elimination.ll | 7 +-
llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 50 ++--
llvm/test/CodeGen/AMDGPU/fdiv.f16.ll | 24 +-
llvm/test/CodeGen/AMDGPU/fmaximum3.ll | 7 +-
llvm/test/CodeGen/AMDGPU/fminimum3.ll | 7 +-
llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll | 266 +++++++++---------
.../AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll | 63 ++---
llvm/test/CodeGen/AMDGPU/fpow.ll | 196 ++++++-------
llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll | 19 +-
llvm/test/CodeGen/AMDGPU/fract-match.ll | 9 +-
llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll | 10 +-
llvm/test/CodeGen/AMDGPU/llvm.exp.ll | 128 +++++----
llvm/test/CodeGen/AMDGPU/llvm.exp10.ll | 169 +++++------
llvm/test/CodeGen/AMDGPU/llvm.exp2.ll | 42 +--
llvm/test/CodeGen/AMDGPU/llvm.frexp.ll | 35 +--
llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll | 56 ++--
llvm/test/CodeGen/AMDGPU/llvm.log.ll | 236 +++++++++-------
llvm/test/CodeGen/AMDGPU/llvm.log10.ll | 236 +++++++++-------
llvm/test/CodeGen/AMDGPU/llvm.log2.ll | 195 +++++--------
llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll | 5 +-
llvm/test/CodeGen/AMDGPU/llvm.round.ll | 15 +-
llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll | 10 +-
llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll | 192 +++++--------
llvm/test/CodeGen/AMDGPU/repeated-divisor.ll | 35 ++-
llvm/test/CodeGen/AMDGPU/roundeven.ll | 86 +++---
llvm/test/CodeGen/AMDGPU/sdwa-commute.ll | 5 +-
.../AMDGPU/select-fabs-fneg-extract.v2f16.ll | 55 ++--
llvm/test/CodeGen/AMDGPU/v_pack.ll | 38 ++-
35 files changed, 1251 insertions(+), 1348 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index acc4b3f0a68b4..232d975c3fc4e 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -455,6 +455,23 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
// writing WORD_1. Modifiers don't matter because all the bits that
// would be impacted are being overwritten by the dst.
// Any other case will not work.
+ //
+ // FIXME Is this really true for f16 operands? That is, this
+ // change introduced by the v_pack_b32_f16 conversion looks wrong:
+ //@@ -2394,17 +2394,17 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half>
+ //%a) {
+ // ; GFX9-LABEL: v_neg_rsq_v2f16:
+ // ; GFX9: ; %bb.0:
+ // ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+ // -; GFX9-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD
+ // dst_unused:UNUSED_PAD src0_sel:WORD_1
+ // -; GFX9-NEXT: v_rsq_f16_e32 v0, v0
+ // -; GFX9-NEXT: v_pack_b32_f16 v0, -v0, -v1
+ // +; GFX9-NEXT: v_rsq_f16_e32 v1, v0
+ // +; GFX9-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:WORD_1
+ // dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+ // +; GFX9-NEXT: v_mov_b32_e32 v0, v1
+ // ; GFX9-NEXT: s_setpc_b64 s[30:31]
SdwaSel DstSel = static_cast<SdwaSel>(
TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel));
if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
@@ -961,7 +978,25 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
return std::make_unique<SDWADstPreserveOperand>(
OrDst, OrSDWADef, OrOtherDef, DstSel);
+ }
+ case AMDGPU::V_PACK_B32_F16_e64: {
+ MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+ MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+
+ bool InvalidOp = false;
+ for (auto *Op : {Dst, Src1, Src2})
+ if (!Op || !Op->isReg() || Op->getReg().isPhysical())
+ InvalidOp = true;
+
+ if (InvalidOp)
+ break;
+
+ if (isSameReg(*Src1, *Src2))
+ break;
+ // FIXME Figure out necessary restrictions on Src1 and Src2
+ return std::make_unique<SDWADstPreserveOperand>(Dst, Src1, Src2, WORD_1);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
index d046b854fb0d8..9b4b14e6ca105 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
@@ -543,14 +543,12 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
; GFX9-LABEL: test_v4f16_sub_mul:
; GFX9: ; %bb.0: ; %.entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2
-; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX9-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT: v_pk_mul_f16 v2, v0, v2
+; GFX9-NEXT: v_pk_mul_f16 v3, v1, v3
+; GFX9-NEXT: v_sub_f16_e32 v0, v2, v4
+; GFX9-NEXT: v_sub_f16_e32 v1, v3, v5
+; GFX9-NEXT: v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul:
@@ -563,27 +561,23 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
; GFX9-DENORM-LABEL: test_v4f16_sub_mul:
; GFX9-DENORM: ; %bb.0: ; %.entry
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
-; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX9-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-DENORM-NEXT: v_pk_mul_f16 v2, v0, v2
+; GFX9-DENORM-NEXT: v_pk_mul_f16 v3, v1, v3
+; GFX9-DENORM-NEXT: v_sub_f16_e32 v0, v2, v4
+; GFX9-DENORM-NEXT: v_sub_f16_e32 v1, v3, v5
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_v4f16_sub_mul:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2
-; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX10-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT: v_pk_mul_f16 v2, v0, v2
+; GFX10-NEXT: v_pk_mul_f16 v3, v1, v3
+; GFX10-NEXT: v_sub_f16_e32 v0, v2, v4
+; GFX10-NEXT: v_sub_f16_e32 v1, v3, v5
+; GFX10-NEXT: v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul:
@@ -596,14 +590,12 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
; GFX10-DENORM-LABEL: test_v4f16_sub_mul:
; GFX10-DENORM: ; %bb.0: ; %.entry
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
-; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX10-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-DENORM-NEXT: v_pk_mul_f16 v2, v0, v2
+; GFX10-DENORM-NEXT: v_pk_mul_f16 v3, v1, v3
+; GFX10-DENORM-NEXT: v_sub_f16_e32 v0, v2, v4
+; GFX10-DENORM-NEXT: v_sub_f16_e32 v1, v3, v5
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-CONTRACT-LABEL: test_v4f16_sub_mul:
@@ -642,14 +634,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
; GFX9-LABEL: test_v4f16_sub_mul_rhs:
; GFX9: ; %bb.0: ; %.entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2
-; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX9-NEXT: v_sub_f16_e32 v2, v4, v0
-; GFX9-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_sub_f16_e32 v3, v5, v1
-; GFX9-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT: v_pk_mul_f16 v2, v0, v2
+; GFX9-NEXT: v_pk_mul_f16 v3, v1, v3
+; GFX9-NEXT: v_sub_f16_e32 v0, v4, v2
+; GFX9-NEXT: v_sub_f16_e32 v1, v5, v3
+; GFX9-NEXT: v_sub_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_sub_f16_sdwa v1, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
@@ -662,27 +652,23 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
; GFX9-DENORM-LABEL: test_v4f16_sub_mul_rhs:
; GFX9-DENORM: ; %bb.0: ; %.entry
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
-; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX9-DENORM-NEXT: v_sub_f16_e32 v2, v4, v0
-; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT: v_sub_f16_e32 v3, v5, v1
-; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-DENORM-NEXT: v_pk_mul_f16 v2, v0, v2
+; GFX9-DENORM-NEXT: v_pk_mul_f16 v3, v1, v3
+; GFX9-DENORM-NEXT: v_sub_f16_e32 v0, v4, v2
+; GFX9-DENORM-NEXT: v_sub_f16_e32 v1, v5, v3
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_v4f16_sub_mul_rhs:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2
-; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX10-NEXT: v_sub_f16_e32 v2, v4, v0
-; GFX10-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT: v_sub_f16_e32 v3, v5, v1
-; GFX10-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT: v_pk_mul_f16 v2, v0, v2
+; GFX10-NEXT: v_pk_mul_f16 v3, v1, v3
+; GFX10-NEXT: v_sub_f16_e32 v0, v4, v2
+; GFX10-NEXT: v_sub_f16_e32 v1, v5, v3
+; GFX10-NEXT: v_sub_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_sub_f16_sdwa v1, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
@@ -695,14 +681,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
; GFX10-DENORM-LABEL: test_v4f16_sub_mul_rhs:
; GFX10-DENORM: ; %bb.0: ; %.entry
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
-; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX10-DENORM-NEXT: v_sub_f16_e32 v2, v4, v0
-; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v5, v1
-; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-DENORM-NEXT: v_pk_mul_f16 v2, v0, v2
+; GFX10-DENORM-NEXT: v_pk_mul_f16 v3, v1, v3
+; GFX10-DENORM-NEXT: v_sub_f16_e32 v0, v4, v2
+; GFX10-DENORM-NEXT: v_sub_f16_e32 v1, v5, v3
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
index c0a828ecacbae..6143e91f037df 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
@@ -219,14 +219,12 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
; GFX9-LABEL: test_v4f16_sub_ext_neg_mul:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX9-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT: v_pk_mul_f16 v2, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT: v_pk_mul_f16 v3, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT: v_sub_f16_e32 v0, v2, v4
+; GFX9-NEXT: v_sub_f16_e32 v1, v3, v5
+; GFX9-NEXT: v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul:
@@ -239,27 +237,23 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
; GFX9-DENORM-LABEL: test_v4f16_sub_ext_neg_mul:
; GFX9-DENORM: ; %bb.0: ; %entry
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX9-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-DENORM-NEXT: v_pk_mul_f16 v2, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-DENORM-NEXT: v_pk_mul_f16 v3, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-DENORM-NEXT: v_sub_f16_e32 v0, v2, v4
+; GFX9-DENORM-NEXT: v_sub_f16_e32 v1, v3, v5
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_v4f16_sub_ext_neg_mul:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX10-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT: v_pk_mul_f16 v2, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-NEXT: v_pk_mul_f16 v3, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-NEXT: v_sub_f16_e32 v0, v2, v4
+; GFX10-NEXT: v_sub_f16_e32 v1, v3, v5
+; GFX10-NEXT: v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul:
@@ -272,14 +266,12 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
; GFX10-DENORM-LABEL: test_v4f16_sub_ext_neg_mul:
; GFX10-DENORM: ; %bb.0: ; %entry
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX10-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-DENORM-NEXT: v_pk_mul_f16 v2, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-DENORM-NEXT: v_pk_mul_f16 v3, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
+; GFX10-DENORM-NEXT: v_sub_f16_e32 v0, v2, v4
+; GFX10-DENORM-NEXT: v_sub_f16_e32 v1, v3, v5
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
entry:
%a = fmul <4 x half> %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 62b264a537457..510f1e012440a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -1092,20 +1092,20 @@ define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_rcp_f16_e32 v2, v1
-; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT: v_rcp_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_mul_f16_e32 v1, v0, v2
+; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdiv_v2f16_afn:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_rcp_f16_e32 v2, v1
-; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT: v_rcp_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_e32 v1, v0, v2
+; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fdiv_v2f16_afn:
@@ -2686,16 +2686,16 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_rcp_f16_e32 v1, v0
-; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT: v_rcp_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_rcp_v2f16_arcp:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_rcp_f16_e32 v1, v0
-; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT: v_rcp_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_rcp_v2f16_arcp:
@@ -2735,16 +2735,16 @@ define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_rcp_f16_e32 v1, v0
-; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT: v_rcp_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_rcp_v2f16_arcp_afn:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_rcp_f16_e32 v1, v0
-; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT: v_rcp_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_rcp_v2f16_arcp_afn:
@@ -3071,20 +3071,20 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_rcp_f16_e32 v2, v1
-; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT: v_rcp_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_mul_f16_e32 v1, v0, v2
+; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdiv_v2f16_afn_ulp25:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_rcp_f16_e32 v2, v1
-; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT: v_rcp_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_e32 v1, v0, v2
+; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fdiv_v2f16_afn_ulp25:
@@ -3189,20 +3189,20 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_rcp_f16_e32 v2, v1
-; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT: v_rcp_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_mul_f16_e32 v1, v0, v2
+; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdiv_v2f16_arcp_ulp25:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_rcp_f16_e32 v2, v1
-; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT: v_rcp_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_e32 v1, v0, v2
+; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fdiv_v2f16_arcp_ulp25:
@@ -3251,20 +3251,20 @@ define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_rcp_f16_e32 v2, v1
-; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT: v_rcp_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_mul_f16_e32 v1, v0, v2
+; GFX9-NEXT: v_mul_f16_sdwa v1, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_rcp_f16_e32 v2, v1
-; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT: v_rcp_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_e32 v1, v0, v2
+; GFX10-NEXT: v_mul_f16_sdwa v1, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index ef676ddc8070e..8611af0e58a80 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -31,16 +31,15 @@ define <2 x half> @test_pown_reduced_fast_v2f16_known_odd(<2 x half> %x, <2 x i3
; GFX9-LABEL: test_pown_reduced_fast_v2f16_known_odd:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_or_b32_e32 v2, 1, v2
; GFX9-NEXT: v_or_b32_e32 v1, 1, v1
-; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1
-; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0
+; GFX9-NEXT: v_or_b32_e32 v2, 1, v2
+; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2
; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
-; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT: v_pack_b32_f16 v1, v1, v2
-; GFX9-NEXT: v_pk_mul_f16 v1, v3, v1
+; GFX9-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v0
+; GFX9-NEXT: v_pk_mul_f16 v1, v2, v1
; GFX9-NEXT: v_bfi_b32 v0, s4, v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%y = or <2 x i32> %y.arg, <i32 1, i32 1>
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
index d1b8a17915adc..f01407470e3ac 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
@@ -1787,13 +1787,12 @@ define <2 x half> @fmul_select_v2f16_test3(<2 x half> %x, <2 x i32> %bool.arg1,
; GFX9-SDAG-LABEL: fmul_select_v2f16_test3:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x3c00
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x4000
; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x3c00
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x4000
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v3
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v2, v4, s[4:5]
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v1, v2, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX9-SDAG-NEXT: v_pk_mul_f16 v0, v0, v1
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -1817,11 +1816,11 @@ define <2 x half> @fmul_select_v2f16_test3(<2 x half> %x, <2 x i32> %bool.arg1,
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_mov_b32_e32 v5, 0x4000
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e64 s4, v1, v3
; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
-; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
-; GFX10-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0x3c00
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, 0x3c00, v5, s4
+; GFX10-SDAG-NEXT: v_cndmask_b32_sdwa v1, v2, v5, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX10-SDAG-NEXT: v_pk_mul_f16 v0, v0, v1
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -1943,13 +1942,12 @@ define <2 x half> @fmul_select_v2f16_test4(<2 x half> %x, <2 x i32> %bool.arg1,
; GFX9-SDAG-LABEL: fmul_select_v2f16_test4:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x3c00
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x3800
; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x3c00
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x3800
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v3
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v2, v4, s[4:5]
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v1, v2, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX9-SDAG-NEXT: v_pk_mul_f16 v0, v0, v1
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -1973,11 +1971,11 @@ define <2 x half> @fmul_select_v2f16_test4(<2 x half> %x, <2 x i32> %bool.arg1,
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_mov_b32_e32 v5, 0x3800
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e64 s4, v1, v3
; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3c00, v5, vcc_lo
-; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v5, vcc_lo
-; GFX10-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0x3c00
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, 0x3c00, v5, s4
+; GFX10-SDAG-NEXT: v_cndmask_b32_sdwa v1, v2, v5, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX10-SDAG-NEXT: v_pk_mul_f16 v0, v0, v1
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index d370ea86e3dc8..50dff6fc3c391 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -490,19 +490,15 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: .LBB2_3: ; %exit
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00
-; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v2
-; GFX9-NEXT: v_mov_b32_e32 v5, 0x3800
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc
-; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v2, v5 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x3900
+; GFX9-NEXT: v_mov_b32_e32 v5, 0x3d00
+; GFX9-NEXT: v_cmp_ge_f16_e64 s[4:5], 0.5, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v5, s[4:5]
+; GFX9-NEXT: v_cmp_ge_f16_e64 s[4:5], 0.5, v3
; GFX9-NEXT: v_cmp_nge_f16_e32 vcc, 0.5, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v0, vcc
-; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v3
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT: v_pack_b32_f16 v1, v0, v5
-; GFX9-NEXT: v_pack_b32_f16 v0, v4, v2
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v4, v5, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_cndmask_b32_sdwa v0, v4, v5, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX9-NEXT: .LBB2_4:
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
@@ -1210,19 +1206,15 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
; GFX9-NEXT: .LBB5_3: ; %exit
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x3900
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x3d00
-; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v4
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x3800
-; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
-; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v4, v3 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x3900
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x3d00
+; GFX9-NEXT: v_cmp_ge_f16_e64 s[4:5], 0.5, v4
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v3, s[4:5]
+; GFX9-NEXT: v_cmp_ge_f16_e64 s[4:5], 0.5, v5
; GFX9-NEXT: v_cmp_nge_f16_e32 vcc, 0.5, v5
-; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc
-; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v5
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX9-NEXT: v_pack_b32_f16 v1, v0, v4
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v3
+; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_cndmask_b32_sdwa v0, v2, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX9-NEXT: .LBB5_4:
; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
@@ -1920,29 +1912,22 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
; GFX9-NEXT: .LBB8_4: ; %exit
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x3800
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x3900
-; GFX9-NEXT: v_mov_b32_e32 v2, 0x3d00
+; GFX9-NEXT: v_mov_b32_e32 v9, 0x3900
+; GFX9-NEXT: v_mov_b32_e32 v10, 0x3d00
; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v7
-; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v2, vcc
-; GFX9-NEXT: v_cmp_nle_f16_sdwa vcc, v7, v0 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_cndmask_b32_e32 v7, v2, v1, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v6
-; GFX9-NEXT: v_cndmask_b32_e32 v8, v1, v2, vcc
-; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v6, v0 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_cndmask_b32_e32 v6, v1, v2, vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc
; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v5
-; GFX9-NEXT: v_cndmask_b32_e32 v9, v1, v2, vcc
-; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v5, v0 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_cndmask_b32_e32 v5, v1, v2, vcc
+; GFX9-NEXT: v_mov_b32_e32 v8, 0x3800
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc
; GFX9-NEXT: v_cmp_ge_f16_e32 vcc, 0.5, v4
-; GFX9-NEXT: v_cndmask_b32_e32 v10, v1, v2, vcc
-; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v4, v0 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT: v_pack_b32_f16 v0, v10, v0
-; GFX9-NEXT: v_pack_b32_f16 v1, v9, v5
-; GFX9-NEXT: v_pack_b32_f16 v2, v8, v6
-; GFX9-NEXT: v_pack_b32_f16 v3, v3, v7
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc
+; GFX9-NEXT: v_cmp_le_f16_sdwa vcc, v4, v8 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_cndmask_b32_sdwa v0, v9, v10, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_cndmask_b32_sdwa v1, v9, v10, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_cndmask_b32_sdwa v2, v9, v10, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_cndmask_b32_sdwa v3, v10, v9, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: vec_16xf16_extract_8xf16_0:
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
index a62673679eb8e..8df2a2f530e1d 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -286,11 +286,10 @@ define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32_flushf16
; GCN-DAG: v_cvt_f16_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
; VI-DAG: v_cvt_f16_f32_sdwa [[V1:v[0-9]+]], v{{[0-9]+}}
; VI: v_or_b32_e32 [[V:v[0-9]+]], [[V0]], [[V1]]
-; GFX9: v_cvt_f16_f32_e32 [[V1:v[0-9]+]], v{{[0-9]+}}
-; GFX9: v_pack_b32_f16 [[V:v[0-9]+]], [[V1]], [[V0]]
+; GFX9: v_cvt_f16_f32_sdwa [[V1:v[0-9]+]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GCN-NOT: v_mul
; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]]
+; GCN: {{flat|global}}_store_dword v{{.+}}, [[V1]]
define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(ptr addrspace(1) %arg, ptr addrspace(1) %out) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %arg, i32 %id
@@ -732,7 +731,7 @@ define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode_nnan(fl
; GCN-LABEL: {{^}}v_test_canonicalize_build_vector_v2f16:
; GFX9-DAG: v_add_f16_e32
-; GFX9-DAG: v_mul_f16_e32
+; GFX9-DAG: v_mul_f16_sdwa
; GFX9-NOT: v_max
; GFX9-NOT: v_pk_max
define <2 x half> @v_test_canonicalize_build_vector_v2f16(<2 x half> %vec) {
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index eec6bab67b6c2..9583c8bb8227f 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -3136,10 +3136,9 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f32_sign_v2f16(<2 x float> %mag, <
; GFX9-LABEL: v_copysign_out_v2f16_mag_v2f32_sign_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
-; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -3642,10 +3641,9 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f16_sign_v2f32(<2 x half> %mag, <2
; GFX9-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
-; GFX9-NEXT: v_pack_b32_f16 v1, v1, v2
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -3970,10 +3968,9 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f32_sign_v2f16(<2 x float> inre
;
; GFX9-LABEL: s_copysign_out_v2f16_mag_v2f32_sign_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_cvt_f16_f32_e32 v0, s1
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, s0
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, s0
+; GFX9-NEXT: v_cvt_f16_f32_sdwa v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX9-NEXT: s_mov_b32 s0, 0x7fff7fff
-; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v1
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
@@ -4428,10 +4425,9 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f16_sign_v2f32(<2 x half> inreg
;
; GFX9-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, s1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, s1
+; GFX9-NEXT: v_cvt_f16_f32_sdwa v0, s2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX9-NEXT: s_mov_b32 s1, 0x7fff7fff
-; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_bfi_b32 v0, s1, v1, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
@@ -4762,12 +4758,11 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f32_sign_v3f16(<3 x float> %mag, <
; GFX9-LABEL: v_copysign_out_v3f16_mag_v3f32_sign_v3f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v1
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2
+; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
-; GFX9-NEXT: v_pack_b32_f16 v0, v0, v5
-; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v4
+; GFX9-NEXT: v_bfi_b32 v1, s4, v2, v4
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -5463,11 +5458,10 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f16_sign_v3f32(<3 x half> %mag, <3
; GFX9-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX9-NEXT: v_cvt_f16_f32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
-; GFX9-NEXT: v_pack_b32_f16 v2, v2, v3
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v4
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -5870,15 +5864,13 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f32_sign_v4f16(<4 x float> %mag, <
; GFX9-LABEL: v_copysign_out_v4f16_mag_v4f32_sign_v4f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GFX9-NEXT: v_cvt_f16_f32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
-; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v4
-; GFX9-NEXT: v_pack_b32_f16 v1, v2, v3
-; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v5
+; GFX9-NEXT: v_bfi_b32 v1, s4, v2, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_v4f16_mag_v4f32_sign_v4f16:
@@ -6780,15 +6772,13 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f16_sign_v4f32(<4 x half> %mag, <4
; GFX9-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX9-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX9-NEXT: v_cvt_f16_f32_sdwa v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GFX9-NEXT: v_cvt_f16_f32_sdwa v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
-; GFX9-NEXT: v_pack_b32_f16 v2, v2, v3
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2
-; GFX9-NEXT: v_pack_b32_f16 v2, v4, v5
-; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2
+; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f32:
@@ -7371,9 +7361,8 @@ define amdgpu_ps i32 @s_copysign_v2f16_0_v2f32(<2 x float> inreg %sign) {
;
; GFX9-LABEL: s_copysign_v2f16_0_v2f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_cvt_f16_f32_e32 v0, s1
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, s0
-; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, s0
+; GFX9-NEXT: v_cvt_f16_f32_sdwa v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0x80008000, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
@@ -7428,9 +7417,8 @@ define <2 x half> @v_copysign_v2f16_0_v2bf32(<2 x float> %sign) {
; GFX9-LABEL: v_copysign_v2f16_0_v2bf32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v0, 0x80008000, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index 1334d0ef278d1..5239cd0901071 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -2186,17 +2186,17 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX9-LABEL: v_rsq_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_rsq_f16_e32 v0, v0
-; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: v_rsq_f16_e32 v1, v0
+; GFX9-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_rsq_v2f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_rsq_f16_e32 v0, v0
-; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT: v_rsq_f16_e32 v1, v0
+; GFX10-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_rsq_v2f16:
@@ -2394,17 +2394,17 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX9-LABEL: v_neg_rsq_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_rsq_f16_e32 v0, v0
-; GFX9-NEXT: v_pack_b32_f16 v0, -v0, -v1
+; GFX9-NEXT: v_rsq_f16_e32 v1, v0
+; GFX9-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_neg_rsq_v2f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_rsq_f16_e32 v0, v0
-; GFX10-NEXT: v_pack_b32_f16 v0, -v0, -v1
+; GFX10-NEXT: v_rsq_f16_e32 v1, v0
+; GFX10-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_neg_rsq_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 6010f29c166a1..bef30f7d3f5e7 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -3860,11 +3860,10 @@ define <2 x half> @v_no_fmaximum3_f16__multi_use(half %a, half %b, half %c) {
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX942-NEXT: v_max_f16_e32 v1, v0, v2
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX942-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX942-NEXT: v_max_f16_e32 v1, v0, v2
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: v_no_fmaximum3_f16__multi_use:
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
index 4506fd649a5ff..b5a6843dad58a 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
@@ -3860,11 +3860,10 @@ define <2 x half> @v_no_fminimum3_f16__multi_use(half %a, half %b, half %c) {
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
; GFX942-NEXT: s_nop 1
; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX942-NEXT: v_min_f16_e32 v1, v0, v2
; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX942-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX942-NEXT: v_min_f16_e32 v1, v0, v2
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cndmask_b32_sdwa v0, v4, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX942-NEXT: s_setpc_b64 s[30:31]
;
; GFX950-LABEL: v_no_fminimum3_f16__multi_use:
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
index b25b9b994ea09..cc34df7709cde 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
@@ -6721,13 +6721,12 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
; GFX9-SDAG-LABEL: v_mul_v2f16_select_64_1:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x3c00
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x5400
; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x3c00
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x5400
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5]
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX9-SDAG-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -6747,11 +6746,11 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x5400
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e64 s4, 0, v0
; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v3, vcc_lo
-; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v3, vcc_lo
-; GFX10-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3c00
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0x3c00, v3, s4
+; GFX10-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX10-SDAG-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -6825,13 +6824,12 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
; GFX9-SDAG-LABEL: v_mul_v2f16_select_1_64:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x5400
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x5400
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x3c00
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5]
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX9-SDAG-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -6851,11 +6849,11 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x3c00
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e64 s4, 0, v0
; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x5400, v3, vcc_lo
-; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo
-; GFX10-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x5400
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0x5400, v3, s4
+; GFX10-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX10-SDAG-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -6929,13 +6927,12 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
; GFX9-SDAG-LABEL: v_mul_v2f16_select_n1_n64:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xd400
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0xbc00
; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0xd400
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xbc00
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5]
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX9-SDAG-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -6956,11 +6953,11 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0xbc00
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e64 s4, 0, v0
; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xd400, v3, vcc_lo
-; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xd400, v3, vcc_lo
-; GFX10-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0xd400
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0xd400, v3, s4
+; GFX10-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX10-SDAG-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -7037,13 +7034,12 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
; GFX9-SDAG-LABEL: v_mul_v2f16_select_128_64:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x5400
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x5800
; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x5400
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x5800
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5]
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX9-SDAG-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -7069,11 +7065,11 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x5800
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e64 s4, 0, v0
; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x5400, v3, vcc_lo
-; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x5400, v3, vcc_lo
-; GFX10-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x5400
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0x5400, v3, s4
+; GFX10-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX10-SDAG-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -7161,13 +7157,12 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
; GFX9-SDAG-LABEL: v_mul_v2f16_select_n128_n64:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xd400
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0xd800
; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0xd400
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xd800
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5]
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX9-SDAG-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -7194,11 +7189,11 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0xd800
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e64 s4, 0, v0
; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xd400, v3, vcc_lo
-; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xd400, v3, vcc_lo
-; GFX10-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0xd400
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0xd400, v3, s4
+; GFX10-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX10-SDAG-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -7289,13 +7284,12 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
; GFX9-SDAG-LABEL: v_mul_v2f16_select_n128_n16:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xcc00
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0xd800
; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0xcc00
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xd800
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5]
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX9-SDAG-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -7316,11 +7310,11 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0xd800
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e64 s4, 0, v0
; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xcc00, v3, vcc_lo
-; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xcc00, v3, vcc_lo
-; GFX10-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0xcc00
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0xcc00, v3, s4
+; GFX10-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v3, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX10-SDAG-NEXT: v_pk_mul_f16 v0, v2, v0
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -7397,13 +7391,12 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal
; GFX9-SDAG-LABEL: v_contract_mul_add_v2f16_select_64_1:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x3c00
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x5400
; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
-; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x3c00
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x5400
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v4, s[4:5]
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX9-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -7424,11 +7417,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0x5400
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e64 s4, 0, v0
; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo
-; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v4, vcc_lo
-; GFX10-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x3c00
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0x3c00, v4, s4
+; GFX10-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v4, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX10-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -7506,13 +7499,12 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal
; GFX9-SDAG-LABEL: v_contract_mul_add_v2f16_select_1_64:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x5400
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x3c00
; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
-; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x5400
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x3c00
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v4, s[4:5]
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX9-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -7533,11 +7525,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0x3c00
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e64 s4, 0, v0
; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x5400, v4, vcc_lo
-; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x5400, v4, vcc_lo
-; GFX10-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x5400
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0x5400, v4, s4
+; GFX10-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v4, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX10-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -7615,13 +7607,12 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h
; GFX9-SDAG-LABEL: v_contract_mul_add_v2f16_select_n64_n1:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0xbc00
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0xd400
; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
-; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0xbc00
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0xd400
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v4, s[4:5]
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX9-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -7643,11 +7634,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0xd400
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e64 s4, 0, v0
; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo
-; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v4, vcc_lo
-; GFX10-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0xbc00
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0xbc00, v4, s4
+; GFX10-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v4, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX10-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -7728,13 +7719,12 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h
; GFX9-SDAG-LABEL: v_contract_mul_add_v2f16_select_n1_n64:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0xd400
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0xbc00
; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
-; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0xd400
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0xbc00
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v4, s[4:5]
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX9-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -7756,11 +7746,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0xbc00
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e64 s4, 0, v0
; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xd400, v4, vcc_lo
-; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, 0xd400, v4, vcc_lo
-; GFX10-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0xd400
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0xd400, v4, s4
+; GFX10-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v4, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX10-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -7841,13 +7831,12 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h
; GFX9-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_64:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x5400
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x5800
; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
-; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x5400
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x5800
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v4, s[4:5]
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX9-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -7874,11 +7863,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0x5800
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e64 s4, 0, v0
; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x5400, v4, vcc_lo
-; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x5400, v4, vcc_lo
-; GFX10-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x5400
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0x5400, v4, s4
+; GFX10-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v4, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX10-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -7969,13 +7958,12 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha
; GFX9-SDAG-LABEL: v_contract_mul_add_v2f16_select_128_4:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x4400
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x5800
; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
-; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x4400
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x5800
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v4, s[4:5]
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX9-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -7996,11 +7984,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0x5800
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e64 s4, 0, v0
; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo
-; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x4400, v4, vcc_lo
-; GFX10-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x4400
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0x4400, v4, s4
+; GFX10-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v4, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX10-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -8078,13 +8066,12 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half
; GFX9-SDAG-LABEL: v_contract_mul_add_v2f16_select_2_4:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x4400
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x4000
; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
-; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x4400
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x4000
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v4, s[4:5]
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX9-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -8111,11 +8098,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0x4000
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e64 s4, 0, v0
; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo
-; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x4400, v4, vcc_lo
-; GFX10-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x4400
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0x4400, v4, s4
+; GFX10-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v4, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX10-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -8206,13 +8193,12 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha
; GFX9-SDAG-LABEL: v_contract_mul_add_v2f16_select_4_128:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x5800
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x4400
; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
-; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, 0x5800
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x4400
+; GFX9-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v1, v4, s[4:5]
+; GFX9-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v4, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX9-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -8233,11 +8219,11 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, 0x4400
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e64 s4, 0, v0
; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x5800, v4, vcc_lo
-; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, 0x5800, v4, vcc_lo
-; GFX10-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x5800
+; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0x5800, v4, s4
+; GFX10-SDAG-NEXT: v_cndmask_b32_sdwa v0, v1, v4, vcc_lo dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX10-SDAG-NEXT: v_pk_fma_f16 v0, v2, v0, v3
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index f03becfbb8968..c210379479b3e 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -187,26 +187,22 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
; GFX10-LABEL: fmul_pow2_8xhalf:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
-; GFX10-NEXT: v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0]
-; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0]
; GFX10-NEXT: v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX10-NEXT: v_cvt_f16_u16_e32 v4, v3
-; GFX10-NEXT: v_cvt_f16_u16_e32 v5, v2
-; GFX10-NEXT: v_cvt_f16_u16_e32 v6, v1
-; GFX10-NEXT: v_cvt_f16_u16_e32 v7, v0
-; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f16_u16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f16_u16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f16_u16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v7, v0
-; GFX10-NEXT: v_pack_b32_f16 v1, v6, v1
-; GFX10-NEXT: v_pack_b32_f16 v2, v5, v2
-; GFX10-NEXT: v_pack_b32_f16 v3, v4, v3
-; GFX10-NEXT: v_pk_mul_f16 v0, 0x7000, v0 op_sel_hi:[0,1]
-; GFX10-NEXT: v_pk_mul_f16 v1, 0x7000, v1 op_sel_hi:[0,1]
-; GFX10-NEXT: v_pk_mul_f16 v2, 0x7000, v2 op_sel_hi:[0,1]
-; GFX10-NEXT: v_pk_mul_f16 v3, 0x7000, v3 op_sel_hi:[0,1]
+; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX10-NEXT: v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0]
+; GFX10-NEXT: v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
+; GFX10-NEXT: v_cvt_f16_u16_e32 v4, v0
+; GFX10-NEXT: v_cvt_f16_u16_e32 v5, v1
+; GFX10-NEXT: v_cvt_f16_u16_e32 v6, v2
+; GFX10-NEXT: v_cvt_f16_u16_e32 v7, v3
+; GFX10-NEXT: v_cvt_f16_u16_sdwa v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f16_u16_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f16_u16_sdwa v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f16_u16_sdwa v7, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_pk_mul_f16 v0, 0x7000, v4 op_sel_hi:[0,1]
+; GFX10-NEXT: v_pk_mul_f16 v1, 0x7000, v5 op_sel_hi:[0,1]
+; GFX10-NEXT: v_pk_mul_f16 v2, 0x7000, v6 op_sel_hi:[0,1]
+; GFX10-NEXT: v_pk_mul_f16 v3, 0x7000, v7 op_sel_hi:[0,1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: fmul_pow2_8xhalf:
@@ -295,19 +291,19 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
; GFX10-LABEL: fmul_pow2_ldexp_8xhalf:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, 0x7000
-; GFX10-NEXT: v_ldexp_f16_e32 v5, 0x7000, v3
+; GFX10-NEXT: v_mov_b32_e32 v8, 0x7000
+; GFX10-NEXT: v_ldexp_f16_e32 v4, 0x7000, v0
+; GFX10-NEXT: v_ldexp_f16_e32 v5, 0x7000, v1
; GFX10-NEXT: v_ldexp_f16_e32 v6, 0x7000, v2
-; GFX10-NEXT: v_ldexp_f16_e32 v7, 0x7000, v1
-; GFX10-NEXT: v_ldexp_f16_e32 v8, 0x7000, v0
-; GFX10-NEXT: v_ldexp_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_ldexp_f16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_ldexp_f16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_ldexp_f16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v8, v0
-; GFX10-NEXT: v_pack_b32_f16 v1, v7, v1
-; GFX10-NEXT: v_pack_b32_f16 v2, v6, v2
-; GFX10-NEXT: v_pack_b32_f16 v3, v5, v3
+; GFX10-NEXT: v_ldexp_f16_e32 v7, 0x7000, v3
+; GFX10-NEXT: v_ldexp_f16_sdwa v4, v8, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_ldexp_f16_sdwa v5, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_ldexp_f16_sdwa v6, v8, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_ldexp_f16_sdwa v7, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_mov_b32_e32 v2, v6
+; GFX10-NEXT: v_mov_b32_e32 v3, v7
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: fmul_pow2_ldexp_8xhalf:
@@ -1073,9 +1069,8 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_pk_lshlrev_b16 v0, v0, 2 op_sel_hi:[1,0]
; GFX10-NEXT: v_cvt_f16_u16_e32 v1, v0
-; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX10-NEXT: v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1]
+; GFX10-NEXT: v_cvt_f16_u16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_pk_mul_f16 v0, 0x4b80, v1 op_sel_hi:[0,1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
diff --git a/llvm/test/CodeGen/AMDGPU/fpow.ll b/llvm/test/CodeGen/AMDGPU/fpow.ll
index ad3f3433c74b3..b360387d009a0 100644
--- a/llvm/test/CodeGen/AMDGPU/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpow.ll
@@ -261,55 +261,52 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
; GFX9-LABEL: v_pow_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_log_f32_e32 v2, v2
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
+; GFX9-NEXT: v_exp_f32_e32 v2, v2
; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
-; GFX9-NEXT: v_exp_f32_e32 v1, v2
-; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: v_exp_f32_e32 v1, v0
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v2
+; GFX9-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_pow_v2f16:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX90A-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX90A-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX90A-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX90A-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX90A-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX90A-NEXT: v_log_f32_e32 v2, v2
; GFX90A-NEXT: v_log_f32_e32 v0, v0
; GFX90A-NEXT: v_mul_legacy_f32 v2, v3, v2
+; GFX90A-NEXT: v_exp_f32_e32 v2, v2
; GFX90A-NEXT: v_mul_legacy_f32 v0, v1, v0
-; GFX90A-NEXT: v_exp_f32_e32 v1, v2
-; GFX90A-NEXT: v_exp_f32_e32 v0, v0
-; GFX90A-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX90A-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX90A-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX90A-NEXT: v_exp_f32_e32 v1, v0
+; GFX90A-NEXT: v_cvt_f16_f32_e32 v0, v2
+; GFX90A-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_pow_v2f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_log_f32_e32 v2, v2
; GFX10-NEXT: v_log_f32_e32 v0, v0
; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
; GFX10-NEXT: v_exp_f32_e32 v1, v2
-; GFX10-NEXT: v_exp_f32_e32 v0, v0
-; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT: v_exp_f32_e32 v2, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v1
+; GFX10-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_pow_v2f16:
@@ -407,55 +404,52 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
; GFX9-LABEL: v_pow_v2f16_fneg_lhs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX9-NEXT: v_cvt_f32_f16_e64 v2, -v0
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_log_f32_e32 v2, v2
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
+; GFX9-NEXT: v_exp_f32_e32 v2, v2
; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
-; GFX9-NEXT: v_exp_f32_e32 v1, v2
-; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: v_exp_f32_e32 v1, v0
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v2
+; GFX9-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_pow_v2f16_fneg_lhs:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX90A-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; GFX90A-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX90A-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX90A-NEXT: v_cvt_f32_f16_e64 v2, -v0
+; GFX90A-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX90A-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX90A-NEXT: v_log_f32_e32 v2, v2
; GFX90A-NEXT: v_log_f32_e32 v0, v0
; GFX90A-NEXT: v_mul_legacy_f32 v2, v3, v2
+; GFX90A-NEXT: v_exp_f32_e32 v2, v2
; GFX90A-NEXT: v_mul_legacy_f32 v0, v1, v0
-; GFX90A-NEXT: v_exp_f32_e32 v1, v2
-; GFX90A-NEXT: v_exp_f32_e32 v0, v0
-; GFX90A-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX90A-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX90A-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX90A-NEXT: v_exp_f32_e32 v1, v0
+; GFX90A-NEXT: v_cvt_f16_f32_e32 v0, v2
+; GFX90A-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_pow_v2f16_fneg_lhs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX10-NEXT: v_cvt_f32_f16_e64 v2, -v0
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_log_f32_e32 v2, v2
; GFX10-NEXT: v_log_f32_e32 v0, v0
; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
; GFX10-NEXT: v_exp_f32_e32 v1, v2
-; GFX10-NEXT: v_exp_f32_e32 v0, v0
-; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT: v_exp_f32_e32 v2, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v1
+; GFX10-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_pow_v2f16_fneg_lhs:
@@ -554,55 +548,52 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
; GFX9-LABEL: v_pow_v2f16_fneg_rhs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_cvt_f32_f16_e64 v1, -v1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e64 v3, -v1
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_log_f32_e32 v2, v2
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
+; GFX9-NEXT: v_exp_f32_e32 v2, v2
; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
-; GFX9-NEXT: v_exp_f32_e32 v1, v2
-; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: v_exp_f32_e32 v1, v0
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v2
+; GFX9-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_pow_v2f16_fneg_rhs:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX90A-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX90A-NEXT: v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX90A-NEXT: v_cvt_f32_f16_e64 v1, -v1
+; GFX90A-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX90A-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX90A-NEXT: v_cvt_f32_f16_e64 v3, -v1
+; GFX90A-NEXT: v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX90A-NEXT: v_log_f32_e32 v2, v2
; GFX90A-NEXT: v_log_f32_e32 v0, v0
; GFX90A-NEXT: v_mul_legacy_f32 v2, v3, v2
+; GFX90A-NEXT: v_exp_f32_e32 v2, v2
; GFX90A-NEXT: v_mul_legacy_f32 v0, v1, v0
-; GFX90A-NEXT: v_exp_f32_e32 v1, v2
-; GFX90A-NEXT: v_exp_f32_e32 v0, v0
-; GFX90A-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX90A-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX90A-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX90A-NEXT: v_exp_f32_e32 v1, v0
+; GFX90A-NEXT: v_cvt_f16_f32_e32 v0, v2
+; GFX90A-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_pow_v2f16_fneg_rhs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_f16_e64 v1, -v1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f32_f16_e64 v3, -v1
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_log_f32_e32 v2, v2
; GFX10-NEXT: v_log_f32_e32 v0, v0
; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
; GFX10-NEXT: v_exp_f32_e32 v1, v2
-; GFX10-NEXT: v_exp_f32_e32 v0, v0
-; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT: v_exp_f32_e32 v2, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v1
+; GFX10-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_pow_v2f16_fneg_rhs:
@@ -705,55 +696,52 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
; GFX9-LABEL: v_pow_v2f16_fneg_lhs_rhs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_cvt_f32_f16_e64 v1, -v1
+; GFX9-NEXT: v_cvt_f32_f16_e64 v2, -v0
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e64 v3, -v1
+; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_log_f32_e32 v2, v2
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
+; GFX9-NEXT: v_exp_f32_e32 v2, v2
; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
-; GFX9-NEXT: v_exp_f32_e32 v1, v2
-; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: v_exp_f32_e32 v1, v0
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v2
+; GFX9-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX90A-LABEL: v_pow_v2f16_fneg_lhs_rhs:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX90A-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; GFX90A-NEXT: v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX90A-NEXT: v_cvt_f32_f16_e64 v1, -v1
+; GFX90A-NEXT: v_cvt_f32_f16_e64 v2, -v0
+; GFX90A-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX90A-NEXT: v_cvt_f32_f16_e64 v3, -v1
+; GFX90A-NEXT: v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX90A-NEXT: v_log_f32_e32 v2, v2
; GFX90A-NEXT: v_log_f32_e32 v0, v0
; GFX90A-NEXT: v_mul_legacy_f32 v2, v3, v2
+; GFX90A-NEXT: v_exp_f32_e32 v2, v2
; GFX90A-NEXT: v_mul_legacy_f32 v0, v1, v0
-; GFX90A-NEXT: v_exp_f32_e32 v1, v2
-; GFX90A-NEXT: v_exp_f32_e32 v0, v0
-; GFX90A-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX90A-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX90A-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX90A-NEXT: v_exp_f32_e32 v1, v0
+; GFX90A-NEXT: v_cvt_f16_f32_e32 v0, v2
+; GFX90A-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_pow_v2f16_fneg_lhs_rhs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_f16_e64 v1, -v1
+; GFX10-NEXT: v_cvt_f32_f16_e64 v2, -v0
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f32_f16_e64 v3, -v1
+; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_log_f32_e32 v2, v2
; GFX10-NEXT: v_log_f32_e32 v0, v0
; GFX10-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
; GFX10-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
; GFX10-NEXT: v_exp_f32_e32 v1, v2
-; GFX10-NEXT: v_exp_f32_e32 v0, v0
-; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT: v_exp_f32_e32 v2, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v1
+; GFX10-NEXT: v_cvt_f16_f32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_pow_v2f16_fneg_lhs_rhs:
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index 77faf363ca412..0c271e76c503a 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -1853,9 +1853,8 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-SDAG-NEXT: s_endpgm
;
@@ -1866,10 +1865,9 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v1, s3
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_sdwa v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-GISEL-NEXT: s_endpgm
;
@@ -3796,11 +3794,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn(
; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v2
+; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v1, v[2:3]
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-SDAG-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-SDAG-NEXT: s_endpgm
;
@@ -3815,8 +3812,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn(
; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX9-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-GISEL-NEXT: s_endpgm
;
@@ -3849,10 +3845,9 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn(
; GFX950-GISEL-NEXT: s_mov_b32 s3, 0xf000
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX950-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX950-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX950-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
+; GFX950-GISEL-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX950-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX950-GISEL-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll
index b14935c57152b..97537605de469 100644
--- a/llvm/test/CodeGen/AMDGPU/fract-match.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll
@@ -1701,8 +1701,8 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) {
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_fract_f16_e32 v1, v0
-; GFX8-NEXT: v_fract_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX8-NEXT: v_fract_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX8-NEXT: v_mov_b32_e32 v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: basic_fract_v2f16_nonan:
@@ -2722,14 +2722,13 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) writeon
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_movk_i32 s6, 0x204
-; GFX8-NEXT: v_floor_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_floor_f16_e32 v4, v0
; GFX8-NEXT: v_fract_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_cmp_class_f16_sdwa s[4:5], v0, s6 src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_pack_b32_f16 v3, v4, v3
+; GFX8-NEXT: v_floor_f16_e32 v3, v0
; GFX8-NEXT: v_fract_f16_e32 v4, v0
; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, 0, s[4:5]
; GFX8-NEXT: v_cmp_class_f16_e64 s[4:5], v0, s6
+; GFX8-NEXT: v_floor_f16_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, 0, s[4:5]
; GFX8-NEXT: v_pack_b32_f16 v0, v0, v5
; GFX8-NEXT: global_store_dword v[1:2], v3, off
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index 769bf0a6458b2..0ebb306d5bae9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -189,9 +189,8 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_mul_f16 v1, v1, 0.15915494 op_sel_hi:[1,0]
; GFX9-NEXT: v_cos_f16_e32 v2, v1
-; GFX9-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: v_cos_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: cos_v2f16:
@@ -203,9 +202,8 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_mul_f16 v1, v1, 0.15915494 op_sel_hi:[1,0]
; GFX10-NEXT: v_cos_f16_e32 v2, v1
-; GFX10-NEXT: v_cos_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: v_cos_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: global_store_dword v0, v2, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-TRUE16-LABEL: cos_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index 3897a0e028334..e546ce94515c0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -6065,12 +6065,11 @@ define <2 x half> @v_exp_v2f16(<2 x half> %in) {
; GFX900-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX900-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX900-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX900-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
; GFX900-NEXT: v_exp_f32_e32 v1, v1
-; GFX900-NEXT: v_exp_f32_e32 v0, v0
-; GFX900-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX900-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-NEXT: v_cvt_f16_f32_e32 v0, v1
+; GFX900-NEXT: v_exp_f32_e32 v1, v2
+; GFX900-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; SI-SDAG-LABEL: v_exp_v2f16:
@@ -6152,12 +6151,11 @@ define <2 x half> @v_exp_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v0|
; GFX900-SDAG-NEXT: v_cvt_f32_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v1
-; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v1
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v2
+; GFX900-SDAG-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp_fabs_v2f16:
@@ -6167,12 +6165,11 @@ define <2 x half> @v_exp_fabs_v2f16(<2 x half> %in) {
; GFX900-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX900-GISEL-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, v1
-; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX900-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, v2
+; GFX900-GISEL-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; SI-SDAG-LABEL: v_exp_fabs_v2f16:
@@ -6260,12 +6257,11 @@ define <2 x half> @v_exp_fneg_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG-NEXT: v_cvt_f32_f16_e64 v1, -|v0|
; GFX900-SDAG-NEXT: v_cvt_f32_f16_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v1
-; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v1
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v2
+; GFX900-SDAG-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp_fneg_fabs_v2f16:
@@ -6275,12 +6271,11 @@ define <2 x half> @v_exp_fneg_fabs_v2f16(<2 x half> %in) {
; GFX900-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX900-GISEL-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, v1
-; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX900-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, v2
+; GFX900-GISEL-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; SI-SDAG-LABEL: v_exp_fneg_fabs_v2f16:
@@ -6373,12 +6368,11 @@ define <2 x half> @v_exp_fneg_v2f16(<2 x half> %in) {
; GFX900-SDAG-NEXT: v_cvt_f32_f16_e64 v1, -v0
; GFX900-SDAG-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v1
-; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v1
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v2
+; GFX900-SDAG-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp_fneg_v2f16:
@@ -6388,12 +6382,11 @@ define <2 x half> @v_exp_fneg_v2f16(<2 x half> %in) {
; GFX900-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX900-GISEL-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, v1
-; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX900-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, v2
+; GFX900-GISEL-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; SI-SDAG-LABEL: v_exp_fneg_v2f16:
@@ -6476,10 +6469,9 @@ define <2 x half> @v_exp_v2f16_fast(<2 x half> %in) {
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x3dc5
-; GFX900-SDAG-NEXT: v_pk_mul_f16 v0, v0, s4 op_sel_hi:[1,0]
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v1, v0
-; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_pk_mul_f16 v1, v0, s4 op_sel_hi:[1,0]
+; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v1
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp_v2f16_fast:
@@ -6555,23 +6547,39 @@ define <3 x half> @v_exp_v3f16(<3 x half> %in) {
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_exp_v3f16:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX900-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX900-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-; GFX900-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX900-NEXT: v_exp_f32_e32 v2, v2
-; GFX900-NEXT: v_exp_f32_e32 v0, v0
-; GFX900-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
-; GFX900-NEXT: v_exp_f32_e32 v1, v1
-; GFX900-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX900-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX900-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_exp_v3f16:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX900-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX900-SDAG-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v3, v0
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v1
+; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v2
+; GFX900-SDAG-NEXT: v_cvt_f16_f32_sdwa v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_exp_v3f16:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX900-GISEL-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v2, v2
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v3, v0
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, v1
+; GFX900-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v2
+; GFX900-GISEL-NEXT: v_cvt_f16_f32_sdwa v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GFX900-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; SI-SDAG-LABEL: v_exp_v3f16:
; SI-SDAG: ; %bb.0:
@@ -6656,14 +6664,14 @@ define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) {
; GFX900-SDAG-LABEL: v_exp_v3f16_afn:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x3dc5
; GFX900-SDAG-NEXT: v_mul_f16_e32 v2, 0x3dc5, v0
-; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x3dc5
; GFX900-SDAG-NEXT: v_exp_f16_e32 v2, v2
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x3dc5, v1
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX900-SDAG-NEXT: v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, v2
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp_v3f16_afn:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index 574b1c0b4974c..3e0000b9a85e0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -6151,12 +6151,11 @@ define <2 x half> @v_exp10_v2f16(<2 x half> %in) {
; GFX900-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX900-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX900-NEXT: v_mul_f32_e32 v1, 0x40549a78, v1
-; GFX900-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0
; GFX900-NEXT: v_exp_f32_e32 v1, v1
-; GFX900-NEXT: v_exp_f32_e32 v0, v0
-; GFX900-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX900-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-NEXT: v_mul_f32_e32 v2, 0x40549a78, v0
+; GFX900-NEXT: v_cvt_f16_f32_e32 v0, v1
+; GFX900-NEXT: v_exp_f32_e32 v1, v2
+; GFX900-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; SI-SDAG-LABEL: v_exp10_v2f16:
@@ -6238,12 +6237,11 @@ define <2 x half> @v_exp10_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v0|
; GFX900-SDAG-NEXT: v_cvt_f32_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549a78, v1
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0
; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v1
-; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x40549a78, v0
+; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v1
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v2
+; GFX900-SDAG-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp10_fabs_v2f16:
@@ -6253,12 +6251,11 @@ define <2 x half> @v_exp10_fabs_v2f16(<2 x half> %in) {
; GFX900-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX900-GISEL-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, 0x40549a78, v1
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0
; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, v1
-; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX900-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, 0x40549a78, v0
+; GFX900-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, v2
+; GFX900-GISEL-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; SI-SDAG-LABEL: v_exp10_fabs_v2f16:
@@ -6346,12 +6343,11 @@ define <2 x half> @v_exp10_fneg_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG-NEXT: v_cvt_f32_f16_e64 v1, -|v0|
; GFX900-SDAG-NEXT: v_cvt_f32_f16_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549a78, v1
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0
; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v1
-; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x40549a78, v0
+; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v1
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v2
+; GFX900-SDAG-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp10_fneg_fabs_v2f16:
@@ -6361,12 +6357,11 @@ define <2 x half> @v_exp10_fneg_fabs_v2f16(<2 x half> %in) {
; GFX900-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX900-GISEL-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, 0x40549a78, v1
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0
; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, v1
-; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX900-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, 0x40549a78, v0
+; GFX900-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, v2
+; GFX900-GISEL-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; SI-SDAG-LABEL: v_exp10_fneg_fabs_v2f16:
@@ -6459,12 +6454,11 @@ define <2 x half> @v_exp10_fneg_v2f16(<2 x half> %in) {
; GFX900-SDAG-NEXT: v_cvt_f32_f16_e64 v1, -v0
; GFX900-SDAG-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549a78, v1
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0
; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v1
-; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x40549a78, v0
+; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v1
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v2
+; GFX900-SDAG-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp10_fneg_v2f16:
@@ -6474,12 +6468,11 @@ define <2 x half> @v_exp10_fneg_v2f16(<2 x half> %in) {
; GFX900-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX900-GISEL-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, 0x40549a78, v1
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0
; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, v1
-; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX900-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, 0x40549a78, v0
+; GFX900-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v1
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, v2
+; GFX900-GISEL-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; SI-SDAG-LABEL: v_exp10_fneg_v2f16:
@@ -6576,36 +6569,34 @@ define <2 x half> @v_exp10_v2f16_fast(<2 x half> %in) {
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x113c
-; GFX900-SDAG-NEXT: s_movk_i32 s5, 0x42a4
; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x113c, v0
+; GFX900-SDAG-NEXT: s_movk_i32 s5, 0x42a4
; GFX900-SDAG-NEXT: v_mul_f16_e32 v2, 0x42a4, v0
-; GFX900-SDAG-NEXT: v_mul_f16_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX900-SDAG-NEXT: v_exp_f16_e32 v1, v1
; GFX900-SDAG-NEXT: v_exp_f16_e32 v2, v2
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v3, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX900-SDAG-NEXT: v_exp_f16_e32 v3, v3
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, v2, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, v0, v3
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_exp_f16_e32 v4, v0
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, v2, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp10_v2f16_fast:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x113c
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42a4
; GFX900-GISEL-NEXT: v_mul_f16_e32 v2, 0x113c, v0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42a4
; GFX900-GISEL-NEXT: v_mul_f16_e32 v4, 0x42a4, v0
-; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX900-GISEL-NEXT: v_exp_f16_e32 v2, v2
; GFX900-GISEL-NEXT: v_exp_f16_e32 v4, v4
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX900-GISEL-NEXT: v_exp_f16_e32 v1, v1
-; GFX900-GISEL-NEXT: v_exp_f16_e32 v0, v0
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v2, v4, v2
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX900-GISEL-NEXT: v_exp_f16_e32 v3, v0
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, v4, v2
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; SI-SDAG-LABEL: v_exp10_v2f16_fast:
@@ -6694,23 +6685,39 @@ define <3 x half> @v_exp10_v3f16(<3 x half> %in) {
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_exp10_v3f16:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX900-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX900-NEXT: v_mul_f32_e32 v2, 0x40549a78, v2
-; GFX900-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0
-; GFX900-NEXT: v_exp_f32_e32 v2, v2
-; GFX900-NEXT: v_exp_f32_e32 v0, v0
-; GFX900-NEXT: v_mul_f32_e32 v1, 0x40549a78, v1
-; GFX900-NEXT: v_exp_f32_e32 v1, v1
-; GFX900-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX900-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX900-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_exp10_v3f16:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX900-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX900-SDAG-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, 0x40549a78, v2
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549a78, v1
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v3, v0
+; GFX900-SDAG-NEXT: v_exp_f32_e32 v1, v1
+; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v2
+; GFX900-SDAG-NEXT: v_cvt_f16_f32_sdwa v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GFX900-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_exp10_v3f16:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX900-GISEL-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, 0x40549a78, v2
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v2, v2
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, 0x40549a78, v1
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v3, v0
+; GFX900-GISEL-NEXT: v_exp_f32_e32 v1, v1
+; GFX900-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v2
+; GFX900-GISEL-NEXT: v_cvt_f16_f32_sdwa v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GFX900-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; SI-SDAG-LABEL: v_exp10_v3f16:
; SI-SDAG: ; %bb.0:
@@ -6817,45 +6824,43 @@ define <3 x half> @v_exp10_v3f16_afn(<3 x half> %in) {
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x113c
; GFX900-SDAG-NEXT: s_movk_i32 s5, 0x42a4
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v2, 0x113c, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x42a4, v1
; GFX900-SDAG-NEXT: v_mul_f16_e32 v3, 0x113c, v0
; GFX900-SDAG-NEXT: v_mul_f16_e32 v4, 0x42a4, v0
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v2, 0x113c, v1
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x42a4, v1
+; GFX900-SDAG-NEXT: v_exp_f16_e32 v3, v3
+; GFX900-SDAG-NEXT: v_exp_f16_e32 v4, v4
; GFX900-SDAG-NEXT: v_mul_f16_sdwa v5, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX900-SDAG-NEXT: v_exp_f16_e32 v2, v2
; GFX900-SDAG-NEXT: v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v3, v3
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v4, v4
; GFX900-SDAG-NEXT: v_exp_f16_e32 v5, v5
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0
+; GFX900-SDAG-NEXT: v_exp_f16_e32 v6, v0
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, v4, v3
; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, v1, v2
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v2, v4, v3
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, v0, v5
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp10_v3f16_afn:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x113c
-; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x42a4
; GFX900-GISEL-NEXT: v_mul_f16_e32 v3, 0x113c, v0
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x42a4
; GFX900-GISEL-NEXT: v_mul_f16_e32 v5, 0x42a4, v0
-; GFX900-GISEL-NEXT: v_mul_f16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX900-GISEL-NEXT: v_exp_f16_e32 v3, v3
; GFX900-GISEL-NEXT: v_exp_f16_e32 v5, v5
-; GFX900-GISEL-NEXT: v_exp_f16_e32 v2, v2
-; GFX900-GISEL-NEXT: v_exp_f16_e32 v0, v0
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX900-GISEL-NEXT: v_mul_f16_e32 v4, 0x113c, v1
; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x42a4, v1
+; GFX900-GISEL-NEXT: v_exp_f16_e32 v2, v2
; GFX900-GISEL-NEXT: v_exp_f16_e32 v4, v4
; GFX900-GISEL-NEXT: v_exp_f16_e32 v1, v1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v3, v5, v3
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, v0, v2
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v3, v0
+; GFX900-GISEL-NEXT: v_exp_f16_e32 v6, v0
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, v5, v3
; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, v1, v4
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; SI-SDAG-LABEL: v_exp10_v3f16_afn:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index dd44a1a35067e..af84a7f8fc303 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -3188,9 +3188,9 @@ define <2 x half> @v_exp2_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_exp2_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_exp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_exp_f16_e32 v1, v0
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp2_v2f16:
@@ -3266,9 +3266,9 @@ define <2 x half> @v_exp2_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_exp2_fabs_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_exp_f16_sdwa v1, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_exp_f16_e64 v0, |v0|
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_exp_f16_e64 v1, |v0|
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v1, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp2_fabs_v2f16:
@@ -3350,9 +3350,9 @@ define <2 x half> @v_exp2_fneg_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_exp2_fneg_fabs_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_exp_f16_sdwa v1, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_exp_f16_e64 v0, -|v0|
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_exp_f16_e64 v1, -|v0|
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v1, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp2_fneg_fabs_v2f16:
@@ -3435,9 +3435,9 @@ define <2 x half> @v_exp2_fneg_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_exp2_fneg_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_exp_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_exp_f16_e64 v0, -v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_exp_f16_e64 v1, -v0
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v1, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp2_fneg_v2f16:
@@ -3505,9 +3505,9 @@ define <2 x half> @v_exp2_v2f16_fast(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_exp2_v2f16_fast:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_exp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_exp_f16_e32 v1, v0
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp2_v2f16_fast:
@@ -3587,10 +3587,10 @@ define <3 x half> @v_exp_v3f16(<3 x half> %in) {
; GFX900-SDAG-LABEL: v_exp_v3f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_exp_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0
+; GFX900-SDAG-NEXT: v_exp_f16_e32 v2, v0
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-SDAG-NEXT: v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, v2
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp_v3f16:
@@ -3671,10 +3671,10 @@ define <3 x half> @v_exp2_v3f16_afn(<3 x half> %in) {
; GFX900-SDAG-LABEL: v_exp2_v3f16_afn:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_exp_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_exp_f16_e32 v0, v0
+; GFX900-SDAG-NEXT: v_exp_f16_e32 v2, v0
+; GFX900-SDAG-NEXT: v_exp_f16_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-SDAG-NEXT: v_exp_f16_e32 v1, v1
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, v2
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_exp2_v3f16_afn:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
index 37b49d48a10ee..3d48c285cf6ce 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
@@ -411,9 +411,8 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
; GFX9-SDAG-LABEL: test_frexp_v2f16_v2i32:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_frexp_mant_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-SDAG-NEXT: v_frexp_mant_f16_e32 v2, v0
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v3, v2, v1
+; GFX9-SDAG-NEXT: v_frexp_mant_f16_e32 v3, v0
+; GFX9-SDAG-NEXT: v_frexp_mant_f16_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX9-SDAG-NEXT: v_frexp_exp_i16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-SDAG-NEXT: v_frexp_exp_i16_f16_e32 v0, v0
; GFX9-SDAG-NEXT: v_bfe_i32 v2, v1, 0, 16
@@ -520,11 +519,11 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_frexp_mant_f16_e32 v3, v0
; GFX9-GISEL-NEXT: v_frexp_exp_i16_f16_e32 v1, v0
-; GFX9-GISEL-NEXT: v_frexp_mant_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-GISEL-NEXT: v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-GISEL-NEXT: v_frexp_exp_i16_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-GISEL-NEXT: v_frexp_mant_f16_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX9-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX9-GISEL-NEXT: v_bfe_i32 v2, v0, 0, 16
-; GFX9-GISEL-NEXT: v_pack_b32_f16 v0, v3, v4
+; GFX9-GISEL-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, v3
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-TRUE16-LABEL: test_frexp_v2f16_v2i32:
@@ -617,13 +616,13 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-SDAG-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
-; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_frexp_mant_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-SDAG-NEXT: v_frexp_mant_f16_e32 v0, v0
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_frexp_mant_f16_e32 v1, v0
+; GFX9-NEXT: v_frexp_mant_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
; GFX11-SDAG-TRUE16: ; %bb.0:
@@ -683,14 +682,6 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
-; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_frexp_mant_f16_e32 v1, v0
-; GFX9-GISEL-NEXT: v_frexp_mant_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
; GFX11-GISEL-TRUE16-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
; GFX11-GISEL-TRUE16: ; %bb.0:
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
index 5fae6de4a9682..bbcd4ceb51f8a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
@@ -460,11 +460,11 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x8000
; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x7fff
-; GFX9-SDAG-NEXT: v_med3_i32 v2, v2, s4, v3
; GFX9-SDAG-NEXT: v_med3_i32 v1, v1, s4, v3
-; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX9-SDAG-NEXT: v_med3_i32 v2, v2, s4, v3
+; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v1, v0, v1
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v1, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, v1
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v2f16_v2i32:
@@ -584,9 +584,9 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
; GFX9-SDAG-LABEL: test_ldexp_v2f16_v2i16:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v1
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2
+; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v2, v0, v1
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, v2
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v2f16_v2i16:
@@ -693,13 +693,13 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) {
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x8000
; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x7fff
-; GFX9-SDAG-NEXT: v_med3_i32 v3, v3, s4, v5
; GFX9-SDAG-NEXT: v_med3_i32 v2, v2, s4, v5
+; GFX9-SDAG-NEXT: v_med3_i32 v3, v3, s4, v5
+; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v2, v0, v2
; GFX9-SDAG-NEXT: v_med3_i32 v4, v4, s4, v5
-; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v2
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v2, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v1, v1, v4
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v3
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, v2
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v3f16_v3i32:
@@ -843,10 +843,10 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) {
; GFX9-SDAG-LABEL: test_ldexp_v3f16_v3i16:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v4, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v2
; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v1, v1, v3
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v4
+; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v3, v0, v2
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, v3
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v3f16_v3i16:
@@ -973,16 +973,16 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) {
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x8000
; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x7fff
-; GFX9-SDAG-NEXT: v_med3_i32 v5, v5, s4, v6
; GFX9-SDAG-NEXT: v_med3_i32 v4, v4, s4, v6
-; GFX9-SDAG-NEXT: v_med3_i32 v3, v3, s4, v6
; GFX9-SDAG-NEXT: v_med3_i32 v2, v2, s4, v6
-; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v1, v1, v4
-; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v2
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v3
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v1, v1, v5
+; GFX9-SDAG-NEXT: v_med3_i32 v5, v5, s4, v6
+; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v4, v1, v4
+; GFX9-SDAG-NEXT: v_med3_i32 v3, v3, s4, v6
+; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v2, v0, v2
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v2, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v4, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, v4
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v4f16_v4i32:
@@ -1155,12 +1155,12 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) {
; GFX9-SDAG-LABEL: test_ldexp_v4f16_v4i16:
; GFX9-SDAG: ; %bb.0:
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v1, v1, v3
-; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v0, v2
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v5
-; GFX9-SDAG-NEXT: v_pack_b32_f16 v1, v1, v4
+; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v4, v1, v3
+; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v5, v0, v2
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v5, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT: v_ldexp_f16_sdwa v4, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, v5
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, v4
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v4f16_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index 6353640bed146..22f7c8f56b52b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -6741,15 +6741,25 @@ define <2 x half> @v_log_v2f16(<2 x half> %in) {
; VI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_log_v2f16:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_log_f16_e32 v1, v0
-; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log_v2f16:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log_v2f16:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x398c
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log_v2f16:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -6873,10 +6883,10 @@ define <2 x half> @v_log_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-SDAG-NEXT: v_log_f16_e64 v1, |v0|
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v2, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log_fabs_v2f16:
@@ -6884,10 +6894,10 @@ define <2 x half> @v_log_fabs_v2f16(<2 x half> %in) {
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x398c
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log_fabs_v2f16:
@@ -7021,10 +7031,10 @@ define <2 x half> @v_log_fneg_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-SDAG-NEXT: v_log_f16_e64 v1, -|v0|
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v2, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log_fneg_fabs_v2f16:
@@ -7032,10 +7042,10 @@ define <2 x half> @v_log_fneg_fabs_v2f16(<2 x half> %in) {
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0
; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x398c
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log_fneg_fabs_v2f16:
@@ -7170,10 +7180,10 @@ define <2 x half> @v_log_fneg_v2f16(<2 x half> %in) {
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-SDAG-NEXT: v_log_f16_e64 v1, -v0
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log_fneg_v2f16:
@@ -7181,10 +7191,10 @@ define <2 x half> @v_log_fneg_v2f16(<2 x half> %in) {
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x398c
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log_fneg_v2f16:
@@ -7300,15 +7310,25 @@ define <2 x half> @v_log_v2f16_fast(<2 x half> %in) {
; VI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_log_v2f16_fast:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_log_f16_e32 v1, v0
-; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log_v2f16_fast:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log_v2f16_fast:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x398c
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log_v2f16_fast:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -7423,17 +7443,29 @@ define <3 x half> @v_log_v3f16(<3 x half> %in) {
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_log_v3f16:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_log_f16_e32 v2, v0
-; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT: v_log_f16_e32 v1, v1
-; GFX900-NEXT: v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log_v3f16:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_log_f16_e32 v2, v0
+; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v2
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v3, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log_v3f16:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
+; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x398c
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v2
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log_v3f16:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -7554,17 +7586,29 @@ define <3 x half> @v_log_v3f16_fast(<3 x half> %in) {
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_log_v3f16_fast:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_log_f16_e32 v2, v0
-; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT: v_log_f16_e32 v1, v1
-; GFX900-NEXT: v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log_v3f16_fast:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_log_f16_e32 v2, v0
+; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v2
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v3, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log_v3f16_fast:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
+; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x398c
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v2
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log_v3f16_fast:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -7724,30 +7768,28 @@ define <4 x half> @v_log_v4f16(<4 x half> %in) {
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-SDAG-NEXT: v_log_f16_e32 v2, v1
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX900-SDAG-NEXT: v_log_f16_e32 v3, v0
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v3, 0x398c, v3
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v3, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v2
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v3
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v4, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v5, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log_v4f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v3, 0x398c, v3
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT: v_log_f16_e32 v4, v1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x398c
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v2
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v4
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log_v4f16:
@@ -7914,30 +7956,28 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) {
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-SDAG-NEXT: v_log_f16_e32 v2, v1
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX900-SDAG-NEXT: v_log_f16_e32 v3, v0
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v3, 0x398c, v3
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v3, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x398c
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x398c, v2
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x398c, v3
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v4, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v5, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log_v4f16_fast:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v2, 0x398c, v2
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v0
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v3, 0x398c, v3
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT: v_log_f16_e32 v4, v1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x398c
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x398c, v2
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x398c, v4
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 58665c7b24aea..63d8d4a659cf1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -6741,15 +6741,25 @@ define <2 x half> @v_log10_v2f16(<2 x half> %in) {
; VI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_log10_v2f16:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_log_f16_e32 v1, v0
-; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log10_v2f16:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log10_v2f16:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x34d1
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log10_v2f16:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -6873,10 +6883,10 @@ define <2 x half> @v_log10_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-SDAG-NEXT: v_log_f16_e64 v1, |v0|
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v2, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log10_fabs_v2f16:
@@ -6884,10 +6894,10 @@ define <2 x half> @v_log10_fabs_v2f16(<2 x half> %in) {
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x34d1
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log10_fabs_v2f16:
@@ -7021,10 +7031,10 @@ define <2 x half> @v_log10_fneg_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-SDAG-NEXT: v_log_f16_e64 v1, -|v0|
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v2, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log10_fneg_fabs_v2f16:
@@ -7032,10 +7042,10 @@ define <2 x half> @v_log10_fneg_fabs_v2f16(<2 x half> %in) {
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0
; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x34d1
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log10_fneg_fabs_v2f16:
@@ -7170,10 +7180,10 @@ define <2 x half> @v_log10_fneg_v2f16(<2 x half> %in) {
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-SDAG-NEXT: v_log_f16_e64 v1, -v0
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log10_fneg_v2f16:
@@ -7181,10 +7191,10 @@ define <2 x half> @v_log10_fneg_v2f16(<2 x half> %in) {
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x34d1
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log10_fneg_v2f16:
@@ -7300,15 +7310,25 @@ define <2 x half> @v_log10_v2f16_fast(<2 x half> %in) {
; VI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_log10_v2f16_fast:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_log_f16_e32 v1, v0
-; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log10_v2f16_fast:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log10_v2f16_fast:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x34d1
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log10_v2f16_fast:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -7423,17 +7443,29 @@ define <3 x half> @v_log10_v3f16(<3 x half> %in) {
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_log10_v3f16:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_log_f16_e32 v2, v0
-; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT: v_log_f16_e32 v1, v1
-; GFX900-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log10_v3f16:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_log_f16_e32 v2, v0
+; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v2
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v3, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log10_v3f16:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
+; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x34d1
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v2
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log10_v3f16:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -7554,17 +7586,29 @@ define <3 x half> @v_log10_v3f16_fast(<3 x half> %in) {
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_log10_v3f16_fast:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_log_f16_e32 v2, v0
-; GFX900-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-NEXT: v_log_f16_e32 v1, v1
-; GFX900-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-SDAG-LABEL: v_log10_v3f16_fast:
+; GFX900-SDAG: ; %bb.0:
+; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-SDAG-NEXT: v_log_f16_e32 v2, v0
+; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v2
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v3, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-GISEL-LABEL: v_log10_v3f16_fast:
+; GFX900-GISEL: ; %bb.0:
+; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
+; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0x34d1
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v2
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log10_v3f16_fast:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -7724,30 +7768,28 @@ define <4 x half> @v_log10_v4f16(<4 x half> %in) {
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-SDAG-NEXT: v_log_f16_e32 v2, v1
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX900-SDAG-NEXT: v_log_f16_e32 v3, v0
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v3, 0x34d1, v3
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v3, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v2
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v3
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v4, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v5, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log10_v4f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v3, 0x34d1, v3
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT: v_log_f16_e32 v4, v1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x34d1
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v2
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v4
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log10_v4f16:
@@ -7914,30 +7956,28 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) {
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-SDAG-NEXT: v_log_f16_e32 v2, v1
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX900-SDAG-NEXT: v_log_f16_e32 v3, v0
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v3, 0x34d1, v3
-; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v3, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v1, v2, v1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-SDAG-NEXT: s_movk_i32 s4, 0x34d1
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v1, 0x34d1, v2
+; GFX900-SDAG-NEXT: v_mul_f16_e32 v0, 0x34d1, v3
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v0, v4, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-SDAG-NEXT: v_mul_f16_sdwa v1, v5, s4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log10_v4f16_fast:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v2, 0x34d1, v2
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v0
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v3, 0x34d1, v3
-; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX900-GISEL-NEXT: v_log_f16_e32 v4, v1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x34d1
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v0, 0x34d1, v2
+; GFX900-GISEL-NEXT: v_mul_f16_e32 v1, 0x34d1, v4
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v0, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX900-GISEL-NEXT: v_mul_f16_sdwa v1, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log10_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index cf2c8fe8fc574..20ccd049735ef 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -4121,21 +4121,13 @@ define <2 x half> @v_log2_v2f16(<2 x half> %in) {
; VI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-SDAG-LABEL: v_log2_v2f16:
-; GFX900-SDAG: ; %bb.0:
-; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_log2_v2f16:
-; GFX900-GISEL: ; %bb.0:
-; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_log2_v2f16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_log_f16_e32 v1, v0
+; GFX900-NEXT: v_log_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-NEXT: v_mov_b32_e32 v0, v1
+; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log2_v2f16:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -4238,18 +4230,17 @@ define <2 x half> @v_log2_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_log2_fabs_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e64 v0, |v0|
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_log_f16_e64 v1, |v0|
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log2_fabs_v2f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX900-GISEL-NEXT: v_log_f16_e32 v0, v1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log2_fabs_v2f16:
@@ -4361,18 +4352,17 @@ define <2 x half> @v_log2_fneg_fabs_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_log2_fneg_fabs_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e64 v0, -|v0|
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_log_f16_e64 v1, -|v0|
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log2_fneg_fabs_v2f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_or_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_or_b32_e32 v1, 0x80008000, v0
+; GFX900-GISEL-NEXT: v_log_f16_e32 v0, v1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log2_fneg_fabs_v2f16:
@@ -4485,18 +4475,17 @@ define <2 x half> @v_log2_fneg_v2f16(<2 x half> %in) {
; GFX900-SDAG-LABEL: v_log2_fneg_v2f16:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e64 v0, -v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX900-SDAG-NEXT: v_log_f16_e64 v1, -v0
+; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, v1
; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-GISEL-LABEL: v_log2_fneg_v2f16:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX900-GISEL-NEXT: v_xor_b32_e32 v1, 0x80008000, v0
+; GFX900-GISEL-NEXT: v_log_f16_e32 v0, v1
+; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log2_fneg_v2f16:
@@ -4591,21 +4580,13 @@ define <2 x half> @v_log2_v2f16_fast(<2 x half> %in) {
; VI-GISEL-NEXT: v_or_b32_e32 v0, v1, v0
; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-SDAG-LABEL: v_log2_v2f16_fast:
-; GFX900-SDAG: ; %bb.0:
-; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_log2_v2f16_fast:
-; GFX900-GISEL: ; %bb.0:
-; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_log2_v2f16_fast:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_log_f16_e32 v1, v0
+; GFX900-NEXT: v_log_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-NEXT: v_mov_b32_e32 v0, v1
+; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log2_v2f16_fast:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -4709,23 +4690,14 @@ define <3 x half> @v_log2_v3f16(<3 x half> %in) {
; VI-GISEL-NEXT: v_or_b32_e32 v0, v2, v0
; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-SDAG-LABEL: v_log2_v3f16:
-; GFX900-SDAG: ; %bb.0:
-; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2
-; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_log2_v3f16:
-; GFX900-GISEL: ; %bb.0:
-; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_log2_v3f16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_log_f16_e32 v2, v0
+; GFX900-NEXT: v_log_f16_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-NEXT: v_log_f16_e32 v1, v1
+; GFX900-NEXT: v_mov_b32_e32 v0, v2
+; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log2_v3f16:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -4827,23 +4799,14 @@ define <3 x half> @v_log2_v3f16_fast(<3 x half> %in) {
; VI-GISEL-NEXT: v_or_b32_e32 v0, v2, v0
; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-SDAG-LABEL: v_log2_v3f16_fast:
-; GFX900-SDAG: ; %bb.0:
-; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v2
-; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_log2_v3f16_fast:
-; GFX900-GISEL: ; %bb.0:
-; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_log_f16_e32 v1, v1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_log2_v3f16_fast:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_log_f16_e32 v2, v0
+; GFX900-NEXT: v_log_f16_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-NEXT: v_log_f16_e32 v1, v1
+; GFX900-NEXT: v_mov_b32_e32 v0, v2
+; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log2_v3f16_fast:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -4963,27 +4926,16 @@ define <4 x half> @v_log2_v4f16(<4 x half> %in) {
; VI-GISEL-NEXT: v_or_b32_e32 v1, v3, v1
; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-SDAG-LABEL: v_log2_v4f16:
-; GFX900-SDAG: ; %bb.0:
-; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v3
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2
-; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_log2_v4f16:
-; GFX900-GISEL: ; %bb.0:
-; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v1, v3, v1
-; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_log2_v4f16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_log_f16_e32 v2, v0
+; GFX900-NEXT: v_log_f16_e32 v3, v1
+; GFX900-NEXT: v_log_f16_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-NEXT: v_log_f16_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-NEXT: v_mov_b32_e32 v0, v2
+; GFX900-NEXT: v_mov_b32_e32 v1, v3
+; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log2_v4f16:
; GFX1100-SDAG-TRUE16: ; %bb.0:
@@ -5103,27 +5055,16 @@ define <4 x half> @v_log2_v4f16_fast(<4 x half> %in) {
; VI-GISEL-NEXT: v_or_b32_e32 v1, v3, v1
; VI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-SDAG-LABEL: v_log2_v4f16_fast:
-; GFX900-SDAG: ; %bb.0:
-; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-SDAG-NEXT: v_log_f16_e32 v0, v0
-; GFX900-SDAG-NEXT: v_log_f16_e32 v1, v1
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v0, v0, v3
-; GFX900-SDAG-NEXT: v_pack_b32_f16 v1, v1, v2
-; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX900-GISEL-LABEL: v_log2_v4f16_fast:
-; GFX900-GISEL: ; %bb.0:
-; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_log_f16_e32 v2, v0
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_log_f16_e32 v3, v1
-; GFX900-GISEL-NEXT: v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX900-GISEL-NEXT: v_pack_b32_f16 v1, v3, v1
-; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_log2_v4f16_fast:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_log_f16_e32 v2, v0
+; GFX900-NEXT: v_log_f16_e32 v3, v1
+; GFX900-NEXT: v_log_f16_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-NEXT: v_log_f16_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX900-NEXT: v_mov_b32_e32 v0, v2
+; GFX900-NEXT: v_mov_b32_e32 v1, v3
+; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-SDAG-TRUE16-LABEL: v_log2_v4f16_fast:
; GFX1100-SDAG-TRUE16: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index 57ce028b1fc4a..513a6f4aba92d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -199,9 +199,8 @@ define amdgpu_kernel void @rint_v2f16(
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rndne_f16_e32 v1, v0
-; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: v_rndne_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: buffer_store_dword v1, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-TRUE16-LABEL: rint_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
index d1a16b687f930..2bb957813ce86 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
@@ -977,16 +977,15 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 {
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s4
; GFX9-NEXT: v_bfi_b32 v2, s5, v2, v3
-; GFX9-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX9-NEXT: v_trunc_f16_e32 v2, s6
-; GFX9-NEXT: v_sub_f16_e32 v3, s6, v2
-; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v3|, 0.5
+; GFX9-NEXT: v_trunc_f16_e32 v3, s6
+; GFX9-NEXT: v_sub_f16_e32 v4, s6, v3
+; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v4|, 0.5
; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX9-NEXT: v_mov_b32_e32 v3, s6
-; GFX9-NEXT: v_bfi_b32 v0, s5, v0, v3
-; GFX9-NEXT: v_add_f16_e32 v0, v2, v0
+; GFX9-NEXT: v_mov_b32_e32 v4, s6
+; GFX9-NEXT: v_bfi_b32 v0, s5, v0, v4
+; GFX9-NEXT: v_add_f16_e32 v0, v3, v0
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index b7fc76aecf080..b117a922dfba6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -189,9 +189,8 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_mul_f16 v1, v1, 0.15915494 op_sel_hi:[1,0]
; GFX9-NEXT: v_sin_f16_e32 v2, v1
-; GFX9-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: v_sin_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sin_v2f16:
@@ -203,9 +202,8 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_mul_f16 v1, v1, 0.15915494 op_sel_hi:[1,0]
; GFX10-NEXT: v_sin_f16_e32 v2, v1
-; GFX10-NEXT: v_sin_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: v_sin_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: global_store_dword v0, v2, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-TRUE16-LABEL: sin_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index 8b31944acc15a..e5e1a1ce53bdd 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -1829,25 +1829,25 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr
; SDAG-GFX1100-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
-; SDAG-GFX900-LABEL: v_mad_mix_v2f32_clamp_precvt:
-; SDAG-GFX900: ; %bb.0:
-; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX900-NEXT: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX900-NEXT: v_cvt_f16_f32_e32 v1, v3
-; SDAG-GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SDAG-GFX900-NEXT: v_pack_b32_f16 v0, v0, v1
-; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_mad_mix_v2f32_clamp_precvt:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mad_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GFX900-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX900-NEXT: v_cvt_f16_f32_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GFX900-NEXT: v_mov_b32_e32 v0, v3
+; GFX900-NEXT: s_setpc_b64 s[30:31]
;
-; SDAG-GFX906-LABEL: v_mad_mix_v2f32_clamp_precvt:
-; SDAG-GFX906: ; %bb.0:
-; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX906-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX906-NEXT: v_cvt_f16_f32_e32 v1, v3
-; SDAG-GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SDAG-GFX906-NEXT: v_pack_b32_f16 v0, v0, v1
-; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
+; GFX906-LABEL: v_mad_mix_v2f32_clamp_precvt:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GFX906-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX906-NEXT: v_cvt_f16_f32_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GFX906-NEXT: v_mov_b32_e32 v0, v3
+; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-VI-LABEL: v_mad_mix_v2f32_clamp_precvt:
; SDAG-VI: ; %bb.0:
@@ -1900,26 +1900,6 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr
; GISEL-GFX1100-NEXT: v_pack_b32_f16 v0, v1, v0
; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-GFX900-LABEL: v_mad_mix_v2f32_clamp_precvt:
-; GISEL-GFX900: ; %bb.0:
-; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX900-NEXT: v_mad_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v1, v3
-; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-GFX900-NEXT: v_pack_b32_f16 v0, v1, v0
-; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-GFX906-LABEL: v_mad_mix_v2f32_clamp_precvt:
-; GISEL-GFX906: ; %bb.0:
-; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX906-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v1, v3
-; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-GFX906-NEXT: v_pack_b32_f16 v0, v1, v0
-; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31]
-;
; GISEL-VI-LABEL: v_mad_mix_v2f32_clamp_precvt:
; GISEL-VI: ; %bb.0:
; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1990,29 +1970,29 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr
; SDAG-GFX1100-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v2
; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
-; SDAG-GFX900-LABEL: v_mad_mix_v3f32_clamp_precvt:
-; SDAG-GFX900: ; %bb.0:
-; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX900-NEXT: v_mad_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX900-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX900-NEXT: v_cvt_f16_f32_e32 v2, v6
-; SDAG-GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SDAG-GFX900-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX900-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SDAG-GFX900-NEXT: v_pack_b32_f16 v0, v0, v2
-; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_mad_mix_v3f32_clamp_precvt:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_mad_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GFX900-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX900-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX900-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GFX900-NEXT: v_cvt_f16_f32_sdwa v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GFX900-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX900-NEXT: v_mov_b32_e32 v0, v6
+; GFX900-NEXT: s_setpc_b64 s[30:31]
;
-; SDAG-GFX906-LABEL: v_mad_mix_v3f32_clamp_precvt:
-; SDAG-GFX906: ; %bb.0:
-; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX906-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX906-NEXT: v_cvt_f16_f32_e32 v2, v6
-; SDAG-GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SDAG-GFX906-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX906-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SDAG-GFX906-NEXT: v_pack_b32_f16 v0, v0, v2
-; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
+; GFX906-LABEL: v_mad_mix_v3f32_clamp_precvt:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GFX906-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX906-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX906-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GFX906-NEXT: v_cvt_f16_f32_sdwa v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GFX906-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX906-NEXT: v_mov_b32_e32 v0, v6
+; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-VI-LABEL: v_mad_mix_v3f32_clamp_precvt:
; SDAG-VI: ; %bb.0:
@@ -2081,30 +2061,6 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr
; GISEL-GFX1100-NEXT: v_pack_b32_f16 v0, v2, v0
; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-GFX900-LABEL: v_mad_mix_v3f32_clamp_precvt:
-; GISEL-GFX900: ; %bb.0:
-; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX900-NEXT: v_mad_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v2, v6
-; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-GFX900-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-GFX900-NEXT: v_pack_b32_f16 v0, v2, v0
-; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-GFX906-LABEL: v_mad_mix_v3f32_clamp_precvt:
-; GISEL-GFX906: ; %bb.0:
-; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX906-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v2, v6
-; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-GFX906-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-GFX906-NEXT: v_pack_b32_f16 v0, v2, v0
-; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31]
-;
; GISEL-VI-LABEL: v_mad_mix_v3f32_clamp_precvt:
; GISEL-VI: ; %bb.0:
; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2192,31 +2148,31 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
; SDAG-GFX900-LABEL: v_mad_mix_v4f32_clamp_precvt:
; SDAG-GFX900: ; %bb.0:
; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX900-NEXT: v_mad_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX900-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX900-NEXT: v_mad_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX900-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX900-NEXT: v_cvt_f16_f32_e32 v2, v6
-; SDAG-GFX900-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SDAG-GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SDAG-GFX900-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SDAG-GFX900-NEXT: v_pack_b32_f16 v0, v0, v3
-; SDAG-GFX900-NEXT: v_pack_b32_f16 v1, v1, v2
+; SDAG-GFX900-NEXT: v_mad_mix_f32 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX900-NEXT: v_mad_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX900-NEXT: v_cvt_f16_f32_e32 v6, v6
+; SDAG-GFX900-NEXT: v_cvt_f16_f32_e32 v7, v7
+; SDAG-GFX900-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX900-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX900-NEXT: v_cvt_f16_f32_sdwa v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; SDAG-GFX900-NEXT: v_cvt_f16_f32_sdwa v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; SDAG-GFX900-NEXT: v_mov_b32_e32 v0, v6
+; SDAG-GFX900-NEXT: v_mov_b32_e32 v1, v7
; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX906-LABEL: v_mad_mix_v4f32_clamp_precvt:
; SDAG-GFX906: ; %bb.0:
; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX906-NEXT: v_fma_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX906-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX906-NEXT: v_fma_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX906-NEXT: v_cvt_f16_f32_e32 v2, v6
-; SDAG-GFX906-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SDAG-GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SDAG-GFX906-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SDAG-GFX906-NEXT: v_pack_b32_f16 v0, v0, v3
-; SDAG-GFX906-NEXT: v_pack_b32_f16 v1, v1, v2
+; SDAG-GFX906-NEXT: v_fma_mix_f32 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX906-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX906-NEXT: v_cvt_f16_f32_e32 v6, v6
+; SDAG-GFX906-NEXT: v_cvt_f16_f32_e32 v7, v7
+; SDAG-GFX906-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX906-NEXT: v_cvt_f16_f32_sdwa v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; SDAG-GFX906-NEXT: v_cvt_f16_f32_sdwa v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; SDAG-GFX906-NEXT: v_mov_b32_e32 v0, v6
+; SDAG-GFX906-NEXT: v_mov_b32_e32 v1, v7
; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-VI-LABEL: v_mad_mix_v4f32_clamp_precvt:
@@ -2309,30 +2265,30 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
; GISEL-GFX900: ; %bb.0:
; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX900-NEXT: v_mad_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX900-NEXT: v_mad_mix_f32 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v7, v7
; GISEL-GFX900-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT: v_mad_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
; GISEL-GFX900-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v3, v6
-; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GISEL-GFX900-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-GFX900-NEXT: v_pack_b32_f16 v0, v3, v0
-; GISEL-GFX900-NEXT: v_pack_b32_f16 v1, v2, v1
+; GISEL-GFX900-NEXT: v_cvt_f16_f32_sdwa v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GISEL-GFX900-NEXT: v_cvt_f16_f32_sdwa v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, v6
+; GISEL-GFX900-NEXT: v_mov_b32_e32 v1, v7
; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-GFX906-LABEL: v_mad_mix_v4f32_clamp_precvt:
; GISEL-GFX906: ; %bb.0:
; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX906-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX906-NEXT: v_fma_mix_f32 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v7, v7
; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT: v_fma_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
; GISEL-GFX906-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v3, v6
-; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GISEL-GFX906-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-GFX906-NEXT: v_pack_b32_f16 v0, v3, v0
-; GISEL-GFX906-NEXT: v_pack_b32_f16 v1, v2, v1
+; GISEL-GFX906-NEXT: v_cvt_f16_f32_sdwa v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GISEL-GFX906-NEXT: v_cvt_f16_f32_sdwa v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, v6
+; GISEL-GFX906-NEXT: v_mov_b32_e32 v1, v7
; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-VI-LABEL: v_mad_mix_v4f32_clamp_precvt:
diff --git a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
index 88a8f3affc83a..0276529b2ade7 100644
--- a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
+++ b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
@@ -272,8 +272,7 @@ define <2 x half> @v_repeat_divisor_f16_x2_arcp(half %x, half %y, half %D) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_rcp_f16_e32 v2, v2
; GFX9-NEXT: v_mul_f16_e32 v0, v0, v2
-; GFX9-NEXT: v_mul_f16_e32 v1, v1, v2
-; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: v_mul_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_repeat_divisor_f16_x2_arcp:
@@ -555,9 +554,9 @@ define <3 x half> @v_repeat_divisor_f16_x3_arcp(half %x, half %y, half %z, half
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_rcp_f16_e32 v3, v3
; GFX9-NEXT: v_mul_f16_e32 v0, v0, v3
-; GFX9-NEXT: v_mul_f16_e32 v4, v1, v3
-; GFX9-NEXT: v_mul_f16_e32 v1, v2, v3
-; GFX9-NEXT: v_pack_b32_f16 v0, v0, v4
+; GFX9-NEXT: v_mul_f16_e32 v2, v2, v3
+; GFX9-NEXT: v_mul_f16_sdwa v0, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT: v_mov_b32_e32 v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_repeat_divisor_f16_x3_arcp:
@@ -825,11 +824,10 @@ define <4 x half> @v_repeat_divisor_v2f16_x2(<2 x half> %x, <2 x half> %y, <2 x
; GFX9-LABEL: v_repeat_divisor_v2f16_x2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_sdwa v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_rcp_f16_e32 v2, v2
-; GFX9-NEXT: v_pack_b32_f16 v2, v2, v3
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2
-; GFX9-NEXT: v_pk_mul_f16 v1, v1, v2
+; GFX9-NEXT: v_rcp_f16_e32 v3, v2
+; GFX9-NEXT: v_rcp_f16_sdwa v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3
+; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_repeat_divisor_v2f16_x2:
@@ -926,16 +924,15 @@ define <6 x half> @v_repeat_divisor_v3f16_x2(<3 x half> %x, <3 x half> %y, <3 x
; GFX9-LABEL: v_repeat_divisor_v3f16_x2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_sdwa v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_rcp_f16_e32 v4, v4
-; GFX9-NEXT: v_rcp_f16_e32 v5, v5
+; GFX9-NEXT: v_rcp_f16_e32 v6, v4
+; GFX9-NEXT: v_rcp_f16_sdwa v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_rcp_f16_e32 v4, v5
; GFX9-NEXT: s_movk_i32 s4, 0x7e00
-; GFX9-NEXT: v_pack_b32_f16 v4, v4, v6
-; GFX9-NEXT: v_pack_b32_f16 v5, v5, s4
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, v4
-; GFX9-NEXT: v_pk_mul_f16 v1, v1, v5
-; GFX9-NEXT: v_pk_mul_f16 v3, v3, v5
-; GFX9-NEXT: v_pk_mul_f16 v4, v2, v4
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, v6
+; GFX9-NEXT: v_pack_b32_f16 v4, v4, s4
+; GFX9-NEXT: v_pk_mul_f16 v1, v1, v4
+; GFX9-NEXT: v_pk_mul_f16 v3, v3, v4
+; GFX9-NEXT: v_pk_mul_f16 v4, v2, v6
; GFX9-NEXT: v_alignbit_b32 v2, v3, v4, 16
; GFX9-NEXT: v_pack_b32_f16 v1, v1, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll
index a259156c09bd7..5fa6b046b9d4d 100644
--- a/llvm/test/CodeGen/AMDGPU/roundeven.ll
+++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll
@@ -461,16 +461,16 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_rndne_f16_e32 v1, v0
-; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT: v_rndne_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_mov_b32_e32 v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_roundeven_v2f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_rndne_f16_e32 v1, v0
-; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT: v_rndne_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_mov_b32_e32 v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_roundeven_v2f16:
@@ -522,17 +522,17 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
; SDAG_GFX9-LABEL: v_roundeven_v2f16:
; SDAG_GFX9: ; %bb.0:
; SDAG_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX9-NEXT: v_rndne_f16_e32 v0, v0
-; SDAG_GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX9-NEXT: v_rndne_f16_e32 v1, v0
+; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX9-NEXT: v_mov_b32_e32 v0, v1
; SDAG_GFX9-NEXT: s_setpc_b64 s[30:31]
;
; SDAG_GFX10-LABEL: v_roundeven_v2f16:
; SDAG_GFX10: ; %bb.0:
; SDAG_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX10-NEXT: v_rndne_f16_e32 v0, v0
-; SDAG_GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX10-NEXT: v_rndne_f16_e32 v1, v0
+; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX10-NEXT: v_mov_b32_e32 v0, v1
; SDAG_GFX10-NEXT: s_setpc_b64 s[30:31]
;
; SDAG_GFX11-TRUE16-LABEL: v_roundeven_v2f16:
@@ -599,19 +599,17 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
; GFX9-LABEL: v_roundeven_v2f16_fneg:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX9-NEXT: v_rndne_f16_e32 v1, v0
-; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v0
+; GFX9-NEXT: v_rndne_f16_e32 v0, v1
+; GFX9-NEXT: v_rndne_f16_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_roundeven_v2f16_fneg:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX10-NEXT: v_rndne_f16_e32 v1, v0
-; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT: v_xor_b32_e32 v1, 0x80008000, v0
+; GFX10-NEXT: v_rndne_f16_e32 v0, v1
+; GFX10-NEXT: v_rndne_f16_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_roundeven_v2f16_fneg:
@@ -673,17 +671,17 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
; SDAG_GFX9-LABEL: v_roundeven_v2f16_fneg:
; SDAG_GFX9: ; %bb.0:
; SDAG_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX9-NEXT: v_rndne_f16_e64 v0, -v0
-; SDAG_GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX9-NEXT: v_rndne_f16_e64 v1, -v0
+; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v1, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX9-NEXT: v_mov_b32_e32 v0, v1
; SDAG_GFX9-NEXT: s_setpc_b64 s[30:31]
;
; SDAG_GFX10-LABEL: v_roundeven_v2f16_fneg:
; SDAG_GFX10: ; %bb.0:
; SDAG_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v1, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX10-NEXT: v_rndne_f16_e64 v0, -v0
-; SDAG_GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
+; SDAG_GFX10-NEXT: v_rndne_f16_e64 v1, -v0
+; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v1, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX10-NEXT: v_mov_b32_e32 v0, v1
; SDAG_GFX10-NEXT: s_setpc_b64 s[30:31]
;
; SDAG_GFX11-TRUE16-LABEL: v_roundeven_v2f16_fneg:
@@ -756,22 +754,22 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_rndne_f16_e32 v2, v0
-; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_rndne_f16_e32 v3, v1
-; GFX9-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT: v_rndne_f16_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_rndne_f16_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-NEXT: v_mov_b32_e32 v1, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_roundeven_v4f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_rndne_f16_e32 v2, v0
-; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_rndne_f16_e32 v3, v1
-; GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT: v_rndne_f16_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_rndne_f16_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; GFX10-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-NEXT: v_mov_b32_e32 v1, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_roundeven_v4f16:
@@ -844,23 +842,23 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
; SDAG_GFX9-LABEL: v_roundeven_v4f16:
; SDAG_GFX9: ; %bb.0:
; SDAG_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX9-NEXT: v_rndne_f16_e32 v1, v1
-; SDAG_GFX9-NEXT: v_rndne_f16_e32 v0, v0
-; SDAG_GFX9-NEXT: v_pack_b32_f16 v0, v0, v3
-; SDAG_GFX9-NEXT: v_pack_b32_f16 v1, v1, v2
+; SDAG_GFX9-NEXT: v_rndne_f16_e32 v2, v1
+; SDAG_GFX9-NEXT: v_rndne_f16_e32 v3, v0
+; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX9-NEXT: v_rndne_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX9-NEXT: v_mov_b32_e32 v0, v3
+; SDAG_GFX9-NEXT: v_mov_b32_e32 v1, v2
; SDAG_GFX9-NEXT: s_setpc_b64 s[30:31]
;
; SDAG_GFX10-LABEL: v_roundeven_v4f16:
; SDAG_GFX10: ; %bb.0:
; SDAG_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG_GFX10-NEXT: v_rndne_f16_e32 v0, v0
-; SDAG_GFX10-NEXT: v_rndne_f16_e32 v1, v1
-; SDAG_GFX10-NEXT: v_pack_b32_f16 v0, v0, v3
-; SDAG_GFX10-NEXT: v_pack_b32_f16 v1, v1, v2
+; SDAG_GFX10-NEXT: v_rndne_f16_e32 v2, v0
+; SDAG_GFX10-NEXT: v_rndne_f16_e32 v3, v1
+; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX10-NEXT: v_rndne_f16_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+; SDAG_GFX10-NEXT: v_mov_b32_e32 v0, v2
+; SDAG_GFX10-NEXT: v_mov_b32_e32 v1, v3
; SDAG_GFX10-NEXT: s_setpc_b64 s[30:31]
;
; SDAG_GFX11-TRUE16-LABEL: v_roundeven_v4f16:
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll b/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll
index 8ad6a4e534d23..5931a5a1abddb 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll
@@ -18,9 +18,8 @@ define void @extracted_values(ptr %ret_struct, ptr addrspace(3) %arg0, ptr addrs
; CHECK-NEXT: v_sub_f16_sdwa v7, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; CHECK-NEXT: v_sub_f16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; CHECK-NEXT: v_add_f16_e32 v4, v6, v7
-; CHECK-NEXT: v_add_f16_e32 v2, v3, v2
-; CHECK-NEXT: v_pack_b32_f16 v2, v4, v2
-; CHECK-NEXT: flat_store_dword v[0:1], v2
+; CHECK-NEXT: v_add_f16_sdwa v4, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+; CHECK-NEXT: flat_store_dword v[0:1], v4
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
index 5518c7a14cc69..4c5572e2fb58e 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
@@ -608,13 +608,12 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
; GFX9-LABEL: add_select_fabs_negk_negk_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, 0xbc00
-; GFX9-NEXT: v_mov_b32_e32 v4, 0xc000
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xbc00
+; GFX9-NEXT: v_mov_b32_e32 v3, 0xc000
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_sdwa v0, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -684,13 +683,12 @@ define <2 x half> @add_select_posk_posk_v2f16(<2 x i32> %c, <2 x half> %x) {
; GFX9-LABEL: add_select_posk_posk_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x3c00
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x4000
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c00
+; GFX9-NEXT: v_mov_b32_e32 v3, 0x4000
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_sdwa v0, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_pk_add_f16 v0, v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -1772,13 +1770,12 @@ define <2 x half> @add_select_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) {
; GFX9-LABEL: add_select_negk_negk_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, 0xbc00
-; GFX9-NEXT: v_mov_b32_e32 v4, 0xc000
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xbc00
+; GFX9-NEXT: v_mov_b32_e32 v3, 0xc000
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_sdwa v0, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_pk_add_f16 v0, v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -1849,13 +1846,12 @@ define <2 x half> @add_select_negliteralk_negliteralk_v2f16(<2 x i32> %c, <2 x h
; GFX9-LABEL: add_select_negliteralk_negliteralk_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, 0xec00
-; GFX9-NEXT: v_mov_b32_e32 v4, 0xe800
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xec00
+; GFX9-NEXT: v_mov_b32_e32 v3, 0xe800
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_sdwa v0, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_pk_add_f16 v0, v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -1924,13 +1920,12 @@ define <2 x half> @add_select_fneg_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
; GFX9-LABEL: add_select_fneg_negk_negk_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v3, 0xbc00
-; GFX9-NEXT: v_mov_b32_e32 v4, 0xc000
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xbc00
+; GFX9-NEXT: v_mov_b32_e32 v3, 0xc000
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[4:5]
+; GFX9-NEXT: v_cndmask_b32_sdwa v0, v1, v3, vcc dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll
index 5440eb871f349..c93849fabf2e2 100644
--- a/llvm/test/CodeGen/AMDGPU/v_pack.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll
@@ -19,8 +19,8 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace
; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1
-; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
-; GCN-NEXT: v_pack_b32_f16 v0, v0, v1
+; GCN-NEXT: v_mov_b32_e32 v1, 0x4000
+; GCN-NEXT: v_add_f16_sdwa v0, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use v0
; GCN-NEXT: ;;#ASMEND
@@ -36,8 +36,8 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace
; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1
-; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
-; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
+; GISEL-NEXT: v_mov_b32_e32 v1, 0x4000
+; GISEL-NEXT: v_add_f16_sdwa v0, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GISEL-NEXT: ;;#ASMSTART
; GISEL-NEXT: ; use v0
; GISEL-NEXT: ;;#ASMEND
@@ -144,8 +144,8 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs
; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_subrev_f16_e32 v0, 2.0, v1
-; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
-; GCN-NEXT: v_pack_b32_f16 v0, v0, v1
+; GCN-NEXT: v_mov_b32_e32 v1, 0x4000
+; GCN-NEXT: v_add_f16_sdwa v0, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use v0
; GCN-NEXT: ;;#ASMEND
@@ -161,8 +161,8 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs
; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_subrev_f16_e32 v0, 2.0, v1
-; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
-; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
+; GISEL-NEXT: v_mov_b32_e32 v1, 0x4000
+; GISEL-NEXT: v_add_f16_sdwa v0, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GISEL-NEXT: ;;#ASMSTART
; GISEL-NEXT: ; use v0
; GISEL-NEXT: ;;#ASMEND
@@ -273,9 +273,8 @@ define amdgpu_kernel void @fptrunc(
; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_pack_b32_f16 v0, v0, v1
+; GCN-NEXT: v_cvt_f16_f32_sdwa v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
@@ -286,10 +285,9 @@ define amdgpu_kernel void @fptrunc(
; GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GISEL-NEXT: v_cvt_f16_f32_e32 v1, s3
+; GISEL-NEXT: v_cvt_f16_f32_sdwa v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
; GISEL-NEXT: s_mov_b32 s2, -1
; GISEL-NEXT: s_mov_b32 s3, 0x31016000
-; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
; GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GISEL-NEXT: s_endpgm
;
@@ -379,8 +377,8 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(
; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1
-; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
-; GCN-NEXT: v_pack_b32_f16 v0, |v0|, |v1|
+; GCN-NEXT: v_mov_b32_e32 v1, 0x4000
+; GCN-NEXT: v_add_f16_sdwa v0, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use v0
; GCN-NEXT: ;;#ASMEND
@@ -396,8 +394,8 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(
; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1
-; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
-; GISEL-NEXT: v_pack_b32_f16 v0, |v0|, |v1|
+; GISEL-NEXT: v_mov_b32_e32 v1, 0x4000
+; GISEL-NEXT: v_add_f16_sdwa v0, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GISEL-NEXT: ;;#ASMSTART
; GISEL-NEXT: ; use v0
; GISEL-NEXT: ;;#ASMEND
@@ -512,8 +510,8 @@ define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(
; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1
-; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
-; GCN-NEXT: v_pack_b32_f16 v0, -v0, -v1
+; GCN-NEXT: v_mov_b32_e32 v1, 0x4000
+; GCN-NEXT: v_add_f16_sdwa v0, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ; use v0
; GCN-NEXT: ;;#ASMEND
@@ -529,8 +527,8 @@ define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(
; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1
-; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
-; GISEL-NEXT: v_pack_b32_f16 v0, -v0, -v1
+; GISEL-NEXT: v_mov_b32_e32 v1, 0x4000
+; GISEL-NEXT: v_add_f16_sdwa v0, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
; GISEL-NEXT: ;;#ASMSTART
; GISEL-NEXT: ; use v0
; GISEL-NEXT: ;;#ASMEND
More information about the llvm-branch-commits
mailing list