[llvm] [AMDGPU][True16] remove pack32 pattern for true16 mode (PR #171756)

Brox Chen via llvm-commits llvm-commits at lists.llvm.org
Wed Dec 10 19:04:15 PST 2025


https://github.com/broxigarchen created https://github.com/llvm/llvm-project/pull/171756

Remove pack32 so that isel use reg_sequence in true16 mode for build_vector. This generates better code

>From e64d2f542c7961f3697f8c05f7bed69d998c53ba Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Wed, 10 Dec 2025 22:02:26 -0500
Subject: [PATCH] remove pack_t16 pattern for true16 mode

---
 llvm/lib/Target/AMDGPU/SIInstructions.td      |   3 -
 .../CodeGen/AMDGPU/dagcombine-fmul-sel.ll     |  14 +--
 .../CodeGen/AMDGPU/extract-subvector-16bit.ll |  68 +++++------
 llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll |  20 ++--
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll     |  84 ++++++-------
 llvm/test/CodeGen/AMDGPU/fdiv.f16.ll          |   5 +-
 llvm/test/CodeGen/AMDGPU/fmaximum3.ll         |   3 +-
 llvm/test/CodeGen/AMDGPU/fminimum3.ll         |   3 +-
 llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll     |  70 +++++------
 .../AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll |  35 ++----
 llvm/test/CodeGen/AMDGPU/fpow.ll              |  24 ++--
 llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll       |  30 ++---
 llvm/test/CodeGen/AMDGPU/fract-match.ll       |  26 ++--
 llvm/test/CodeGen/AMDGPU/half.ll              | 113 +++++++-----------
 llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll     |   3 +-
 llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll      |   5 +-
 llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll    |   3 +-
 llvm/test/CodeGen/AMDGPU/llvm.frexp.ll        |  72 +++++------
 llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll        |  25 +---
 llvm/test/CodeGen/AMDGPU/llvm.log.ll          |  50 ++------
 llvm/test/CodeGen/AMDGPU/llvm.log10.ll        |  50 ++------
 llvm/test/CodeGen/AMDGPU/llvm.log2.ll         |  42 -------
 llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll     |   9 +-
 llvm/test/CodeGen/AMDGPU/llvm.round.ll        |   2 -
 llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll      |   5 +-
 llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll     |   8 +-
 llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll    |   9 +-
 llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll        |   5 +-
 llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll        |  64 +++++-----
 llvm/test/CodeGen/AMDGPU/repeated-divisor.ll  |  19 +--
 llvm/test/CodeGen/AMDGPU/roundeven.ll         |   8 --
 .../AMDGPU/select-fabs-fneg-extract.v2f16.ll  |  35 +++---
 .../AMDGPU/sext-in-reg-vector-shuffle.ll      |  36 +++---
 llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll    |  29 ++---
 llvm/test/CodeGen/AMDGPU/sitofp.f16.ll        |  13 +-
 .../true16-imm-folded-to-0-regression.ll      |   7 +-
 llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll    |  29 ++---
 llvm/test/CodeGen/AMDGPU/uitofp.f16.ll        |  13 +-
 llvm/test/CodeGen/AMDGPU/v_pack.ll            |  40 +++----
 .../test/CodeGen/AMDGPU/vector-reduce-fmax.ll |  10 +-
 .../test/CodeGen/AMDGPU/vector-reduce-fmin.ll |  10 +-
 41 files changed, 383 insertions(+), 716 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 984d1a4db4cd6..4cc66b3a5adee 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3860,9 +3860,6 @@ let SubtargetPredicate = isGFX9Plus in {
 let True16Predicate = NotHasTrue16BitInsts in
   def : PackB32Pat<V_PACK_B32_F16_e64>;
 
-let True16Predicate = UseRealTrue16Insts in
-  def : PackB32Pat<V_PACK_B32_F16_t16_e64>;
-
 let True16Predicate = UseFakeTrue16Insts in
   def : PackB32Pat<V_PACK_B32_F16_fake16_e64>;
 } // End SubtargetPredicate = isGFX9Plus
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
index 0c7dc74d95e49..d1b8a17915adc 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
@@ -1848,10 +1848,9 @@ define <2 x half> @fmul_select_v2f16_test3(<2 x half> %x, <2 x i32> %bool.arg1,
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0x3c00
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, v1, v3
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, 0x4000, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, 0x4000, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, 0x4000, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, 0x4000, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_f16 v0, v0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2005,10 +2004,9 @@ define <2 x half> @fmul_select_v2f16_test4(<2 x half> %x, <2 x i32> %bool.arg1,
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0x3c00
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, v1, v3
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, 0x3800, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, 0x3800, s0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.h, v2.l, 0x3800, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v1.l, v2.l, 0x3800, s0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_f16 v0, v0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index 555adecf9c79c..d370ea86e3dc8 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -524,19 +524,16 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:  .LBB2_3: ; %exit
 ; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x3900
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0x3900
 ; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s0, 0.5, v2.h
 ; GFX11-TRUE16-NEXT:    v_cmp_nge_f16_e64 s1, 0.5, v3.l
 ; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s2, 0.5, v3.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, 0x3d00, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, 0x3d00, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, 0x3d00, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, 0x3d00, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, 0x3d00, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.l, 0x3d00, s2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v2.l, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, 0x3d00, v1.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, 0x3d00, s2
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB2_4:
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
@@ -1251,19 +1248,16 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:  .LBB5_3: ; %exit
 ; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x3900
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0x3900
 ; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s0, 0.5, v2.h
 ; GFX11-TRUE16-NEXT:    v_cmp_nge_f16_e64 s1, 0.5, v3.l
 ; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s2, 0.5, v3.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, 0x3d00, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, 0x3d00, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, 0x3d00, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v1.l, 0x3d00, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, 0x3d00, v0.l, s1
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.l, 0x3d00, s2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v2.l, v1.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, 0x3d00, v1.l, s1
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v1.l, 0x3d00, s2
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ; GFX11-TRUE16-NEXT:  .LBB5_4:
 ; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
@@ -1965,42 +1959,36 @@ define amdgpu_gfx <8 x half> @vec_16xf16_extract_8xf16_0(i1 inreg %cond, ptr add
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %F
 ; GFX11-TRUE16-NEXT:    global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
+; GFX11-TRUE16-NEXT:    global_load_b128 v[4:7], v[2:3], off glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccz .LBB8_3
 ; GFX11-TRUE16-NEXT:    s_branch .LBB8_4
 ; GFX11-TRUE16-NEXT:  .LBB8_2:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
 ; GFX11-TRUE16-NEXT:  .LBB8_3: ; %T
 ; GFX11-TRUE16-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
+; GFX11-TRUE16-NEXT:    global_load_b128 v[4:7], v[0:1], off glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:  .LBB8_4: ; %exit
+; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v7.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0x3900
+; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s0, 0.5, v6.l
+; GFX11-TRUE16-NEXT:    v_cmp_nge_f16_e64 s1, 0.5, v7.h
+; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s2, 0.5, v4.h
+; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s3, 0.5, v5.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.h, 0x3d00, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v3.h, 0x3d00, s0
 ; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x3900
 ; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s0, 0.5, v4.l
-; GFX11-TRUE16-NEXT:    v_cmp_nge_f16_e64 s1, 0.5, v5.h
-; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s2, 0.5, v2.h
-; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s3, 0.5, v3.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.l, 0x3d00, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v0.l, 0x3d00, s0
-; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v3.l
-; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s0, 0.5, v2.l
-; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s34, 0.5, v4.h
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v0.l, 0x3d00, s2
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.l, 0x3d00, s3
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v0.l, 0x3d00, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v0.l, 0x3d00, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v0.l, 0x3d00, s34
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, 0x3d00, v0.l, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v1.h
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v2.l
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v2, v5.l, v2.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v3, v4.l, v3.l
+; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e64 s34, 0.5, v6.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, 0x3d00, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.h, v3.h, 0x3d00, s3
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v1.l, v3.h, 0x3d00, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.h, 0x3d00, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.h, v3.h, 0x3d00, s34
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, 0x3d00, v3.h, s1
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: vec_16xf16_extract_8xf16_0:
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index da08f4fcf8f3d..2335da7f0abde 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -2568,8 +2568,7 @@ define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_undef_v2f16:
@@ -2770,8 +2769,7 @@ define <2 x half> @v_test_canonicalize_reg_k_v2f16(half %val) #1 {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, 2.0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0x4000
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_k_v2f16:
@@ -2813,9 +2811,8 @@ define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 {
 ; GFX11-TRUE16-LABEL: v_test_canonicalize_k_reg_v2f16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, 2.0, v0.l
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.h, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x4000
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_test_canonicalize_k_reg_v2f16:
@@ -2907,10 +2904,9 @@ define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1
 ; GFX11-TRUE16-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
 ; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, 0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
@@ -3015,10 +3011,10 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half
 ; GFX11-TRUE16-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_pk_max_f16 v1, v1, v1
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 9a17538ea9b1b..a773bf256bd0a 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -3146,11 +3146,10 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f32_sign_v2f16(<2 x float> %mag, <
 ; GFX11-TRUE16-LABEL: v_copysign_out_v2f16_mag_v2f32_sign_v2f16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v1
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v1.l
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, v0, v2
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.h, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, v1, v2
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_copysign_out_v2f16_mag_v2f32_sign_v2f16:
@@ -3653,11 +3652,10 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f16_sign_v2f32(<2 x half> %mag, <2
 ; GFX11-TRUE16-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f32:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v2.l, v2
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v2.l
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, v0, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v2.h, v2
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v2.l, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, v0, v2
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f32:
@@ -3983,12 +3981,10 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f32_sign_v2f16(<2 x float> inre
 ;
 ; GFX11-TRUE16-LABEL: s_copysign_out_v2f16_mag_v2f32_sign_v2f16:
 ; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, s1
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, s0
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, v0, s2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-TRUE16-NEXT:    ; return to shader part epilog
 ;
@@ -4443,12 +4439,10 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f16_sign_v2f32(<2 x half> inreg
 ;
 ; GFX11-TRUE16-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f32:
 ; GFX11-TRUE16:       ; %bb.0:
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, s2
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, s1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, s2
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, s1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, s0, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-TRUE16-NEXT:    ; return to shader part epilog
 ;
@@ -4780,14 +4774,12 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f32_sign_v3f16(<3 x float> %mag, <
 ; GFX11-TRUE16-LABEL: v_copysign_out_v3f16_mag_v3f32_sign_v3f16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v1
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v1.l
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v2
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, v0, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0x7fff7fff, v1, v4
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.h, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v2.l, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, v1, v3
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0x7fff7fff, v2, v4
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_copysign_out_v3f16_mag_v3f32_sign_v3f16:
@@ -5483,14 +5475,12 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f16_sign_v3f32(<3 x half> %mag, <3
 ; GFX11-TRUE16-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f32:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v3.l, v3
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v2.l, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v2, v2.l, v3.l
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v3.l, v4
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, v0, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0x7fff7fff, v1, v3
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v3.h, v3
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v3.l, v2
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v2.l, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, v0, v3
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0x7fff7fff, v1, v2
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f32:
@@ -5894,16 +5884,13 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f32_sign_v4f16(<4 x float> %mag, <
 ; GFX11-TRUE16-LABEL: v_copysign_out_v4f16_mag_v4f32_sign_v4f16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v3.l, v3
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v1
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v0.l, v1.l
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v2, v0.h, v3.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v3.h, v3
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.h, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v3.l, v2
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, v1, v4
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0x7fff7fff, v2, v5
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0x7fff7fff, v3, v5
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_copysign_out_v4f16_mag_v4f32_sign_v4f16:
@@ -6807,16 +6794,13 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f16_sign_v4f32(<4 x half> %mag, <4
 ; GFX11-TRUE16-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f32:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v5.l, v5
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v3.l, v3
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v2.l, v2
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v2.h, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v3, v2.l, v3.l
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v2, v2.h, v5.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v5.h, v5
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v3.h, v3
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v3.l, v2
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v5.l, v4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, v0, v3
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0x7fff7fff, v1, v2
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v1, 0x7fff7fff, v1, v5
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f32:
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index 9e527cf38e7ee..1334d0ef278d1 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -2204,8 +2204,6 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_rsq_f16_e32 v0.h, v0.h
 ; GFX11-TRUE16-NEXT:    v_rsq_f16_e32 v0.l, v0.l
-; GFX11-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_rsq_v2f16:
@@ -2415,7 +2413,8 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
 ; GFX11-TRUE16-NEXT:    v_rsq_f16_e32 v0.h, v0.h
 ; GFX11-TRUE16-NEXT:    v_rsq_f16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, -v0.l, -v0.h
+; GFX11-TRUE16-NEXT:    v_xor_b16 v0.h, 0x8000, v0.h
+; GFX11-TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_neg_rsq_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 91f95859ed3db..6010f29c166a1 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -3835,9 +3835,8 @@ define <2 x half> @v_no_fmaximum3_f16__multi_use(half %a, half %b, half %c) {
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_maximum_f16 v0.l, v0.l, v1.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_maximum_f16 v0.h, v0.l, v2.l
-; GFX12-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_no_fmaximum3_f16__multi_use:
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
index 55fc7747bf81d..4506fd649a5ff 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
@@ -3835,9 +3835,8 @@ define <2 x half> @v_no_fminimum3_f16__multi_use(half %a, half %b, half %c) {
 ; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v1.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_minimum_f16 v0.h, v0.l, v2.l
-; GFX12-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: v_no_fminimum3_f16__multi_use:
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
index 8b43e707da055..eb7698d3348ed 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
@@ -6757,9 +6757,8 @@ define <2 x half> @v_mul_v2f16_select_64_1(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x3c00
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0x5400, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0x5400, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0x5400, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0x5400, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6862,9 +6861,8 @@ define <2 x half> @v_mul_v2f16_select_1_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x5400
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0x3c00, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0x3c00, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0x3c00, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0x3c00, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6969,9 +6967,8 @@ define <2 x half> @v_mul_v2f16_select_n1_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xd400
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0xbc00, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xbc00, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xbc00, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0xbc00, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7087,9 +7084,8 @@ define <2 x half> @v_mul_v2f16_select_128_64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x5400
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0x5800, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0x5800, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0x5800, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0x5800, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7214,9 +7210,8 @@ define <2 x half> @v_mul_v2f16_select_n128_n64(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xd400
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0xd800, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xd800, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xd800, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0xd800, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7332,9 +7327,8 @@ define <2 x half> @v_mul_v2f16_select_n128_n16(<2 x i32> %arg, <2 x half> %x) {
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0xcc00
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0xd800, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xd800, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xd800, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0xd800, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_f16 v0, v2, v0
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7441,9 +7435,8 @@ define <2 x half> @v_contract_mul_add_v2f16_select_64_1(<2 x i32> %arg, <2 x hal
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x3c00
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, 0x5400, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, 0x5400, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, 0x5400, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, 0x5400, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7551,9 +7544,8 @@ define <2 x half> @v_contract_mul_add_v2f16_select_1_64(<2 x i32> %arg, <2 x hal
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x5400
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, 0x3c00, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, 0x3c00, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, 0x3c00, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, 0x3c00, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7663,9 +7655,8 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n64_n1(<2 x i32> %arg, <2 x h
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0xbc00
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, 0xd400, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, 0xd400, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, 0xd400, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, 0xd400, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7777,9 +7768,8 @@ define <2 x half> @v_contract_mul_add_v2f16_select_n1_n64(<2 x i32> %arg, <2 x h
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0xd400
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, 0xbc00, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, 0xbc00, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, 0xbc00, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, 0xbc00, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -7900,9 +7890,8 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_64(<2 x i32> %arg, <2 x h
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x5400
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, 0x5800, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, 0x5800, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, 0x5800, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, 0x5800, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8018,9 +8007,8 @@ define <2 x half> @v_contract_mul_add_v2f16_select_128_4(<2 x i32> %arg, <2 x ha
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x4400
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, 0x5800, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, 0x5800, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, 0x5800, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, 0x5800, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8139,9 +8127,8 @@ define <2 x half> @v_contract_mul_add_v2f16_select_2_4(<2 x i32> %arg, <2 x half
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x4400
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, 0x4000, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, 0x4000, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, 0x4000, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, 0x4000, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -8257,9 +8244,8 @@ define <2 x half> @v_contract_mul_add_v2f16_select_4_128(<2 x i32> %arg, <2 x ha
 ; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x5800
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-SDAG-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, 0x4400, vcc_lo
-; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, 0x4400, s0
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.h, v4.l, 0x4400, vcc_lo
+; GFX11-SDAG-TRUE16-NEXT:    v_cndmask_b16 v0.l, v4.l, 0x4400, s0
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_fma_f16 v0, v2, v0, v3
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index bd9807477c1c0..f03becfbb8968 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -213,27 +213,21 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_lshlrev_b16 v4, v2, 1 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_lshlrev_b16 v2, v1, 1 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT:    v_pk_lshlrev_b16 v5, v0, 1 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT:    v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v0.l, v3.l
-; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v0.h, v4.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v3.l, v3.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v2.l, v2.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v1.l, v2.l
-; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v1.h, v5.l
-; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v2.l, v5.h
+; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v1.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v1.h, v1.h
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v2.h, v2.h
-; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v3.l, v4.h
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v3.h, v3.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v4, v1.h, v2.l
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v2.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v2, v0.h, v3.l
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v3, v0.l, v3.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_pk_mul_f16 v0, 0x7000, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT:    v_pk_mul_f16 v0, 0x7000, v0 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    v_pk_mul_f16 v1, 0x7000, v1 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_pk_mul_f16 v2, 0x7000, v2 op_sel_hi:[0,1]
@@ -327,12 +321,6 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
 ; GFX11-TRUE16-NEXT:    v_ldexp_f16_e32 v1.h, 0x7000, v1.h
 ; GFX11-TRUE16-NEXT:    v_ldexp_f16_e32 v2.h, 0x7000, v2.h
 ; GFX11-TRUE16-NEXT:    v_ldexp_f16_e32 v3.h, 0x7000, v3.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v2, v2.l, v2.h
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v3, v3.l, v3.h
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: fmul_pow2_ldexp_8xhalf:
@@ -1097,8 +1085,7 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v0.h, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fpow.ll b/llvm/test/CodeGen/AMDGPU/fpow.ll
index 11e74facb7a71..ad3f3433c74b3 100644
--- a/llvm/test/CodeGen/AMDGPU/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpow.ll
@@ -329,10 +329,8 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX11-TRUE16-NEXT:    v_exp_f32_e32 v1, v2
 ; GFX11-TRUE16-NEXT:    v_exp_f32_e32 v2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v1
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v2
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_pow_v2f16:
@@ -477,10 +475,8 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX11-TRUE16-NEXT:    v_exp_f32_e32 v1, v2
 ; GFX11-TRUE16-NEXT:    v_exp_f32_e32 v2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v1
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v2
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_pow_v2f16_fneg_lhs:
@@ -626,10 +622,8 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX11-TRUE16-NEXT:    v_exp_f32_e32 v1, v2
 ; GFX11-TRUE16-NEXT:    v_exp_f32_e32 v2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v1
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v2
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_pow_v2f16_fneg_rhs:
@@ -779,10 +773,8 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX11-TRUE16-NEXT:    v_exp_f32_e32 v1, v2
 ; GFX11-TRUE16-NEXT:    v_exp_f32_e32 v2, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v1
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v2
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_pow_v2f16_fneg_lhs_rhs:
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index 657e35a90a9a5..77faf363ca412 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -1915,14 +1915,12 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
 ; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b64 v[1:2], off, s[8:11], 0
+; GFX11-SDAG-TRUE16-NEXT:    buffer_load_b64 v[0:1], off, s[8:11], 0
 ; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v2
-; GFX11-SDAG-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.h, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v0
+; GFX11-SDAG-TRUE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
 ; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-FAKE16-LABEL: fptrunc_v2f32_to_v2f16:
@@ -1956,8 +1954,6 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
 ; GFX11-GISEL-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, s3
 ; GFX11-GISEL-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX11-GISEL-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-GISEL-TRUE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_endpgm
 ;
@@ -3875,13 +3871,11 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn(
 ; GFX11-SDAG-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    v_cvt_f32_f64_e32 v2, v[2:3]
-; GFX11-SDAG-TRUE16-NEXT:    v_cvt_f32_f64_e32 v1, v[0:1]
+; GFX11-SDAG-TRUE16-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v2
-; GFX11-SDAG-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.h, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v0
+; GFX11-SDAG-TRUE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
 ; GFX11-SDAG-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-FAKE16-LABEL: fptrunc_v2f64_to_v2f16_afn:
@@ -3921,8 +3915,6 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn(
 ; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
 ; GFX11-GISEL-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-GISEL-TRUE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_endpgm
 ;
@@ -4000,12 +3992,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn(
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    v_cvt_f32_f64_e32 v0, s[4:5]
 ; GFX1250-GISEL-TRUE16-NEXT:    v_cvt_f32_f64_e32 v1, s[6:7]
-; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
-; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-GISEL-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
-; GFX1250-GISEL-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
 ; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX1250-GISEL-TRUE16-NEXT:    v_cvt_pk_f16_f32 v0, v0, v1
 ; GFX1250-GISEL-TRUE16-NEXT:    buffer_store_b32 v0, off, s[0:3], null
 ; GFX1250-GISEL-TRUE16-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll
index f00657da440f0..b14935c57152b 100644
--- a/llvm/test/CodeGen/AMDGPU/fract-match.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll
@@ -1710,8 +1710,6 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_fract_f16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    v_fract_f16_e32 v0.h, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: basic_fract_v2f16_nonan:
@@ -1733,8 +1731,6 @@ define <2 x half> @basic_fract_v2f16_nonan(<2 x half> nofpclass(nan) %x) {
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_fract_f16_e32 v0.l, v0.l
 ; GFX12-TRUE16-NEXT:    v_fract_f16_e32 v0.h, v0.h
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-FAKE16-LABEL: basic_fract_v2f16_nonan:
@@ -2747,14 +2743,11 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) writeon
 ; GFX11-TRUE16-NEXT:    v_fract_f16_e32 v3.h, v0.h
 ; GFX11-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.h, 0x204
 ; GFX11-TRUE16-NEXT:    v_cmp_class_f16_e64 s1, v0.l, 0x204
-; GFX11-TRUE16-NEXT:    v_floor_f16_e32 v0.h, v0.h
-; GFX11-TRUE16-NEXT:    v_floor_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    v_floor_f16_e32 v4.h, v0.h
+; GFX11-TRUE16-NEXT:    v_floor_f16_e32 v4.l, v0.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, 0, s0
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, 0, s1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v4, v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v3.l, v3.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, 0, s0
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0, s1
 ; GFX11-TRUE16-NEXT:    global_store_b32 v[1:2], v4, off
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2788,14 +2781,11 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) writeon
 ; GFX12-TRUE16-NEXT:    v_fract_f16_e32 v3.h, v0.h
 ; GFX12-TRUE16-NEXT:    v_cmp_class_f16_e64 s0, v0.h, 0x204
 ; GFX12-TRUE16-NEXT:    v_cmp_class_f16_e64 s1, v0.l, 0x204
-; GFX12-TRUE16-NEXT:    v_floor_f16_e32 v0.h, v0.h
-; GFX12-TRUE16-NEXT:    v_floor_f16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT:    v_floor_f16_e32 v4.h, v0.h
+; GFX12-TRUE16-NEXT:    v_floor_f16_e32 v4.l, v0.l
 ; GFX12-TRUE16-NEXT:    s_wait_alu depctr_va_sdst(0)
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.h, v3.h, 0, s0
-; GFX12-TRUE16-NEXT:    v_cndmask_b16 v3.l, v3.l, 0, s1
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-TRUE16-NEXT:    v_pack_b32_f16 v4, v0.l, v0.h
-; GFX12-TRUE16-NEXT:    v_pack_b32_f16 v0, v3.l, v3.h
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.h, 0, s0
+; GFX12-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0, s1
 ; GFX12-TRUE16-NEXT:    global_store_b32 v[1:2], v4, off
 ; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index 8e427a6ef2023..5154ba95aec78 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -2595,15 +2595,13 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou
 ; GFX11-TRUE16-LABEL: global_truncstore_v2f32_to_v2f16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_b64 v[1:2], v3, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v2
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
-; GFX11-TRUE16-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.h, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v0
+; GFX11-TRUE16-NEXT:    global_store_b32 v2, v1, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: global_truncstore_v2f32_to_v2f16:
@@ -2684,14 +2682,12 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_load_b96 v[0:2], v3, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v1
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v4.h, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v2
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v4.l, v0
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    global_store_d16_hi_b16 v3, v0, s[0:1] offset:4
-; GFX11-TRUE16-NEXT:    global_store_b32 v3, v1, s[0:1]
+; GFX11-TRUE16-NEXT:    global_store_b16 v3, v1, s[0:1] offset:4
+; GFX11-TRUE16-NEXT:    global_store_b32 v3, v4, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: global_truncstore_v3f32_to_v3f16:
@@ -2770,14 +2766,11 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_load_b128 v[0:3], v4, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v3.l, v3
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v2.l, v2
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v1
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v2, v2.l, v3.l
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v0.l, v1.l
-; GFX11-TRUE16-NEXT:    global_store_b64 v4, v[1:2], s[0:1]
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v3.h, v3
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v3.l, v2
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v2.h, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v2.l, v0
+; GFX11-TRUE16-NEXT:    global_store_b64 v4, v[2:3], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: global_truncstore_v4f32_to_v4f16:
@@ -2884,20 +2877,15 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou
 ; GFX11-TRUE16-NEXT:    global_load_b128 v[0:3], v8, s[2:3] offset:16
 ; GFX11-TRUE16-NEXT:    global_load_b128 v[4:7], v8, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v3.l, v3
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v2.l, v2
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v1
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v3.h, v3
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v3.l, v2
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v2.h, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v2.l, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v7
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.h, v6
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v5.l, v5
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v4.l, v4
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v3, v2.l, v3.l
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v2, v0.l, v1.l
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.h, v0.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v4.l, v5.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.h, v7
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v6
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v5
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v4
 ; GFX11-TRUE16-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -3070,44 +3058,35 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
 ; GFX11-TRUE16-LABEL: global_truncstore_v16f32_to_v16f16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v17, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x3
-; GFX11-TRUE16-NEXT:    global_load_b128 v[0:3], v17, s[2:3] offset:16
-; GFX11-TRUE16-NEXT:    global_load_b128 v[4:7], v17, s[2:3]
-; GFX11-TRUE16-NEXT:    global_load_b128 v[8:11], v17, s[2:3] offset:48
-; GFX11-TRUE16-NEXT:    global_load_b128 v[12:15], v17, s[2:3] offset:32
+; GFX11-TRUE16-NEXT:    global_load_b128 v[0:3], v16, s[2:3] offset:16
+; GFX11-TRUE16-NEXT:    global_load_b128 v[4:7], v16, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_b128 v[8:11], v16, s[2:3] offset:48
+; GFX11-TRUE16-NEXT:    global_load_b128 v[12:15], v16, s[2:3] offset:32
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v3.l, v3
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v16.l, v5
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v16.h, v4
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v3.h, v3
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v3.l, v2
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v4.l, v11
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v4.h, v10
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v5.l, v9
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v5.h, v8
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v11.h, v11
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v11.l, v10
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v10.h, v9
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v10.l, v8
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v8.l, v15
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v8.h, v14
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v9.l, v13
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v9.h, v12
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v2.l, v2
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v1
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v7
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.h, v6
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v7, v4.h, v4.l
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v6, v5.h, v5.l
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v5, v8.h, v8.l
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v4, v9.h, v9.l
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v3, v2.l, v3.l
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v2, v0.l, v1.l
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.h, v0.h
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v16.h, v16.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v9.h, v15
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v9.l, v14
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v8.h, v13
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v8.l, v12
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v2.h, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v2.l, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.h, v7
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v6
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v5
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v4
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    global_store_b128 v17, v[4:7], s[0:1] offset:16
-; GFX11-TRUE16-NEXT:    global_store_b128 v17, v[0:3], s[0:1]
+; GFX11-TRUE16-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:16
+; GFX11-TRUE16-NEXT:    global_store_b128 v16, v[0:3], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: global_truncstore_v16f32_to_v16f16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
index 893dc3995e72f..117a169069782 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
@@ -160,9 +160,8 @@ define amdgpu_kernel void @ceil_v2f16(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_ceil_f16_e32 v0.l, v0.l
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_ceil_f16_e32 v0.h, v1.l
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index e5fcc2609333b..b7768998160d5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -226,8 +226,6 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX11-TRUE16-NEXT:    v_cos_f16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cos_f16_e32 v0.h, v0.h
-; GFX11-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -262,9 +260,8 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0.15915494, v2.l
 ; GFX12-TRUE16-NEXT:    v_cos_f16_e32 v0.l, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_cos_f16_e32 v0.h, v0.h
-; GFX12-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-TRUE16-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
index 22bb79dd9f4f3..868e69dfbd352 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
@@ -161,9 +161,8 @@ define amdgpu_kernel void @floor_v2f16(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_floor_f16_e32 v0.l, v0.l
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_floor_f16_e32 v0.h, v1.l
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
index eed67d9e020d7..37b49d48a10ee 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
@@ -424,15 +424,13 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX11-SDAG-TRUE16-LABEL: test_frexp_v2f16_v2i32:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v1.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v1.h, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v2.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v3.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v1.h, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v2, 0, 16
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v3, 0, 16
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v2.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.h, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 16
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_frexp_v2f16_v2i32:
@@ -457,15 +455,13 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v1.l, v0.h
-; GFX12-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v1.h, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v2.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v3.l, v0.h
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v1.h, v1.l
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v2, 0, 16
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v3, 0, 16
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v1.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v2.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.h, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 16
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-FAKE16-LABEL: test_frexp_v2f16_v2i32:
@@ -534,15 +530,13 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX11-GISEL-TRUE16-LABEL: test_frexp_v2f16_v2i32:
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v1.l, v0.l
-; GFX11-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v1.h, v0.h
-; GFX11-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v2.l, v0.l
-; GFX11-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v3.l, v0.h
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v1.l, v1.h
-; GFX11-GISEL-TRUE16-NEXT:    v_bfe_i32 v1, v2, 0, 16
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-GISEL-TRUE16-NEXT:    v_bfe_i32 v2, v3, 0, 16
+; GFX11-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v1.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v2.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.h, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 16
 ; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-FAKE16-LABEL: test_frexp_v2f16_v2i32:
@@ -567,15 +561,13 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v1.l, v0.l
-; GFX12-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v1.h, v0.h
-; GFX12-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v2.l, v0.l
-; GFX12-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v3.l, v0.h
-; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v1.l, v1.h
-; GFX12-GISEL-TRUE16-NEXT:    v_bfe_i32 v1, v2, 0, 16
-; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-GISEL-TRUE16-NEXT:    v_bfe_i32 v2, v3, 0, 16
+; GFX12-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v1.l, v0.l
+; GFX12-GISEL-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v2.l, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
+; GFX12-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.h, v0.h
+; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-GISEL-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
+; GFX12-GISEL-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 16
 ; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-FAKE16-LABEL: test_frexp_v2f16_v2i32:
@@ -638,8 +630,6 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.h, v0.h
 ; GFX11-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
@@ -661,8 +651,6 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.h, v0.h
 ; GFX12-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-FAKE16-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
@@ -708,8 +696,6 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
 ; GFX11-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.h, v0.h
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-FAKE16-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
@@ -731,8 +717,6 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
 ; GFX12-GISEL-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.h, v0.h
-; GFX12-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX12-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-FAKE16-LABEL: test_frexp_v2f16_v2i32_only_use_fract:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
index 22f562ab8557b..930fab95b863d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
@@ -475,9 +475,8 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
 ; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
 ; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v2, v2, s0, 0x7fff
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v0.h, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i32:
@@ -595,8 +594,6 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v0.h, v1.h
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i16:
@@ -709,16 +706,14 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) {
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-TRUE16-NEXT:    s_movk_i32 s0, 0x8000
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v2, v2, s0, 0x7fff
 ; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v3, v3, s0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v4, v4, s0, 0x7fff
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v2.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v0.h, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v2, v4, s0, 0x7fff
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v1.l, v4.l
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i32:
@@ -856,8 +851,6 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) {
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v0.h, v2.h
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v1.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i16:
@@ -997,11 +990,8 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) {
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v0.h, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v1.h, v1.h, v5.l
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v4f16_v4i32:
@@ -1172,9 +1162,6 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) {
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v0.h, v2.h
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v1.l, v3.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v4f16_v4i16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index dceeba51025ed..aa4178c6ed56d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -6883,8 +6883,6 @@ define <2 x half> @v_log_v2f16(<2 x half> %in) {
 ; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
-; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-FAKE16-LABEL: v_log_v2f16:
@@ -6908,8 +6906,6 @@ define <2 x half> @v_log_v2f16(<2 x half> %in) {
 ; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
-; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-FAKE16-LABEL: v_log_v2f16:
@@ -7026,8 +7022,6 @@ define <2 x half> @v_log_fabs_v2f16(<2 x half> %in) {
 ; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
-; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-FAKE16-LABEL: v_log_fabs_v2f16:
@@ -7053,8 +7047,6 @@ define <2 x half> @v_log_fabs_v2f16(<2 x half> %in) {
 ; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
-; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-FAKE16-LABEL: v_log_fabs_v2f16:
@@ -7178,8 +7170,6 @@ define <2 x half> @v_log_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
-; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-FAKE16-LABEL: v_log_fneg_fabs_v2f16:
@@ -7205,8 +7195,6 @@ define <2 x half> @v_log_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
-; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-FAKE16-LABEL: v_log_fneg_fabs_v2f16:
@@ -7331,8 +7319,6 @@ define <2 x half> @v_log_fneg_v2f16(<2 x half> %in) {
 ; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
-; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-FAKE16-LABEL: v_log_fneg_v2f16:
@@ -7358,8 +7344,6 @@ define <2 x half> @v_log_fneg_v2f16(<2 x half> %in) {
 ; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
-; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-FAKE16-LABEL: v_log_fneg_v2f16:
@@ -7458,8 +7442,6 @@ define <2 x half> @v_log_v2f16_fast(<2 x half> %in) {
 ; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
-; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-FAKE16-LABEL: v_log_v2f16_fast:
@@ -7483,8 +7465,6 @@ define <2 x half> @v_log_v2f16_fast(<2 x half> %in) {
 ; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
-; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-FAKE16-LABEL: v_log_v2f16_fast:
@@ -7585,12 +7565,11 @@ define <3 x half> @v_log_v3f16(<3 x half> %in) {
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
-; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
 ; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x398c, v1.l
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-FAKE16-LABEL: v_log_v3f16:
@@ -7615,12 +7594,11 @@ define <3 x half> @v_log_v3f16(<3 x half> %in) {
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
-; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
 ; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x398c, v1.l
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-FAKE16-LABEL: v_log_v3f16:
@@ -7718,12 +7696,11 @@ define <3 x half> @v_log_v3f16_fast(<3 x half> %in) {
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
-; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
 ; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x398c, v1.l
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-FAKE16-LABEL: v_log_v3f16_fast:
@@ -7748,12 +7725,11 @@ define <3 x half> @v_log_v3f16_fast(<3 x half> %in) {
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
-; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
 ; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x398c, v1.l
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-FAKE16-LABEL: v_log_v3f16_fast:
@@ -7906,14 +7882,11 @@ define <4 x half> @v_log_v4f16(<4 x half> %in) {
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.h, v1.h
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x398c, v1.l
-; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
 ; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v1.h, 0x398c, v1.h
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
-; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
 ; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-FAKE16-LABEL: v_log_v4f16:
@@ -7945,13 +7918,11 @@ define <4 x half> @v_log_v4f16(<4 x half> %in) {
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.h, v1.h
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
-; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x398c, v1.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v1.h, 0x398c, v1.h
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-FAKE16-LABEL: v_log_v4f16:
@@ -8101,14 +8072,11 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) {
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.h, v1.h
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x398c, v1.l
-; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
 ; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v1.h, 0x398c, v1.h
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
-; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
 ; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-FAKE16-LABEL: v_log_v4f16_fast:
@@ -8140,13 +8108,11 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) {
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.h, v1.h
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x398c, v0.l
-; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x398c, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x398c, v1.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v1.h, 0x398c, v1.h
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-FAKE16-LABEL: v_log_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index f40316d1ba824..1ead1bbda8a05 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -6883,8 +6883,6 @@ define <2 x half> @v_log10_v2f16(<2 x half> %in) {
 ; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
-; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-FAKE16-LABEL: v_log10_v2f16:
@@ -6908,8 +6906,6 @@ define <2 x half> @v_log10_v2f16(<2 x half> %in) {
 ; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
-; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-FAKE16-LABEL: v_log10_v2f16:
@@ -7026,8 +7022,6 @@ define <2 x half> @v_log10_fabs_v2f16(<2 x half> %in) {
 ; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
-; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-FAKE16-LABEL: v_log10_fabs_v2f16:
@@ -7053,8 +7047,6 @@ define <2 x half> @v_log10_fabs_v2f16(<2 x half> %in) {
 ; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
-; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-FAKE16-LABEL: v_log10_fabs_v2f16:
@@ -7178,8 +7170,6 @@ define <2 x half> @v_log10_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
-; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-FAKE16-LABEL: v_log10_fneg_fabs_v2f16:
@@ -7205,8 +7195,6 @@ define <2 x half> @v_log10_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
-; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-FAKE16-LABEL: v_log10_fneg_fabs_v2f16:
@@ -7331,8 +7319,6 @@ define <2 x half> @v_log10_fneg_v2f16(<2 x half> %in) {
 ; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
-; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-FAKE16-LABEL: v_log10_fneg_v2f16:
@@ -7358,8 +7344,6 @@ define <2 x half> @v_log10_fneg_v2f16(<2 x half> %in) {
 ; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
-; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-FAKE16-LABEL: v_log10_fneg_v2f16:
@@ -7458,8 +7442,6 @@ define <2 x half> @v_log10_v2f16_fast(<2 x half> %in) {
 ; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
-; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-FAKE16-LABEL: v_log10_v2f16_fast:
@@ -7483,8 +7465,6 @@ define <2 x half> @v_log10_v2f16_fast(<2 x half> %in) {
 ; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
-; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-FAKE16-LABEL: v_log10_v2f16_fast:
@@ -7585,12 +7565,11 @@ define <3 x half> @v_log10_v3f16(<3 x half> %in) {
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
-; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
 ; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x34d1, v1.l
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-FAKE16-LABEL: v_log10_v3f16:
@@ -7615,12 +7594,11 @@ define <3 x half> @v_log10_v3f16(<3 x half> %in) {
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
-; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
 ; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x34d1, v1.l
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-FAKE16-LABEL: v_log10_v3f16:
@@ -7718,12 +7696,11 @@ define <3 x half> @v_log10_v3f16_fast(<3 x half> %in) {
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
-; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
 ; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x34d1, v1.l
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-FAKE16-LABEL: v_log10_v3f16_fast:
@@ -7748,12 +7725,11 @@ define <3 x half> @v_log10_v3f16_fast(<3 x half> %in) {
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
-; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
 ; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x34d1, v1.l
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-FAKE16-LABEL: v_log10_v3f16_fast:
@@ -7906,14 +7882,11 @@ define <4 x half> @v_log10_v4f16(<4 x half> %in) {
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.h, v1.h
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x34d1, v1.l
-; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
 ; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v1.h, 0x34d1, v1.h
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
-; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
 ; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-FAKE16-LABEL: v_log10_v4f16:
@@ -7945,13 +7918,11 @@ define <4 x half> @v_log10_v4f16(<4 x half> %in) {
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.h, v1.h
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
-; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x34d1, v1.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v1.h, 0x34d1, v1.h
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-FAKE16-LABEL: v_log10_v4f16:
@@ -8101,14 +8072,11 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) {
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.h, v1.h
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x34d1, v1.l
-; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
 ; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    v_mul_f16_e32 v1.h, 0x34d1, v1.h
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
-; GFX1100-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
 ; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-FAKE16-LABEL: v_log10_v4f16_fast:
@@ -8140,13 +8108,11 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) {
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.h, v1.h
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x34d1, v0.l
-; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0x34d1, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v1.l, 0x34d1, v1.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_mul_f16_e32 v1.h, 0x34d1, v1.h
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-FAKE16-LABEL: v_log10_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index e24fd1f22bfa6..46b8ab09d917c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -4157,8 +4157,6 @@ define <2 x half> @v_log2_v2f16(<2 x half> %in) {
 ; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
-; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-FAKE16-LABEL: v_log2_v2f16:
@@ -4177,8 +4175,6 @@ define <2 x half> @v_log2_v2f16(<2 x half> %in) {
 ; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
-; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-FAKE16-LABEL: v_log2_v2f16:
@@ -4276,8 +4272,6 @@ define <2 x half> @v_log2_fabs_v2f16(<2 x half> %in) {
 ; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.h, |v0.h|
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.l, |v0.l|
-; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-FAKE16-LABEL: v_log2_fabs_v2f16:
@@ -4298,8 +4292,6 @@ define <2 x half> @v_log2_fabs_v2f16(<2 x half> %in) {
 ; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
-; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-FAKE16-LABEL: v_log2_fabs_v2f16:
@@ -4403,8 +4395,6 @@ define <2 x half> @v_log2_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.h, -|v0.h|
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.l, -|v0.l|
-; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-FAKE16-LABEL: v_log2_fneg_fabs_v2f16:
@@ -4425,8 +4415,6 @@ define <2 x half> @v_log2_fneg_fabs_v2f16(<2 x half> %in) {
 ; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
-; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-FAKE16-LABEL: v_log2_fneg_fabs_v2f16:
@@ -4531,8 +4519,6 @@ define <2 x half> @v_log2_fneg_v2f16(<2 x half> %in) {
 ; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.h, -v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e64 v0.l, -v0.l
-; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-FAKE16-LABEL: v_log2_fneg_v2f16:
@@ -4553,8 +4539,6 @@ define <2 x half> @v_log2_fneg_v2f16(<2 x half> %in) {
 ; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
-; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-FAKE16-LABEL: v_log2_fneg_v2f16:
@@ -4643,8 +4627,6 @@ define <2 x half> @v_log2_v2f16_fast(<2 x half> %in) {
 ; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
-; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-FAKE16-LABEL: v_log2_v2f16_fast:
@@ -4663,8 +4645,6 @@ define <2 x half> @v_log2_v2f16_fast(<2 x half> %in) {
 ; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
-; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-FAKE16-LABEL: v_log2_v2f16_fast:
@@ -4768,8 +4748,6 @@ define <3 x half> @v_log2_v3f16(<3 x half> %in) {
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
-; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-FAKE16-LABEL: v_log2_v3f16:
@@ -4790,8 +4768,6 @@ define <3 x half> @v_log2_v3f16(<3 x half> %in) {
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
-; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-FAKE16-LABEL: v_log2_v3f16:
@@ -4890,8 +4866,6 @@ define <3 x half> @v_log2_v3f16_fast(<3 x half> %in) {
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
-; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-FAKE16-LABEL: v_log2_v3f16_fast:
@@ -4912,8 +4886,6 @@ define <3 x half> @v_log2_v3f16_fast(<3 x half> %in) {
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
-; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-FAKE16-LABEL: v_log2_v3f16_fast:
@@ -5035,9 +5007,6 @@ define <4 x half> @v_log2_v4f16(<4 x half> %in) {
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
-; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
 ; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-FAKE16-LABEL: v_log2_v4f16:
@@ -5062,10 +5031,6 @@ define <4 x half> @v_log2_v4f16(<4 x half> %in) {
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.h, v1.h
-; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
-; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-FAKE16-LABEL: v_log2_v4f16:
@@ -5182,9 +5147,6 @@ define <4 x half> @v_log2_v4f16_fast(<4 x half> %in) {
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
 ; GFX1100-SDAG-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
-; GFX1100-SDAG-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
-; GFX1100-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
 ; GFX1100-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-FAKE16-LABEL: v_log2_v4f16_fast:
@@ -5209,10 +5171,6 @@ define <4 x half> @v_log2_v4f16_fast(<4 x half> %in) {
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v0.h, v0.h
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.l, v1.l
 ; GFX1100-GISEL-TRUE16-NEXT:    v_log_f16_e32 v1.h, v1.h
-; GFX1100-GISEL-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_3)
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
-; GFX1100-GISEL-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX1100-GISEL-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
 ; GFX1100-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-GISEL-FAKE16-LABEL: v_log2_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index 3d8a8a2962921..57ce028b1fc4a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -220,9 +220,8 @@ define amdgpu_kernel void @rint_v2f16(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v0.l, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -265,8 +264,10 @@ define amdgpu_kernel void @rint_v2f16(
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX12-TRUE16-NEXT:    v_rndne_f16_e32 v0.l, v0.l
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_rndne_f16_e32 v0.h, v1.l
-; GFX12-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX12-TRUE16-NEXT:    v_rndne_f16_e32 v1.l, v1.l
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
 ; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], null
 ; GFX12-TRUE16-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
index b241b9b800d2a..d1a16b687f930 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
@@ -1017,8 +1017,6 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 {
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.h, v0.h, v2.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index 48cd7c0f3286b..56cfec29e4ed2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -226,8 +226,6 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX11-TRUE16-NEXT:    v_sin_f16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_sin_f16_e32 v0.h, v0.h
-; GFX11-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -262,9 +260,8 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0.15915494, v2.l
 ; GFX12-TRUE16-NEXT:    v_sin_f16_e32 v0.l, v0.l
-; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-TRUE16-NEXT:    v_sin_f16_e32 v0.h, v0.h
-; GFX12-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-TRUE16-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
index 84a27bb84c442..6fbfcf3451430 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
@@ -200,8 +200,6 @@ define amdgpu_kernel void @sqrt_v2f16(
 ; GFX11-TRUE16-NEXT:    v_sqrt_f16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_sqrt_f16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -245,8 +243,10 @@ define amdgpu_kernel void @sqrt_v2f16(
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX12-TRUE16-NEXT:    v_sqrt_f16_e32 v0.l, v0.l
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX12-TRUE16-NEXT:    v_sqrt_f16_e32 v0.h, v1.l
-; GFX12-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX12-TRUE16-NEXT:    v_sqrt_f16_e32 v1.l, v1.l
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
 ; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], null
 ; GFX12-TRUE16-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
index 482a7de073847..047c170ab527f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
@@ -199,9 +199,8 @@ define amdgpu_kernel void @trunc_v2f16(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_trunc_f16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -244,8 +243,10 @@ define amdgpu_kernel void @trunc_v2f16(
 ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX12-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
 ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-TRUE16-NEXT:    v_trunc_f16_e32 v0.h, v1.l
-; GFX12-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX12-TRUE16-NEXT:    v_trunc_f16_e32 v1.l, v1.l
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, v1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
 ; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], null
 ; GFX12-TRUE16-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
index 259ee0b26d2d8..2b6c487e27ed2 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
@@ -65,9 +65,8 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %s
 ; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
 ; SDAG-GFX11-TRUE16:       ; %bb.0:
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
-; SDAG-GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, 1.0, v0.l
+; SDAG-GFX11-TRUE16-NEXT:    v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x3c00
 ; SDAG-GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index ba77552e5809b..8b31944acc15a 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -409,10 +409,10 @@ define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half
 ; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32:
 ; SDAG-GFX1100-TRUE16:       ; %bb.0:
 ; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
 ; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v3.l
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b32_e32 v0, v3
 ; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32:
@@ -530,11 +530,11 @@ define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half
 ; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v3f32:
 ; SDAG-GFX1100-TRUE16:       ; %bb.0:
 ; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v0, v0, v2, v4 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
 ; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
 ; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; SDAG-GFX1100-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v6.l
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b32_e32 v0, v6
 ; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v3f32:
@@ -698,13 +698,12 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half
 ; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v4f32:
 ; SDAG-GFX1100-TRUE16:       ; %bb.0:
 ; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v0, v0, v2, v4 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v6.h
-; SDAG-GFX1100-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v6.l
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixhi_f16 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixhi_f16 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v7, v0, v2, v4 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v6, v1, v3, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT:    v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v6
 ; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v4f32:
@@ -1047,13 +1046,12 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
 ; SDAG-GFX1100-TRUE16:       ; %bb.0:
 ; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0
 ; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, 0
-; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b32_e32 v0, v3
-; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; SDAG-GFX1100-TRUE16-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b32_e32 v0, v3
 ; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v3f32_clamp_postcvt:
@@ -1479,10 +1477,10 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
 ; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
 ; SDAG-GFX1100-TRUE16:       ; %bb.0:
 ; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v3.l
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b32_e32 v0, v3
 ; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
@@ -1643,9 +1641,9 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
 ; SDAG-GFX1100-TRUE16:       ; %bb.0:
 ; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT:    v_pack_b32_f16 v0, v3.l, v0.l
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b32_e32 v0, v3
 ; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi:
@@ -1815,10 +1813,8 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr
 ; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v1, v0, v1, v2 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v3
-; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
-; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v3
+; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v1
 ; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_clamp_precvt:
@@ -1974,11 +1970,10 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr
 ; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v2, v0, v2, v4 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v6
-; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v2
-; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v6
+; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v2
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v1
-; SDAG-GFX1100-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v3f32_clamp_precvt:
@@ -2169,14 +2164,11 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
 ; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v2, v0, v2, v4 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v3, v1, v3, v5 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v6
-; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v7
+; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.h, v6
+; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v7
 ; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v2
-; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.h, v3
-; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
-; SDAG-GFX1100-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.h, v1.l
+; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v2
+; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v3
 ; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v4f32_clamp_precvt:
diff --git a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
index 624f4480e689a..88a8f3affc83a 100644
--- a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
+++ b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
@@ -283,8 +283,6 @@ define <2 x half> @v_repeat_divisor_f16_x2_arcp(half %x, half %y, half %D) #0 {
 ; GFX11-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
 ; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.h, v1.l, v2.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_repeat_divisor_f16_x2_arcp:
@@ -570,8 +568,6 @@ define <3 x half> @v_repeat_divisor_f16_x3_arcp(half %x, half %y, half %z, half
 ; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v3.l
 ; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.h, v1.l, v3.l
 ; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v1.l, v2.l, v3.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_repeat_divisor_f16_x3_arcp:
@@ -842,8 +838,6 @@ define <4 x half> @v_repeat_divisor_v2f16_x2(<2 x half> %x, <2 x half> %y, <2 x
 ; GFX11-TRUE16-NEXT:    v_rcp_f16_e32 v2.h, v2.h
 ; GFX11-TRUE16-NEXT:    v_rcp_f16_e32 v2.l, v2.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v2, v2.l, v2.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX11-TRUE16-NEXT:    v_pk_mul_f16 v1, v1, v2
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
@@ -950,20 +944,17 @@ define <6 x half> @v_repeat_divisor_v3f16_x2(<3 x half> %x, <3 x half> %y, <3 x
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_rcp_f16_e32 v4.h, v4.h
-; GFX11-TRUE16-NEXT:    v_rcp_f16_e32 v5.l, v5.l
 ; GFX11-TRUE16-NEXT:    v_rcp_f16_e32 v4.l, v4.l
+; GFX11-TRUE16-NEXT:    v_rcp_f16_e32 v5.l, v5.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.h, 0x7e00
 ; GFX11-TRUE16-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v5, v5.l, 0x7e00
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v4, v4.l, v4.h
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_pk_mul_f16 v1, v1, v5
 ; GFX11-TRUE16-NEXT:    v_pk_mul_f16 v2, v2, v4
 ; GFX11-TRUE16-NEXT:    v_pk_mul_f16 v3, v3, v5
+; GFX11-TRUE16-NEXT:    v_pk_mul_f16 v1, v1, v5
 ; GFX11-TRUE16-NEXT:    v_pk_mul_f16 v0, v0, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v2.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll
index cf3edc0b4ac96..a259156c09bd7 100644
--- a/llvm/test/CodeGen/AMDGPU/roundeven.ll
+++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll
@@ -478,7 +478,6 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v0.h, v0.h
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_roundeven_v2f16:
@@ -541,7 +540,6 @@ define <2 x half> @v_roundeven_v2f16(<2 x half> %x) {
 ; SDAG_GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v0.h, v0.h
 ; SDAG_GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v0.l, v0.l
-; SDAG_GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; SDAG_GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX11-FAKE16-LABEL: v_roundeven_v2f16:
@@ -622,7 +620,6 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
 ; GFX11-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x80008000, v0
 ; GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v0.h, v0.h
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_roundeven_v2f16_fneg:
@@ -694,7 +691,6 @@ define <2 x half> @v_roundeven_v2f16_fneg(<2 x half> %x) {
 ; SDAG_GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG_GFX11-TRUE16-NEXT:    v_rndne_f16_e64 v0.h, -v0.h
 ; SDAG_GFX11-TRUE16-NEXT:    v_rndne_f16_e64 v0.l, -v0.l
-; SDAG_GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; SDAG_GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX11-FAKE16-LABEL: v_roundeven_v2f16_fneg:
@@ -785,8 +781,6 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
 ; GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v0.h, v0.h
 ; GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v1.l, v1.l
 ; GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v1.h, v1.h
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_roundeven_v4f16:
@@ -876,8 +870,6 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
 ; SDAG_GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v0.h, v0.h
 ; SDAG_GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v0.l, v0.l
 ; SDAG_GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v1.l, v1.l
-; SDAG_GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
-; SDAG_GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, v1.h
 ; SDAG_GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG_GFX11-FAKE16-LABEL: v_roundeven_v4f16:
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
index 9814ed80befbf..5518c7a14cc69 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
@@ -625,10 +625,9 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0xc000, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xc000, s0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xc000, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0xc000, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -702,10 +701,9 @@ define <2 x half> @add_select_posk_posk_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0x4000, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0x4000, s0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0x4000, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0x4000, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v2
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1791,10 +1789,9 @@ define <2 x half> @add_select_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) {
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0xc000, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xc000, s0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xc000, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0xc000, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v2
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1869,10 +1866,9 @@ define <2 x half> @add_select_negliteralk_negliteralk_v2f16(<2 x i32> %c, <2 x h
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0xe800, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xe800, s0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xe800, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0xe800, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, v0, v2
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1945,10 +1941,9 @@ define <2 x half> @add_select_fneg_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e64 s0, 0, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0xc000, vcc_lo
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xc000, s0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, v3.l, 0xc000, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v3.l, 0xc000, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll
index 584d26ed41893..6f48e4d6e2dd2 100644
--- a/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll
@@ -8,35 +8,31 @@ define amdgpu_kernel void @v_sext_in_reg_i8_i16_shuffle_vector(ptr addrspace(1)
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    global_load_b64 v[1:2], v0, s[2:3]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.h
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.h
-; GFX11-TRUE16-NEXT:    v_bfe_i32 v7, v2, 0, 8
-; GFX11-TRUE16-NEXT:    v_ashrrev_i32_e32 v5, 24, v2
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.h
 ; GFX11-TRUE16-NEXT:    v_ashrrev_i32_e32 v6, 24, v1
-; GFX11-TRUE16-NEXT:    v_bfe_i32 v8, v3, 0, 8
 ; GFX11-TRUE16-NEXT:    v_ashrrev_i16 v0.l, 8, v1.l
 ; GFX11-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-TRUE16-NEXT:    v_bfe_i32 v10, v4, 0, 8
+; GFX11-TRUE16-NEXT:    v_ashrrev_i32_e32 v9, 24, v2
+; GFX11-TRUE16-NEXT:    v_bfe_i32 v5, v2, 0, 8
 ; GFX11-TRUE16-NEXT:    v_ashrrev_i16 v0.h, 8, v2.l
-; GFX11-TRUE16-NEXT:    v_bfe_i32 v3, v4, 0, 8
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v7.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v8.l
-; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v0.l, v0.l
-; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT:    v_bfe_i32 v7, v3, 0, 8
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v2.l, v6.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v10.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v3.l, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v1.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v0.l, v9.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v3.h, v4.l
+; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v1.h, v5.l
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v2.h, v6.l
-; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v4.h, v5.l
-; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v1.l, v1.l
-; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v1.h, v2.l
-; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v2.l, v3.l
-; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v4.l, v4.l
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, 0
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v3, v0.l, v1.l
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v0.h, v1.h
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v2, v2.h, v2.l
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v4.h, v4.l
-; GFX11-TRUE16-NEXT:    global_store_b128 v5, v[0:3], s[0:1]
+; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v0.h, v7.l
+; GFX11-TRUE16-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: v_sext_in_reg_i8_i16_shuffle_vector:
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
index d46278633f341..8c823849616b9 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
@@ -810,6 +810,7 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s7, s2, s3
 ; GFX11-TRUE16-NEXT:    s_xor_b32 s9, s0, s1
@@ -837,12 +838,9 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2
 ; GFX11-TRUE16-NEXT:    v_ldexp_f32 v0, v0, s0
 ; GFX11-TRUE16-NEXT:    v_ldexp_f32 v1, v1, s1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
-; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v1
+; GFX11-TRUE16-NEXT:    global_store_b32 v2, v0, s[4:5]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: s_sint_to_fp_v2i64_to_v2f16:
@@ -1102,21 +1100,16 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_i32_e32 v3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_ldexp_f32 v2, v2, v9
-; GFX11-TRUE16-NEXT:    v_ldexp_f32 v5, v0, v10
+; GFX11-TRUE16-NEXT:    v_ldexp_f32 v0, v0, v10
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_ldexp_f32 v1, v1, v11
+; GFX11-TRUE16-NEXT:    v_ldexp_f32 v5, v1, v11
 ; GFX11-TRUE16-NEXT:    v_ldexp_f32 v3, v3, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v8
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.h, v2
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v0
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v1
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.h, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v8
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v2, v0.h, v0.l
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.h, v1.l
-; GFX11-TRUE16-NEXT:    global_store_b64 v3, v[1:2], s[0:1]
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v3
+; GFX11-TRUE16-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: v_sint_to_fp_v4i64_to_v4f16:
diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
index 7ddd90e7aeb53..792bd03ad15aa 100644
--- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
@@ -237,9 +237,8 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v0.l, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -334,13 +333,11 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-TRUE16-NEXT:    v_cvt_f32_i32_e32 v2, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v1
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
-; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.h, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v0
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: sitofp_v2i32_to_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/true16-imm-folded-to-0-regression.ll b/llvm/test/CodeGen/AMDGPU/true16-imm-folded-to-0-regression.ll
index 0bebb5849ed81..27840c4f582d9 100644
--- a/llvm/test/CodeGen/AMDGPU/true16-imm-folded-to-0-regression.ll
+++ b/llvm/test/CodeGen/AMDGPU/true16-imm-folded-to-0-regression.ll
@@ -4,16 +4,15 @@
 ; Make sure that the 16-bit constant 0x3c00 isn't folded as 0 into
 ; v_bfi_b32.
 define i32 @mov16_bfi_fold_regression(half %arg, i32 %arg1) {
-; CHECK-LABEL: bfi_fold_regression:
+; CHECK-LABEL: mov16_bfi_fold_regression:
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b16_e32 v2.l, 0x3c00
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
-; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; CHECK-NEXT:    v_bfi_b32 v0, 0x7fff, v2, v0
+; CHECK-NEXT:    v_mov_b16_e32 v0.h, 0
 ; CHECK-NEXT:    v_cndmask_b16 v0.l, 0x3c00, v0.l, vcc_lo
-; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; CHECK-NEXT:    v_pack_b32_f16 v0, v0.l, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %cmp = icmp eq i32 %arg1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
index d25178f11063c..bf57f65a570cf 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
@@ -659,6 +659,7 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clz_i32_u32 s6, s3
 ; GFX11-TRUE16-NEXT:    s_clz_i32_u32 s7, s1
@@ -678,12 +679,9 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2
 ; GFX11-TRUE16-NEXT:    v_ldexp_f32 v0, v0, s0
 ; GFX11-TRUE16-NEXT:    v_ldexp_f32 v1, v1, s1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
-; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v1
+; GFX11-TRUE16-NEXT:    global_store_b32 v2, v0, s[4:5]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: s_uint_to_fp_v2i64_to_v2f16:
@@ -886,21 +884,16 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_u32_e32 v3, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_ldexp_f32 v2, v2, v9
-; GFX11-TRUE16-NEXT:    v_ldexp_f32 v5, v0, v10
+; GFX11-TRUE16-NEXT:    v_ldexp_f32 v0, v0, v10
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_ldexp_f32 v1, v1, v11
+; GFX11-TRUE16-NEXT:    v_ldexp_f32 v5, v1, v11
 ; GFX11-TRUE16-NEXT:    v_ldexp_f32 v3, v3, v4
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v2
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v4, 3, v8
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.h, v2
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v0
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v1
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.h, v3
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v3, 3, v8
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v2, v0.h, v0.l
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.h, v1.l
-; GFX11-TRUE16-NEXT:    global_store_b64 v3, v[1:2], s[0:1]
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v3
+; GFX11-TRUE16-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: v_uint_to_fp_v4i64_to_v4f16:
diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
index 2d7ce10644928..8fe5cbeaab34d 100644
--- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
@@ -237,9 +237,8 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16(
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v0.l, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v0.h, v1.l
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -334,13 +333,11 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-TRUE16-NEXT:    v_cvt_f32_u32_e32 v2, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v1
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
-; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.h, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v0
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: uitofp_v2i32_to_v2f16:
diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll
index d8044139aceb3..5440eb871f349 100644
--- a/llvm/test/CodeGen/AMDGPU/v_pack.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll
@@ -96,8 +96,6 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace
 ; GFX11-GCN-REAL16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.l, 2.0, v0.l
 ; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v0.h
-; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GCN-REAL16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-GCN-REAL16-NEXT:    ;;#ASMSTART
 ; GFX11-GCN-REAL16-NEXT:    ; use v0
 ; GFX11-GCN-REAL16-NEXT:    ;;#ASMEND
@@ -116,8 +114,6 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace
 ; GFX11-GISEL-REAL16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.l, 2.0, v0.l
 ; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v0.h
-; GFX11-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-REAL16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-GISEL-REAL16-NEXT:    ;;#ASMSTART
 ; GFX11-GISEL-REAL16-NEXT:    ; use v0
 ; GFX11-GISEL-REAL16-NEXT:    ;;#ASMEND
@@ -225,8 +221,6 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs
 ; GFX11-GCN-REAL16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GCN-REAL16-NEXT:    v_subrev_f16_e32 v0.l, 2.0, v0.l
 ; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v0.h
-; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GCN-REAL16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-GCN-REAL16-NEXT:    ;;#ASMSTART
 ; GFX11-GCN-REAL16-NEXT:    ; use v0
 ; GFX11-GCN-REAL16-NEXT:    ;;#ASMEND
@@ -245,8 +239,6 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs
 ; GFX11-GISEL-REAL16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-REAL16-NEXT:    v_subrev_f16_e32 v0.l, 2.0, v0.l
 ; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v0.h
-; GFX11-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-REAL16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-GISEL-REAL16-NEXT:    ;;#ASMSTART
 ; GFX11-GISEL-REAL16-NEXT:    ; use v0
 ; GFX11-GISEL-REAL16-NEXT:    ;;#ASMEND
@@ -348,14 +340,12 @@ define amdgpu_kernel void @fptrunc(
 ; GFX11-GCN-REAL16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-GCN-REAL16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-GCN-REAL16-NEXT:    s_mov_b32 s4, s0
-; GFX11-GCN-REAL16-NEXT:    buffer_load_b64 v[1:2], off, s[8:11], 0
+; GFX11-GCN-REAL16-NEXT:    buffer_load_b64 v[0:1], off, s[8:11], 0
 ; GFX11-GCN-REAL16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-GCN-REAL16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GCN-REAL16-NEXT:    v_cvt_f16_f32_e32 v0.l, v2
-; GFX11-GCN-REAL16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
-; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GCN-REAL16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
-; GFX11-GCN-REAL16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-GCN-REAL16-NEXT:    v_cvt_f16_f32_e32 v1.h, v1
+; GFX11-GCN-REAL16-NEXT:    v_cvt_f16_f32_e32 v1.l, v0
+; GFX11-GCN-REAL16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
 ; GFX11-GCN-REAL16-NEXT:    s_endpgm
 ;
 ; GFX11-GISEL-REAL16-LABEL: fptrunc:
@@ -368,8 +358,6 @@ define amdgpu_kernel void @fptrunc(
 ; GFX11-GISEL-REAL16-NEXT:    v_cvt_f16_f32_e32 v0.h, s3
 ; GFX11-GISEL-REAL16-NEXT:    s_mov_b32 s2, -1
 ; GFX11-GISEL-REAL16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-REAL16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; GFX11-GISEL-REAL16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX11-GISEL-REAL16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
@@ -468,8 +456,9 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(
 ; GFX11-GCN-REAL16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.l, 2.0, v0.l
 ; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v0.h
-; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GCN-REAL16-NEXT:    v_pack_b32_f16 v0, |v0.l|, |v0.h|
+; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GCN-REAL16-NEXT:    v_and_b16 v0.l, 0x7fff, v0.l
+; GFX11-GCN-REAL16-NEXT:    v_and_b16 v0.h, 0x7fff, v0.h
 ; GFX11-GCN-REAL16-NEXT:    ;;#ASMSTART
 ; GFX11-GCN-REAL16-NEXT:    ; use v0
 ; GFX11-GCN-REAL16-NEXT:    ;;#ASMEND
@@ -488,8 +477,9 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(
 ; GFX11-GISEL-REAL16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.l, 2.0, v0.l
 ; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v0.h
-; GFX11-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-REAL16-NEXT:    v_pack_b32_f16 v0, |v0.l|, |v0.h|
+; GFX11-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-REAL16-NEXT:    v_and_b16 v0.l, 0x7fff, v0.l
+; GFX11-GISEL-REAL16-NEXT:    v_and_b16 v0.h, 0x7fff, v0.h
 ; GFX11-GISEL-REAL16-NEXT:    ;;#ASMSTART
 ; GFX11-GISEL-REAL16-NEXT:    ; use v0
 ; GFX11-GISEL-REAL16-NEXT:    ;;#ASMEND
@@ -599,8 +589,9 @@ define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(
 ; GFX11-GCN-REAL16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.l, 2.0, v0.l
 ; GFX11-GCN-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v0.h
-; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GCN-REAL16-NEXT:    v_pack_b32_f16 v0, -v0.l, -v0.h
+; GFX11-GCN-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GCN-REAL16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-GCN-REAL16-NEXT:    v_xor_b16 v0.h, 0x8000, v0.h
 ; GFX11-GCN-REAL16-NEXT:    ;;#ASMSTART
 ; GFX11-GCN-REAL16-NEXT:    ; use v0
 ; GFX11-GCN-REAL16-NEXT:    ;;#ASMEND
@@ -619,8 +610,9 @@ define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(
 ; GFX11-GISEL-REAL16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.l, 2.0, v0.l
 ; GFX11-GISEL-REAL16-NEXT:    v_add_f16_e32 v0.h, 2.0, v0.h
-; GFX11-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-REAL16-NEXT:    v_pack_b32_f16 v0, -v0.l, -v0.h
+; GFX11-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-REAL16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-GISEL-REAL16-NEXT:    v_xor_b16 v0.h, 0x8000, v0.h
 ; GFX11-GISEL-REAL16-NEXT:    ;;#ASMSTART
 ; GFX11-GISEL-REAL16-NEXT:    ; use v0
 ; GFX11-GISEL-REAL16-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll
index d500a3e50f9f7..acca5169126e4 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll
@@ -272,12 +272,11 @@ define half @test_vector_reduce_fmax_v3half(<3 x half> %v) {
 ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmax_v3half:
 ; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0x7e00
 ; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v1.l, v1.l, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, 0x7e00
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_f16 v0, v0, v1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -325,12 +324,11 @@ define half @test_vector_reduce_fmax_v3half(<3 x half> %v) {
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0x7e00
 ; GFX12-SDAG-TRUE16-NEXT:    v_max_num_f16_e32 v1.l, v1.l, v1.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, 0x7e00
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_num_f16 v0, v0, v1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_max_num_f16_e32 v0.l, v0.l, v0.h
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll
index 56e7e045e40eb..3f56d9871c568 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll
@@ -272,12 +272,11 @@ define half @test_vector_reduce_fmin_v3half(<3 x half> %v) {
 ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_fmin_v3half:
 ; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0x7e00
 ; GFX11-SDAG-TRUE16-NEXT:    v_max_f16_e32 v1.l, v1.l, v1.l
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_max_f16 v0, v0, v0
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, 0x7e00
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_pk_min_f16 v0, v0, v1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -325,12 +324,11 @@ define half @test_vector_reduce_fmin_v3half(<3 x half> %v) {
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 0x7e00
 ; GFX12-SDAG-TRUE16-NEXT:    v_max_num_f16_e32 v1.l, v1.l, v1.l
 ; GFX12-SDAG-TRUE16-NEXT:    v_pk_max_num_f16 v0, v0, v0
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, 0x7e00
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_pk_min_num_f16 v0, v0, v1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-SDAG-TRUE16-NEXT:    v_min_num_f16_e32 v0.l, v0.l, v0.h
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;



More information about the llvm-commits mailing list