[llvm] [AMDGPU][True16][CodeGen] insert vgpr32 for SelectMadMixFma for f16/bf16 (PR #159648)

Brox Chen via llvm-commits llvm-commits at lists.llvm.org
Mon Sep 22 09:50:38 PDT 2025


https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/159648

>From 6609e8536f7d661c1f87c0c80a67714fe312a961 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Mon, 22 Sep 2025 11:30:02 -0400
Subject: [PATCH] update madmix mod

---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |  25 +-
 llvm/test/CodeGen/AMDGPU/fdiv.f16.ll          |  36 +-
 .../AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll |  54 +--
 llvm/test/CodeGen/AMDGPU/frem.ll              | 212 ++++-----
 llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll        |  75 ++-
 llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll        | 227 +++++++--
 llvm/test/CodeGen/AMDGPU/mad-mix.ll           | 445 +++++++++++++-----
 7 files changed, 722 insertions(+), 352 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index c2fca79979e1b..2f8b36ad54f14 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -4078,18 +4078,23 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
   // register.
 
   Mods |= SISrcMods::OP_SEL_1;
-  if (IsExtractHigh ||
-      (Src.getValueSizeInBits() == 16 && isExtractHiElt(Src, Src))) {
-    Mods |= SISrcMods::OP_SEL_0;
+  if (Src.getValueSizeInBits() == 16) {
+    if (Subtarget->useRealTrue16Insts()) {
+      // In true16 mode, pack src to a 32bit
+      if (Src.getOpcode() == ISD::TRUNCATE &&
+          Src.getOperand(0).getValueType() == MVT::i32) {
+        Src = Src.getOperand(0);
+      } else {
+        Src = createVOP3PSrc32FromLo16(Src, In, CurDAG, Subtarget);
+      }
+    } else if (isExtractHiElt(Src, Src)) {
+      Mods |= SISrcMods::OP_SEL_0;
 
-    // TODO: Should we try to look for neg/abs here?
-  }
+      // TODO: Should we try to look for neg/abs here?
+    }
+  } else if (Src.getValueSizeInBits() == 32 && IsExtractHigh)
+    Mods |= SISrcMods::OP_SEL_0;
 
-  // Prevent unnecessary subreg COPY to VGPR_16
-  if (Src.getOpcode() == ISD::TRUNCATE &&
-      Src.getOperand(0).getValueType() == MVT::i32) {
-    Src = Src.getOperand(0);
-  }
   return true;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index 210e09fd9169a..7f6a920d25016 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -137,33 +137,31 @@ define amdgpu_kernel void @v_fdiv_f16(
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v2, v1, s[2:3] glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[4:5] glc dlc
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v3, v1, s[4:5] glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v0.l
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v2.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v4, v4, v3
-; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v7, -v5, v4, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v4, v4, v0
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v5, -v3, v4, v2 op_sel_hi:[1,0,1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v4, v7, v3
-; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v5, -v5, v4, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v4, v5, v0
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v5, -v3, v4, v2 op_sel_hi:[1,0,1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v3, v5, v3
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v0, v5, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: v_fdiv_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index a859cc91b7fde..fe95d4561d0cd 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -1571,25 +1571,24 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0x46000000
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v1.l
+; GFX11-TRUE16-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v2, 0x46000000, v1
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v2, 0x46000000, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v4, -v3, v2, s0 op_sel_hi:[1,0,0]
-; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v2, v4, v1
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v3, -v1, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v2, v3, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v3, -v3, v2, s0 op_sel_hi:[1,0,0]
-; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v3, -v1, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v0, v3, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
-; GFX11-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.h, v0.l, 0x7000
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v1.l, 0x7000
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
@@ -1739,25 +1738,24 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
 ; GFX11-TRUE16-NEXT:    v_lshlrev_b16 v0.l, v0.l, 1
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 2.0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v0.l, v0.l
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v1.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, v1, v1
-; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v4, -v3, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v2, v0, v0
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v3, -v1, v2, s0 op_sel_hi:[1,0,0]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v2, v4, v1
-; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v3, -v3, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v2, v3, v0
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v3, -v1, v2, s0 op_sel_hi:[1,0,0]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v0, v3, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.h, v0.l, 2.0
+; GFX11-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v1.l, 2.0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index c4a38dcd7b5f3..78a961ea0da17 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -1433,37 +1433,35 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v3, v1, s[4:5] offset:8
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v2.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_rcp_f32_e32 v4, v4
 ; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v3, v3, v4
-; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v3, v7, v4
-; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v0, v5, v4
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v4, v5, v4
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, v4, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT:    v_trunc_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_fma_f16 v0.l, -v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_fma_f16 v0.l, -v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: fast_frem_f16:
@@ -1507,38 +1505,36 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX1150-TRUE16-NEXT:    s_clause 0x1
 ; GFX1150-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1150-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1150-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1150-TRUE16-NEXT:    s_clause 0x1
-; GFX1150-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX1150-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT:    global_load_d16_b16 v2, v1, s[2:3]
+; GFX1150-TRUE16-NEXT:    global_load_d16_b16 v3, v1, s[4:5] offset:8
 ; GFX1150-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v0.l
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v2.l
 ; GFX1150-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v1.l
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
-; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v3.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
 ; GFX1150-TRUE16-NEXT:    v_rcp_f32_e32 v4, v4
-; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
-; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v3, v7, v4
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v0, v5, v4
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
 ; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v4, v5, v4
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1150-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
-; GFX1150-TRUE16-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1150-TRUE16-NEXT:    v_add_f32_e32 v0, v4, v0
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v3
-; GFX1150-TRUE16-NEXT:    v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1150-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_trunc_f16_e32 v3.l, v0.h
-; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_fmac_f16_e32 v0.l, v3.l, v1.l
-; GFX1150-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT:    v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1150-TRUE16-NEXT:    global_store_b16 v1, v2, s[0:1]
 ; GFX1150-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX1150-FAKE16-LABEL: fast_frem_f16:
@@ -1583,38 +1579,36 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX1200-TRUE16-NEXT:    s_clause 0x1
 ; GFX1200-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1200-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1200-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1200-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1200-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1200-TRUE16-NEXT:    s_clause 0x1
-; GFX1200-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX1200-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1200-TRUE16-NEXT:    global_load_d16_b16 v2, v1, s[2:3]
+; GFX1200-TRUE16-NEXT:    global_load_d16_b16 v3, v1, s[4:5] offset:8
 ; GFX1200-TRUE16-NEXT:    s_wait_loadcnt 0x1
-; GFX1200-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v0.l
+; GFX1200-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v2.l
 ; GFX1200-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1200-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v1.l
-; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
-; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
-; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v3.l
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
 ; GFX1200-TRUE16-NEXT:    v_rcp_f32_e32 v4, v4
-; GFX1200-TRUE16-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX1200-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT:    v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
-; GFX1200-TRUE16-NEXT:    v_fmac_f32_e32 v3, v7, v4
+; GFX1200-TRUE16-NEXT:    v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT:    v_fmac_f32_e32 v0, v5, v4
 ; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT:    v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT:    v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
 ; GFX1200-TRUE16-NEXT:    v_mul_f32_e32 v4, v5, v4
 ; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
-; GFX1200-TRUE16-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1200-TRUE16-NEXT:    v_add_f32_e32 v0, v4, v0
 ; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v3
-; GFX1200-TRUE16-NEXT:    v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1200-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX1200-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
 ; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT:    v_trunc_f16_e32 v3.l, v0.h
-; GFX1200-TRUE16-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1200-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX1200-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
 ; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT:    v_fmac_f16_e32 v0.l, v3.l, v1.l
-; GFX1200-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX1200-TRUE16-NEXT:    v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1200-TRUE16-NEXT:    global_store_b16 v1, v2, s[0:1]
 ; GFX1200-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX1200-FAKE16-LABEL: fast_frem_f16:
@@ -1840,37 +1834,35 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v3, v1, s[4:5] offset:8
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v2.l
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_rcp_f32_e32 v4, v4
 ; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v3, v3, v4
-; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v3, v7, v4
-; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT:    v_fmac_f32_e32 v0, v5, v4
+; GFX11-TRUE16-NEXT:    v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_mul_f32_e32 v4, v5, v4
 ; GFX11-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v3
+; GFX11-TRUE16-NEXT:    v_add_f32_e32 v0, v4, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT:    v_trunc_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_fma_f16 v0.l, -v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    v_fma_f16 v0.l, -v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: unsafe_frem_f16:
@@ -1914,38 +1906,36 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1150-TRUE16-NEXT:    s_clause 0x1
 ; GFX1150-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1150-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1150-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1150-TRUE16-NEXT:    s_clause 0x1
-; GFX1150-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX1150-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT:    global_load_d16_b16 v2, v1, s[2:3]
+; GFX1150-TRUE16-NEXT:    global_load_d16_b16 v3, v1, s[4:5] offset:8
 ; GFX1150-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v0.l
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v2.l
 ; GFX1150-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v1.l
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
-; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v3.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
 ; GFX1150-TRUE16-NEXT:    v_rcp_f32_e32 v4, v4
-; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
-; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v3, v7, v4
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v0, v5, v4
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
 ; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v4, v5, v4
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1150-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
-; GFX1150-TRUE16-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1150-TRUE16-NEXT:    v_add_f32_e32 v0, v4, v0
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v3
-; GFX1150-TRUE16-NEXT:    v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1150-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_trunc_f16_e32 v3.l, v0.h
-; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_fmac_f16_e32 v0.l, v3.l, v1.l
-; GFX1150-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT:    v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1150-TRUE16-NEXT:    global_store_b16 v1, v2, s[0:1]
 ; GFX1150-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX1150-FAKE16-LABEL: unsafe_frem_f16:
@@ -1990,38 +1980,36 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1200-TRUE16-NEXT:    s_clause 0x1
 ; GFX1200-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1200-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1200-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1200-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1200-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1200-TRUE16-NEXT:    s_clause 0x1
-; GFX1200-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
-; GFX1200-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1200-TRUE16-NEXT:    global_load_d16_b16 v2, v1, s[2:3]
+; GFX1200-TRUE16-NEXT:    global_load_d16_b16 v3, v1, s[4:5] offset:8
 ; GFX1200-TRUE16-NEXT:    s_wait_loadcnt 0x1
-; GFX1200-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v0.l
+; GFX1200-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v2.l
 ; GFX1200-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1200-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v1.l
-; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
-; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
-; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v3.l
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
 ; GFX1200-TRUE16-NEXT:    v_rcp_f32_e32 v4, v4
-; GFX1200-TRUE16-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX1200-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT:    v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
-; GFX1200-TRUE16-NEXT:    v_fmac_f32_e32 v3, v7, v4
+; GFX1200-TRUE16-NEXT:    v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT:    v_fmac_f32_e32 v0, v5, v4
 ; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT:    v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT:    v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
 ; GFX1200-TRUE16-NEXT:    v_mul_f32_e32 v4, v5, v4
 ; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
-; GFX1200-TRUE16-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1200-TRUE16-NEXT:    v_add_f32_e32 v0, v4, v0
 ; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v3
-; GFX1200-TRUE16-NEXT:    v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1200-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX1200-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
 ; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT:    v_trunc_f16_e32 v3.l, v0.h
-; GFX1200-TRUE16-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1200-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX1200-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
 ; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT:    v_fmac_f16_e32 v0.l, v3.l, v1.l
-; GFX1200-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX1200-TRUE16-NEXT:    v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1200-TRUE16-NEXT:    global_store_b16 v1, v2, s[0:1]
 ; GFX1200-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX1200-FAKE16-LABEL: unsafe_frem_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
index 1ae3434db6da5..030b1b58f5b1a 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
@@ -65,10 +65,10 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %s
 ; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
 ; SDAG-GFX11-TRUE16:       ; %bb.0:
 ; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0x3c00
-; SDAG-GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX11-TRUE16-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; SDAG-GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, v3
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x3c00
+; SDAG-GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-TRUE16-NEXT:    v_fma_mixhi_f16 v0, v3, v1, v2 op_sel_hi:[1,1,1]
 ; SDAG-GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
@@ -137,13 +137,22 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %s
 }
 
 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src1, half %src2, half %lo) #0 {
-; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v3
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
+; SDAG-GFX11-TRUE16:       ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v3.l
+; SDAG-GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-TRUE16-NEXT:    v_fma_mixhi_f16 v0, v4, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
+; SDAG-GFX11-FAKE16:       ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v3
+; SDAG-GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
 ; GFX9:       ; %bb.0:
@@ -172,6 +181,14 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src
 ; SDAG-CI-NEXT:    v_mov_b32_e32 v0, v3
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
 ; GISEL-CI:       ; %bb.0:
 ; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -499,14 +516,25 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half
 }
 
 define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use(half %src0, half %src1, half %src2) #0 {
-; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; GFX11-NEXT:    v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; GFX11-NEXT:    global_store_b16 v[0:1], v3, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
+; SDAG-GFX11-TRUE16:       ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
+; SDAG-GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-TRUE16-NEXT:    v_fma_mixlo_f16 v1, v0, v3, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-TRUE16-NEXT:    v_fma_mixhi_f16 v0, v0, v3, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX11-TRUE16-NEXT:    global_store_b16 v[0:1], v1, off dlc
+; SDAG-GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; SDAG-GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
+; SDAG-GFX11-FAKE16:       ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-FAKE16-NEXT:    v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX11-FAKE16-NEXT:    global_store_b16 v[0:1], v3, off dlc
+; SDAG-GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; SDAG-GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
 ; GFX9:       ; %bb.0:
@@ -542,6 +570,15 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi
 ; SDAG-CI-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX11-NEXT:    v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX11-NEXT:    global_store_b16 v[0:1], v3, off dlc
+; GISEL-GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GISEL-GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
 ; GISEL-CI:       ; %bb.0:
 ; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index eab92668c536b..3c8a5b25b7c26 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -411,7 +411,14 @@ define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half
 ; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32:
 ; SDAG-GFX1100-TRUE16:       ; %bb.0:
 ; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v3.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1]
 ; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v3.l
 ; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
@@ -534,12 +541,18 @@ define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half
 ; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v3f32:
 ; SDAG-GFX1100-TRUE16:       ; %bb.0:
 ; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v6.h, 0
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v4.h
 ; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v6.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v6.h
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v6, v6, v7, v8 op_sel_hi:[1,1,1]
 ; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v6.l
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b32_e32 v0, v6
 ; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -704,8 +717,21 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half
 ; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v4f32:
 ; SDAG-GFX1100-TRUE16:       ; %bb.0:
 ; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v6.h, 0
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v3.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v6.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v6.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v6.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v4.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v6.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v5.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v6.h
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v6, v6, v8, v10 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v7, v7, v9, v11 op_sel_hi:[1,1,1]
 ; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v6.l
 ; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v7.l
@@ -914,14 +940,28 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half
 ; FIXME (DAG): Fold clamp
 
 define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_clamp_postcvt:
+; SDAG-GFX1100-TRUE16:       ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v3.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.h
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixhi_f16 v0, v3, v4, v5 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_clamp_postcvt:
+; SDAG-GFX1100-FAKE16:       ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX1100-FAKE16-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-FAKE16-NEXT:    v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt:
 ; GFX900:       ; %bb.0:
@@ -978,6 +1018,15 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e64 v1, v1 clamp
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt:
+; GISEL-GFX1100:       ; %bb.0:
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX1100-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v0, v3
+; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-VI-LABEL: v_mad_mix_v2f32_clamp_postcvt:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1040,13 +1089,17 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
 ; SDAG-GFX1100-TRUE16:       ; %bb.0:
 ; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v4.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v6.h, v3.h
 ; SDAG-GFX1100-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.l, 0
-; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixhi_f16 v0, v3, v5, v6 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX1100-TRUE16-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
-; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b32_e32 v0, v3
 ; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v3f32_clamp_postcvt:
@@ -1247,17 +1300,39 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
 }
 
 define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT:    v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-NEXT:    v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT:    v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; SDAG-GFX1100-TRUE16:       ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v6.h, 0
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v3.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v6.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v6.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v6.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v5.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v6.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v4.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v6.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixhi_f16 v0, v7, v9, v11 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixhi_f16 v1, v6, v8, v10 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; SDAG-GFX1100-FAKE16:       ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT:    v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-FAKE16-NEXT:    v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-FAKE16-NEXT:    v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-FAKE16-NEXT:    v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-FAKE16-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; SDAG-GFX1100-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt:
 ; GFX900:       ; %bb.0:
@@ -1358,6 +1433,18 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e64 v3, v3 clamp
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; GISEL-GFX1100:       ; %bb.0:
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT:    v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT:    v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL-GFX1100-NEXT:    v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT:    v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX1100-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1452,10 +1539,15 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
 ; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
 ; SDAG-GFX1100-TRUE16:       ; %bb.0:
 ; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.h
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixhi_f16 v0, v3, v1, v2 op_sel_hi:[1,1,1]
 ; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
@@ -1617,10 +1709,15 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
 ; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi:
 ; SDAG-GFX1100-TRUE16:       ; %bb.0:
 ; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.h
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mixhi_f16 v0, v3, v1, v2 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi:
@@ -1789,12 +1886,18 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr
 ; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_clamp_precvt:
 ; SDAG-GFX1100-TRUE16:       ; %bb.0:
 ; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.h
 ; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v1, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v3
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v3.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.h
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
-; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v3, v3, v4, v5 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v3
 ; SDAG-GFX1100-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1947,15 +2050,22 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr
 ; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v3f32_clamp_precvt:
 ; SDAG-GFX1100-TRUE16:       ; %bb.0:
 ; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v2, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v6.h, 0
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v4.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v6.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v6.h
 ; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v6
-; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v2
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v2, v6, v7, v8 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v1
-; SDAG-GFX1100-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
+; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v2
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
 ; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v3f32_clamp_precvt:
@@ -2141,19 +2251,32 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
 ; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v4f32_clamp_precvt:
 ; SDAG-GFX1100-TRUE16:       ; %bb.0:
 ; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v6.h, 0
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v3.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v7.h, v6.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v8.h, v6.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v9.h, v6.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v10.l, v4.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v5.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v11.h, v6.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v10.h, v6.h
 ; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v2, v0, v2, v4 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v3, v1, v3, v5 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v6
-; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v7
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v7, v7, v9, v11 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v6, v6, v8, v10 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v2
 ; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.h, v3
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v7
+; SDAG-GFX1100-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v6
 ; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; SDAG-GFX1100-TRUE16-NEXT:    v_pack_b32_f16 v1, v1.h, v1.l
+; SDAG-GFX1100-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.h, v0.l
 ; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v4f32_clamp_precvt:
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
index a4878539b1c74..1aa8c5b0ec057 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
@@ -75,11 +75,24 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2
 }
 
 define float @v_mad_mix_f32_f16hi_f16hi_f16hi_int(i32 %src0, i32 %src1, i32 %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_int:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_int:
+; SDAG-GFX1100-TRUE16:       ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v3.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v0, v3, v0, v1 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_int:
+; SDAG-GFX1100-FAKE16:       ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_int:
 ; GFX900:       ; %bb.0:
@@ -122,6 +135,12 @@ define float @v_mad_mix_f32_f16hi_f16hi_f16hi_int(i32 %src0, i32 %src1, i32 %src
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v2
 ; CI-NEXT:    v_mac_f32_e32 v0, v3, v1
 ; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_int:
+; GISEL-GFX1100:       ; %bb.0:
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
   %src0.hi = lshr i32 %src0, 16
   %src1.hi = lshr i32 %src1, 16
   %src2.hi = lshr i32 %src2, 16
@@ -139,11 +158,24 @@ define float @v_mad_mix_f32_f16hi_f16hi_f16hi_int(i32 %src0, i32 %src1, i32 %src
 }
 
 define float @v_mad_mix_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt:
+; SDAG-GFX1100-TRUE16:       ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v3.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v0, v3, v0, v1 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt:
+; SDAG-GFX1100-FAKE16:       ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt:
 ; GFX900:       ; %bb.0:
@@ -181,6 +213,12 @@ define float @v_mad_mix_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v1, v3, v5
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt:
+; GISEL-GFX1100:       ; %bb.0:
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-CI-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt:
 ; GISEL-CI:       ; %bb.0:
 ; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -200,30 +238,44 @@ define float @v_mad_mix_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %
 }
 
 define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_v2f32:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; GFX1100-NEXT:    v_fma_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32:
+; SDAG-GFX1100-TRUE16:       ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v3.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.h
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v1, v3, v4, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_mad_mix_v2f32:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_mad_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; GFX900-NEXT:    v_mad_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; GFX900-NEXT:    v_mov_b32_e32 v0, v3
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32:
+; SDAG-GFX1100-FAKE16:       ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT:    v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-FAKE16-NEXT:    v_fma_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; SDAG-GFX1100-FAKE16-NEXT:    v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX906-LABEL: v_mad_mix_v2f32:
-; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; GFX906-NEXT:    v_fma_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; GFX906-NEXT:    v_mov_b32_e32 v0, v3
-; GFX906-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX900-LABEL: v_mad_mix_v2f32:
+; SDAG-GFX900:       ; %bb.0:
+; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX900-NEXT:    v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX900-NEXT:    v_mov_b32_e32 v1, v3
+; SDAG-GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX906-LABEL: v_mad_mix_v2f32:
+; SDAG-GFX906:       ; %bb.0:
+; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX906-NEXT:    v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX906-NEXT:    v_mov_b32_e32 v1, v3
+; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX9GEN-LABEL: v_mad_mix_v2f32:
 ; SDAG-GFX9GEN:       ; %bb.0:
@@ -271,6 +323,31 @@ define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x hal
 ; SDAG-CI-NEXT:    v_mac_f32_e32 v0, v4, v2
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX1100-LABEL: v_mad_mix_v2f32:
+; GISEL-GFX1100:       ; %bb.0:
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v0, v3
+; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX900-LABEL: v_mad_mix_v2f32:
+; GISEL-GFX900:       ; %bb.0:
+; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX900-NEXT:    v_mad_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX900-NEXT:    v_mad_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GISEL-GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX906-LABEL: v_mad_mix_v2f32:
+; GISEL-GFX906:       ; %bb.0:
+; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX906-NEXT:    v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX906-NEXT:    v_fma_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX906-NEXT:    v_mov_b32_e32 v0, v3
+; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-GFX9GEN-LABEL: v_mad_mix_v2f32:
 ; GISEL-GFX9GEN:       ; %bb.0:
 ; GISEL-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -317,14 +394,30 @@ define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x hal
 }
 
 define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_v2f32_shuffle:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1]
-; GFX1100-NEXT:    v_fma_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1]
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-NEXT:    v_mov_b32_e32 v0, v3
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_shuffle:
+; SDAG-GFX1100-TRUE16:       ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v5.h, v3.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v4.h, v3.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v2, v3, v1, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v1, v0, v4, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b32_e32 v0, v2
+; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_shuffle:
+; SDAG-GFX1100-FAKE16:       ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT:    v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-FAKE16-NEXT:    v_fma_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; SDAG-GFX1100-FAKE16-NEXT:    v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_v2f32_shuffle:
 ; GFX900:       ; %bb.0:
@@ -383,6 +476,15 @@ define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1,
 ; SDAG-CI-NEXT:    v_mad_f32 v1, v4, v3, v5
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_shuffle:
+; GISEL-GFX1100:       ; %bb.0:
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v0, v3
+; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-CI-LABEL: v_mad_mix_v2f32_shuffle:
 ; GISEL-CI:       ; %bb.0:
 ; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1193,32 +1295,45 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 {
 }
 
 define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) #0 {
-; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_f32imm1:
-; SDAG-GFX1100:       ; %bb.0:
-; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-NEXT:    s_mov_b32 s0, 1.0
-; SDAG-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0]
-; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; SDAG-GFX1100-NEXT:    v_mov_b32_e32 v0, v2
-; SDAG-GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_f32imm1:
+; SDAG-GFX1100-TRUE16:       ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT:    s_mov_b32 s0, 1.0
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v0, v0, v1, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v1, v2, v3, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_f32imm1:
+; SDAG-GFX1100-FAKE16:       ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT:    s_mov_b32 s0, 1.0
+; SDAG-GFX1100-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-FAKE16-NEXT:    v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-FAKE16-NEXT:    v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; SDAG-GFX1100-FAKE16-NEXT:    v_mov_b32_e32 v0, v2
+; SDAG-GFX1100-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX900-LABEL: v_mad_mix_v2f32_f32imm1:
 ; SDAG-GFX900:       ; %bb.0:
 ; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    s_mov_b32 s4, 1.0
-; SDAG-GFX900-NEXT:    v_mad_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0]
-; SDAG-GFX900-NEXT:    v_mad_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; SDAG-GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; SDAG-GFX900-NEXT:    v_mad_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; SDAG-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
+; SDAG-GFX900-NEXT:    v_mov_b32_e32 v1, v2
 ; SDAG-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX906-LABEL: v_mad_mix_v2f32_f32imm1:
 ; SDAG-GFX906:       ; %bb.0:
 ; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    s_mov_b32 s4, 1.0
-; SDAG-GFX906-NEXT:    v_fma_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0]
-; SDAG-GFX906-NEXT:    v_fma_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; SDAG-GFX906-NEXT:    v_mov_b32_e32 v0, v2
+; SDAG-GFX906-NEXT:    v_fma_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; SDAG-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
+; SDAG-GFX906-NEXT:    v_mov_b32_e32 v1, v2
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX9GEN-LABEL: v_mad_mix_v2f32_f32imm1:
@@ -1325,32 +1440,45 @@ define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1)
 }
 
 define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 {
-; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
-; SDAG-GFX1100:       ; %bb.0:
-; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-NEXT:    s_mov_b32 s0, 0x3e230000
-; SDAG-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0]
-; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; SDAG-GFX1100-NEXT:    v_mov_b32_e32 v0, v2
-; SDAG-GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
+; SDAG-GFX1100-TRUE16:       ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT:    s_mov_b32 s0, 0x3e230000
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v0, v0, v1, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v1, v2, v3, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
+; SDAG-GFX1100-FAKE16:       ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT:    s_mov_b32 s0, 0x3e230000
+; SDAG-GFX1100-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-FAKE16-NEXT:    v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-FAKE16-NEXT:    v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; SDAG-GFX1100-FAKE16-NEXT:    v_mov_b32_e32 v0, v2
+; SDAG-GFX1100-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX900-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
 ; SDAG-GFX900:       ; %bb.0:
 ; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    s_mov_b32 s4, 0x3e230000
-; SDAG-GFX900-NEXT:    v_mad_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0]
-; SDAG-GFX900-NEXT:    v_mad_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; SDAG-GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; SDAG-GFX900-NEXT:    v_mad_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; SDAG-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
+; SDAG-GFX900-NEXT:    v_mov_b32_e32 v1, v2
 ; SDAG-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX906-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
 ; SDAG-GFX906:       ; %bb.0:
 ; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    s_mov_b32 s4, 0x3e230000
-; SDAG-GFX906-NEXT:    v_fma_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0]
-; SDAG-GFX906-NEXT:    v_fma_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; SDAG-GFX906-NEXT:    v_mov_b32_e32 v0, v2
+; SDAG-GFX906-NEXT:    v_fma_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; SDAG-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
+; SDAG-GFX906-NEXT:    v_mov_b32_e32 v1, v2
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX9GEN-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
@@ -1464,32 +1592,45 @@ define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half>
 }
 
 define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 {
-; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_f32imminv2pi:
-; SDAG-GFX1100:       ; %bb.0:
-; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-NEXT:    s_mov_b32 s0, 0.15915494
-; SDAG-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0]
-; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; SDAG-GFX1100-NEXT:    v_mov_b32_e32 v0, v2
-; SDAG-GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_f32imminv2pi:
+; SDAG-GFX1100-TRUE16:       ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v2.h, 0
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT:    s_mov_b32 s0, 0.15915494
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v0, v0, v1, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.h, v2.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v1, v2, v3, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_f32imminv2pi:
+; SDAG-GFX1100-FAKE16:       ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT:    s_mov_b32 s0, 0.15915494
+; SDAG-GFX1100-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-FAKE16-NEXT:    v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-FAKE16-NEXT:    v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; SDAG-GFX1100-FAKE16-NEXT:    v_mov_b32_e32 v0, v2
+; SDAG-GFX1100-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX900-LABEL: v_mad_mix_v2f32_f32imminv2pi:
 ; SDAG-GFX900:       ; %bb.0:
 ; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX900-NEXT:    s_mov_b32 s4, 0.15915494
-; SDAG-GFX900-NEXT:    v_mad_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0]
-; SDAG-GFX900-NEXT:    v_mad_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; SDAG-GFX900-NEXT:    v_mov_b32_e32 v0, v2
+; SDAG-GFX900-NEXT:    v_mad_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; SDAG-GFX900-NEXT:    v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
+; SDAG-GFX900-NEXT:    v_mov_b32_e32 v1, v2
 ; SDAG-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX906-LABEL: v_mad_mix_v2f32_f32imminv2pi:
 ; SDAG-GFX906:       ; %bb.0:
 ; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-GFX906-NEXT:    s_mov_b32 s4, 0.15915494
-; SDAG-GFX906-NEXT:    v_fma_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0]
-; SDAG-GFX906-NEXT:    v_fma_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; SDAG-GFX906-NEXT:    v_mov_b32_e32 v0, v2
+; SDAG-GFX906-NEXT:    v_fma_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; SDAG-GFX906-NEXT:    v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
+; SDAG-GFX906-NEXT:    v_mov_b32_e32 v1, v2
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX9GEN-LABEL: v_mad_mix_v2f32_f32imminv2pi:
@@ -1599,11 +1740,24 @@ define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %s
 }
 
 define float @v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
+; SDAG-GFX1100-TRUE16:       ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v3.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v0, v3, v0, v1 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
+; SDAG-GFX1100-FAKE16:       ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
 ; GFX900:       ; %bb.0:
@@ -1641,6 +1795,12 @@ define float @v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x h
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v1, v3, v5 clamp
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX1100-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
+; GISEL-GFX1100:       ; %bb.0:
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-CI-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
 ; GISEL-CI:       ; %bb.0:
 ; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2253,9 +2413,10 @@ define float @v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo(i32 %src0.arg, half %
 ; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo:
 ; SDAG-GFX1100-TRUE16:       ; %bb.0:
 ; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.h
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
+; SDAG-GFX1100-TRUE16-NEXT:    v_xor_b16 v2.l, 0x8000, v0.h
 ; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v0, |v2|, v1, v0 op_sel_hi:[1,1,1]
 ; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo:
@@ -2339,11 +2500,20 @@ define float @v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo(i32 %src0.arg, half %
 }
 
 define float @v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo:
+; SDAG-GFX1100-TRUE16:       ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v0, |v2|, v1, v0 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo:
+; SDAG-GFX1100-FAKE16:       ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT:    v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo:
 ; GFX900:       ; %bb.0:
@@ -2383,6 +2553,12 @@ define float @v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo(i32 %src0.arg, half %src1
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v0, v1, v2
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX1100-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo:
+; GISEL-GFX1100:       ; %bb.0:
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-CI-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo:
 ; GISEL-CI:       ; %bb.0:
 ; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2403,11 +2579,20 @@ define float @v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo(i32 %src0.arg, half %src1
 }
 
 define float @v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_fma_mix_f32 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
+; SDAG-GFX1100-TRUE16:       ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v0, -v2, v1, v0 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
+; SDAG-GFX1100-FAKE16:       ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT:    v_fma_mix_f32 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
 ; GFX900:       ; %bb.0:
@@ -2447,6 +2632,12 @@ define float @v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v0, v1, v2
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX1100-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
+; GISEL-GFX1100:       ; %bb.0:
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
 ; GISEL-GFX9GEN:       ; %bb.0:
 ; GISEL-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2488,11 +2679,20 @@ define float @v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half
 }
 
 define float @v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
+; SDAG-GFX1100-TRUE16:       ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v0, |v2|, v1, v0 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
+; SDAG-GFX1100-FAKE16:       ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT:    v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
 ; GFX900:       ; %bb.0:
@@ -2532,6 +2732,12 @@ define float @v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo(i32 %src0.arg, half
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v0, v1, v2
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX1100-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
+; GISEL-GFX1100:       ; %bb.0:
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
 ; GISEL-GFX9GEN:       ; %bb.0:
 ; GISEL-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2573,11 +2779,20 @@ define float @v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo(i32 %src0.arg, half
 }
 
 define float @v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
+; SDAG-GFX1100-TRUE16:       ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
+; SDAG-GFX1100-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT:    v_fma_mix_f32 v0, -|v2|, v1, v0 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
+; SDAG-GFX1100-FAKE16:       ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT:    v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
 ; GFX900:       ; %bb.0:
@@ -2617,6 +2832,12 @@ define float @v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo(i32 %src0.arg,
 ; SDAG-CI-NEXT:    v_mad_f32 v0, v0, v1, v2
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX1100-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
+; GISEL-GFX1100:       ; %bb.0:
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
 ; GISEL-GFX9GEN:       ; %bb.0:
 ; GISEL-GFX9GEN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)



More information about the llvm-commits mailing list