[llvm] update madmixpat mod (PR #159648)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 18 13:41:43 PDT 2025
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/159648
>From a128b49b89ea86ce78471cbd6023a49054cd153d Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Wed, 17 Sep 2025 18:25:51 -0400
Subject: [PATCH] update madmixpat mod
---
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 41 +-
llvm/test/CodeGen/AMDGPU/bf16.ll | 998 +++++++-----------
llvm/test/CodeGen/AMDGPU/fdiv.f16.ll | 36 +-
.../AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll | 54 +-
llvm/test/CodeGen/AMDGPU/frem.ll | 212 ++--
llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll | 17 +-
llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll | 75 +-
llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll | 6 +-
llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll | 206 ++--
llvm/test/CodeGen/AMDGPU/mad-mix.ll | 424 ++++++--
.../CodeGen/AMDGPU/vector_shuffle.packed.ll | 503 ++++-----
11 files changed, 1281 insertions(+), 1291 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index c2fca79979e1b..a6ca086e3010b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3989,13 +3989,13 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
return SelectVOP3Mods(In, Src, SrcMods);
}
-// Match lowered fpext from bf16 to f32. This is a bit operation extending
+// Match lowered fpext from f16/bf16 to f32. This is a bit operation extending
// a 16-bit value with 16-bit of zeroes at LSB:
//
-// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val)))))
+// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast f16/bf16:val)))))
// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true
// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false
-static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) {
+static SDValue match16FPExtendLike(SDValue Op, bool &IsExtractHigh) {
if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST)
return SDValue();
Op = Op.getOperand(0);
@@ -4006,7 +4006,7 @@ static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) {
if (!Low16 || !Low16->isZero())
return SDValue();
Op = stripBitcast(Op.getOperand(1));
- if (Op.getValueType() != MVT::bf16)
+ if (Op.getValueType() != MVT::bf16 && Op.getValueType() != MVT::f16)
return SDValue();
return Op;
}
@@ -4045,16 +4045,17 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
bool IsExtractHigh = false;
if (Src.getOpcode() == ISD::FP_EXTEND) {
Src = Src.getOperand(0);
- } else if (VT == MVT::bf16) {
- SDValue B16 = matchBF16FPExtendLike(Src, IsExtractHigh);
- if (!B16)
- return false;
- Src = B16;
+ } else if (Src.getOpcode() == ISD::TRUNCATE &&
+ Src.getOperand(0).getValueType() == MVT::i32) {
+ // Prevent unnecessary subreg COPY to VGPR_16
+ Src = Src.getOperand(0);
+ } else if (SDValue FP16 = match16FPExtendLike(Src, IsExtractHigh)) {
+ Src = FP16;
} else
return false;
if (Src.getValueType() != VT &&
- (VT != MVT::bf16 || Src.getValueType() != MVT::i32))
+ ((VT != MVT::f16 && VT != MVT::bf16) || Src.getValueType() != MVT::i32))
return false;
Src = stripBitcast(Src);
@@ -4078,18 +4079,18 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
// register.
Mods |= SISrcMods::OP_SEL_1;
- if (IsExtractHigh ||
- (Src.getValueSizeInBits() == 16 && isExtractHiElt(Src, Src))) {
- Mods |= SISrcMods::OP_SEL_0;
+ if (Src.getValueSizeInBits() == 16) {
+ if (Subtarget->useRealTrue16Insts())
+ // In true16 mode, create a Src32 with 16bit src
+ Src = createVOP3PSrc32From16Lo(Src, In, CurDAG, Subtarget);
+ else if (isExtractHiElt(Src, Src)) {
+ Mods |= SISrcMods::OP_SEL_0;
- // TODO: Should we try to look for neg/abs here?
- }
+ // TODO: Should we try to look for neg/abs here?
+ }
+ } else if (Src.getValueSizeInBits() == 32 && IsExtractHigh)
+ Mods |= SISrcMods::OP_SEL_0;
- // Prevent unnecessary subreg COPY to VGPR_16
- if (Src.getOpcode() == ISD::TRUNCATE &&
- Src.getOperand(0).getValueType() == MVT::i32) {
- Src = Src.getOperand(0);
- }
return true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 44c719f3635c8..d464062768034 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -46090,93 +46090,70 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
; GFX950-LABEL: v_fma_v2bf16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
-; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX950-NEXT: v_fmac_f32_e32 v3, v5, v4
-; GFX950-NEXT: v_fmac_f32_e32 v2, v0, v1
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v2, v3
+; GFX950-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX950-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fma_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v4
-; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1
-; GFX10-NEXT: v_bfe_u32 v0, v3, 16, 1
+; GFX10-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX10-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX10-NEXT: v_bfe_u32 v1, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v3
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2
-; GFX10-NEXT: v_add3_u32 v0, v0, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fma_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_dual_fmac_f32 v2, v0, v1 :: v_dual_fmac_f32 v3, v5, v4
-; GFX11TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_bfe_u32 v0, v3, 16, 1
+; GFX11TRUE16-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX11TRUE16-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1
+; GFX11TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
-; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
-; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v3, 0x7fff
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc_lo
+; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
+; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fma_v2bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_dual_fmac_f32 v2, v0, v1 :: v_dual_fmac_f32 v3, v5, v4
-; GFX11FAKE16-NEXT: v_bfe_u32 v1, v2, 16, 1
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_bfe_u32 v0, v3, 16, 1
+; GFX11FAKE16-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX11FAKE16-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT: v_bfe_u32 v1, v3, 16, 1
+; GFX11FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
-; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
-; GFX11FAKE16-NEXT: v_add3_u32 v0, v0, v3, 0x7fff
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
+; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
ret <2 x bfloat> %op
@@ -46319,132 +46296,93 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
; GFX950-LABEL: v_fma_v3bf16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX950-NEXT: v_fmac_f32_e32 v5, v1, v3
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v5, s0
-; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
-; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX950-NEXT: v_fmac_f32_e32 v3, v6, v5
-; GFX950-NEXT: v_fmac_f32_e32 v4, v0, v2
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v4, v3
+; GFX950-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX950-NEXT: v_fma_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX950-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fma_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v7
-; GFX10-NEXT: v_fmac_f32_e32 v5, v1, v3
-; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v2
-; GFX10-NEXT: v_bfe_u32 v1, v6, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v6
+; GFX10-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX10-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX10-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX10-NEXT: v_bfe_u32 v2, v6, 16, 1
+; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v6
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT: v_bfe_u32 v0, v5, 16, 1
-; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX10-NEXT: v_add3_u32 v1, v1, v6, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX10-NEXT: v_add3_u32 v0, v0, v5, 0x7fff
-; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX10-NEXT: v_add3_u32 v2, v2, v6, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v8, vcc_lo
-; GFX10-NEXT: v_perm_b32 v0, v2, v1, 0x7060302
-; GFX10-NEXT: v_alignbit_b32 v1, s4, v3, 16
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fma_v3bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
-; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11TRUE16-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v5, 16, v5
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_fmac_f32_e32 v4, v0, v2
+; GFX11TRUE16-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX11TRUE16-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX11TRUE16-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1
+; GFX11TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_fmac_f32_e32 v5, v1, v3
-; GFX11TRUE16-NEXT: v_bfe_u32 v1, v6, 16, 1
-; GFX11TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v6
-; GFX11TRUE16-NEXT: v_bfe_u32 v0, v5, 16, 1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v6, 0x7fff
-; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v5, 0x7fff
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v0, v7, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
+; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff
+; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fma_v3bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
-; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11FAKE16-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v5, 16, v5
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_fmac_f32_e32 v4, v0, v2
+; GFX11FAKE16-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX11FAKE16-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX11FAKE16-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT: v_bfe_u32 v2, v6, 16, 1
+; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_fmac_f32_e32 v5, v1, v3
-; GFX11FAKE16-NEXT: v_bfe_u32 v1, v6, 16, 1
-; GFX11FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v6
-; GFX11FAKE16-NEXT: v_bfe_u32 v0, v5, 16, 1
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v6, 0x7fff
-; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11FAKE16-NEXT: v_add3_u32 v0, v0, v5, 0x7fff
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff
+; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v0, v8, vcc_lo
-; GFX11FAKE16-NEXT: v_perm_b32 v0, v2, v1, 0x7060302
+; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc_lo
+; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v3, 16
+; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
ret <3 x bfloat> %op
@@ -46623,162 +46561,111 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
; GFX950-LABEL: v_fma_v4bf16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
-; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v1
-; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX950-NEXT: v_fmac_f32_e32 v6, v8, v7
-; GFX950-NEXT: v_fmac_f32_e32 v5, v1, v3
-; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
-; GFX950-NEXT: v_fmac_f32_e32 v1, v7, v3
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX950-NEXT: v_fmac_f32_e32 v3, v0, v2
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v3, v1
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v5, v6
+; GFX950-NEXT: v_fma_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX950-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX950-NEXT: v_fma_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX950-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v6
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fma_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v0
-; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX10-NEXT: v_fmac_f32_e32 v5, v1, v3
-; GFX10-NEXT: v_fmac_f32_e32 v7, v9, v8
-; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v6
-; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v2
-; GFX10-NEXT: v_add3_u32 v0, v10, v6, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX10-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX10-NEXT: v_fma_mix_f32 v6, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX10-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX10-NEXT: v_fma_mix_f32 v3, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX10-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX10-NEXT: v_bfe_u32 v2, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v6
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo
-; GFX10-NEXT: v_add3_u32 v0, v2, v5, 0x7fff
-; GFX10-NEXT: v_add3_u32 v2, v3, v7, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v7
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX10-NEXT: v_add3_u32 v6, v8, v4, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v0, v3, v2, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v4, v1, 0x7060302
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX10-NEXT: v_add3_u32 v2, v2, v6, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v9, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_add3_u32 v4, v9, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc_lo
+; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fma_v4bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
-; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v1
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_fmac_f32_e32 v5, v1, v3
-; GFX11TRUE16-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v7, 16, v4
-; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
-; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX11TRUE16-NEXT: v_fma_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX11TRUE16-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX11TRUE16-NEXT: v_fma_mix_f32 v3, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX11TRUE16-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_add3_u32 v4, v9, v6, 0x7fff
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2
-; GFX11TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11TRUE16-NEXT: v_fmac_f32_e32 v3, v0, v1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_fmac_f32_e32 v7, v10, v8
-; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v5, 0x7fff
-; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v6
-; GFX11TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1
-; GFX11TRUE16-NEXT: v_bfe_u32 v0, v7, 16, 1
-; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v7
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11TRUE16-NEXT: v_add3_u32 v5, v9, v3, 0x7fff
-; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v7, 0x7fff
-; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v0, v10, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX11TRUE16-NEXT: v_bfe_u32 v5, v6, 16, 1
+; GFX11TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX11TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
+; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v6
+; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v6, 0x7fff
+; GFX11TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11TRUE16-NEXT: v_add3_u32 v8, v9, v0, 0x7fff
+; GFX11TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v10, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc_lo
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fma_v4bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
-; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
-; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3
-; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_fmac_f32_e32 v5, v1, v3
-; GFX11FAKE16-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v7, 16, v4
-; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX11FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v6
+; GFX11FAKE16-NEXT: v_fma_mix_f32 v6, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX11FAKE16-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX11FAKE16-NEXT: v_fma_mix_f32 v3, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX11FAKE16-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11FAKE16-NEXT: v_bfe_u32 v2, v6, 16, 1
+; GFX11FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6
+; GFX11FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2
-; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11FAKE16-NEXT: v_fmac_f32_e32 v4, v0, v2
-; GFX11FAKE16-NEXT: v_add3_u32 v0, v10, v6, 0x7fff
-; GFX11FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo
-; GFX11FAKE16-NEXT: v_fmac_f32_e32 v7, v9, v8
-; GFX11FAKE16-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX11FAKE16-NEXT: v_add3_u32 v0, v2, v5, 0x7fff
-; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11FAKE16-NEXT: v_bfe_u32 v3, v7, 16, 1
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11FAKE16-NEXT: v_add3_u32 v6, v8, v4, 0x7fff
-; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11FAKE16-NEXT: v_add3_u32 v2, v3, v7, 0x7fff
-; GFX11FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v7
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v0, v9, vcc_lo
-; GFX11FAKE16-NEXT: v_perm_b32 v0, v3, v2, 0x7060302
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x7060302
+; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff
+; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11FAKE16-NEXT: v_bfe_u32 v9, v0, 16, 1
+; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11FAKE16-NEXT: v_add3_u32 v4, v9, v0, 0x7fff
+; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
+; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc_lo
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
ret <4 x bfloat> %op
@@ -47026,93 +46913,70 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
; GFX950-LABEL: v_fmuladd_v2bf16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
-; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
-; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX950-NEXT: v_fmac_f32_e32 v3, v5, v4
-; GFX950-NEXT: v_fmac_f32_e32 v2, v0, v1
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v2, v3
+; GFX950-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX950-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmuladd_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v4
-; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1
-; GFX10-NEXT: v_bfe_u32 v0, v3, 16, 1
+; GFX10-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX10-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX10-NEXT: v_bfe_u32 v1, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v3
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_bfe_u32 v1, v2, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v2
-; GFX10-NEXT: v_add3_u32 v0, v0, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
-; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX10-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fmuladd_v2bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_dual_fmac_f32 v2, v0, v1 :: v_dual_fmac_f32 v3, v5, v4
-; GFX11TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_bfe_u32 v0, v3, 16, 1
+; GFX11TRUE16-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX11TRUE16-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1
+; GFX11TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
-; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
-; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v3, 0x7fff
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc_lo
+; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
+; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fmuladd_v2bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v1
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2
-; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_dual_fmac_f32 v2, v0, v1 :: v_dual_fmac_f32 v3, v5, v4
-; GFX11FAKE16-NEXT: v_bfe_u32 v1, v2, 16, 1
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_bfe_u32 v0, v3, 16, 1
+; GFX11FAKE16-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX11FAKE16-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT: v_bfe_u32 v1, v3, 16, 1
+; GFX11FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
; GFX11FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v3
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
-; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
-; GFX11FAKE16-NEXT: v_add3_u32 v0, v0, v3, 0x7fff
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
+; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
ret <2 x bfloat> %op
@@ -47267,132 +47131,93 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
; GFX950-LABEL: v_fmuladd_v3bf16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX950-NEXT: v_fmac_f32_e32 v5, v1, v3
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v5, s0
-; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
-; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX950-NEXT: v_fmac_f32_e32 v3, v6, v5
-; GFX950-NEXT: v_fmac_f32_e32 v4, v0, v2
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v4, v3
+; GFX950-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX950-NEXT: v_fma_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX950-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmuladd_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v2
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v7
-; GFX10-NEXT: v_fmac_f32_e32 v5, v1, v3
-; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v2
-; GFX10-NEXT: v_bfe_u32 v1, v6, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v6
+; GFX10-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX10-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX10-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX10-NEXT: v_bfe_u32 v2, v6, 16, 1
+; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v6
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT: v_bfe_u32 v0, v5, 16, 1
-; GFX10-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX10-NEXT: v_add3_u32 v1, v1, v6, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX10-NEXT: v_add3_u32 v0, v0, v5, 0x7fff
-; GFX10-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX10-NEXT: v_add3_u32 v2, v2, v6, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v8, vcc_lo
-; GFX10-NEXT: v_perm_b32 v0, v2, v1, 0x7060302
-; GFX10-NEXT: v_alignbit_b32 v1, s4, v3, 16
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX10-NEXT: v_alignbit_b32 v1, s4, v1, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fmuladd_v3bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
-; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11TRUE16-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v5, 16, v5
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_fmac_f32_e32 v4, v0, v2
+; GFX11TRUE16-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX11TRUE16-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX11TRUE16-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1
+; GFX11TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_fmac_f32_e32 v5, v1, v3
-; GFX11TRUE16-NEXT: v_bfe_u32 v1, v6, 16, 1
-; GFX11TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v6
-; GFX11TRUE16-NEXT: v_bfe_u32 v0, v5, 16, 1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v6, 0x7fff
-; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v5, 0x7fff
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v0, v7, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v8, vcc_lo
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
+; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff
+; GFX11TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fmuladd_v3bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0
-; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11FAKE16-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v5, 16, v5
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_fmac_f32_e32 v4, v0, v2
+; GFX11FAKE16-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX11FAKE16-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX11FAKE16-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT: v_bfe_u32 v2, v6, 16, 1
+; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_bfe_u32 v2, v4, 16, 1
-; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
-; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_fmac_f32_e32 v5, v1, v3
-; GFX11FAKE16-NEXT: v_bfe_u32 v1, v6, 16, 1
-; GFX11FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v6
-; GFX11FAKE16-NEXT: v_bfe_u32 v0, v5, 16, 1
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_add3_u32 v1, v1, v6, 0x7fff
-; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11FAKE16-NEXT: v_add3_u32 v0, v0, v5, 0x7fff
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff
+; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v0, v8, vcc_lo
-; GFX11FAKE16-NEXT: v_perm_b32 v0, v2, v1, 0x7060302
+; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc_lo
+; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x7060302
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v3, 16
+; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c)
ret <3 x bfloat> %op
@@ -47587,162 +47412,111 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
; GFX950-LABEL: v_fmuladd_v4bf16:
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
-; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v1
-; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX950-NEXT: v_fmac_f32_e32 v6, v8, v7
-; GFX950-NEXT: v_fmac_f32_e32 v5, v1, v3
-; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
-; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
-; GFX950-NEXT: v_fmac_f32_e32 v1, v7, v3
-; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX950-NEXT: v_fmac_f32_e32 v3, v0, v2
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v3, v1
-; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v5, v6
+; GFX950-NEXT: v_fma_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX950-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX950-NEXT: v_fma_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX950-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v6
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fmuladd_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v1
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v0
-; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v7
-; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX10-NEXT: v_fmac_f32_e32 v5, v1, v3
-; GFX10-NEXT: v_fmac_f32_e32 v7, v9, v8
-; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v6
-; GFX10-NEXT: v_fmac_f32_e32 v4, v0, v2
-; GFX10-NEXT: v_add3_u32 v0, v10, v6, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX10-NEXT: v_bfe_u32 v3, v7, 16, 1
+; GFX10-NEXT: v_fma_mix_f32 v6, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX10-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX10-NEXT: v_fma_mix_f32 v3, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX10-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX10-NEXT: v_bfe_u32 v2, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v6
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo
-; GFX10-NEXT: v_add3_u32 v0, v2, v5, 0x7fff
-; GFX10-NEXT: v_add3_u32 v2, v3, v7, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v7
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX10-NEXT: v_add3_u32 v6, v8, v4, 0x7fff
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v9, vcc_lo
-; GFX10-NEXT: v_perm_b32 v0, v3, v2, 0x7060302
-; GFX10-NEXT: v_perm_b32 v1, v4, v1, 0x7060302
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX10-NEXT: v_add3_u32 v2, v2, v6, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v9, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_add3_u32 v4, v9, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc_lo
+; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x7060302
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_fmuladd_v4bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v3
-; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v1
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_fmac_f32_e32 v5, v1, v3
-; GFX11TRUE16-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v7, 16, v4
-; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v4
-; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX11TRUE16-NEXT: v_fma_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX11TRUE16-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX11TRUE16-NEXT: v_fma_mix_f32 v3, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX11TRUE16-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_add3_u32 v4, v9, v6, 0x7fff
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0
-; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2
-; GFX11TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11TRUE16-NEXT: v_fmac_f32_e32 v3, v0, v1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_fmac_f32_e32 v7, v10, v8
-; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v5, 0x7fff
-; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v6
-; GFX11TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1
-; GFX11TRUE16-NEXT: v_bfe_u32 v0, v7, 16, 1
-; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v7
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11TRUE16-NEXT: v_add3_u32 v5, v9, v3, 0x7fff
-; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v7, 0x7fff
-; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v0, v10, vcc_lo
-; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo
+; GFX11TRUE16-NEXT: v_bfe_u32 v5, v6, 16, 1
+; GFX11TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX11TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX11TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
+; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v6
+; GFX11TRUE16-NEXT: v_add3_u32 v5, v5, v6, 0x7fff
+; GFX11TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo
; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
-; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11TRUE16-NEXT: v_add3_u32 v8, v9, v0, 0x7fff
+; GFX11TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v10, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v8, v9, vcc_lo
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_fmuladd_v4bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1
-; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0
-; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3
-; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v5
-; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_fmac_f32_e32 v5, v1, v3
-; GFX11FAKE16-NEXT: v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v7, 16, v4
-; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX11FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v6
+; GFX11FAKE16-NEXT: v_fma_mix_f32 v6, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX11FAKE16-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX11FAKE16-NEXT: v_fma_mix_f32 v3, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX11FAKE16-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11FAKE16-NEXT: v_bfe_u32 v2, v6, 16, 1
+; GFX11FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v6
+; GFX11FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11FAKE16-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2
-; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11FAKE16-NEXT: v_fmac_f32_e32 v4, v0, v2
-; GFX11FAKE16-NEXT: v_add3_u32 v0, v10, v6, 0x7fff
-; GFX11FAKE16-NEXT: v_bfe_u32 v2, v5, 16, 1
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo
-; GFX11FAKE16-NEXT: v_fmac_f32_e32 v7, v9, v8
-; GFX11FAKE16-NEXT: v_bfe_u32 v8, v4, 16, 1
-; GFX11FAKE16-NEXT: v_add3_u32 v0, v2, v5, 0x7fff
-; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11FAKE16-NEXT: v_bfe_u32 v3, v7, 16, 1
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11FAKE16-NEXT: v_add3_u32 v6, v8, v4, 0x7fff
-; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11FAKE16-NEXT: v_add3_u32 v2, v3, v7, 0x7fff
-; GFX11FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v7
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo
-; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v0, v9, vcc_lo
-; GFX11FAKE16-NEXT: v_perm_b32 v0, v3, v2, 0x7060302
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x7060302
+; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff
+; GFX11FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11FAKE16-NEXT: v_bfe_u32 v9, v0, 16, 1
+; GFX11FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11FAKE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
+; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11FAKE16-NEXT: v_add3_u32 v4, v9, v0, 0x7fff
+; GFX11FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
+; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc_lo
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
ret <4 x bfloat> %op
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index 210e09fd9169a..7f6a920d25016 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -137,33 +137,31 @@ define amdgpu_kernel void @v_fdiv_f16(
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] glc dlc
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v0.l
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v3, v3
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v2.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v0, v0
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v4, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v0
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v4, v2 op_sel_hi:[1,0,1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v4, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v4, v5, v0
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v4, v2 op_sel_hi:[1,0,1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v5, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v5, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff800000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_fdiv_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index a859cc91b7fde..fe95d4561d0cd 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -1571,25 +1571,24 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0x46000000
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v1, v1
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.l
+; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v0, v0
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v2, 0x46000000, v1
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v2, 0x46000000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v4, -v3, v2, s0 op_sel_hi:[1,0,0]
-; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v2, v4, v1
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v1, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v2, v3, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v3, v2, s0 op_sel_hi:[1,0,0]
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v1, v3, v1
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v1, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v3, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff800000, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
-; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.h, v0.l, 0x7000
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v1.l, 0x7000
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
@@ -1739,25 +1738,24 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v0.l, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 2.0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.l, v0.l
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v1, v1
+; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v0, v0
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v1, v1
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v4, -v3, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v0, v0
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v1, v2, s0 op_sel_hi:[1,0,0]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v2, v4, v1
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v3, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v2, v3, v0
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v1, v2, s0 op_sel_hi:[1,0,0]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v3, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff800000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.h, v0.l, 2.0
+; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v1.l, 2.0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index c4a38dcd7b5f3..78a961ea0da17 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -1433,37 +1433,35 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: fast_frem_f16:
@@ -1507,38 +1505,36 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX1150-TRUE16-NEXT: s_clause 0x1
; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-TRUE16-NEXT: s_clause 0x1
-; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX1150-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8
; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
-; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
-; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
-; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
-; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
-; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h
-; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l
-; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1150-TRUE16-NEXT: global_store_b16 v1, v2, s[0:1]
; GFX1150-TRUE16-NEXT: s_endpgm
;
; GFX1150-FAKE16-LABEL: fast_frem_f16:
@@ -1583,38 +1579,36 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX1200-TRUE16-NEXT: s_clause 0x1
; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1200-TRUE16-NEXT: s_clause 0x1
-; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX1200-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1200-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3]
+; GFX1200-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8
; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1
-; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
-; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
-; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
-; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
-; GFX1200-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
-; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h
-; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l
-; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1200-TRUE16-NEXT: global_store_b16 v1, v2, s[0:1]
; GFX1200-TRUE16-NEXT: s_endpgm
;
; GFX1200-FAKE16-LABEL: fast_frem_f16:
@@ -1840,37 +1834,35 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: unsafe_frem_f16:
@@ -1914,38 +1906,36 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX1150-TRUE16-NEXT: s_clause 0x1
; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-TRUE16-NEXT: s_clause 0x1
-; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX1150-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8
; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
-; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
-; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
-; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
-; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
-; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h
-; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l
-; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1150-TRUE16-NEXT: global_store_b16 v1, v2, s[0:1]
; GFX1150-TRUE16-NEXT: s_endpgm
;
; GFX1150-FAKE16-LABEL: unsafe_frem_f16:
@@ -1990,38 +1980,36 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX1200-TRUE16-NEXT: s_clause 0x1
; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1200-TRUE16-NEXT: s_clause 0x1
-; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX1200-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1200-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3]
+; GFX1200-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8
; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1
-; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
-; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
-; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
-; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
-; GFX1200-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
-; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h
-; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l
-; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1200-TRUE16-NEXT: global_store_b16 v1, v2, s[0:1]
; GFX1200-TRUE16-NEXT: s_endpgm
;
; GFX1200-FAKE16-LABEL: unsafe_frem_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll
index c96ba754c0811..c93c4e04f3a23 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll
@@ -20,7 +20,7 @@ define float @v_mad_mix_f32_bf16hi_bf16hi_bf16hi_int(i32 %src0, i32 %src1, i32 %
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX1250-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%src0.hi = lshr i32 %src0, 16
%src1.hi = lshr i32 %src1, 16
@@ -43,7 +43,7 @@ define float @v_mad_mix_f32_bf16hi_bf16hi_bf16hi_elt(<2 x bfloat> %src0, <2 x bf
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX1250-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%src0.hi = extractelement <2 x bfloat> %src0, i32 1
%src1.hi = extractelement <2 x bfloat> %src1, i32 1
@@ -320,7 +320,7 @@ define float @v_mad_mix_clamp_f32_bf16hi_bf16hi_bf16hi_elt(<2 x bfloat> %src0, <
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%src0.hi = extractelement <2 x bfloat> %src0, i32 1
%src1.hi = extractelement <2 x bfloat> %src1, i32 1
@@ -560,10 +560,7 @@ define float @v_mad_mix_f32_bf16lo_bf16lo_bf16lo_all_cast_from_half(half %src0,
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_dual_lshlrev_b32 v3, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_fmac_f32_e32 v0, v3, v1
+; GFX1250-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%src0.bf16 = bitcast half %src0 to bfloat
%src1.bf16 = bitcast half %src1 to bfloat
@@ -580,9 +577,7 @@ define float @v_mad_mix_f32_bf16lo_cast_from_half_bf16lo_bf16lo(half %src0, bflo
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[0,1,1]
+; GFX1250-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,0,0]
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%src0.bf16 = bitcast half %src0 to bfloat
%src0.ext = fpext bfloat %src0.bf16 to float
@@ -597,7 +592,7 @@ define amdgpu_kernel void @test_fma_mix_f32_bf16_src2_bf16lo(float %x, i32 %y, p
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, s0, 0, s1 op_sel_hi:[0,0,1]
+; GFX1250-NEXT: v_fma_mix_f32 v0, s0, 0, s1 op_sel_hi:[0,0,1]
; GFX1250-NEXT: s_mov_b32 s0, 0
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
index 1ae3434db6da5..030b1b58f5b1a 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
@@ -65,10 +65,10 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %s
; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
; SDAG-GFX11-TRUE16: ; %bb.0:
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0x3c00
-; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX11-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x3c00
+; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-TRUE16-NEXT: v_fma_mixhi_f16 v0, v3, v1, v2 op_sel_hi:[1,1,1]
; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
@@ -137,13 +137,22 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %s
}
define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src1, half %src2, half %lo) #0 {
-; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
+; SDAG-GFX11-TRUE16: ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-TRUE16-NEXT: v_fma_mixhi_f16 v0, v4, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
+; SDAG-GFX11-FAKE16: ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
; GFX9: ; %bb.0:
@@ -172,6 +181,14 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src
; SDAG-CI-NEXT: v_mov_b32_e32 v0, v3
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
+; GISEL-GFX11: ; %bb.0:
+; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -499,14 +516,25 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half
}
define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use(half %src0, half %src1, half %src2) #0 {
-; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; GFX11-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; GFX11-NEXT: global_store_b16 v[0:1], v3, off dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
+; SDAG-GFX11-TRUE16: ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
+; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v1, v0, v3, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-TRUE16-NEXT: v_fma_mixhi_f16 v0, v0, v3, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v1, off dlc
+; SDAG-GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
+; SDAG-GFX11-FAKE16: ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-FAKE16-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v3, off dlc
+; SDAG-GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
; GFX9: ; %bb.0:
@@ -542,6 +570,15 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi
; SDAG-CI-NEXT: s_waitcnt vmcnt(0)
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
+; GISEL-GFX11: ; %bb.0:
+; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX11-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX11-NEXT: global_store_b16 v[0:1], v3, off dlc
+; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll
index 03304ae3946b3..5fa145a974df8 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll
@@ -345,9 +345,9 @@ define <3 x bfloat> @v_mad_mix_v3f32_clamp_precvt(<3 x bfloat> %src0, <3 x bfloa
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_fma_mix_f32_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX1250-NEXT: v_fma_mix_f32_bf16 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX1250-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index eab92668c536b..09586cda408b0 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -411,12 +411,14 @@ define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32:
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.h, v3.l
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32:
@@ -534,13 +536,15 @@ define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v3f32:
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.h
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v6, v7, v8 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.h, v6.l
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v6
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v2, v4 op_sel_hi:[1,1,1]
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v3f32:
@@ -704,16 +708,21 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v4f32:
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v8.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v9.l, v3.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v10.l, v4.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.h
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v6, v8, v10 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v7, v7, v9, v11 op_sel_hi:[1,1,1]
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.h, v6.l
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v1.h, v7.l
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v2, v4 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v4f32:
@@ -914,14 +923,25 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half
; FIXME (DAG): Fold clamp
define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: v_mov_b32_e32 v0, v3
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_clamp_postcvt:
+; SDAG-GFX1100-TRUE16: ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v0, v3, v4, v5 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_clamp_postcvt:
+; SDAG-GFX1100-FAKE16: ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt:
; GFX900: ; %bb.0:
@@ -978,6 +998,15 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s
; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, v3
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-VI-LABEL: v_mad_mix_v2f32_clamp_postcvt:
; GISEL-VI: ; %bb.0:
; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1040,13 +1069,14 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.h
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, 0
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v0, v3, v5, v6 op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-TRUE16-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v3f32_clamp_postcvt:
@@ -1247,17 +1277,33 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
}
define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; SDAG-GFX1100-TRUE16: ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v9.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v11.l, v4.h
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v0, v7, v9, v11 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v1, v6, v8, v10 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; SDAG-GFX1100-FAKE16: ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-FAKE16-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt:
; GFX900: ; %bb.0:
@@ -1358,6 +1404,18 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s
; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt:
; GISEL-VI: ; %bb.0:
; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1453,9 +1511,12 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v0, v1, v2, v4 op_sel_hi:[1,1,1]
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
@@ -1618,9 +1679,12 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v0, v1, v2, v4 op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi:
@@ -1789,13 +1853,16 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_clamp_precvt:
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v3
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v1, v3, v4, v5 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l
+; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_clamp_precvt:
@@ -1947,15 +2014,19 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v3f32_clamp_precvt:
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v2, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.h
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v6
-; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v2, v6, v7, v8 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1
-; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l
+; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v3f32_clamp_precvt:
@@ -2141,19 +2212,26 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v4f32_clamp_precvt:
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v2, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v3, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v8.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v9.l, v3.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v10.l, v4.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.h
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v6
-; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v7
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v2, v6, v8, v10 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v3, v7, v9, v11 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1
+; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2
; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.h, v3
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l
-; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v1, v1.h, v1.l
+; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, v1.h
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v4f32_clamp_precvt:
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
index a4878539b1c74..025e4027503cc 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
@@ -75,11 +75,21 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2
}
define float @v_mad_mix_f32_f16hi_f16hi_f16hi_int(i32 %src0, i32 %src1, i32 %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_int:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_int:
+; SDAG-GFX1100-TRUE16: ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_int:
+; SDAG-GFX1100-FAKE16: ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_int:
; GFX900: ; %bb.0:
@@ -122,6 +132,12 @@ define float @v_mad_mix_f32_f16hi_f16hi_f16hi_int(i32 %src0, i32 %src1, i32 %src
; CI-NEXT: v_cvt_f32_f16_e32 v0, v2
; CI-NEXT: v_mac_f32_e32 v0, v3, v1
; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_int:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
%src0.hi = lshr i32 %src0, 16
%src1.hi = lshr i32 %src1, 16
%src2.hi = lshr i32 %src2, 16
@@ -139,11 +155,21 @@ define float @v_mad_mix_f32_f16hi_f16hi_f16hi_int(i32 %src0, i32 %src1, i32 %src
}
define float @v_mad_mix_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt:
+; SDAG-GFX1100-TRUE16: ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt:
+; SDAG-GFX1100-FAKE16: ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt:
; GFX900: ; %bb.0:
@@ -181,6 +207,12 @@ define float @v_mad_mix_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %
; SDAG-CI-NEXT: v_mad_f32 v0, v1, v3, v5
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-CI-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -200,30 +232,41 @@ define float @v_mad_mix_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %
}
define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_v2f32:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-NEXT: v_mov_b32_e32 v0, v3
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32:
+; SDAG-GFX1100-TRUE16: ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v1, v3, v4, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_mad_mix_v2f32:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_mad_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; GFX900-NEXT: v_mad_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; GFX900-NEXT: v_mov_b32_e32 v0, v3
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32:
+; SDAG-GFX1100-FAKE16: ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX906-LABEL: v_mad_mix_v2f32:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; GFX906-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; GFX906-NEXT: v_mov_b32_e32 v0, v3
-; GFX906-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX900-LABEL: v_mad_mix_v2f32:
+; SDAG-GFX900: ; %bb.0:
+; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX900-NEXT: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX900-NEXT: v_mov_b32_e32 v1, v3
+; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX906-LABEL: v_mad_mix_v2f32:
+; SDAG-GFX906: ; %bb.0:
+; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX906-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX906-NEXT: v_mov_b32_e32 v1, v3
+; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX9GEN-LABEL: v_mad_mix_v2f32:
; SDAG-GFX9GEN: ; %bb.0:
@@ -271,6 +314,31 @@ define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x hal
; SDAG-CI-NEXT: v_mac_f32_e32 v0, v4, v2
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_v2f32:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, v3
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX900-LABEL: v_mad_mix_v2f32:
+; GISEL-GFX900: ; %bb.0:
+; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX900-NEXT: v_mad_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX900-NEXT: v_mad_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, v3
+; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX906-LABEL: v_mad_mix_v2f32:
+; GISEL-GFX906: ; %bb.0:
+; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX906-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX906-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, v3
+; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-GFX9GEN-LABEL: v_mad_mix_v2f32:
; GISEL-GFX9GEN: ; %bb.0:
; GISEL-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -317,14 +385,27 @@ define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x hal
}
define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_v2f32_shuffle:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1]
-; GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1]
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX1100-NEXT: v_mov_b32_e32 v0, v3
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_shuffle:
+; SDAG-GFX1100-TRUE16: ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v2, v2, v1, v3 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v1, v0, v4, v3 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_shuffle:
+; SDAG-GFX1100-FAKE16: ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_mad_mix_v2f32_shuffle:
; GFX900: ; %bb.0:
@@ -383,6 +464,15 @@ define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1,
; SDAG-CI-NEXT: v_mad_f32 v1, v4, v3, v5
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_shuffle:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, v3
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-CI-LABEL: v_mad_mix_v2f32_shuffle:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1193,32 +1283,43 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 {
}
define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) #0 {
-; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_f32imm1:
-; SDAG-GFX1100: ; %bb.0:
-; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-NEXT: s_mov_b32 s0, 1.0
-; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-NEXT: v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0]
-; SDAG-GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, v2
-; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_f32imm1:
+; SDAG-GFX1100-TRUE16: ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT: s_mov_b32 s0, 1.0
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, v0, v1, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v1, v2, v3, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_f32imm1:
+; SDAG-GFX1100-FAKE16: ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT: s_mov_b32 s0, 1.0
+; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX900-LABEL: v_mad_mix_v2f32_f32imm1:
; SDAG-GFX900: ; %bb.0:
; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX900-NEXT: s_mov_b32 s4, 1.0
-; SDAG-GFX900-NEXT: v_mad_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0]
-; SDAG-GFX900-NEXT: v_mad_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; SDAG-GFX900-NEXT: v_mov_b32_e32 v0, v2
+; SDAG-GFX900-NEXT: v_mad_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; SDAG-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
+; SDAG-GFX900-NEXT: v_mov_b32_e32 v1, v2
; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX906-LABEL: v_mad_mix_v2f32_f32imm1:
; SDAG-GFX906: ; %bb.0:
; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX906-NEXT: s_mov_b32 s4, 1.0
-; SDAG-GFX906-NEXT: v_fma_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0]
-; SDAG-GFX906-NEXT: v_fma_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; SDAG-GFX906-NEXT: v_mov_b32_e32 v0, v2
+; SDAG-GFX906-NEXT: v_fma_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
+; SDAG-GFX906-NEXT: v_mov_b32_e32 v1, v2
; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX9GEN-LABEL: v_mad_mix_v2f32_f32imm1:
@@ -1325,32 +1426,43 @@ define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1)
}
define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 {
-; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
-; SDAG-GFX1100: ; %bb.0:
-; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-NEXT: s_mov_b32 s0, 0x3e230000
-; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-NEXT: v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0]
-; SDAG-GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, v2
-; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
+; SDAG-GFX1100-TRUE16: ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT: s_mov_b32 s0, 0x3e230000
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, v0, v1, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v1, v2, v3, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
+; SDAG-GFX1100-FAKE16: ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT: s_mov_b32 s0, 0x3e230000
+; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX900-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
; SDAG-GFX900: ; %bb.0:
; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX900-NEXT: s_mov_b32 s4, 0x3e230000
-; SDAG-GFX900-NEXT: v_mad_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0]
-; SDAG-GFX900-NEXT: v_mad_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; SDAG-GFX900-NEXT: v_mov_b32_e32 v0, v2
+; SDAG-GFX900-NEXT: v_mad_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; SDAG-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
+; SDAG-GFX900-NEXT: v_mov_b32_e32 v1, v2
; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX906-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
; SDAG-GFX906: ; %bb.0:
; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX906-NEXT: s_mov_b32 s4, 0x3e230000
-; SDAG-GFX906-NEXT: v_fma_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0]
-; SDAG-GFX906-NEXT: v_fma_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; SDAG-GFX906-NEXT: v_mov_b32_e32 v0, v2
+; SDAG-GFX906-NEXT: v_fma_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
+; SDAG-GFX906-NEXT: v_mov_b32_e32 v1, v2
; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX9GEN-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
@@ -1464,32 +1576,43 @@ define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half>
}
define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 {
-; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_f32imminv2pi:
-; SDAG-GFX1100: ; %bb.0:
-; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-NEXT: s_mov_b32 s0, 0.15915494
-; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-NEXT: v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0]
-; SDAG-GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, v2
-; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_f32imminv2pi:
+; SDAG-GFX1100-TRUE16: ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT: s_mov_b32 s0, 0.15915494
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, v0, v1, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v1, v2, v3, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_f32imminv2pi:
+; SDAG-GFX1100-FAKE16: ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT: s_mov_b32 s0, 0.15915494
+; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX900-LABEL: v_mad_mix_v2f32_f32imminv2pi:
; SDAG-GFX900: ; %bb.0:
; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX900-NEXT: s_mov_b32 s4, 0.15915494
-; SDAG-GFX900-NEXT: v_mad_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0]
-; SDAG-GFX900-NEXT: v_mad_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; SDAG-GFX900-NEXT: v_mov_b32_e32 v0, v2
+; SDAG-GFX900-NEXT: v_mad_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; SDAG-GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
+; SDAG-GFX900-NEXT: v_mov_b32_e32 v1, v2
; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX906-LABEL: v_mad_mix_v2f32_f32imminv2pi:
; SDAG-GFX906: ; %bb.0:
; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX906-NEXT: s_mov_b32 s4, 0.15915494
-; SDAG-GFX906-NEXT: v_fma_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0]
-; SDAG-GFX906-NEXT: v_fma_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; SDAG-GFX906-NEXT: v_mov_b32_e32 v0, v2
+; SDAG-GFX906-NEXT: v_fma_mix_f32 v2, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, s4 op_sel_hi:[1,1,0]
+; SDAG-GFX906-NEXT: v_mov_b32_e32 v1, v2
; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX9GEN-LABEL: v_mad_mix_v2f32_f32imminv2pi:
@@ -1599,11 +1722,21 @@ define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %s
}
define float @v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
+; SDAG-GFX1100-TRUE16: ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
+; SDAG-GFX1100-FAKE16: ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
; GFX900: ; %bb.0:
@@ -1641,6 +1774,12 @@ define float @v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x h
; SDAG-CI-NEXT: v_mad_f32 v0, v1, v3, v5 clamp
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-CI-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2253,9 +2392,10 @@ define float @v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo(i32 %src0.arg, half %
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo:
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; SDAG-GFX1100-TRUE16-NEXT: v_xor_b16 v2.l, 0x8000, v0.h
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, |v2|, v1, v0 op_sel_hi:[1,1,1]
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo:
@@ -2339,11 +2479,20 @@ define float @v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo(i32 %src0.arg, half %
}
define float @v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo:
+; SDAG-GFX1100-TRUE16: ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, |v2|, v1, v0 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo:
+; SDAG-GFX1100-FAKE16: ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo:
; GFX900: ; %bb.0:
@@ -2383,6 +2532,12 @@ define float @v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo(i32 %src0.arg, half %src1
; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-CI-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2403,11 +2558,20 @@ define float @v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo(i32 %src0.arg, half %src1
}
define float @v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_fma_mix_f32 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
+; SDAG-GFX1100-TRUE16: ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, -v2, v1, v0 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
+; SDAG-GFX1100-FAKE16: ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
; GFX900: ; %bb.0:
@@ -2447,6 +2611,12 @@ define float @v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half
; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
; GISEL-GFX9GEN: ; %bb.0:
; GISEL-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2488,11 +2658,20 @@ define float @v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half
}
define float @v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
+; SDAG-GFX1100-TRUE16: ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, |v2|, v1, v0 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
+; SDAG-GFX1100-FAKE16: ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
; GFX900: ; %bb.0:
@@ -2532,6 +2711,12 @@ define float @v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo(i32 %src0.arg, half
; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
; GISEL-GFX9GEN: ; %bb.0:
; GISEL-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2573,11 +2758,20 @@ define float @v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo(i32 %src0.arg, half
}
define float @v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
+; SDAG-GFX1100-TRUE16: ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, -|v2|, v1, v0 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
+; SDAG-GFX1100-FAKE16: ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
; GFX900: ; %bb.0:
@@ -2617,6 +2811,12 @@ define float @v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo(i32 %src0.arg,
; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
; GISEL-GFX9GEN: ; %bb.0:
; GISEL-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index b01e92d6979a3..e3c4769155cb1 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -5292,351 +5292,272 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
; GFX942-NEXT: s_mov_b32 s3, 0x7060302
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9]
-; GFX942-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
-; GFX942-NEXT: global_load_dwordx2 v[4:5], v6, s[10:11]
-; GFX942-NEXT: s_waitcnt vmcnt(2)
-; GFX942-NEXT: v_lshlrev_b32_e32 v7, 16, v0
-; GFX942-NEXT: s_waitcnt vmcnt(1)
-; GFX942-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX942-NEXT: global_load_dwordx2 v[2:3], v6, s[10:11]
+; GFX942-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_and_b32_e32 v9, 0xffff0000, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX942-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX942-NEXT: v_lshlrev_b32_e32 v11, 16, v1
-; GFX942-NEXT: v_and_b32_e32 v12, 0xffff0000, v3
-; GFX942-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX942-NEXT: v_fmac_f32_e32 v8, v7, v9
-; GFX942-NEXT: v_fmac_f32_e32 v2, v7, v4
-; GFX942-NEXT: v_fmac_f32_e32 v3, v11, v4
-; GFX942-NEXT: v_bfe_u32 v4, v8, 16, 1
-; GFX942-NEXT: v_fmac_f32_e32 v12, v11, v9
-; GFX942-NEXT: v_or_b32_e32 v7, 0x400000, v8
-; GFX942-NEXT: v_bfe_u32 v9, v2, 16, 1
-; GFX942-NEXT: v_add3_u32 v4, v4, v8, s2
+; GFX942-NEXT: v_fma_mix_f32 v7, v0, v2, v4 op_sel:[0,1,1] op_sel_hi:[1,1,1]
+; GFX942-NEXT: v_fma_mix_f32 v4, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX942-NEXT: v_fma_mix_f32 v8, v1, v2, v5 op_sel:[0,1,1] op_sel_hi:[1,1,1]
+; GFX942-NEXT: v_fma_mix_f32 v2, v1, v2, v5 op_sel_hi:[1,1,1]
+; GFX942-NEXT: v_bfe_u32 v5, v7, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX942-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX942-NEXT: v_add3_u32 v5, v5, v7, s2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX942-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX942-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX942-NEXT: v_add3_u32 v10, v10, v4, s2
+; GFX942-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX942-NEXT: v_or_b32_e32 v13, 0x400000, v8
+; GFX942-NEXT: v_bfe_u32 v14, v2, 16, 1
+; GFX942-NEXT: v_add3_u32 v12, v12, v8, s2
+; GFX942-NEXT: v_cndmask_b32_e32 v4, v10, v11, vcc
; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v8, v8
-; GFX942-NEXT: v_or_b32_e32 v11, 0x400000, v2
-; GFX942-NEXT: v_bfe_u32 v13, v12, 16, 1
-; GFX942-NEXT: v_add3_u32 v9, v9, v2, s2
-; GFX942-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX942-NEXT: v_or_b32_e32 v15, 0x400000, v2
+; GFX942-NEXT: v_add3_u32 v14, v14, v2, s2
+; GFX942-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc
; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX942-NEXT: v_or_b32_e32 v14, 0x400000, v12
-; GFX942-NEXT: v_bfe_u32 v15, v3, 16, 1
-; GFX942-NEXT: v_add3_u32 v13, v13, v12, s2
-; GFX942-NEXT: v_cndmask_b32_e32 v2, v9, v11, vcc
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
-; GFX942-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX942-NEXT: v_or_b32_e32 v16, 0x400000, v3
-; GFX942-NEXT: v_add3_u32 v15, v15, v3, s2
-; GFX942-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX942-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX942-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v15, v16, vcc
-; GFX942-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX942-NEXT: v_fmac_f32_e32 v2, v0, v10
-; GFX942-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX942-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX942-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX942-NEXT: v_fmac_f32_e32 v4, v0, v5
-; GFX942-NEXT: v_bfe_u32 v0, v2, 16, 1
-; GFX942-NEXT: v_fmac_f32_e32 v3, v1, v10
-; GFX942-NEXT: v_fmac_f32_e32 v7, v1, v5
-; GFX942-NEXT: v_or_b32_e32 v1, 0x400000, v2
-; GFX942-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX942-NEXT: v_add3_u32 v0, v0, v2, s2
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX942-NEXT: v_bfe_u32 v9, v3, 16, 1
-; GFX942-NEXT: v_add3_u32 v5, v5, v4, s2
-; GFX942-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX942-NEXT: v_fma_mix_f32 v4, v0, v3, v4 op_sel:[1,0,1] op_sel_hi:[1,1,1]
+; GFX942-NEXT: v_fma_mix_f32 v0, v0, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v14, v15, vcc
+; GFX942-NEXT: v_fma_mix_f32 v2, v1, v3, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1]
+; GFX942-NEXT: v_fma_mix_f32 v1, v1, v3, v7 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX942-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX942-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX942-NEXT: v_add3_u32 v3, v3, v4, s2
; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v3
-; GFX942-NEXT: v_bfe_u32 v11, v7, 16, 1
-; GFX942-NEXT: v_add3_u32 v9, v9, v3, s2
-; GFX942-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX942-NEXT: v_or_b32_e32 v12, 0x400000, v7
-; GFX942-NEXT: v_add3_u32 v11, v11, v7, s2
-; GFX942-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc
-; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
-; GFX942-NEXT: v_perm_b32 v0, v2, v0, s3
+; GFX942-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX942-NEXT: v_bfe_u32 v9, v2, 16, 1
+; GFX942-NEXT: v_add3_u32 v7, v7, v0, s2
+; GFX942-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX942-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX942-NEXT: v_bfe_u32 v11, v1, 16, 1
+; GFX942-NEXT: v_add3_u32 v9, v9, v2, s2
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX942-NEXT: v_or_b32_e32 v12, 0x400000, v1
+; GFX942-NEXT: v_add3_u32 v11, v11, v1, s2
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX942-NEXT: v_perm_b32 v0, v0, v3, s3
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_cndmask_b32_e32 v3, v11, v12, vcc
-; GFX942-NEXT: v_perm_b32 v1, v3, v1, s3
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc
+; GFX942-NEXT: v_perm_b32 v1, v1, v2, s3
; GFX942-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX10-LABEL: fma_shuffle_v2bf16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x10
-; GFX10-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1]
-; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[4:5]
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7]
-; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
-; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v2
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3]
+; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[4:5]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v1
-; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT: v_fmac_f32_e32 v7, v8, v9
-; GFX10-NEXT: v_fmac_f32_e32 v0, v8, v4
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: v_fmac_f32_e32 v11, v12, v9
-; GFX10-NEXT: v_fmac_f32_e32 v1, v12, v4
-; GFX10-NEXT: v_bfe_u32 v4, v7, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v7
-; GFX10-NEXT: v_bfe_u32 v9, v0, 16, 1
+; GFX10-NEXT: v_fma_mix_f32 v7, v0, v2, v4 op_sel:[0,1,1] op_sel_hi:[1,1,1]
+; GFX10-NEXT: v_fma_mix_f32 v4, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX10-NEXT: v_fma_mix_f32 v8, v1, v2, v5 op_sel:[0,1,1] op_sel_hi:[1,1,1]
+; GFX10-NEXT: v_fma_mix_f32 v2, v1, v2, v5 op_sel_hi:[1,1,1]
+; GFX10-NEXT: v_bfe_u32 v5, v7, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX10-NEXT: v_bfe_u32 v10, v4, 16, 1
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v0
-; GFX10-NEXT: v_add3_u32 v4, v4, v7, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v15, v1, 16, 1
-; GFX10-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v13, v11, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v16, 0x400000, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_add3_u32 v15, v15, v1, 0x7fff
-; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX10-NEXT: v_or_b32_e32 v14, 0x400000, v11
-; GFX10-NEXT: v_add3_u32 v13, v13, v11, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo
+; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX10-NEXT: v_add3_u32 v5, v5, v7, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v14, v2, 16, 1
+; GFX10-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_add3_u32 v14, v14, v2, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v8
+; GFX10-NEXT: v_add3_u32 v12, v12, v8, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v11, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_fma_mix_f32 v4, v0, v3, v4 op_sel:[1,0,1] op_sel_hi:[1,1,1]
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v14, v15, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX10-NEXT: v_fma_mix_f32 v0, v0, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX10-NEXT: v_fma_mix_f32 v2, v1, v3, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1]
+; GFX10-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT: v_bfe_u32 v9, v2, 16, 1
+; GFX10-NEXT: v_fma_mix_f32 v1, v1, v3, v7 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX10-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT: v_add3_u32 v9, v9, v2, 0x7fff
+; GFX10-NEXT: v_bfe_u32 v11, v1, 16, 1
+; GFX10-NEXT: v_add3_u32 v3, v3, v4, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v1
+; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v11, v11, v1, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v15, v16, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX10-NEXT: v_fmac_f32_e32 v4, v2, v5
-; GFX10-NEXT: v_fmac_f32_e32 v0, v2, v10
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc_lo
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX10-NEXT: v_fmac_f32_e32 v1, v3, v10
-; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc_lo
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v9, v1, 16, 1
-; GFX10-NEXT: v_fmac_f32_e32 v7, v3, v5
-; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v1
-; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX10-NEXT: v_add3_u32 v9, v9, v1, 0x7fff
-; GFX10-NEXT: v_bfe_u32 v11, v7, 16, 1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v7
-; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX10-NEXT: v_add3_u32 v11, v11, v7, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v11, v12, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x7060302
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc_lo
-; GFX10-NEXT: v_perm_b32 v0, v3, v0, 0x7060302
-; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
+; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
+; GFX10-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-TRUE16-LABEL: fma_shuffle_v2bf16:
; GFX11-TRUE16: ; %bb.0: ; %entry
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x10
-; GFX11-TRUE16-NEXT: s_load_b128 s[4:7], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 3, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v6, s[0:1]
-; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v6, s[4:5]
-; GFX11-TRUE16-NEXT: global_load_b64 v[4:5], v6, s[6:7]
+; GFX11-TRUE16-NEXT: global_load_b64 v[2:3], v6, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_b64 v[4:5], v6, s[4:5]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_fmac_f32 v1, v12, v4 :: v_dual_lshlrev_b32 v8, 16, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v15, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v15, v15, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-TRUE16-NEXT: v_dual_fmac_f32 v7, v8, v9 :: v_dual_and_b32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v0, v8, v4
-; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v7, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v4, v0, v2, v4 op_sel:[0,1,1] op_sel_hi:[1,1,1]
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v8, v1, v2, v5 op_sel_hi:[1,1,1]
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v2, v1, v2, v5 op_sel:[0,1,1] op_sel_hi:[1,1,1]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v7, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v7, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v7, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_add3_u32 v14, v14, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v8
+; GFX11-TRUE16-NEXT: v_add3_u32 v12, v12, v8, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v10, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v4, v0, v3, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v14, v15, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v0, v0, v3, v5 op_sel:[1,0,1] op_sel_hi:[1,1,1]
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v2, v1, v3, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v4, v2, v5
-; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v11, v12, v9
-; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v11, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v11
-; GFX11-TRUE16-NEXT: v_add3_u32 v13, v13, v11, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v15, v16, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v0, v2, v10
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v7, v3, v5
-; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v5, v8 :: v_dual_fmac_f32 v1, v3, v10
-; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v7, 16, 1
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v1, v1, v3, v7 op_sel:[1,0,1] op_sel_hi:[1,1,1]
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v4, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v7, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v12, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v12, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v9, v11, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v5.h
-; GFX11-TRUE16-NEXT: global_store_b64 v6, v[0:1], s[0:1]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v8.h
+; GFX11-TRUE16-NEXT: global_store_b64 v6, v[0:1], s[4:5]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: fma_shuffle_v2bf16:
; GFX11-FAKE16: ; %bb.0: ; %entry
; GFX11-FAKE16-NEXT: s_clause 0x1
-; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x10
-; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 3, v0
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-NEXT: s_clause 0x2
; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v6, s[0:1]
-; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v6, s[4:5]
-; GFX11-FAKE16-NEXT: global_load_b64 v[4:5], v6, s[6:7]
+; GFX11-FAKE16-NEXT: global_load_b64 v[2:3], v6, s[2:3]
+; GFX11-FAKE16-NEXT: global_load_b64 v[4:5], v6, s[4:5]
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v4
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v1
-; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_dual_fmac_f32 v1, v12, v4 :: v_dual_lshlrev_b32 v8, 16, v2
-; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v1, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add3_u32 v15, v15, v1, 0x7fff
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v0
-; GFX11-FAKE16-NEXT: v_dual_fmac_f32 v7, v8, v9 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v0, v8, v4
-; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v7, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_fma_mix_f32 v7, v0, v2, v4 op_sel:[0,1,1] op_sel_hi:[1,1,1]
+; GFX11-FAKE16-NEXT: v_fma_mix_f32 v4, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX11-FAKE16-NEXT: v_fma_mix_f32 v8, v1, v2, v5 op_sel:[0,1,1] op_sel_hi:[1,1,1]
+; GFX11-FAKE16-NEXT: v_fma_mix_f32 v2, v1, v2, v5 op_sel_hi:[1,1,1]
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v7, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v7
+; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v4, 16, 1
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v7, 0x7fff
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v4, v2, v5
-; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v11, v12, v9
-; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v0, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v7, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v14, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v8, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_add3_u32 v14, v14, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v8
+; GFX11-FAKE16-NEXT: v_add3_u32 v12, v12, v8, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v10, v11, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_fma_mix_f32 v4, v0, v3, v4 op_sel:[1,0,1] op_sel_hi:[1,1,1]
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v14, v15, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8
+; GFX11-FAKE16-NEXT: v_fma_mix_f32 v0, v0, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_fma_mix_f32 v2, v1, v3, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1]
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v12, v13, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v2, 16, 1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_fma_mix_f32 v1, v1, v3, v7 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v2, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v4, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v11, 16, 1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x400000, v11
-; GFX11-FAKE16-NEXT: v_add3_u32 v13, v13, v11, 0x7fff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo
+; GFX11-FAKE16-NEXT: v_add3_u32 v11, v11, v1, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v15, v16, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc_lo
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v1, v3, v10
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-FAKE16-NEXT: v_bfe_u32 v9, v1, 16, 1
-; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v7, v3, v5
-; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT: v_add3_u32 v9, v9, v1, 0x7fff
-; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-FAKE16-NEXT: v_bfe_u32 v11, v7, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x400000, v7
-; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v0, v2, v10
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x400000, v1
-; GFX11-FAKE16-NEXT: v_add3_u32 v11, v11, v7, 0x7fff
-; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v11, v12, vcc_lo
-; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_perm_b32 v1, v2, v1, 0x7060302
-; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc_lo
-; GFX11-FAKE16-NEXT: v_perm_b32 v0, v3, v0, 0x7060302
-; GFX11-FAKE16-NEXT: global_store_b64 v6, v[0:1], s[0:1]
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x7060302
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x7060302
+; GFX11-FAKE16-NEXT: global_store_b64 v6, v[0:1], s[4:5]
; GFX11-FAKE16-NEXT: s_endpgm
entry:
%tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x()
More information about the llvm-commits
mailing list