[llvm] [AMDGPU][True16][CodeGen] insert vgpr32 for SelectMadMixFma for 16bit (PR #159648)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 23 08:46:15 PDT 2025
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/159648
>From 331d62e3c5995d156c29dabb78fe321977d5954c Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Mon, 22 Sep 2025 13:43:20 -0400
Subject: [PATCH] madmixfma use vgpr16
---
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 28 ++-
llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 39 ++++
llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h | 1 +
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 7 +
llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 131 +++++++----
llvm/test/CodeGen/AMDGPU/fdiv.f16.ll | 36 ++-
llvm/test/CodeGen/AMDGPU/fma-mix.gfx11plus.ll | 93 ++++++++
.../AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll | 54 +++--
llvm/test/CodeGen/AMDGPU/frem.ll | 212 +++++++++---------
llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll | 80 ++++---
llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll | 135 ++++++-----
llvm/test/CodeGen/AMDGPU/mad-mix.ll | 5 +-
llvm/test/CodeGen/AMDGPU/preserve-hi16.ll | 3 +-
13 files changed, 535 insertions(+), 289 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/fma-mix.gfx11plus.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index c2fca79979e1b..2158a4d0e2076 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -4078,18 +4078,26 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
// register.
Mods |= SISrcMods::OP_SEL_1;
- if (IsExtractHigh ||
- (Src.getValueSizeInBits() == 16 && isExtractHiElt(Src, Src))) {
- Mods |= SISrcMods::OP_SEL_0;
+ if (Src.getValueSizeInBits() == 16) {
+ if (isExtractHiElt(Src, Src)) {
+ Mods |= SISrcMods::OP_SEL_0;
- // TODO: Should we try to look for neg/abs here?
- }
+ // TODO: Should we try to look for neg/abs here?
+ return true;
+ }
+
+ if (Src.getOpcode() == ISD::TRUNCATE &&
+ Src.getOperand(0).getValueType() == MVT::i32) {
+ Src = Src.getOperand(0);
+ return true;
+ }
+
+ if (Subtarget->useRealTrue16Insts())
+ // In true16 mode, pack src to a 32bit
+ Src = createVOP3PSrc32FromLo16(Src, In, CurDAG, Subtarget);
+ } else if (IsExtractHigh)
+ Mods |= SISrcMods::OP_SEL_0;
- // Prevent unnecessary subreg COPY to VGPR_16
- if (Src.getOpcode() == ISD::TRUNCATE &&
- Src.getOperand(0).getValueType() == MVT::i32) {
- Src = Src.getOperand(0);
- }
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 6acbf52b97de5..11b9d0b1840d3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -175,6 +175,41 @@ void AMDGPUMCInstLower::lowerT16D16Helper(const MachineInstr *MI,
}
}
+void AMDGPUMCInstLower::lowerT16FmaMixFP16(const MachineInstr *MI,
+ MCInst &OutMI) const {
+ unsigned Opcode = MI->getOpcode();
+ const auto *TII = static_cast<const SIInstrInfo *>(ST.getInstrInfo());
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
+ int VDstIdx = AMDGPU::getNamedOperandIdx(Opcode, llvm::AMDGPU::OpName::vdst);
+ const MachineOperand &VDst = MI->getOperand(VDstIdx);
+ bool IsHi = AMDGPU::isHi16Reg(VDst.getReg(), TRI);
+ // select hi/lo MCInst
+ switch (Opcode) {
+ case AMDGPU::V_FMA_MIX_F16_t16:
+ Opcode = IsHi ? AMDGPU::V_FMA_MIXHI_F16 : AMDGPU::V_FMA_MIXLO_F16;
+ break;
+ case AMDGPU::V_FMA_MIX_BF16_t16:
+ Opcode = IsHi ? AMDGPU::V_FMA_MIXHI_BF16 : AMDGPU::V_FMA_MIXLO_BF16;
+ break;
+ }
+ int MCOpcode = TII->pseudoToMCOpcode(Opcode);
+ assert(MCOpcode != -1 &&
+ "Pseudo instruction doesn't have a target-specific version");
+ OutMI.setOpcode(MCOpcode);
+
+ // lower operands
+ for (int I = 0, E = MI->getNumExplicitOperands(); I < E; I++) {
+ const MachineOperand &MO = MI->getOperand(I);
+ MCOperand MCOp;
+ if (I == VDstIdx)
+ MCOp = MCOperand::createReg(TRI.get32BitRegister(VDst.getReg()));
+ else
+ lowerOperand(MO, MCOp);
+ OutMI.addOperand(MCOp);
+ }
+}
+
void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
unsigned Opcode = MI->getOpcode();
const auto *TII = static_cast<const SIInstrInfo *>(ST.getInstrInfo());
@@ -201,6 +236,10 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
} else if (AMDGPU::getT16D16Helper(Opcode)) {
lowerT16D16Helper(MI, OutMI);
return;
+ } else if (Opcode == AMDGPU::V_FMA_MIX_F16_t16 ||
+ Opcode == AMDGPU::V_FMA_MIX_BF16_t16) {
+ lowerT16FmaMixFP16(MI, OutMI);
+ return;
}
int MCOpcode = TII->pseudoToMCOpcode(Opcode);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
index 68b8d4e25a6cc..23ed55d45220f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
@@ -38,6 +38,7 @@ class AMDGPUMCInstLower {
void lower(const MachineInstr *MI, MCInst &OutMI) const;
void lowerT16D16Helper(const MachineInstr *MI, MCInst &OutMI) const;
+ void lowerT16FmaMixFP16(const MachineInstr *MI, MCInst &OutMI) const;
};
namespace {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 70223da961e92..b3e4a885c23c6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -9498,6 +9498,13 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
DescSize = Desc.getSize();
}
+ // If FMA Pseudo inst, get correct MC code size
+ if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
+ // fma lo/hi f16/bf16 inst are in same size
+ const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
+ DescSize = Desc.getSize();
+ }
+
return DescSize;
}
}
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index f7279b664ed27..52ee1e874ad86 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -64,6 +64,13 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
"$vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp";
}
+class VOP3P_Mix_Profile_t16<VOPProfile P, VOP3Features Features = VOP3_REGULAR>
+ : VOP3P_Mix_Profile<P, Features, 0> {
+ let IsTrue16 = 1;
+ let IsRealTrue16 = 1;
+ let DstRC64 = getVALUDstForVT<P.DstVT, 1 /*IsTrue16*/, 1 /*IsVOP3Encoding*/>.ret;
+}
+
multiclass VOP3PInst<string OpName, VOPProfile P,
SDPatternOperator node = null_frag, bit IsDOT = 0> {
def NAME : VOP3P_Pseudo<OpName, P,
@@ -95,6 +102,16 @@ multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> {
} // end SubtargetPredicate = isGFX11Plus
}
+multiclass VOP3_VOP3PInst_t16<string OpName, VOP3P_Mix_Profile P> {
+ def NAME : VOP3P_Pseudo<OpName, P>;
+
+ if P.HasExtVOP3DPP then
+ def _dpp : VOP3_DPP_Pseudo<OpName, P> {
+ let VOP3P = 1;
+ let PseudoInstr = OpName#"_dpp";
+ }
+}
+
let isReMaterializable = 1 in {
let isCommutable = 1 in {
defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
@@ -160,12 +177,9 @@ defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_
// TODO: Make sure we're doing the right thing with denormals. Note
// that FMA and MAD will differ.
-multiclass MadFmaMixPats<SDPatternOperator fma_like,
- Instruction mix_inst,
- Instruction mixlo_inst,
- Instruction mixhi_inst,
- ValueType VT = f16,
- ValueType vecVT = v2f16> {
+multiclass MadFmaMixFP32Pats<SDPatternOperator fma_like,
+ Instruction mix_inst,
+ ValueType VT = f16> {
defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods);
defvar VOP3PMadMixModsExtPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsExt, VOP3PMadMixModsExt);
// At least one of the operands needs to be an fpextend of an f16
@@ -189,7 +203,14 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
(f32 (VOP3PMadMixModsExtPat VT:$src2, i32:$src2_mods)))),
(mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
DSTCLAMP.NONE)>;
+}
+multiclass MadFmaMixFP16Pats<SDPatternOperator fma_like,
+ Instruction mixlo_inst,
+ Instruction mixhi_inst,
+ ValueType VT = f16,
+ ValueType vecVT = v2f16> {
+ defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods);
def : GCNPat <
(AMDGPUclamp (build_vector
(VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)),
@@ -243,9 +264,6 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
// FIXME: Special case handling for maxhi (especially for clamp)
// because dealing with the write to high half of the register is
// difficult.
- foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
- let True16Predicate = p in {
-
def : GCNPat <
(build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
(f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
@@ -269,45 +287,60 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
DSTCLAMP.ENABLE,
VGPR_32:$elt0))
>;
+}
- } // end True16Predicate
+multiclass MadFmaMixFP16Pats_t16<SDPatternOperator fma_like,
+ Instruction mix_inst_16,
+ ValueType VT = f16,
+ ValueType vecVT = v2f16> {
+ defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods);
+ def : GCNPat <
+ (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers))))),
+ (mix_inst_16 $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ (i32 0), (i32 0),
+ DSTCLAMP.NONE)
+ >;
- let True16Predicate = UseRealTrue16Insts in {
def : GCNPat <
- (build_vector (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
(f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))), VT:$elt1),
- (vecVT (mixlo_inst $src0_modifiers, $src0,
- $src1_modifiers, $src1,
- $src2_modifiers, $src2,
- DSTCLAMP.NONE,
- (REG_SEQUENCE VGPR_32, (VT (IMPLICIT_DEF)), lo16, $elt1, hi16)))
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))),
+ (mix_inst_16 $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ $src2_modifiers, $src2,
+ DSTCLAMP.NONE)
>;
+
def : GCNPat <
- (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))),
- (vecVT (mixhi_inst $src0_modifiers, $src0,
- $src1_modifiers, $src1,
- $src2_modifiers, $src2,
- DSTCLAMP.NONE,
- (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16)))
+ (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))),
+ (mix_inst_16 $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ $src2_modifiers, $src2,
+ DSTCLAMP.ENABLE)
>;
def : GCNPat <
- (build_vector
- VT:$elt0,
- (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))),
- (vecVT (mixhi_inst $src0_modifiers, $src0,
- $src1_modifiers, $src1,
- $src2_modifiers, $src2,
- DSTCLAMP.ENABLE,
- (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16)))
+ (AMDGPUclamp (build_vector
+ (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$lo_src1, i32:$lo_src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$lo_src2, i32:$lo_src2_modifiers))))),
+ (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$hi_src0, i32:$hi_src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$hi_src1, i32:$hi_src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$hi_src2, i32:$hi_src2_modifiers))))))),
+ (vecVT (REG_SEQUENCE VGPR_32, (mix_inst_16 $lo_src0_modifiers, $lo_src0,
+ $lo_src1_modifiers, $lo_src1,
+ $lo_src2_modifiers, $lo_src2,
+ DSTCLAMP.ENABLE), lo16,
+ (mix_inst_16 $hi_src0_modifiers, $hi_src0,
+ $hi_src1_modifiers, $hi_src1,
+ $hi_src2_modifiers, $hi_src2,
+ DSTCLAMP.ENABLE), hi16))
>;
- } // end True16Predicate
}
class MinimumMaximumByMinimum3Maximum3VOP3P<SDPatternOperator node,
@@ -341,7 +374,8 @@ defm V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3P_Mix_Profile<VOP_F
} // End FPDPRounding = 1
}
-defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
+defm : MadFmaMixFP32Pats<fmad, V_MAD_MIX_F32>;
+defm : MadFmaMixFP16Pats<fmad, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
} // OtherPredicates = [NoFP32Denormals]
} // End SubtargetPredicate = HasMadMixInsts
@@ -360,10 +394,19 @@ defm V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3P_Mix_Profile<VOP_F
let ClampLo = 0, ClampHi = 1 in {
defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL, 1>>;
}
+
+// Pseudo true16 inst for v_fma_mixlo/hi_f16
+defm V_FMA_MIX_F16_t16 : VOP3_VOP3PInst_t16<"v_fma_mix_f16_t16", VOP3P_Mix_Profile_t16<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
} // End FPDPRounding = 1
}
-defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
+defm : MadFmaMixFP32Pats<fma, V_FMA_MIX_F32>;
+
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in
+defm : MadFmaMixFP16Pats<fma, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
+let True16Predicate = UseRealTrue16Insts in
+defm : MadFmaMixFP16Pats_t16<fma, V_FMA_MIX_F16_t16>;
}
let SubtargetPredicate = HasFmaMixBF16Insts in {
@@ -378,10 +421,18 @@ defm V_FMA_MIXLO_BF16 : VOP3_VOP3PInst<"v_fma_mixlo_bf16", VOP3P_Mix_Profile<VOP
let ClampLo = 0, ClampHi = 1 in {
defm V_FMA_MIXHI_BF16 : VOP3_VOP3PInst<"v_fma_mixhi_bf16", VOP3P_Mix_Profile<VOP_BF16_BF16_BF16_BF16, VOP3_OPSEL, 1>>;
}
+
+// Pseudo true16 inst for v_fma_mixlo/hi_bf16
+defm V_FMA_MIX_BF16_t16 : VOP3_VOP3PInst_t16<"v_fma_mix_bf16_t16", VOP3P_Mix_Profile_t16<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
} // End FPDPRounding = 1
} // End isCommutable = 1
-defm : MadFmaMixPats<fma, V_FMA_MIX_F32_BF16, V_FMA_MIXLO_BF16, V_FMA_MIXHI_BF16, bf16, v2bf16>;
+defm : MadFmaMixFP32Pats<fma, V_FMA_MIX_F32_BF16, bf16>;
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in
+defm : MadFmaMixFP16Pats<fma, V_FMA_MIXLO_BF16, V_FMA_MIXHI_BF16, bf16, v2bf16>;
+let True16Predicate = UseRealTrue16Insts in
+defm : MadFmaMixFP16Pats_t16<fma, V_FMA_MIX_BF16_t16>;
} // End SubtargetPredicate = HasFmaMixBF16Insts
def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> {
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index 210e09fd9169a..7f6a920d25016 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -137,33 +137,31 @@ define amdgpu_kernel void @v_fdiv_f16(
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] glc dlc
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v0.l
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v3, v3
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v2.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v0, v0
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v4, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v0
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v4, v2 op_sel_hi:[1,0,1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v4, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v4, v5, v0
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v4, v2 op_sel_hi:[1,0,1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v5, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v5, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff800000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_fdiv_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fma-mix.gfx11plus.ll b/llvm/test/CodeGen/AMDGPU/fma-mix.gfx11plus.ll
new file mode 100644
index 0000000000000..1ba13b287be46
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fma-mix.gfx11plus.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -stop-after=amdgpu-isel | FileCheck %s --check-prefixes=GFX11-REAL16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -stop-after=amdgpu-isel | FileCheck %s --check-prefixes=GFX11-FAKE16
+
+; Make sure no "vgpr32 = copy vgpr16" is generated
+
+define amdgpu_kernel void @fma_mix_f16 (ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) {
+ ; GFX11-REAL16-LABEL: name: fma_mix_f16
+ ; GFX11-REAL16: bb.0.entry:
+ ; GFX11-REAL16-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
+ ; GFX11-REAL16-NEXT: {{ $}}
+ ; GFX11-REAL16-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; GFX11-REAL16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; GFX11-REAL16-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s256) from %ir.a.kernarg.offset, align 4, addrspace 4)
+ ; GFX11-REAL16-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub1
+ ; GFX11-REAL16-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub0
+ ; GFX11-REAL16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
+ ; GFX11-REAL16-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub3
+ ; GFX11-REAL16-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub2
+ ; GFX11-REAL16-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1
+ ; GFX11-REAL16-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub5
+ ; GFX11-REAL16-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub4
+ ; GFX11-REAL16-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
+ ; GFX11-REAL16-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub7
+ ; GFX11-REAL16-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub6
+ ; GFX11-REAL16-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1
+ ; GFX11-REAL16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX11-REAL16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023
+ ; GFX11-REAL16-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]](s32), killed [[S_MOV_B32_]], implicit $exec
+ ; GFX11-REAL16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2
+ ; GFX11-REAL16-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_AND_B32_e64_]], implicit $exec
+ ; GFX11-REAL16-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[REG_SEQUENCE]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s32) from %ir.in.gep1, addrspace 1)
+ ; GFX11-REAL16-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[REG_SEQUENCE1]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s32) from %ir.in.gep2, addrspace 1)
+ ; GFX11-REAL16-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 killed [[REG_SEQUENCE2]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s16) from %ir.in.gep3, addrspace 1)
+ ; GFX11-REAL16-NEXT: [[V_MOV_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, 14336, 0, implicit $exec
+ ; GFX11-REAL16-NEXT: [[V_ADD_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_ADD_F16_t16_e64 0, killed [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], 0, killed [[V_MOV_B16_t16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-REAL16-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX11-REAL16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX11-REAL16-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_ADD_F16_t16_e64_]], %subreg.lo16, killed [[DEF]], %subreg.hi16
+ ; GFX11-REAL16-NEXT: [[V_FMA_MIX_F16_t16_:%[0-9]+]]:vgpr_16 = nofpexcept V_FMA_MIX_F16_t16 0, killed [[GLOBAL_LOAD_DWORD_SADDR]], 0, killed [[GLOBAL_LOAD_DWORD_SADDR1]], 8, killed [[REG_SEQUENCE4]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GFX11-REAL16-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 killed [[V_MOV_B32_e32_]], killed [[V_FMA_MIX_F16_t16_]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s16) into %ir.4, addrspace 1)
+ ; GFX11-REAL16-NEXT: S_ENDPGM 0
+ ;
+ ; GFX11-FAKE16-LABEL: name: fma_mix_f16
+ ; GFX11-FAKE16: bb.0.entry:
+ ; GFX11-FAKE16-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
+ ; GFX11-FAKE16-NEXT: {{ $}}
+ ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; GFX11-FAKE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; GFX11-FAKE16-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s256) from %ir.a.kernarg.offset, align 4, addrspace 4)
+ ; GFX11-FAKE16-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub1
+ ; GFX11-FAKE16-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub0
+ ; GFX11-FAKE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
+ ; GFX11-FAKE16-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub3
+ ; GFX11-FAKE16-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub2
+ ; GFX11-FAKE16-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1
+ ; GFX11-FAKE16-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub5
+ ; GFX11-FAKE16-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub4
+ ; GFX11-FAKE16-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1
+ ; GFX11-FAKE16-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub7
+ ; GFX11-FAKE16-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub6
+ ; GFX11-FAKE16-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1
+ ; GFX11-FAKE16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023
+ ; GFX11-FAKE16-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]](s32), killed [[S_MOV_B32_]], implicit $exec
+ ; GFX11-FAKE16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2
+ ; GFX11-FAKE16-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_AND_B32_e64_]], implicit $exec
+ ; GFX11-FAKE16-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[REG_SEQUENCE]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s32) from %ir.in.gep1, addrspace 1)
+ ; GFX11-FAKE16-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[REG_SEQUENCE1]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s32) from %ir.in.gep2, addrspace 1)
+ ; GFX11-FAKE16-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR killed [[REG_SEQUENCE2]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s16) from %ir.in.gep3, addrspace 1)
+ ; GFX11-FAKE16-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 14336
+ ; GFX11-FAKE16-NEXT: [[V_ADD_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_fake16_e64 0, killed [[GLOBAL_LOAD_USHORT_SADDR]], 0, killed [[S_MOV_B32_2]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GFX11-FAKE16-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+ ; GFX11-FAKE16-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 0, killed [[GLOBAL_LOAD_DWORD_SADDR]], 0, killed [[GLOBAL_LOAD_DWORD_SADDR1]], 8, killed [[V_ADD_F16_fake16_e64_]], 0, [[COPY10]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-FAKE16-NEXT: GLOBAL_STORE_SHORT_SADDR killed [[V_MOV_B32_e32_]], killed [[V_FMA_MIXLO_F16_]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s16) into %ir.4, addrspace 1)
+ ; GFX11-FAKE16-NEXT: S_ENDPGM 0
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %in.gep1 = getelementptr i32, ptr addrspace(1) %a, i32 %tid
+ %in.gep2 = getelementptr i32, ptr addrspace(1) %b, i32 %tid
+ %in.gep3 = getelementptr i32, ptr addrspace(1) %c, i32 %tid
+ %load.a = load float, ptr addrspace(1) %in.gep1
+ %load.b = load float, ptr addrspace(1) %in.gep2
+ %load.c = load half, ptr addrspace(1) %in.gep3
+ %add.c = fadd half %load.c, 0.5
+ %load.float.c = fpext half %add.c to float
+ %result = tail call float @llvm.fmuladd.f32(float %load.a, float %load.b, float %load.float.c)
+ %half = fptrunc float %result to half
+ store half %half, ptr addrspace(1) %out
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index a859cc91b7fde..fe95d4561d0cd 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -1571,25 +1571,24 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0x46000000
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v1, v1
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.l
+; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v0, v0
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v2, 0x46000000, v1
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v2, 0x46000000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v4, -v3, v2, s0 op_sel_hi:[1,0,0]
-; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v2, v4, v1
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v1, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v2, v3, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v3, v2, s0 op_sel_hi:[1,0,0]
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v1, v3, v1
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v1, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v3, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff800000, v0
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
-; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.h, v0.l, 0x7000
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v1.l, 0x7000
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds:
@@ -1739,25 +1738,24 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v0.l, 1
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 2.0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.l, v0.l
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v1, v1
+; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v0, v0
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v1, v1
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v4, -v3, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v0, v0
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v1, v2, s0 op_sel_hi:[1,0,0]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v2, v4, v1
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v3, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v2, v3, v0
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v1, v2, s0 op_sel_hi:[1,0,0]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v3, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff800000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.h, v0.l, 2.0
+; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v1.l, 2.0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2:
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index c4a38dcd7b5f3..78a961ea0da17 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -1433,37 +1433,35 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: fast_frem_f16:
@@ -1507,38 +1505,36 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX1150-TRUE16-NEXT: s_clause 0x1
; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-TRUE16-NEXT: s_clause 0x1
-; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX1150-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8
; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
-; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
-; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
-; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
-; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
-; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h
-; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l
-; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1150-TRUE16-NEXT: global_store_b16 v1, v2, s[0:1]
; GFX1150-TRUE16-NEXT: s_endpgm
;
; GFX1150-FAKE16-LABEL: fast_frem_f16:
@@ -1583,38 +1579,36 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX1200-TRUE16-NEXT: s_clause 0x1
; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1200-TRUE16-NEXT: s_clause 0x1
-; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX1200-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1200-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3]
+; GFX1200-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8
; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1
-; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
-; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
-; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
-; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
-; GFX1200-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
-; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h
-; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l
-; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1200-TRUE16-NEXT: global_store_b16 v1, v2, s[0:1]
; GFX1200-TRUE16-NEXT: s_endpgm
;
; GFX1200-FAKE16-LABEL: fast_frem_f16:
@@ -1840,37 +1834,35 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: unsafe_frem_f16:
@@ -1914,38 +1906,36 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX1150-TRUE16-NEXT: s_clause 0x1
; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-TRUE16-NEXT: s_clause 0x1
-; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX1150-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8
; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
-; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
-; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
-; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
-; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
-; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h
-; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l
-; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1150-TRUE16-NEXT: global_store_b16 v1, v2, s[0:1]
; GFX1150-TRUE16-NEXT: s_endpgm
;
; GFX1150-FAKE16-LABEL: unsafe_frem_f16:
@@ -1990,38 +1980,36 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX1200-TRUE16-NEXT: s_clause 0x1
; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1200-TRUE16-NEXT: s_clause 0x1
-; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
-; GFX1200-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1200-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3]
+; GFX1200-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8
; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1
-; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
-; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
-; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
-; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1]
; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
-; GFX1200-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
-; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h
-; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l
-; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1200-TRUE16-NEXT: global_store_b16 v1, v2, s[0:1]
; GFX1200-TRUE16-NEXT: s_endpgm
;
; GFX1200-FAKE16-LABEL: unsafe_frem_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
index 1ae3434db6da5..3f66c23e1a73b 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
@@ -65,10 +65,9 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %s
; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
; SDAG-GFX11-TRUE16: ; %bb.0:
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0x3c00
-; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX11-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, 1.0, v0.l
; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
@@ -137,13 +136,20 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %s
}
define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src1, half %src2, half %lo) #0 {
-; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
+; SDAG-GFX11-TRUE16: ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
+; SDAG-GFX11-FAKE16: ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
; GFX9: ; %bb.0:
@@ -172,6 +178,14 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src
; SDAG-CI-NEXT: v_mov_b32_e32 v0, v3
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
+; GISEL-GFX11: ; %bb.0:
+; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -196,10 +210,8 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, ha
; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
; SDAG-GFX11-TRUE16: ; %bb.0:
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v1, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-TRUE16-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
-; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
@@ -277,10 +289,8 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src
; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
; SDAG-GFX11-TRUE16: ; %bb.0:
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v1, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-TRUE16-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
-; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
@@ -499,14 +509,25 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half
}
define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use(half %src0, half %src1, half %src2) #0 {
-; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; GFX11-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; GFX11-NEXT: global_store_b16 v[0:1], v3, off dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
+; SDAG-GFX11-TRUE16: ; %bb.0:
+; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
+; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v0, v3, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-TRUE16-NEXT: v_fma_mixhi_f16 v0, v3, v1, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v0, off dlc
+; SDAG-GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
+; SDAG-GFX11-FAKE16: ; %bb.0:
+; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-FAKE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-FAKE16-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v3, off dlc
+; SDAG-GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
; GFX9: ; %bb.0:
@@ -542,6 +563,15 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi
; SDAG-CI-NEXT: s_waitcnt vmcnt(0)
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
+; GISEL-GFX11: ; %bb.0:
+; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX11-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX11-NEXT: global_store_b16 v[0:1], v3, off dlc
+; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index eab92668c536b..21e6faf46f58d 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -412,11 +412,9 @@ define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v3.l
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32:
@@ -535,12 +533,10 @@ define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v2, v4 op_sel_hi:[1,1,1]
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v6
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v6.l
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v3f32:
@@ -704,16 +700,13 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v4f32:
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v2, v4 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v6.h
+; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, v6.l
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v4f32:
@@ -914,14 +907,23 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half
; FIXME (DAG): Fold clamp
define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: v_mov_b32_e32 v0, v3
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_clamp_postcvt:
+; SDAG-GFX1100-TRUE16: ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_clamp_postcvt:
+; SDAG-GFX1100-FAKE16: ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt:
; GFX900: ; %bb.0:
@@ -978,6 +980,15 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s
; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, v3
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-VI-LABEL: v_mad_mix_v2f32_clamp_postcvt:
; GISEL-VI: ; %bb.0:
; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1040,13 +1051,13 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, 0
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-TRUE16-NEXT: v_pk_max_f16 v1, v1, v1 clamp
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT: v_pk_max_f16 v1, v1, v1 clamp
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v3f32_clamp_postcvt:
@@ -1247,17 +1258,29 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
}
define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; SDAG-GFX1100-TRUE16: ; %bb.0:
+; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v6
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v1, v2
+; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; SDAG-GFX1100-FAKE16: ; %bb.0:
+; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-FAKE16-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt:
; GFX900: ; %bb.0:
@@ -1358,6 +1381,18 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s
; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt:
; GISEL-VI: ; %bb.0:
; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1452,10 +1487,10 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v3.l
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
@@ -1618,9 +1653,9 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v0, v3.l, v0.l
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi:
@@ -2385,10 +2420,8 @@ define i32 @mixlo_zext(float %src0, float %src1, float %src2) #0 {
; SDAG-GFX1100-TRUE16-LABEL: mixlo_zext:
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v0, v1, v2
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: mixlo_zext:
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
index a4878539b1c74..95df131e21358 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
@@ -2253,9 +2253,10 @@ define float @v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo(i32 %src0.arg, half %
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo:
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.h
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; SDAG-GFX1100-TRUE16-NEXT: v_xor_b16 v2.l, 0x8000, v0.h
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, |v2|, v1, v0 op_sel_hi:[1,1,1]
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo:
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll
index 79910af5c0434..93f4ea37117ba 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll
@@ -929,9 +929,8 @@ define i32 @zext_fptrunc_fma_f16(float %x, float %y, float %z) {
; GFX11-TRUE16-LABEL: zext_fptrunc_fma_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v1, v0, v1, v2
+; GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: zext_fptrunc_fma_f16:
More information about the llvm-commits
mailing list