[llvm] [AMDGPU][True16][CodeGen] true16 isel pattern for fma_mix_f16/bf16 (PR #159648)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 23 08:57:07 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Brox Chen (broxigarchen)
<details>
<summary>Changes</summary>
This patch includes:
1. fma_mix inst takes fp16 type as input, but place the operand in vgpr32. Update selector to insert vgpr32 for true16 mode if necessary.
2. fma_mix inst returns fp16 type as output, but place the vdst in vgpr32. Create a fma_mix_t16 pesudo inst for isel pattern, and lower it to mix_lo/hi in the mc lowering pass.
These stop isel from emitting illegal `vgpr32 = COPY vgpr16` and improve code quality
---
Patch is 75.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/159648.diff
13 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (+18-10)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp (+39)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h (+1)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+7)
- (modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+91-40)
- (modified) llvm/test/CodeGen/AMDGPU/fdiv.f16.ll (+17-19)
- (added) llvm/test/CodeGen/AMDGPU/fma-mix.gfx11plus.ll (+93)
- (modified) llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll (+26-28)
- (modified) llvm/test/CodeGen/AMDGPU/frem.ll (+100-112)
- (modified) llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll (+55-25)
- (modified) llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll (+84-51)
- (modified) llvm/test/CodeGen/AMDGPU/mad-mix.ll (+3-2)
- (modified) llvm/test/CodeGen/AMDGPU/preserve-hi16.ll (+1-2)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index c2fca79979e1b..2158a4d0e2076 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -4078,18 +4078,26 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
// register.
Mods |= SISrcMods::OP_SEL_1;
- if (IsExtractHigh ||
- (Src.getValueSizeInBits() == 16 && isExtractHiElt(Src, Src))) {
- Mods |= SISrcMods::OP_SEL_0;
+ if (Src.getValueSizeInBits() == 16) {
+ if (isExtractHiElt(Src, Src)) {
+ Mods |= SISrcMods::OP_SEL_0;
- // TODO: Should we try to look for neg/abs here?
- }
+ // TODO: Should we try to look for neg/abs here?
+ return true;
+ }
+
+ if (Src.getOpcode() == ISD::TRUNCATE &&
+ Src.getOperand(0).getValueType() == MVT::i32) {
+ Src = Src.getOperand(0);
+ return true;
+ }
+
+ if (Subtarget->useRealTrue16Insts())
+ // In true16 mode, pack src to a 32bit
+ Src = createVOP3PSrc32FromLo16(Src, In, CurDAG, Subtarget);
+ } else if (IsExtractHigh)
+ Mods |= SISrcMods::OP_SEL_0;
- // Prevent unnecessary subreg COPY to VGPR_16
- if (Src.getOpcode() == ISD::TRUNCATE &&
- Src.getOperand(0).getValueType() == MVT::i32) {
- Src = Src.getOperand(0);
- }
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 6acbf52b97de5..11b9d0b1840d3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -175,6 +175,41 @@ void AMDGPUMCInstLower::lowerT16D16Helper(const MachineInstr *MI,
}
}
+void AMDGPUMCInstLower::lowerT16FmaMixFP16(const MachineInstr *MI,
+ MCInst &OutMI) const {
+ unsigned Opcode = MI->getOpcode();
+ const auto *TII = static_cast<const SIInstrInfo *>(ST.getInstrInfo());
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
+ int VDstIdx = AMDGPU::getNamedOperandIdx(Opcode, llvm::AMDGPU::OpName::vdst);
+ const MachineOperand &VDst = MI->getOperand(VDstIdx);
+ bool IsHi = AMDGPU::isHi16Reg(VDst.getReg(), TRI);
+ // select hi/lo MCInst
+ switch (Opcode) {
+ case AMDGPU::V_FMA_MIX_F16_t16:
+ Opcode = IsHi ? AMDGPU::V_FMA_MIXHI_F16 : AMDGPU::V_FMA_MIXLO_F16;
+ break;
+ case AMDGPU::V_FMA_MIX_BF16_t16:
+ Opcode = IsHi ? AMDGPU::V_FMA_MIXHI_BF16 : AMDGPU::V_FMA_MIXLO_BF16;
+ break;
+ }
+ int MCOpcode = TII->pseudoToMCOpcode(Opcode);
+ assert(MCOpcode != -1 &&
+ "Pseudo instruction doesn't have a target-specific version");
+ OutMI.setOpcode(MCOpcode);
+
+ // lower operands
+ for (int I = 0, E = MI->getNumExplicitOperands(); I < E; I++) {
+ const MachineOperand &MO = MI->getOperand(I);
+ MCOperand MCOp;
+ if (I == VDstIdx)
+ MCOp = MCOperand::createReg(TRI.get32BitRegister(VDst.getReg()));
+ else
+ lowerOperand(MO, MCOp);
+ OutMI.addOperand(MCOp);
+ }
+}
+
void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
unsigned Opcode = MI->getOpcode();
const auto *TII = static_cast<const SIInstrInfo *>(ST.getInstrInfo());
@@ -201,6 +236,10 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
} else if (AMDGPU::getT16D16Helper(Opcode)) {
lowerT16D16Helper(MI, OutMI);
return;
+ } else if (Opcode == AMDGPU::V_FMA_MIX_F16_t16 ||
+ Opcode == AMDGPU::V_FMA_MIX_BF16_t16) {
+ lowerT16FmaMixFP16(MI, OutMI);
+ return;
}
int MCOpcode = TII->pseudoToMCOpcode(Opcode);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
index 68b8d4e25a6cc..23ed55d45220f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
@@ -38,6 +38,7 @@ class AMDGPUMCInstLower {
void lower(const MachineInstr *MI, MCInst &OutMI) const;
void lowerT16D16Helper(const MachineInstr *MI, MCInst &OutMI) const;
+ void lowerT16FmaMixFP16(const MachineInstr *MI, MCInst &OutMI) const;
};
namespace {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 70223da961e92..b3e4a885c23c6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -9498,6 +9498,13 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
DescSize = Desc.getSize();
}
+ // If FMA Pseudo inst, get correct MC code size
+ if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) {
+ // fma lo/hi f16/bf16 inst are in same size
+ const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16);
+ DescSize = Desc.getSize();
+ }
+
return DescSize;
}
}
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index f7279b664ed27..52ee1e874ad86 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -64,6 +64,13 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
"$vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp";
}
+class VOP3P_Mix_Profile_t16<VOPProfile P, VOP3Features Features = VOP3_REGULAR>
+ : VOP3P_Mix_Profile<P, Features, 0> {
+ let IsTrue16 = 1;
+ let IsRealTrue16 = 1;
+ let DstRC64 = getVALUDstForVT<P.DstVT, 1 /*IsTrue16*/, 1 /*IsVOP3Encoding*/>.ret;
+}
+
multiclass VOP3PInst<string OpName, VOPProfile P,
SDPatternOperator node = null_frag, bit IsDOT = 0> {
def NAME : VOP3P_Pseudo<OpName, P,
@@ -95,6 +102,16 @@ multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> {
} // end SubtargetPredicate = isGFX11Plus
}
+multiclass VOP3_VOP3PInst_t16<string OpName, VOP3P_Mix_Profile P> {
+ def NAME : VOP3P_Pseudo<OpName, P>;
+
+ if P.HasExtVOP3DPP then
+ def _dpp : VOP3_DPP_Pseudo<OpName, P> {
+ let VOP3P = 1;
+ let PseudoInstr = OpName#"_dpp";
+ }
+}
+
let isReMaterializable = 1 in {
let isCommutable = 1 in {
defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
@@ -160,12 +177,9 @@ defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_
// TODO: Make sure we're doing the right thing with denormals. Note
// that FMA and MAD will differ.
-multiclass MadFmaMixPats<SDPatternOperator fma_like,
- Instruction mix_inst,
- Instruction mixlo_inst,
- Instruction mixhi_inst,
- ValueType VT = f16,
- ValueType vecVT = v2f16> {
+multiclass MadFmaMixFP32Pats<SDPatternOperator fma_like,
+ Instruction mix_inst,
+ ValueType VT = f16> {
defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods);
defvar VOP3PMadMixModsExtPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsExt, VOP3PMadMixModsExt);
// At least one of the operands needs to be an fpextend of an f16
@@ -189,7 +203,14 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
(f32 (VOP3PMadMixModsExtPat VT:$src2, i32:$src2_mods)))),
(mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
DSTCLAMP.NONE)>;
+}
+multiclass MadFmaMixFP16Pats<SDPatternOperator fma_like,
+ Instruction mixlo_inst,
+ Instruction mixhi_inst,
+ ValueType VT = f16,
+ ValueType vecVT = v2f16> {
+ defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods);
def : GCNPat <
(AMDGPUclamp (build_vector
(VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)),
@@ -243,9 +264,6 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
// FIXME: Special case handling for maxhi (especially for clamp)
// because dealing with the write to high half of the register is
// difficult.
- foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
- let True16Predicate = p in {
-
def : GCNPat <
(build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
(f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
@@ -269,45 +287,60 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
DSTCLAMP.ENABLE,
VGPR_32:$elt0))
>;
+}
- } // end True16Predicate
+multiclass MadFmaMixFP16Pats_t16<SDPatternOperator fma_like,
+ Instruction mix_inst_16,
+ ValueType VT = f16,
+ ValueType vecVT = v2f16> {
+ defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods);
+ def : GCNPat <
+ (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers))))),
+ (mix_inst_16 $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ (i32 0), (i32 0),
+ DSTCLAMP.NONE)
+ >;
- let True16Predicate = UseRealTrue16Insts in {
def : GCNPat <
- (build_vector (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
(f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))), VT:$elt1),
- (vecVT (mixlo_inst $src0_modifiers, $src0,
- $src1_modifiers, $src1,
- $src2_modifiers, $src2,
- DSTCLAMP.NONE,
- (REG_SEQUENCE VGPR_32, (VT (IMPLICIT_DEF)), lo16, $elt1, hi16)))
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))),
+ (mix_inst_16 $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ $src2_modifiers, $src2,
+ DSTCLAMP.NONE)
>;
+
def : GCNPat <
- (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))),
- (vecVT (mixhi_inst $src0_modifiers, $src0,
- $src1_modifiers, $src1,
- $src2_modifiers, $src2,
- DSTCLAMP.NONE,
- (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16)))
+ (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))),
+ (mix_inst_16 $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ $src2_modifiers, $src2,
+ DSTCLAMP.ENABLE)
>;
def : GCNPat <
- (build_vector
- VT:$elt0,
- (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
- (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
- (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))),
- (vecVT (mixhi_inst $src0_modifiers, $src0,
- $src1_modifiers, $src1,
- $src2_modifiers, $src2,
- DSTCLAMP.ENABLE,
- (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16)))
+ (AMDGPUclamp (build_vector
+ (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$lo_src1, i32:$lo_src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$lo_src2, i32:$lo_src2_modifiers))))),
+ (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$hi_src0, i32:$hi_src0_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$hi_src1, i32:$hi_src1_modifiers)),
+ (f32 (VOP3PMadMixModsPat VT:$hi_src2, i32:$hi_src2_modifiers))))))),
+ (vecVT (REG_SEQUENCE VGPR_32, (mix_inst_16 $lo_src0_modifiers, $lo_src0,
+ $lo_src1_modifiers, $lo_src1,
+ $lo_src2_modifiers, $lo_src2,
+ DSTCLAMP.ENABLE), lo16,
+ (mix_inst_16 $hi_src0_modifiers, $hi_src0,
+ $hi_src1_modifiers, $hi_src1,
+ $hi_src2_modifiers, $hi_src2,
+ DSTCLAMP.ENABLE), hi16))
>;
- } // end True16Predicate
}
class MinimumMaximumByMinimum3Maximum3VOP3P<SDPatternOperator node,
@@ -341,7 +374,8 @@ defm V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3P_Mix_Profile<VOP_F
} // End FPDPRounding = 1
}
-defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
+defm : MadFmaMixFP32Pats<fmad, V_MAD_MIX_F32>;
+defm : MadFmaMixFP16Pats<fmad, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
} // OtherPredicates = [NoFP32Denormals]
} // End SubtargetPredicate = HasMadMixInsts
@@ -360,10 +394,19 @@ defm V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3P_Mix_Profile<VOP_F
let ClampLo = 0, ClampHi = 1 in {
defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL, 1>>;
}
+
+// Pseudo true16 inst for v_fma_mixlo/hi_f16
+defm V_FMA_MIX_F16_t16 : VOP3_VOP3PInst_t16<"v_fma_mix_f16_t16", VOP3P_Mix_Profile_t16<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
} // End FPDPRounding = 1
}
-defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
+defm : MadFmaMixFP32Pats<fma, V_FMA_MIX_F32>;
+
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in
+defm : MadFmaMixFP16Pats<fma, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
+let True16Predicate = UseRealTrue16Insts in
+defm : MadFmaMixFP16Pats_t16<fma, V_FMA_MIX_F16_t16>;
}
let SubtargetPredicate = HasFmaMixBF16Insts in {
@@ -378,10 +421,18 @@ defm V_FMA_MIXLO_BF16 : VOP3_VOP3PInst<"v_fma_mixlo_bf16", VOP3P_Mix_Profile<VOP
let ClampLo = 0, ClampHi = 1 in {
defm V_FMA_MIXHI_BF16 : VOP3_VOP3PInst<"v_fma_mixhi_bf16", VOP3P_Mix_Profile<VOP_BF16_BF16_BF16_BF16, VOP3_OPSEL, 1>>;
}
+
+// Pseudo true16 inst for v_fma_mixlo/hi_bf16
+defm V_FMA_MIX_BF16_t16 : VOP3_VOP3PInst_t16<"v_fma_mix_bf16_t16", VOP3P_Mix_Profile_t16<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
} // End FPDPRounding = 1
} // End isCommutable = 1
-defm : MadFmaMixPats<fma, V_FMA_MIX_F32_BF16, V_FMA_MIXLO_BF16, V_FMA_MIXHI_BF16, bf16, v2bf16>;
+defm : MadFmaMixFP32Pats<fma, V_FMA_MIX_F32_BF16, bf16>;
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in
+defm : MadFmaMixFP16Pats<fma, V_FMA_MIXLO_BF16, V_FMA_MIXHI_BF16, bf16, v2bf16>;
+let True16Predicate = UseRealTrue16Insts in
+defm : MadFmaMixFP16Pats_t16<fma, V_FMA_MIX_BF16_t16>;
} // End SubtargetPredicate = HasFmaMixBF16Insts
def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> {
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index 210e09fd9169a..7f6a920d25016 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -137,33 +137,31 @@ define amdgpu_kernel void @v_fdiv_f16(
; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] glc dlc
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] glc dlc
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v0.l
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v3, v3
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v2.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v0, v0
; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v4, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v0
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v4, v2 op_sel_hi:[1,0,1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v4, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v4, v5, v0
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v4, v2 op_sel_hi:[1,0,1]
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v5, v3
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v5, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff800000, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.h, v1.l, v0.l
-; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: v_fdiv_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fma-mix.gfx11plus.ll b/llvm/test/CodeGen/AMDGPU/fma-mix.gfx11plus.ll
new file mode 100644
index 0000000000000..1ba13b287be46
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fma-mix.gfx11plus.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -stop-after=amdgpu-isel | FileCheck %s --check-prefixes=GFX11-REAL16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -stop-after=amdgpu-isel | FileCheck %s --check-prefixes=GFX11-FAKE16
+
+; Make sure no "vgpr32 = copy vgpr16" is generated
+
+define amdgpu_kernel void @fma_mix_f16 (ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) {
+ ; GFX11-REAL16-LABEL: name: fma_mix_f16
+ ; GFX11-REAL16: bb.0.entry:
+ ; GFX11-REAL16-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
+ ; GFX11-REAL16-NEXT: {{ $}}
+ ; GFX11-REAL16-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; GFX11-REAL16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+ ; GFX11-REAL16-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s256) from %ir.a.kernarg.offset, align 4, addrspace 4)
+ ; GFX11-REAL16-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub1
+ ; GFX11-REAL16-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub0
+ ; GFX11-REAL16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1
+ ; GFX11-REAL16-NEXT: [[CO...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/159648
More information about the llvm-commits
mailing list