[llvm] [AMDGPU][True16][MC] VOP2 update instructions with fake16 format (PR #114436)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 31 10:37:31 PDT 2024
https://github.com/broxigarchen created https://github.com/llvm/llvm-project/pull/114436
Some old "t16" VOP2 instructions are actually in fake16 format. Correct and update test file
>From 178d1d1a05d954590c29ea1b2c68f25552288e34 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Thu, 31 Oct 2024 13:35:20 -0400
Subject: [PATCH] [AMDGPU][True16][MC] VOP2 update instructions with fake16
format
---
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 2 +-
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 41 ++++---
llvm/lib/Target/AMDGPU/SIInstrInfo.td | 15 +++
llvm/lib/Target/AMDGPU/SIInstructions.td | 6 +-
.../Target/AMDGPU/SIShrinkInstructions.cpp | 4 +-
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 4 +-
llvm/lib/Target/AMDGPU/VOP2Instructions.td | 115 +++++++++++-------
.../CodeGen/AMDGPU/fix-sgpr-copies-f16.mir | 2 +-
.../test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir | 8 +-
llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir | 8 +-
10 files changed, 125 insertions(+), 80 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index f0c7837e0bb75a..0b8be0d88170c4 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -176,7 +176,7 @@ static unsigned macToMad(unsigned Opc) {
return AMDGPU::V_FMA_F32_e64;
case AMDGPU::V_FMAC_F16_e64:
return AMDGPU::V_FMA_F16_gfx9_e64;
- case AMDGPU::V_FMAC_F16_t16_e64:
+ case AMDGPU::V_FMAC_F16_fake16_e64:
return AMDGPU::V_FMA_F16_gfx9_e64;
case AMDGPU::V_FMAC_LEGACY_F32_e64:
return AMDGPU::V_FMA_LEGACY_F32_e64;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 89a2eb4f18946b..561816618f68e7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3480,7 +3480,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
- Opc == AMDGPU::V_FMAC_F16_t16_e64) {
+ Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
// Don't fold if we are using source or output modifiers. The new VOP2
// instructions don't have them.
if (hasAnyModifiersSet(UseMI))
@@ -3500,7 +3500,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
bool IsFMA =
Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
- Opc == AMDGPU::V_FMAC_F16_t16_e64;
+ Opc == AMDGPU::V_FMAC_F16_fake16_e64;
MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
@@ -3533,16 +3533,16 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
unsigned NewOpc =
IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
- : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
+ : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
: AMDGPU::V_FMAMK_F16)
: (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
if (pseudoToMCOpcode(NewOpc) == -1)
return false;
- // V_FMAMK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
+ // V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
// would also require restricting their register classes. For now
// just bail out.
- if (NewOpc == AMDGPU::V_FMAMK_F16_t16)
+ if (NewOpc == AMDGPU::V_FMAMK_F16_fake16)
return false;
const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
@@ -3557,8 +3557,8 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Src0->setIsKill(RegSrc->isKill());
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
- Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
- Opc == AMDGPU::V_FMAC_F16_e64)
+ Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
@@ -3612,24 +3612,24 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
unsigned NewOpc =
IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
- : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
+ : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
: AMDGPU::V_FMAAK_F16)
: (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
if (pseudoToMCOpcode(NewOpc) == -1)
return false;
- // V_FMAAK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
+ // V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
// would also require restricting their register classes. For now
// just bail out.
- if (NewOpc == AMDGPU::V_FMAAK_F16_t16)
+ if (NewOpc == AMDGPU::V_FMAAK_F16_fake16)
return false;
// FIXME: This would be a lot easier if we could return a new instruction
// instead of having to modify in place.
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
- Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
- Opc == AMDGPU::V_FMAC_F16_e64)
+ Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
@@ -3852,19 +3852,22 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
return MIB;
}
+// FIXME-TRUE16. assert should be enabled after V_FMAC_F16_t16 is enabled
+#if 0
assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
"V_FMAC_F16_t16_e32 is not supported and not expected to be present "
"pre-RA");
+#endif
// Handle MAC/FMAC.
bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
- Opc == AMDGPU::V_FMAC_F16_t16_e64;
+ Opc == AMDGPU::V_FMAC_F16_fake16_e64;
bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
- Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
@@ -3878,7 +3881,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
return nullptr;
case AMDGPU::V_MAC_F16_e64:
case AMDGPU::V_FMAC_F16_e64:
- case AMDGPU::V_FMAC_F16_t16_e64:
+ case AMDGPU::V_FMAC_F16_fake16_e64:
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_LEGACY_F32_e64:
case AMDGPU::V_FMAC_F32_e64:
@@ -3963,7 +3966,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
int64_t Imm;
if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
unsigned NewOpc =
- IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
+ IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
: AMDGPU::V_FMAAK_F16)
: AMDGPU::V_FMAAK_F32)
: (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
@@ -3982,7 +3985,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
}
}
unsigned NewOpc =
- IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
+ IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
: AMDGPU::V_FMAMK_F16)
: AMDGPU::V_FMAMK_F32)
: (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
@@ -4437,7 +4440,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_LEGACY_F32_e64:
case AMDGPU::V_FMAC_F16_e64:
- case AMDGPU::V_FMAC_F16_t16_e64:
+ case AMDGPU::V_FMAC_F16_fake16_e64:
case AMDGPU::V_FMAC_F32_e64:
case AMDGPU::V_FMAC_F64_e64:
case AMDGPU::V_FMAC_LEGACY_F32_e64:
@@ -5484,7 +5487,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
- case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_t16_e64;
+ case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64;
case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 42a1ffb8a26d4a..0d65a1ecd5bc8e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1710,6 +1710,21 @@ class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> {
1 : VSrc_b32);
}
+// Returns the vreg register class to use for sources of VOP3 instructions for the
+// given VT.
+class getVOP3VRegSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 0> {
+ RegisterOperand ret =
+ !cond(!eq(VT.Size, 128) : RegisterOperand<VReg_128>,
+ !eq(VT.Size, 96) : RegisterOperand<VReg_96>,
+ !eq(VT.Size, 64) : RegisterOperand<VReg_64>,
+ !eq(VT.Size, 48) : RegisterOperand<VReg_64>,
+ !eq(VT.Size, 16) : !if(IsTrue16,
+ !if(IsFake16, RegisterOperand<VGPR_32>,
+ RegisterOperand<VGPR_16>),
+ RegisterOperand<VGPR_32>),
+ 1 : RegisterOperand<VGPR_32>);
+}
+
// Src2 of VOP3 DPP instructions cannot be a literal
class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> {
RegisterOperand ret =
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index c8a46217190a1d..c4977f1fb2aece 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3200,7 +3200,7 @@ def : GCNPat <
let SubtargetPredicate = isGFX10Plus in {
// Don't allow source modifiers. If there are any source modifiers then it's
// better to select fma instead of fmac.
-let OtherPredicates = [NotHasTrue16BitInsts] in
+let True16Predicate = NotHasTrue16BitInsts in
def : GCNPat <
(fma (f16 (VOP3NoMods f32:$src0)),
(f16 (VOP3NoMods f32:$src1)),
@@ -3208,12 +3208,12 @@ def : GCNPat <
(V_FMAC_F16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
SRCMODS.NONE, $src2)
>;
-let OtherPredicates = [HasTrue16BitInsts] in
+let True16Predicate = UseFakeTrue16Insts in
def : GCNPat <
(fma (f16 (VOP3NoMods f32:$src0)),
(f16 (VOP3NoMods f32:$src1)),
(f16 (VOP3NoMods f32:$src2))),
- (V_FMAC_F16_t16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
+ (V_FMAC_F16_fake16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
SRCMODS.NONE, $src2)
>;
}
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index f0b0e378ad668d..42df4576a774d5 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -455,7 +455,7 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
break;
case AMDGPU::V_FMA_F16_e64:
case AMDGPU::V_FMA_F16_gfx9_e64:
- NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
+ NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
: AMDGPU::V_FMAAK_F16;
break;
}
@@ -484,7 +484,7 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
break;
case AMDGPU::V_FMA_F16_e64:
case AMDGPU::V_FMA_F16_gfx9_e64:
- NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
+ NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
: AMDGPU::V_FMAMK_F16;
break;
}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 20a81a3135f0b2..e3d7786cbe6b9b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -563,8 +563,8 @@ bool isMAC(unsigned Opc) {
Opc == AMDGPU::V_FMAC_LEGACY_F32_e64_gfx10 ||
Opc == AMDGPU::V_FMAC_DX9_ZERO_F32_e64_gfx11 ||
Opc == AMDGPU::V_FMAC_F16_e64_gfx10 ||
- Opc == AMDGPU::V_FMAC_F16_t16_e64_gfx11 ||
- Opc == AMDGPU::V_FMAC_F16_t16_e64_gfx12 ||
+ Opc == AMDGPU::V_FMAC_F16_fake16_e64_gfx11 ||
+ Opc == AMDGPU::V_FMAC_F16_fake16_e64_gfx12 ||
Opc == AMDGPU::V_DOT2C_F32_F16_e64_vi ||
Opc == AMDGPU::V_DOT2C_I32_I16_e64_vi ||
Opc == AMDGPU::V_DOT4C_I32_I8_e64_vi ||
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index fbde3bb7d14111..e360c91e1664ad 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -95,6 +95,7 @@ class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemo
// copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
+ let True16Predicate = ps.True16Predicate;
let OtherPredicates = ps.OtherPredicates;
let AsmMatchConverter = ps.AsmMatchConverter;
let AsmVariantName = ps.AsmVariantName;
@@ -373,10 +374,10 @@ class VOP_MADAK <ValueType vt> : VOP_MADK_Base<vt> {
}
def VOP_MADAK_F16 : VOP_MADAK <f16>;
-def VOP_MADAK_F16_t16 : VOP_MADAK <f16> {
+def VOP_MADAK_F16_fake16 : VOP_MADAK <f16> {
let IsTrue16 = 1;
- let DstRC = VOPDstOperand<VGPR_32_Lo128>;
- let Ins32 = (ins VSrcFake16_f16_Lo128_Deferred:$src0, VGPR_32_Lo128:$src1, ImmOpType:$imm);
+ let DstRC = getVALUDstForVT_fake16<DstVT>.ret;
+ let Ins32 = (ins VSrcFake16_f16_Lo128_Deferred:$src0, VGPRSrc_32_Lo128:$src1, ImmOpType:$imm);
}
def VOP_MADAK_F32 : VOP_MADAK <f32>;
@@ -398,10 +399,10 @@ class VOP_MADMK <ValueType vt> : VOP_MADK_Base<vt> {
}
def VOP_MADMK_F16 : VOP_MADMK <f16>;
-def VOP_MADMK_F16_t16 : VOP_MADMK <f16> {
+def VOP_MADMK_F16_fake16 : VOP_MADMK <f16> {
let IsTrue16 = 1;
- let DstRC = VOPDstOperand<VGPR_32_Lo128>;
- let Ins32 = (ins VSrcFake16_f16_Lo128_Deferred:$src0, ImmOpType:$imm, VGPR_32_Lo128:$src1);
+ let DstRC = getVALUDstForVT_fake16<DstVT>.ret;
+ let Ins32 = (ins VSrcFake16_f16_Lo128_Deferred:$src0, ImmOpType:$imm, VGPRSrc_32_Lo128:$src1);
}
def VOP_MADMK_F32 : VOP_MADMK <f32>;
@@ -409,7 +410,9 @@ def VOP_MADMK_F32 : VOP_MADMK <f32>;
// and processing time but it makes it easier to convert to mad.
class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, vt0]> {
let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, getVregSrcForVT<Src2VT>.ret:$src2);
- let Ins64 = getIns64<Src0RC64, Src1RC64, getVregSrcForVT<Src2VT>.ret, 3,
+ // Src2 must accept the same operand types as vdst, namely VGPRs only
+ let Src2RC64 = getVOP3VRegSrcForVT<Src2VT, IsTrue16, !not(IsRealTrue16)>.ret;
+ let Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, 3,
0, HasModifiers, HasModifiers, HasOMod,
Src0Mod, Src1Mod, Src2Mod>.ret;
let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
@@ -464,21 +467,18 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v
}
def VOP_MAC_F16 : VOP_MAC <f16>;
-def VOP_MAC_F16_t16 : VOP_MAC <f16> {
+def VOP_MAC_F16_fake16 : VOP_MAC <f16> {
let IsTrue16 = 1;
- let HasOpSel = 1;
- let AsmVOP3OpSel = getAsmVOP3OpSel<2/*NumSrcArgs*/, HasClamp, HasOMod,
- HasSrc0FloatMods, HasSrc1FloatMods, HasSrc2FloatMods>.ret;
- let DstRC = VOPDstOperand<VGPR_32_Lo128>;
- let DstRC64 = VOPDstOperand<VGPR_32>;
- let Src1RC32 = VGPRSrc_32_Lo128;
+ let DstRC = getVALUDstForVT_fake16<DstVT>.ret;
+ let Src0RC32 = getVOPSrc0ForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+ let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret:$src2);
let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
- let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret;
- let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret;
- let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret;
+ let Src0ModDPP = getSrcModDPP_t16<Src0VT, 1/*IsFake16*/>.ret;
+ let Src1ModDPP = getSrcModDPP_t16<Src1VT, 1/*IsFake16*/>.ret;
+ let Src2ModDPP = getSrcModDPP_t16<Src2VT, 1/*IsFake16*/>.ret;
let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret:$src2, // stub argument
@@ -488,10 +488,18 @@ def VOP_MAC_F16_t16 : VOP_MAC <f16> {
Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret:$src2, // stub argument
dpp8:$dpp8, Dpp8FI:$fi);
- let Src2Mod = FP32InputMods; // dummy unused modifiers
- let Src2RC64 = VGPRSrc_32; // stub argument
+ let DstRC64 = getVALUDstForVT<DstVT>.ret;
+ let Src0VOP3DPP = VGPRSrc_32;
+ let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1/*IsFake16*/>.ret;
+ let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 1/*IsFake16*/>.ret;
+ let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 1/*IsFake16*/>.ret;
let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 1/*IsFake16*/>.ret;
+ let Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, 1/*IsFake16*/>.ret;
+ let Src0Mod = getSrc0Mod<Src0VT, DstVT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+ let Src1Mod = getSrcMod<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+ let Src2Mod = getSrcMod<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
}
+
def VOP_MAC_F32 : VOP_MAC <f32>;
let HasExtDPP = 0, HasExt32BitDPP = 0 in
def VOP_MAC_LEGACY_F32 : VOP_MAC <f32>;
@@ -650,15 +658,18 @@ class VOP2e_SGPR<list<ValueType> ArgVT> : VOPProfile<ArgVT> {
}
def VOP2e_I32_I32_I32_I1 : VOP2e_SGPR<[i32, i32, i32, i1]>;
+def VOP2e_I16_I16_I16_I1 : VOP2e_SGPR<[i16, i16, i16, i1]>;
+// V_CNDMASK_B16 is VOP3 only
def VOP2e_I16_I16_I16_I1_fake16 : VOP2e_SGPR<[i16, i16, i16, i1]> {
let IsTrue16 = 1;
let DstRC64 = getVALUDstForVT<DstVT>.ret;
- let Src0Mod = getSrcMod<f16>.ret;
- let Src1Mod = getSrcMod<f16>.ret;
+ let Src0Mod = getSrc0Mod<f16, DstVT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+ let Src1Mod = getSrcMod<f16, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
let Src0VOP3DPP = VGPRSrc_32;
- let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT>.ret;
+ let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 1/*IsFake16*/>.ret;
+ let Src0ModVOP3DPP = getSrc0ModVOP3DPP<f16, DstVT, 1/*IsFake16*/>.ret;
let Src1ModVOP3DPP = getSrcModVOP3DPP<f16, 1/*IsFake16*/>.ret;
}
@@ -924,7 +935,6 @@ let FPDPRounding = 1 in {
let SubtargetPredicate = UseFakeTrue16Insts in
defm V_LDEXP_F16_fake16 : VOP2Inst <"v_ldexp_f16_fake16", LDEXP_F16_VOPProfile_Fake16, null_frag, "v_ldexp_f16_fake16">;
} // End FPDPRounding = 1
-// FIXME VOP3 Only instructions. NFC using VOPProfile_True16 for these until a planned change to use a new register class for VOP3 encoded True16 instuctions
defm V_LSHLREV_B16 : VOP2Inst_e64_t16 <"v_lshlrev_b16", VOP_I16_I16_I16, clshl_rev_16>;
defm V_LSHRREV_B16 : VOP2Inst_e64_t16 <"v_lshrrev_b16", VOP_I16_I16_I16, clshr_rev_16>;
defm V_ASHRREV_I16 : VOP2Inst_e64_t16 <"v_ashrrev_i16", VOP_I16_I16_I16, cashr_rev_16>;
@@ -988,16 +998,17 @@ let FPDPRounding = 1, isReMaterializable = 1, FixedSize = 1 in {
let SubtargetPredicate = isGFX10Plus, True16Predicate = NotHasTrue16BitInsts in {
def V_FMAMK_F16 : VOP2_Pseudo <"v_fmamk_f16", VOP_MADMK_F16, [], "">;
}
-let SubtargetPredicate = HasTrue16BitInsts in {
-def V_FMAMK_F16_t16 : VOP2_Pseudo <"v_fmamk_f16_t16", VOP_MADMK_F16_t16, [], "">;
+let True16Predicate = UseFakeTrue16Insts in {
+ def V_FMAMK_F16_fake16 : VOP2_Pseudo <"v_fmamk_f16_fake16", VOP_MADMK_F16_fake16, [], "">;
}
+
let isCommutable = 1 in {
let SubtargetPredicate = isGFX10Plus, True16Predicate = NotHasTrue16BitInsts in {
def V_FMAAK_F16 : VOP2_Pseudo <"v_fmaak_f16", VOP_MADAK_F16, [], "">;
}
-let SubtargetPredicate = HasTrue16BitInsts in {
-def V_FMAAK_F16_t16 : VOP2_Pseudo <"v_fmaak_f16_t16", VOP_MADAK_F16_t16, [], "">;
+let True16Predicate = UseFakeTrue16Insts in {
+ def V_FMAAK_F16_fake16 : VOP2_Pseudo <"v_fmaak_f16_fake16", VOP_MADAK_F16_fake16, [], "">;
}
} // End isCommutable = 1
} // End FPDPRounding = 1, isReMaterializable = 1, FixedSize = 1
@@ -1006,12 +1017,14 @@ let Constraints = "$vdst = $src2",
DisableEncoding="$src2",
isConvertibleToThreeAddress = 1,
isCommutable = 1 in {
-let SubtargetPredicate = isGFX10Plus, True16Predicate = NotHasTrue16BitInsts in {
-defm V_FMAC_F16 : VOP2Inst <"v_fmac_f16", VOP_MAC_F16>;
+let SubtargetPredicate = isGFX10Plus in {
+let True16Predicate = NotHasTrue16BitInsts in {
+ defm V_FMAC_F16 : VOP2Inst <"v_fmac_f16", VOP_MAC_F16>;
}
-let SubtargetPredicate = HasTrue16BitInsts in {
-defm V_FMAC_F16_t16 : VOP2Inst <"v_fmac_f16_t16", VOP_MAC_F16_t16>;
+let True16Predicate = UseFakeTrue16Insts in {
+ defm V_FMAC_F16_fake16 : VOP2Inst <"v_fmac_f16_fake16", VOP_MAC_F16_fake16>;
}
+} // End SubtargetPredicate = isGFX10Plus
} // End FMAC Constraints
let SubtargetPredicate = Has16BitInsts in {
@@ -1576,14 +1589,20 @@ multiclass VOP2_Real_FULL_with_name_gfx12<bits<6> op, string opName,
string asmName> :
VOP2_Real_FULL_with_name<GFX12Gen, op, opName, asmName>;
-multiclass VOP2_Real_FULL_t16_with_name_gfx12<bits<6> op, string opName,
- string asmName, string alias> {
+multiclass VOP2_Real_FULL_t16_gfx12<bits<6> op, string opName,
+ string asmName, string alias> {
defm NAME : VOP2_Real_FULL_with_name<GFX12Gen, op, opName, asmName>;
def _gfx12_2nd_alias : AMDGPUMnemonicAlias<alias, asmName> {
let AssemblerPredicate = isGFX12Only;
}
}
+multiclass VOP2_Real_FULL_t16_and_fake16_gfx12<bits<6> op, string opName,
+ string asmName, string alias> {
+ defm _t16: VOP2_Real_FULL_t16_gfx12<op, opName#"_t16", asmName, alias>;
+ defm _fake16: VOP2_Real_FULL_t16_gfx12<op, opName#"_fake16", asmName, alias>;
+}
+
multiclass VOP2_Real_NO_DPP_with_name_gfx12<bits<6> op, string opName,
string asmName> :
VOP2_Real_NO_DPP_with_name<GFX12Gen, op, opName, asmName>;
@@ -1607,10 +1626,8 @@ defm V_SUBREV_CO_CI_U32 :
defm V_MIN_NUM_F32 : VOP2_Real_FULL_with_name_gfx12<0x015, "V_MIN_F32", "v_min_num_f32">;
defm V_MAX_NUM_F32 : VOP2_Real_FULL_with_name_gfx12<0x016, "V_MAX_F32", "v_max_num_f32">;
-defm V_MIN_NUM_F16 : VOP2_Real_FULL_t16_with_name_gfx12<0x030, "V_MIN_F16_t16", "v_min_num_f16", "v_min_f16">;
-defm V_MIN_NUM_F16_fake16 : VOP2_Real_FULL_t16_with_name_gfx12<0x030, "V_MIN_F16_fake16", "v_min_num_f16", "v_min_f16">;
-defm V_MAX_NUM_F16 : VOP2_Real_FULL_t16_with_name_gfx12<0x031, "V_MAX_F16_t16", "v_max_num_f16", "v_max_f16">;
-defm V_MAX_NUM_F16_fake16 : VOP2_Real_FULL_t16_with_name_gfx12<0x031, "V_MAX_F16_fake16", "v_max_num_f16", "v_max_f16">;
+defm V_MIN_NUM_F16 : VOP2_Real_FULL_t16_and_fake16_gfx12<0x030, "V_MIN_F16", "v_min_num_f16", "v_min_f16">;
+defm V_MAX_NUM_F16 : VOP2_Real_FULL_t16_and_fake16_gfx12<0x031, "V_MAX_F16", "v_max_num_f16", "v_max_f16">;
let SubtargetPredicate = isGFX12Plus in {
defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx12>;
@@ -1645,6 +1662,14 @@ multiclass VOP2_Real_NO_VOP3_with_name_gfx11<bits<6> op, string opName,
}
}
+multiclass VOP2_Real_FULL_t16_gfx11<bits<6> op, string asmName, string opName = NAME> :
+ VOP2_Real_FULL_with_name<GFX11Gen, op, opName, asmName>;
+
+multiclass VOP2_Real_FULL_t16_and_fake16_gfx11<bits<6> op, string asmName, string opName = NAME> {
+ defm opName#"_t16": VOP2_Real_FULL_t16_gfx11<op, asmName, opName#"_t16">;
+ defm opName#"_fake16": VOP2_Real_FULL_t16_gfx11<op, asmName, opName#"_fake16">;
+}
+
multiclass VOP2_Real_NO_DPP_with_name_gfx11<bits<6> op, string opName,
string asmName> :
VOP2_Real_NO_DPP_with_name<GFX11Gen, op, opName, asmName>;
@@ -1675,14 +1700,16 @@ multiclass VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<bits<10> op, string as
multiclass VOP3beOnly_Realtriple_gfx11_gfx12<bits<10> op> :
VOP3beOnly_Realtriple<GFX11Gen, op>, VOP3beOnly_Realtriple<GFX12Gen, op>;
-multiclass VOP2Only_Real_MADK_with_name_gfx11_gfx12<bits<6> op, string asmName,
+multiclass VOP2Only_Real_MADK_t16_gfx11_gfx12<bits<6> op, string asmName,
string opName = NAME> :
VOP2Only_Real_MADK_with_name<GFX11Gen, op, asmName, opName>,
VOP2Only_Real_MADK_with_name<GFX12Gen, op, asmName, opName>;
-multiclass VOP2_Real_FULL_t16_gfx11<bits<6> op, string asmName,
- string opName = NAME> :
- VOP2_Real_FULL_with_name<GFX11Gen, op, opName, asmName>;
+multiclass VOP2Only_Real_MADK_t16_and_fake16_gfx11_gfx12<bits<6> op, string asmName,
+ string opName = NAME> {
+ defm _t16: VOP2Only_Real_MADK_t16_gfx11_gfx12<op, asmName, opName#"_t16">;
+ defm _fake16: VOP2Only_Real_MADK_t16_gfx11_gfx12<op, asmName, opName#"_fake16">;
+}
multiclass VOP2_Real_FULL_t16_gfx11_gfx12<bits<6> op, string asmName,
string opName = NAME> :
@@ -1721,15 +1748,15 @@ defm V_SUBREV_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x034, "v_subrev_f16
defm V_SUBREV_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x034, "v_subrev_f16">;
defm V_MUL_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x035, "v_mul_f16">;
defm V_MUL_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x035, "v_mul_f16">;
-defm V_FMAC_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x036, "v_fmac_f16">;
+defm V_FMAC_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x036, "v_fmac_f16">;
defm V_LDEXP_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x03b, "v_ldexp_f16">;
defm V_LDEXP_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x03b, "v_ldexp_f16">;
defm V_MAX_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x039, "v_max_f16">;
defm V_MAX_F16_fake16 : VOP2_Real_FULL_t16_gfx11<0x039, "v_max_f16">;
defm V_MIN_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x03a, "v_min_f16">;
defm V_MIN_F16_fake16 : VOP2_Real_FULL_t16_gfx11<0x03a, "v_min_f16">;
-defm V_FMAMK_F16_t16 : VOP2Only_Real_MADK_with_name_gfx11_gfx12<0x037, "v_fmamk_f16">;
-defm V_FMAAK_F16_t16 : VOP2Only_Real_MADK_with_name_gfx11_gfx12<0x038, "v_fmaak_f16">;
+defm V_FMAMK_F16_fake16 : VOP2Only_Real_MADK_t16_gfx11_gfx12<0x037, "v_fmamk_f16">;
+defm V_FMAAK_F16_fake16 : VOP2Only_Real_MADK_t16_gfx11_gfx12<0x038, "v_fmaak_f16">;
// VOP3 only.
defm V_CNDMASK_B16 : VOP3Only_Realtriple_gfx11_gfx12<0x25d>;
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
index e8291f7ab8f729..ac7944f25fe37c 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
@@ -12,7 +12,7 @@ body: |
; GCN-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; GCN-NEXT: [[V_FMAC_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAC_F16_t16_e64 0, killed [[DEF1]], 0, [[DEF2]], 0, [[V_CVT_F32_U32_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[V_FMAC_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAC_F16_fake16_e64 0, killed [[DEF1]], 0, [[DEF2]], 0, [[V_CVT_F32_U32_e64_]], 0, 0, implicit $mode, implicit $exec
%0:vgpr_32 = IMPLICIT_DEF
%1:sreg_32 = IMPLICIT_DEF
%2:sreg_32 = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir b/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir
index 820b8579bd0a49..cefd24032871f4 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir
+++ b/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir
@@ -23,7 +23,7 @@ body: |
%1 = COPY %0.sub1
%2 = COPY %0.sub0
%3 = V_MOV_B32_e32 1078523331, implicit $exec
- %4 = V_FMAC_F16_t16_e64 0, killed %2, 0, %3, 0, killed %1, 0, 0, 0, implicit $mode, implicit $exec
+ %4 = V_FMAC_F16_fake16_e64 0, killed %2, 0, %3, 0, killed %1, 0, 0, implicit $mode, implicit $exec
...
@@ -48,7 +48,7 @@ body: |
%1 = COPY %0.sub1
%2 = COPY %0.sub0
%3 = V_MOV_B32_e32 1078523331, implicit $exec
- %4 = V_FMAC_F16_t16_e64 0, %2, 0, killed %3, 0, killed %1, 0, 0, 0, implicit $mode, implicit $exec
+ %4 = V_FMAC_F16_fake16_e64 0, %2, 0, killed %3, 0, killed %1, 0, 0, implicit $mode, implicit $exec
...
@@ -73,7 +73,7 @@ body: |
%1 = COPY %0.sub0
%2 = COPY %0.sub1
%3 = V_MOV_B32_e32 1078523331, implicit $exec
- %4 = V_FMAC_F16_t16_e64 0, killed %1, 0, %2, 0, %3, 0, 0, 0, implicit $mode, implicit $exec
+ %4 = V_FMAC_F16_fake16_e64 0, killed %1, 0, %2, 0, %3, 0, 0, implicit $mode, implicit $exec
...
---
@@ -95,7 +95,7 @@ body: |
%0:vgpr_32 = COPY killed $vgpr0
%1:vgpr_32 = V_MOV_B32_e32 49664, implicit $exec
- %2:vgpr_32 = V_FMAC_F16_t16_e64 0, 16384, 0, killed %0, 0, %1, 0, 0, 0, implicit $mode, implicit $exec
+ %2:vgpr_32 = V_FMAC_F16_fake16_e64 0, 16384, 0, killed %0, 0, %1, 0, 0, implicit $mode, implicit $exec
S_ENDPGM 0
...
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir
index ed2148ab5a1989..26feb8120c7510 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir
+++ b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir
@@ -267,7 +267,7 @@ body: |
; GFX11-LABEL: name: fma_cvv_f16
; GFX11: $vgpr0 = IMPLICIT_DEF
; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
- ; GFX11-NEXT: $vgpr2 = V_FMAMK_F16_t16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec
+ ; GFX11-NEXT: $vgpr2 = V_FMAMK_F16_fake16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec
; GFX11-NEXT: SI_RETURN implicit $vgpr2
$vgpr0 = IMPLICIT_DEF
$vgpr1 = IMPLICIT_DEF
@@ -288,7 +288,7 @@ body: |
; GFX11-LABEL: name: fma_vcv_f16
; GFX11: $vgpr0 = IMPLICIT_DEF
; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
- ; GFX11-NEXT: $vgpr2 = V_FMAMK_F16_t16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec
+ ; GFX11-NEXT: $vgpr2 = V_FMAMK_F16_fake16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec
; GFX11-NEXT: SI_RETURN implicit $vgpr2
$vgpr0 = IMPLICIT_DEF
$vgpr1 = IMPLICIT_DEF
@@ -309,7 +309,7 @@ body: |
; GFX11-LABEL: name: fma_vvc_f16
; GFX11: $vgpr0 = IMPLICIT_DEF
; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
- ; GFX11-NEXT: $vgpr2 = V_FMAAK_F16_t16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec
+ ; GFX11-NEXT: $vgpr2 = V_FMAAK_F16_fake16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec
; GFX11-NEXT: SI_RETURN implicit $vgpr2
$vgpr0 = IMPLICIT_DEF
$vgpr1 = IMPLICIT_DEF
@@ -330,7 +330,7 @@ body: |
; GFX11-LABEL: name: fma_vsc_f16
; GFX11: $vgpr0 = IMPLICIT_DEF
; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF
- ; GFX11-NEXT: $vgpr2 = V_FMAAK_F16_t16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec
+ ; GFX11-NEXT: $vgpr2 = V_FMAAK_F16_fake16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec
; GFX11-NEXT: SI_RETURN implicit $vgpr2
$vgpr0 = IMPLICIT_DEF
$sgpr1 = IMPLICIT_DEF
More information about the llvm-commits
mailing list