[llvm] MCExpr-ify SIProgramInfo (PR #88257)
Janek van Oirschot via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 16 08:03:43 PDT 2024
https://github.com/JanekvO updated https://github.com/llvm/llvm-project/pull/88257
>From 5da69c76c040df8c1ecaa7038d4e2ebebb93698b Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <janek.vanoirschot at amd.com>
Date: Tue, 9 Apr 2024 22:08:47 +0100
Subject: [PATCH 1/3] MCExpr-ify SIProgramInfo struct with accompanying
population function.
---
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 396 ++++++++++++------
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h | 2 +
.../AMDGPU/AMDGPUHSAMetadataStreamer.cpp | 22 +-
.../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 8 +-
.../AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp | 221 ++++++++++
.../Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h | 35 +-
llvm/lib/Target/AMDGPU/SIProgramInfo.cpp | 207 +++++++--
llvm/lib/Target/AMDGPU/SIProgramInfo.h | 45 +-
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 12 +-
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 6 +
llvm/test/MC/AMDGPU/alignto_mcexpr.s | 15 +
llvm/test/MC/AMDGPU/extrasgprs_mcexpr.s | 41 ++
llvm/test/MC/AMDGPU/occupancy_mcexpr.s | 61 +++
llvm/test/MC/AMDGPU/totalnumvgpr_mcexpr.s | 25 ++
14 files changed, 905 insertions(+), 191 deletions(-)
create mode 100644 llvm/test/MC/AMDGPU/alignto_mcexpr.s
create mode 100644 llvm/test/MC/AMDGPU/extrasgprs_mcexpr.s
create mode 100644 llvm/test/MC/AMDGPU/occupancy_mcexpr.s
create mode 100644 llvm/test/MC/AMDGPU/totalnumvgpr_mcexpr.s
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 052b231d62a3eb..b410f0c13e1b49 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -22,6 +22,7 @@
#include "AMDKernelCodeT.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUInstPrinter.h"
+#include "MCTargetDesc/AMDGPUMCExpr.h"
#include "MCTargetDesc/AMDGPUMCKernelDescriptor.h"
#include "MCTargetDesc/AMDGPUTargetStreamer.h"
#include "R600AsmPrinter.h"
@@ -134,6 +135,12 @@ void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
getTargetStreamer()->getPALMetadata()->readFromIR(M);
}
+uint64_t AMDGPUAsmPrinter::getMCExprValue(const MCExpr *Value) {
+ int64_t Val;
+ Value->evaluateAsAbsolute(Val);
+ return Val;
+}
+
void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
// Init target streamer if it has not yet happened
if (!IsTargetStreamerInitialized)
@@ -237,12 +244,14 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
getNameWithPrefix(KernelName, &MF->getFunction());
getTargetStreamer()->EmitAmdhsaKernelDescriptor(
STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
- CurrentProgramInfo.NumVGPRsForWavesPerEU,
- CurrentProgramInfo.NumSGPRsForWavesPerEU -
+ getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU),
+ getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU) -
IsaInfo::getNumExtraSGPRs(
- &STM, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
+ &STM, getMCExprValue(CurrentProgramInfo.VCCUsed),
+ getMCExprValue(CurrentProgramInfo.FlatUsed),
getTargetStreamer()->getTargetID()->isXnackOnOrAny()),
- CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
+ getMCExprValue(CurrentProgramInfo.VCCUsed),
+ getMCExprValue(CurrentProgramInfo.FlatUsed));
Streamer.popSection();
}
@@ -422,7 +431,7 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
}
- if (CurrentProgramInfo.DynamicCallStack &&
+ if (getMCExprValue(CurrentProgramInfo.DynamicCallStack) &&
CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK;
@@ -439,29 +448,22 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
MCKernelDescriptor KernelDescriptor;
- assert(isUInt<32>(PI.ScratchSize));
- assert(isUInt<32>(PI.getComputePGMRSrc1(STM)));
- assert(isUInt<32>(PI.getComputePGMRSrc2()));
-
KernelDescriptor.group_segment_fixed_size =
MCConstantExpr::create(PI.LDSSize, Ctx);
- KernelDescriptor.private_segment_fixed_size =
- MCConstantExpr::create(PI.ScratchSize, Ctx);
+ KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
Align MaxKernArgAlign;
KernelDescriptor.kernarg_size = MCConstantExpr::create(
STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
- KernelDescriptor.compute_pgm_rsrc1 =
- MCConstantExpr::create(PI.getComputePGMRSrc1(STM), Ctx);
- KernelDescriptor.compute_pgm_rsrc2 =
- MCConstantExpr::create(PI.getComputePGMRSrc2(), Ctx);
+ KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
+ KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
KernelDescriptor.kernel_code_properties =
MCConstantExpr::create(getAmdhsaKernelCodeProperties(MF), Ctx);
- assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
- KernelDescriptor.compute_pgm_rsrc3 = MCConstantExpr::create(
- STM.hasGFX90AInsts() ? CurrentProgramInfo.ComputePGMRSrc3GFX90A : 0, Ctx);
+ assert(STM.hasGFX90AInsts() ||
+ getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A) == 0);
+ KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A;
KernelDescriptor.kernarg_preload = MCConstantExpr::create(
AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
@@ -477,7 +479,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
initTargetStreamer(*MF.getFunction().getParent());
ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
- CurrentProgramInfo = SIProgramInfo();
+ CurrentProgramInfo.reset(MF);
const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
@@ -550,11 +552,13 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
OutStreamer->emitRawComment(" Kernel info:", false);
emitCommonFunctionComments(
- CurrentProgramInfo.NumArchVGPR,
- STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR
+ getMCExprValue(CurrentProgramInfo.NumArchVGPR),
+ STM.hasMAIInsts() ? getMCExprValue(CurrentProgramInfo.NumAccVGPR)
: std::optional<uint32_t>(),
- CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
- CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI);
+ getMCExprValue(CurrentProgramInfo.NumVGPR),
+ getMCExprValue(CurrentProgramInfo.NumSGPR),
+ getMCExprValue(CurrentProgramInfo.ScratchSize), getFunctionCodeSize(MF),
+ MFI);
OutStreamer->emitRawComment(
" FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
@@ -565,32 +569,38 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
" bytes/workgroup (compile time only)", false);
OutStreamer->emitRawComment(
- " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
+ " SGPRBlocks: " + Twine(getMCExprValue(CurrentProgramInfo.SGPRBlocks)),
+ false);
OutStreamer->emitRawComment(
- " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
+ " VGPRBlocks: " + Twine(getMCExprValue(CurrentProgramInfo.VGPRBlocks)),
+ false);
OutStreamer->emitRawComment(
- " NumSGPRsForWavesPerEU: " +
- Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
+ " NumSGPRsForWavesPerEU: " +
+ Twine(getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU)),
+ false);
OutStreamer->emitRawComment(
- " NumVGPRsForWavesPerEU: " +
- Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
+ " NumVGPRsForWavesPerEU: " +
+ Twine(getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU)),
+ false);
if (STM.hasGFX90AInsts())
OutStreamer->emitRawComment(
- " AccumOffset: " +
- Twine((CurrentProgramInfo.AccumOffset + 1) * 4), false);
+ " AccumOffset: " +
+ Twine((getMCExprValue(CurrentProgramInfo.AccumOffset) + 1) * 4),
+ false);
OutStreamer->emitRawComment(
- " Occupancy: " +
- Twine(CurrentProgramInfo.Occupancy), false);
+ " Occupancy: " + Twine(getMCExprValue(CurrentProgramInfo.Occupancy)),
+ false);
OutStreamer->emitRawComment(
" WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
- OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
- Twine(CurrentProgramInfo.ScratchEnable),
- false);
+ OutStreamer->emitRawComment(
+ " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
+ Twine(getMCExprValue(CurrentProgramInfo.ScratchEnable)),
+ false);
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
Twine(CurrentProgramInfo.UserSGPR),
false);
@@ -611,18 +621,20 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
false);
assert(STM.hasGFX90AInsts() ||
- CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
+ getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A) == 0);
if (STM.hasGFX90AInsts()) {
OutStreamer->emitRawComment(
- " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
- Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
- amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
- false);
+ " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
+ Twine((AMDHSA_BITS_GET(
+ getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A),
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
+ false);
OutStreamer->emitRawComment(
- " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
- Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
- amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
- false);
+ " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
+ Twine((AMDHSA_BITS_GET(
+ getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A),
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
+ false);
}
}
@@ -702,23 +714,40 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
ResourceUsage->getResourceInfo(&MF.getFunction());
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+ MCContext &Ctx = MF.getContext();
+
+ auto CreateExpr = [&Ctx](int64_t Value) {
+ return MCConstantExpr::create(Value, Ctx);
+ };
- ProgInfo.NumArchVGPR = Info.NumVGPR;
- ProgInfo.NumAccVGPR = Info.NumAGPR;
- ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM);
- ProgInfo.AccumOffset = alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1;
+ auto TryGetMCExprValue = [&Ctx](const MCExpr *Value, uint64_t &Res) -> bool {
+ int64_t Val;
+ if (Value->evaluateAsAbsolute(Val)) {
+ Res = Val;
+ return true;
+ } else
+ return false;
+ };
+
+ ProgInfo.NumArchVGPR = CreateExpr(Info.NumVGPR);
+ ProgInfo.NumAccVGPR = CreateExpr(Info.NumAGPR);
+ ProgInfo.NumVGPR = CreateExpr(Info.getTotalNumVGPRs(STM));
+ ProgInfo.AccumOffset =
+ CreateExpr(alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1);
ProgInfo.TgSplit = STM.isTgSplitEnabled();
- ProgInfo.NumSGPR = Info.NumExplicitSGPR;
- ProgInfo.ScratchSize = Info.PrivateSegmentSize;
- ProgInfo.VCCUsed = Info.UsesVCC;
- ProgInfo.FlatUsed = Info.UsesFlatScratch;
- ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
+ ProgInfo.NumSGPR = CreateExpr(Info.NumExplicitSGPR);
+ ProgInfo.ScratchSize = CreateExpr(Info.PrivateSegmentSize);
+ ProgInfo.VCCUsed = CreateExpr(Info.UsesVCC);
+ ProgInfo.FlatUsed = CreateExpr(Info.UsesFlatScratch);
+ ProgInfo.DynamicCallStack =
+ CreateExpr(Info.HasDynamicallySizedStack || Info.HasRecursion);
const uint64_t MaxScratchPerWorkitem =
STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
- if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) {
- DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
- ProgInfo.ScratchSize,
+ uint64_t ScratchSize;
+ if (TryGetMCExprValue(ProgInfo.ScratchSize, ScratchSize) &&
+ ScratchSize > MaxScratchPerWorkitem) {
+ DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), ScratchSize,
MaxScratchPerWorkitem, DS_Error);
MF.getFunction().getContext().diagnose(DiagStackSize);
}
@@ -728,27 +757,30 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// The calculations related to SGPR/VGPR blocks are
// duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
// unified.
- unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
- &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed,
- getTargetStreamer()->getTargetID()->isXnackOnOrAny());
+ const MCExpr *ExtraSGPRs = AMDGPUVariadicMCExpr::createExtraSGPRs(
+ ProgInfo.VCCUsed, ProgInfo.FlatUsed, getIsaVersion(STM.getCPU()).Major,
+ STM.getFeatureBits().test(AMDGPU::FeatureArchitectedFlatScratch),
+ getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
// Check the addressable register limit before we add ExtraSGPRs.
if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
!STM.hasSGPRInitBug()) {
unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
- if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
+ uint64_t NumSgpr;
+ if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
+ NumSgpr > MaxAddressableNumSGPRs) {
// This can happen due to a compiler bug or when using inline asm.
LLVMContext &Ctx = MF.getFunction().getContext();
DiagnosticInfoResourceLimit Diag(
- MF.getFunction(), "addressable scalar registers", ProgInfo.NumSGPR,
+ MF.getFunction(), "addressable scalar registers", NumSgpr,
MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
Ctx.diagnose(Diag);
- ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1;
+ ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);
}
}
// Account for extra SGPRs and VGPRs reserved for debugger use.
- ProgInfo.NumSGPR += ExtraSGPRs;
+ ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx);
const Function &F = MF.getFunction();
@@ -819,40 +851,50 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
}
}
}
- ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
- ProgInfo.NumArchVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
- ProgInfo.NumVGPR =
- Info.getTotalNumVGPRs(STM, Info.NumAGPR, ProgInfo.NumArchVGPR);
+ ProgInfo.NumSGPR = AMDGPUVariadicMCExpr::createMax(
+ {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx);
+
+ ProgInfo.NumArchVGPR = AMDGPUVariadicMCExpr::createMax(
+ {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
+
+ ProgInfo.NumVGPR = AMDGPUVariadicMCExpr::createTotalNumVGPR(
+ STM.hasGFX90AInsts(), ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
}
// Adjust number of registers used to meet default/requested minimum/maximum
// number of waves per execution unit request.
- ProgInfo.NumSGPRsForWavesPerEU = std::max(
- std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU()));
- ProgInfo.NumVGPRsForWavesPerEU = std::max(
- std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU()));
+ ProgInfo.NumSGPRsForWavesPerEU = AMDGPUVariadicMCExpr::createMax(
+ {ProgInfo.NumSGPR, CreateExpr(1ul),
+ CreateExpr(STM.getMinNumSGPRs(MFI->getMaxWavesPerEU()))},
+ Ctx);
+ ProgInfo.NumVGPRsForWavesPerEU = AMDGPUVariadicMCExpr::createMax(
+ {ProgInfo.NumVGPR, CreateExpr(1ul),
+ CreateExpr(STM.getMinNumVGPRs(MFI->getMaxWavesPerEU()))},
+ Ctx);
if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
STM.hasSGPRInitBug()) {
unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
- if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
+ uint64_t NumSgpr;
+ if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
+ NumSgpr > MaxAddressableNumSGPRs) {
// This can happen due to a compiler bug or when using inline asm to use
// the registers which are usually reserved for vcc etc.
LLVMContext &Ctx = MF.getFunction().getContext();
DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
- ProgInfo.NumSGPR, MaxAddressableNumSGPRs,
+ NumSgpr, MaxAddressableNumSGPRs,
DS_Error, DK_ResourceLimit);
Ctx.diagnose(Diag);
- ProgInfo.NumSGPR = MaxAddressableNumSGPRs;
- ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs;
+ ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);
+ ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);
}
}
if (STM.hasSGPRInitBug()) {
ProgInfo.NumSGPR =
- AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
+ CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);
ProgInfo.NumSGPRsForWavesPerEU =
- AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
+ CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);
}
if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
@@ -871,11 +913,26 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
STM.getAddressableLocalMemorySize(), DS_Error);
Ctx.diagnose(Diag);
}
+ // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
+ // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
+ auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
+ unsigned Granule) {
+ const MCExpr *OneConst = CreateExpr(1ul);
+ const MCExpr *GranuleConst = CreateExpr(Granule);
+ const MCExpr *MaxNumGPR =
+ AMDGPUVariadicMCExpr::createMax({NumGPR, OneConst}, Ctx);
+ const MCExpr *AlignToGPR =
+ AMDGPUVariadicMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
+ const MCExpr *DivGPR =
+ MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
+ const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
+ return SubGPR;
+ };
- ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks(
- &STM, ProgInfo.NumSGPRsForWavesPerEU);
- ProgInfo.VGPRBlocks =
- IsaInfo::getEncodedNumVGPRBlocks(&STM, ProgInfo.NumVGPRsForWavesPerEU);
+ ProgInfo.SGPRBlocks = GetNumGPRBlocks(ProgInfo.NumSGPRsForWavesPerEU,
+ IsaInfo::getSGPREncodingGranule(&STM));
+ ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
+ IsaInfo::getVGPREncodingGranule(&STM));
const SIModeRegisterDefaults Mode = MFI->getMode();
@@ -904,14 +961,23 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.LDSBlocks =
alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
+ // The MCExpr equivalent of divideCeil.
+ auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
+ const MCExpr *Ceil =
+ AMDGPUVariadicMCExpr::createAlignTo(Numerator, Denominator, Ctx);
+ return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);
+ };
+
// Scratch is allocated in 64-dword or 256-dword blocks.
unsigned ScratchAlignShift =
STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
// We need to program the hardware with the amount of scratch memory that
// is used by the entire wave. ProgInfo.ScratchSize is the amount of
// scratch memory used per thread.
- ProgInfo.ScratchBlocks = divideCeil(
- ProgInfo.ScratchSize * STM.getWavefrontSize(), 1ULL << ScratchAlignShift);
+ ProgInfo.ScratchBlocks = DivideCeil(
+ MCBinaryExpr::createMul(ProgInfo.ScratchSize,
+ CreateExpr(STM.getWavefrontSize()), Ctx),
+ CreateExpr(1ULL << ScratchAlignShift));
if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
@@ -930,8 +996,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// anything to disable it if we know the stack isn't used here. We may still
// have emitted code reading it to initialize scratch, but if that's unused
// reading garbage should be OK.
- ProgInfo.ScratchEnable =
- ProgInfo.ScratchBlocks > 0 || ProgInfo.DynamicCallStack;
+ ProgInfo.ScratchEnable = MCBinaryExpr::createLOr(
+ MCBinaryExpr::createGT(ProgInfo.ScratchBlocks,
+ MCConstantExpr::create(0, Ctx), Ctx),
+ ProgInfo.DynamicCallStack, Ctx);
+
ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
// For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
ProgInfo.TrapHandlerEnable =
@@ -947,26 +1016,41 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.EXCPEnable = 0;
if (STM.hasGFX90AInsts()) {
- AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
- amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
- ProgInfo.AccumOffset);
- AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
- amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
- ProgInfo.TgSplit);
+ // return ((Dst & ~Mask) | (Value << Shift))
+ auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
+ uint32_t Shift) {
+ auto Shft = MCConstantExpr::create(Shift, Ctx);
+ auto Msk = MCConstantExpr::create(Mask, Ctx);
+ Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
+ Dst = MCBinaryExpr::createOr(
+ Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), Ctx);
+ return Dst;
+ };
+
+ ProgInfo.ComputePGMRSrc3GFX90A =
+ SetBits(ProgInfo.ComputePGMRSrc3GFX90A, ProgInfo.AccumOffset,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
+ ProgInfo.ComputePGMRSrc3GFX90A =
+ SetBits(ProgInfo.ComputePGMRSrc3GFX90A, CreateExpr(ProgInfo.TgSplit),
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
}
- ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize,
- ProgInfo.NumSGPRsForWavesPerEU,
- ProgInfo.NumVGPRsForWavesPerEU);
+ ProgInfo.Occupancy = AMDGPUVariadicMCExpr::createOccupancy(
+ STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU,
+ ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
+
const auto [MinWEU, MaxWEU] =
AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
- if (ProgInfo.Occupancy < MinWEU) {
+ uint64_t Occupancy;
+ if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
DiagnosticInfoOptimizationFailure Diag(
F, F.getSubprogram(),
"failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
"'" +
F.getName() + "': desired occupancy was " + Twine(MinWEU) +
- ", final occupancy is " + Twine(ProgInfo.Occupancy));
+ ", final occupancy is " + Twine(Occupancy));
F.getContext().diagnose(Diag);
}
}
@@ -989,36 +1073,76 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
+ MCContext &Ctx = MF.getContext();
+
+ // (((Value) & Mask) << Shift)
+ auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
+ const MCExpr *msk = MCConstantExpr::create(Mask, Ctx);
+ const MCExpr *shft = MCConstantExpr::create(Shift, Ctx);
+ return MCBinaryExpr::createShl(MCBinaryExpr::createAnd(Value, msk, Ctx),
+ shft, Ctx);
+ };
+
+ auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {
+ int64_t Val;
+ if (Value->evaluateAsAbsolute(Val))
+ OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size);
+ else
+ OutStreamer->emitValue(Value, Size);
+ };
if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1);
- OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1(STM));
+ EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx),
+ /*Size=*/4);
OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
- OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc2());
+ EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4);
OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
- OutStreamer->emitInt32(
- STM.getGeneration() >= AMDGPUSubtarget::GFX12
- ? S_00B860_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
- : STM.getGeneration() == AMDGPUSubtarget::GFX11
- ? S_00B860_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
- : S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
+
+ // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
+ // appropriate generation.
+ if (STM.getGeneration() >= AMDGPUSubtarget::GFX12)
+ EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
+ /*Mask=*/0x3FFFF, /*Shift=*/12),
+ /*Size=*/4);
+ else if (STM.getGeneration() == AMDGPUSubtarget::GFX11)
+ EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
+ /*Mask=*/0x7FFF, /*Shift=*/12),
+ /*Size=*/4);
+ else
+ EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
+ /*Mask=*/0x1FFF, /*Shift=*/12),
+ /*Size=*/4);
// TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
// 0" comment but I don't see a corresponding field in the register spec.
} else {
OutStreamer->emitInt32(RsrcReg);
- OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
- S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
+
+ const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
+ SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),
+ SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),
+ MF.getContext());
+ EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);
OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
- OutStreamer->emitInt32(
- STM.getGeneration() >= AMDGPUSubtarget::GFX12
- ? S_0286E8_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
- : STM.getGeneration() == AMDGPUSubtarget::GFX11
- ? S_0286E8_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
- : S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
+
+ // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
+ // appropriate generation.
+ if (STM.getGeneration() >= AMDGPUSubtarget::GFX12)
+ EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
+ /*Mask=*/0x3FFFF, /*Shift=*/12),
+ /*Size=*/4);
+ else if (STM.getGeneration() == AMDGPUSubtarget::GFX11)
+ EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
+ /*Mask=*/0x7FFF, /*Shift=*/12),
+ /*Size=*/4);
+ else
+ EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
+ /*Mask=*/0x1FFF, /*Shift=*/12),
+ /*Size=*/4);
}
if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
@@ -1072,31 +1196,35 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
auto MD = getTargetStreamer()->getPALMetadata();
MD->setEntryPoint(CC, MF.getFunction().getName());
- MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU);
+ MD->setNumUsedVgprs(CC,
+ getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU));
// Only set AGPRs for supported devices
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
if (STM.hasMAIInsts()) {
- MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
+ MD->setNumUsedAgprs(CC, getMCExprValue(CurrentProgramInfo.NumAccVGPR));
}
- MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
+ MD->setNumUsedSgprs(CC,
+ getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU));
if (MD->getPALMajorVersion() < 3) {
MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM));
if (AMDGPU::isCompute(CC)) {
MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2());
} else {
- if (CurrentProgramInfo.ScratchBlocks > 0)
+ if (getMCExprValue(CurrentProgramInfo.ScratchBlocks) > 0)
MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
}
} else {
MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
- MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
+ MD->setHwStage(CC, ".scratch_en",
+ (bool)getMCExprValue(CurrentProgramInfo.ScratchEnable));
EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM);
}
// ScratchSize is in bytes, 16 aligned.
- MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16));
+ MD->setScratchSize(
+ CC, alignTo(getMCExprValue(CurrentProgramInfo.ScratchSize), 16));
if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
@@ -1158,8 +1286,10 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
// Set optional info
MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
- MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
- MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
+ MD->setFunctionNumUsedVgprs(
+ FnName, getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU));
+ MD->setFunctionNumUsedSgprs(
+ FnName, getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU));
}
// This is supposed to be log2(Size)
@@ -1193,7 +1323,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
(CurrentProgramInfo.getComputePGMRSrc2() << 32);
Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
- if (CurrentProgramInfo.DynamicCallStack)
+ if (getMCExprValue(CurrentProgramInfo.DynamicCallStack))
Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK;
AMD_HSA_BITS_SET(Out.code_properties,
@@ -1229,9 +1359,10 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
Align MaxKernArgAlign;
Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
- Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
- Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
- Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
+ Out.wavefront_sgpr_count = getMCExprValue(CurrentProgramInfo.NumSGPR);
+ Out.workitem_vgpr_count = getMCExprValue(CurrentProgramInfo.NumVGPR);
+ Out.workitem_private_segment_byte_size =
+ getMCExprValue(CurrentProgramInfo.ScratchSize);
Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
// kernarg_segment_alignment is specified as log of the alignment.
@@ -1324,17 +1455,20 @@ void AMDGPUAsmPrinter::emitResourceUsageRemarks(
// printing multiple diagnostic location and diag opts.
EmitResourceUsageRemark("FunctionName", "Function Name",
MF.getFunction().getName());
- EmitResourceUsageRemark("NumSGPR", "SGPRs", CurrentProgramInfo.NumSGPR);
- EmitResourceUsageRemark("NumVGPR", "VGPRs", CurrentProgramInfo.NumArchVGPR);
+ EmitResourceUsageRemark("NumSGPR", "SGPRs",
+ getMCExprValue(CurrentProgramInfo.NumSGPR));
+ EmitResourceUsageRemark("NumVGPR", "VGPRs",
+ getMCExprValue(CurrentProgramInfo.NumArchVGPR));
if (hasMAIInsts)
- EmitResourceUsageRemark("NumAGPR", "AGPRs", CurrentProgramInfo.NumAccVGPR);
+ EmitResourceUsageRemark("NumAGPR", "AGPRs",
+ getMCExprValue(CurrentProgramInfo.NumAccVGPR));
EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
- CurrentProgramInfo.ScratchSize);
+ getMCExprValue(CurrentProgramInfo.ScratchSize));
StringRef DynamicStackStr =
- CurrentProgramInfo.DynamicCallStack ? "True" : "False";
+ getMCExprValue(CurrentProgramInfo.DynamicCallStack) ? "True" : "False";
EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
- CurrentProgramInfo.Occupancy);
+ getMCExprValue(CurrentProgramInfo.Occupancy));
EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
CurrentProgramInfo.SGPRSpill);
EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index b8b2718d293e69..3d155905c4afeb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -78,6 +78,8 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
void initTargetStreamer(Module &M);
+ static uint64_t getMCExprValue(const MCExpr *Value);
+
public:
explicit AMDGPUAsmPrinter(TargetMachine &TM,
std::unique_ptr<MCStreamer> Streamer);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 9e288ab50e1701..a402e6fc68b491 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -19,6 +19,7 @@
#include "SIMachineFunctionInfo.h"
#include "SIProgramInfo.h"
#include "llvm/IR/Module.h"
+#include "llvm/MC/MCExpr.h"
using namespace llvm;
static std::pair<Type *, Align> getArgumentTypeAlign(const Argument &Arg,
@@ -462,6 +463,12 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
const Function &F = MF.getFunction();
+ auto getMCExprValue = [](const MCExpr *Value) {
+ int64_t Val;
+ Value->evaluateAsAbsolute(Val);
+ return static_cast<uint64_t>(Val);
+ };
+
auto Kern = HSAMetadataDoc->getMapNode();
Align MaxKernArgAlign;
@@ -470,10 +477,10 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
Kern[".group_segment_fixed_size"] =
Kern.getDocument()->getNode(ProgramInfo.LDSSize);
Kern[".private_segment_fixed_size"] =
- Kern.getDocument()->getNode(ProgramInfo.ScratchSize);
+ Kern.getDocument()->getNode(getMCExprValue(ProgramInfo.ScratchSize));
if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
- Kern[".uses_dynamic_stack"] =
- Kern.getDocument()->getNode(ProgramInfo.DynamicCallStack);
+ Kern[".uses_dynamic_stack"] = Kern.getDocument()->getNode(
+ static_cast<bool>(getMCExprValue(ProgramInfo.DynamicCallStack)));
if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5 && STM.supportsWGP())
Kern[".workgroup_processor_mode"] =
@@ -484,12 +491,15 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
Kern.getDocument()->getNode(std::max(Align(4), MaxKernArgAlign).value());
Kern[".wavefront_size"] =
Kern.getDocument()->getNode(STM.getWavefrontSize());
- Kern[".sgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumSGPR);
- Kern[".vgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumVGPR);
+ Kern[".sgpr_count"] =
+ Kern.getDocument()->getNode(getMCExprValue(ProgramInfo.NumSGPR));
+ Kern[".vgpr_count"] =
+ Kern.getDocument()->getNode(getMCExprValue(ProgramInfo.NumVGPR));
// Only add AGPR count to metadata for supported devices
if (STM.hasMAIInsts()) {
- Kern[".agpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumAccVGPR);
+ Kern[".agpr_count"] =
+ Kern.getDocument()->getNode(getMCExprValue(ProgramInfo.NumAccVGPR));
}
Kern[".max_flat_workgroup_size"] =
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 38667235211471..08b4a86994cab5 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//
#include "AMDKernelCodeT.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCExpr.h"
#include "MCTargetDesc/AMDGPUMCKernelDescriptor.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -8406,12 +8407,17 @@ bool AMDGPUAsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
AGVK VK = StringSwitch<AGVK>(TokenId)
.Case("max", AGVK::AGVK_Max)
.Case("or", AGVK::AGVK_Or)
+ .Case("extrasgprs", AGVK::AGVK_ExtraSGPRs)
+ .Case("totalnumvgprs", AGVK::AGVK_TotalNumVGPRs)
+ .Case("totalnumvgprs90a", AGVK::AGVK_TotalNumVGPRs90A)
+ .Case("alignto", AGVK::AGVK_AlignTo)
+ .Case("occupancy", AGVK::AGVK_Occupancy)
.Default(AGVK::AGVK_None);
if (VK != AGVK::AGVK_None && peekToken().is(AsmToken::LParen)) {
SmallVector<const MCExpr *, 4> Exprs;
uint64_t CommaCount = 0;
- lex(); // Eat 'max'/'or'
+ lex(); // Eat Arg ('or', 'max', 'occupancy', etc.)
lex(); // Eat '('
while (true) {
if (trySkipToken(AsmToken::RParen)) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
index 4578c33d92dce1..25813eb30aefd5 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
@@ -7,6 +7,9 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUMCExpr.h"
+#include "GCNSubtarget.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/IR/Function.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbol.h"
@@ -16,6 +19,7 @@
#include <optional>
using namespace llvm;
+using namespace llvm::AMDGPU;
AMDGPUVariadicMCExpr::AMDGPUVariadicMCExpr(VariadicKind Kind,
ArrayRef<const MCExpr *> Args,
@@ -61,6 +65,21 @@ void AMDGPUVariadicMCExpr::printImpl(raw_ostream &OS,
case AGVK_Max:
OS << "max(";
break;
+ case AGVK_ExtraSGPRs:
+ OS << "extrasgprs(";
+ break;
+ case AGVK_TotalNumVGPRs:
+ OS << "totalnumvgprs(";
+ break;
+ case AGVK_TotalNumVGPRs90A:
+ OS << "totalnumvgprs90a(";
+ break;
+ case AGVK_AlignTo:
+ OS << "alignto(";
+ break;
+ case AGVK_Occupancy:
+ OS << "occupancy(";
+ break;
}
for (auto It = Args.begin(); It != Args.end(); ++It) {
(*It)->print(OS, MAI, /*InParens=*/false);
@@ -86,6 +105,158 @@ bool AMDGPUVariadicMCExpr::evaluateAsRelocatableImpl(
MCValue &Res, const MCAsmLayout *Layout, const MCFixup *Fixup) const {
std::optional<int64_t> Total;
+ auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) {
+ MCValue MCVal;
+ if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) ||
+ !MCVal.isAbsolute())
+ return false;
+
+ ConstantValue = MCVal.getConstant();
+ return true;
+ };
+
+ if (Kind == AGVK_ExtraSGPRs) {
+ assert(Args.size() == 5 &&
+ "AMDGPUVariadic Argument count incorrect for ExtraSGPRs");
+ uint64_t VCCUsed, FlatScrUsed, MajorVersion, XNACKUsed,
+ hasArchitectedFlatScr, ExtraSGPRs = 0;
+
+ bool Success = true;
+ Success &= TryGetMCExprValue(Args[0], MajorVersion);
+ Success &= TryGetMCExprValue(Args[3], XNACKUsed);
+ Success &= TryGetMCExprValue(Args[4], hasArchitectedFlatScr);
+
+ assert(Success &&
+ "Arguments 1, 4, and 5 for ExtraSGPRs should be known constants");
+ if (!Success || !TryGetMCExprValue(Args[1], VCCUsed) ||
+ !TryGetMCExprValue(Args[2], FlatScrUsed))
+ return false;
+
+ if (VCCUsed)
+ ExtraSGPRs = 2;
+ if (MajorVersion >= 10) {
+ Res = MCValue::get(ExtraSGPRs);
+ return true;
+ }
+ if (MajorVersion < 8) {
+ if (FlatScrUsed)
+ ExtraSGPRs = 4;
+ } else {
+ if (XNACKUsed)
+ ExtraSGPRs = 4;
+ if (FlatScrUsed || hasArchitectedFlatScr)
+ ExtraSGPRs = 6;
+ }
+
+ Res = MCValue::get(ExtraSGPRs);
+ return true;
+ }
+
+ if (Kind == AGVK_AlignTo) {
+ assert(Args.size() == 2 &&
+ "AMDGPUVariadic Argument count incorrect for AlignTo");
+ uint64_t Value, Align;
+ if (!TryGetMCExprValue(Args[0], Value) ||
+ !TryGetMCExprValue(Args[1], Align))
+ return false;
+
+ Res = MCValue::get(alignTo(Value, Align));
+ return true;
+ }
+
+ if (Kind == AGVK_TotalNumVGPRs90A) {
+ assert(Args.size() == 2 &&
+ "AMDGPUVariadic Argument count incorrect for TotalNumVGPRs90A");
+ uint64_t NumAGPR, NumVGPR, Total;
+ if (!TryGetMCExprValue(Args[0], NumAGPR) ||
+ !TryGetMCExprValue(Args[1], NumVGPR))
+ return false;
+
+ if (NumAGPR) {
+ Total = alignTo(NumVGPR, 4) + NumAGPR;
+ } else {
+ Total = std::max(NumVGPR, NumAGPR);
+ }
+
+ Res = MCValue::get(Total);
+ return true;
+ }
+
+ if (Kind == AGVK_TotalNumVGPRs) {
+ assert(Args.size() == 2 &&
+ "AMDGPUVariadic Argument count incorrect for TotalNumVGPRs");
+ uint64_t NumAGPR, NumVGPR;
+ if (!TryGetMCExprValue(Args[0], NumAGPR) ||
+ !TryGetMCExprValue(Args[1], NumVGPR))
+ return false;
+
+ Res = MCValue::get(std::max(NumVGPR, NumAGPR));
+ return true;
+ }
+
+ if (Kind == AGVK_Occupancy) {
+ assert(Args.size() == 7 &&
+ "AMDGPUVariadic Argument count incorrect for Occupancy");
+ uint64_t InitOccupancy, MaxWaves, Granule, TargetTotalNumVGPRs, Generation,
+ NumSGPRs, NumVGPRs;
+
+ bool Success = true;
+ Success &= TryGetMCExprValue(Args[0], MaxWaves);
+ Success &= TryGetMCExprValue(Args[1], Granule);
+ Success &= TryGetMCExprValue(Args[2], TargetTotalNumVGPRs);
+ Success &= TryGetMCExprValue(Args[3], Generation);
+ Success &= TryGetMCExprValue(Args[4], InitOccupancy);
+
+ assert(Success &&
+ "Arguments 1 to 5 for Occupancy should be known constants");
+
+ if (!Success || !TryGetMCExprValue(Args[5], NumSGPRs) ||
+ !TryGetMCExprValue(Args[6], NumVGPRs))
+ return false;
+
+ auto OccWithNumVGPRs = [&](uint64_t NumVGPRs) -> uint64_t {
+ return IsaInfo::getNumWavesPerEUWithNumVGPRs(NumVGPRs, Granule, MaxWaves,
+ TargetTotalNumVGPRs);
+ };
+
+ // Mirrors GCNSubtarget::getOccupancyWithNumSGPRs without dependency on
+ // subtarget.
+ auto OccWithNumSGPRs = [&](uint64_t NumSGPRs) -> uint64_t {
+ if (Generation >= AMDGPUSubtarget::GFX10)
+ return MaxWaves;
+
+ if (Generation >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ if (NumSGPRs <= 80)
+ return 10;
+ if (NumSGPRs <= 88)
+ return 9;
+ if (NumSGPRs <= 100)
+ return 8;
+ return 7;
+ }
+ if (NumSGPRs <= 48)
+ return 10;
+ if (NumSGPRs <= 56)
+ return 9;
+ if (NumSGPRs <= 64)
+ return 8;
+ if (NumSGPRs <= 72)
+ return 7;
+ if (NumSGPRs <= 80)
+ return 6;
+ return 5;
+ };
+
+ uint64_t Occupancy = InitOccupancy;
+ if (NumSGPRs)
+ Occupancy = std::min(Occupancy, OccWithNumSGPRs(NumSGPRs));
+ if (NumVGPRs)
+ Occupancy = std::min(Occupancy, OccWithNumVGPRs(NumVGPRs));
+
+ Res = MCValue::get(Occupancy);
+ return true;
+ }
+
for (const MCExpr *Arg : Args) {
MCValue ArgRes;
if (!Arg->evaluateAsRelocatable(ArgRes, Layout, Fixup) ||
@@ -113,3 +284,53 @@ MCFragment *AMDGPUVariadicMCExpr::findAssociatedFragment() const {
}
return nullptr;
}
+
+/// Allow delayed MCExpr resolve of ExtraSGPRs (in case VCCUsed or FlatScrUsed
+/// are unresolvable but needed for further MCExprs). Derived from
+/// implementation of IsaInfo::getNumExtraSGPRs in AMDGPUBaseInfo.cpp.
+///
+const AMDGPUVariadicMCExpr *AMDGPUVariadicMCExpr::createExtraSGPRs(
+ const MCExpr *VCCUsed, const MCExpr *FlatScrUsed, unsigned MajorVersion,
+ bool hasArchitectedFlatScratch, bool XNACKUsed, MCContext &Ctx) {
+ auto GetConstantExpr = [&Ctx](int64_t Value) {
+ return MCConstantExpr::create(Value, Ctx);
+ };
+
+ return create(AGVK_ExtraSGPRs,
+ {GetConstantExpr(MajorVersion), VCCUsed, FlatScrUsed,
+ GetConstantExpr(XNACKUsed),
+ GetConstantExpr(hasArchitectedFlatScratch)},
+ Ctx);
+}
+
+const AMDGPUVariadicMCExpr *AMDGPUVariadicMCExpr::createTotalNumVGPR(
+ bool has90AInsts, const MCExpr *NumAGPR, const MCExpr *NumVGPR,
+ MCContext &Ctx) {
+ return create(has90AInsts ? AGVK_TotalNumVGPRs90A : AGVK_TotalNumVGPRs,
+ {NumAGPR, NumVGPR}, Ctx);
+}
+
+/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
+///
+/// Remove dependency on GCNSubtarget and depend only only the necessary values
+/// for said occupancy computation. Should match computeOccupancy implementation
+/// without passing \p STM on.
+const AMDGPUVariadicMCExpr *
+AMDGPUVariadicMCExpr::createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
+ const MCExpr *NumVGPRs,
+ const GCNSubtarget &STM, MCContext &Ctx) {
+ unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
+ unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM);
+ unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
+ unsigned Generation = STM.getGeneration();
+
+ auto CreateExpr = [&Ctx](unsigned Value) {
+ return MCConstantExpr::create(Value, Ctx);
+ };
+
+ return create(AGVK_Occupancy,
+ {CreateExpr(MaxWaves), CreateExpr(Granule),
+ CreateExpr(TargetTotalNumVGPRs), CreateExpr(Generation),
+ CreateExpr(InitOcc), NumSGPRs, NumVGPRs},
+ Ctx);
+}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
index 238e0dea791b24..f317ef73fa3e24 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@@ -14,6 +14,9 @@
namespace llvm {
+class Function;
+class GCNSubtarget;
+
/// AMDGPU target specific variadic MCExpr operations.
///
/// Takes in a minimum of 1 argument to be used with an operation. The supported
@@ -26,7 +29,16 @@ namespace llvm {
///
class AMDGPUVariadicMCExpr : public MCTargetExpr {
public:
- enum VariadicKind { AGVK_None, AGVK_Or, AGVK_Max };
+ enum VariadicKind {
+ AGVK_None,
+ AGVK_Or,
+ AGVK_Max,
+ AGVK_ExtraSGPRs,
+ AGVK_TotalNumVGPRs,
+ AGVK_TotalNumVGPRs90A,
+ AGVK_AlignTo,
+ AGVK_Occupancy
+ };
private:
VariadicKind Kind;
@@ -52,6 +64,27 @@ class AMDGPUVariadicMCExpr : public MCTargetExpr {
return create(VariadicKind::AGVK_Max, Args, Ctx);
}
+ static const AMDGPUVariadicMCExpr *
+ createExtraSGPRs(const MCExpr *VCCUsed, const MCExpr *FlatScrUsed,
+ unsigned MajorVersion, bool hasArchitectedFlatScratch,
+ bool XNACKUsed, MCContext &Ctx);
+
+ static const AMDGPUVariadicMCExpr *createTotalNumVGPR(bool has90AInsts,
+ const MCExpr *NumAGPR,
+ const MCExpr *NumVGPR,
+ MCContext &Ctx);
+
+ static const AMDGPUVariadicMCExpr *
+ createAlignTo(const MCExpr *Value, const MCExpr *Align, MCContext &Ctx) {
+ return create(VariadicKind::AGVK_AlignTo, {Value, Align}, Ctx);
+ }
+
+ static const AMDGPUVariadicMCExpr *createOccupancy(unsigned InitOcc,
+ const MCExpr *NumSGPRs,
+ const MCExpr *NumVGPRs,
+ const GCNSubtarget &STM,
+ MCContext &Ctx);
+
VariadicKind getKind() const { return Kind; }
const MCExpr *getSubExpr(size_t Index) const;
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
index 9ed7aacc0538ec..0d40816cdd4b8e 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -18,57 +18,114 @@
#include "GCNSubtarget.h"
#include "SIDefines.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/MC/MCExpr.h"
using namespace llvm;
-uint64_t SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST) const {
- uint64_t Reg = S_00B848_VGPRS(VGPRBlocks) | S_00B848_SGPRS(SGPRBlocks) |
- S_00B848_PRIORITY(Priority) | S_00B848_FLOAT_MODE(FloatMode) |
- S_00B848_PRIV(Priv) | S_00B848_DEBUG_MODE(DebugMode) |
- S_00B848_WGP_MODE(WgpMode) | S_00B848_MEM_ORDERED(MemOrdered);
+void SIProgramInfo::reset(const MachineFunction &MF) {
+ MCContext &Ctx = MF.getContext();
+
+ const MCExpr *ZeroExpr = MCConstantExpr::create(0, Ctx);
+
+ VGPRBlocks = ZeroExpr;
+ SGPRBlocks = ZeroExpr;
+ Priority = 0;
+ FloatMode = 0;
+ Priv = 0;
+ DX10Clamp = 0;
+ DebugMode = 0;
+ IEEEMode = 0;
+ WgpMode = 0;
+ MemOrdered = 0;
+ RrWgMode = 0;
+ ScratchSize = ZeroExpr;
+
+ LDSBlocks = 0;
+ ScratchBlocks = ZeroExpr;
+
+ ScratchEnable = ZeroExpr;
+ UserSGPR = 0;
+ TrapHandlerEnable = 0;
+ TGIdXEnable = 0;
+ TGIdYEnable = 0;
+ TGIdZEnable = 0;
+ TGSizeEnable = 0;
+ TIdIGCompCount = 0;
+ EXCPEnMSB = 0;
+ LdsSize = 0;
+ EXCPEnable = 0;
+
+ ComputePGMRSrc3GFX90A = ZeroExpr;
+
+ NumVGPR = ZeroExpr;
+ NumArchVGPR = ZeroExpr;
+ NumAccVGPR = ZeroExpr;
+ AccumOffset = ZeroExpr;
+ TgSplit = 0;
+ NumSGPR = ZeroExpr;
+ SGPRSpill = 0;
+ VGPRSpill = 0;
+ LDSSize = 0;
+ FlatUsed = ZeroExpr;
+
+ NumSGPRsForWavesPerEU = ZeroExpr;
+ NumVGPRsForWavesPerEU = ZeroExpr;
+ Occupancy = ZeroExpr;
+ DynamicCallStack = ZeroExpr;
+ VCCUsed = ZeroExpr;
+}
+
+static uint64_t getComputePGMRSrc1Reg(const SIProgramInfo &ProgInfo,
+ const GCNSubtarget &ST) {
+ uint64_t Reg = S_00B848_PRIORITY(ProgInfo.Priority) |
+ S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
+ S_00B848_PRIV(ProgInfo.Priv) |
+ S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
+ S_00B848_WGP_MODE(ProgInfo.WgpMode) |
+ S_00B848_MEM_ORDERED(ProgInfo.MemOrdered);
if (ST.hasDX10ClampMode())
- Reg |= S_00B848_DX10_CLAMP(DX10Clamp);
+ Reg |= S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp);
if (ST.hasIEEEMode())
- Reg |= S_00B848_IEEE_MODE(IEEEMode);
+ Reg |= S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
if (ST.hasRrWGMode())
- Reg |= S_00B848_RR_WG_MODE(RrWgMode);
+ Reg |= S_00B848_RR_WG_MODE(ProgInfo.RrWgMode);
return Reg;
}
-uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC,
- const GCNSubtarget &ST) const {
- if (AMDGPU::isCompute(CC)) {
- return getComputePGMRSrc1(ST);
- }
- uint64_t Reg = S_00B848_VGPRS(VGPRBlocks) | S_00B848_SGPRS(SGPRBlocks) |
- S_00B848_PRIORITY(Priority) | S_00B848_FLOAT_MODE(FloatMode) |
- S_00B848_PRIV(Priv) | S_00B848_DEBUG_MODE(DebugMode);
+static uint64_t getPGMRSrc1Reg(const SIProgramInfo &ProgInfo,
+ CallingConv::ID CC, const GCNSubtarget &ST) {
+ uint64_t Reg = S_00B848_PRIORITY(ProgInfo.Priority) |
+ S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
+ S_00B848_PRIV(ProgInfo.Priv) |
+ S_00B848_DEBUG_MODE(ProgInfo.DebugMode);
if (ST.hasDX10ClampMode())
- Reg |= S_00B848_DX10_CLAMP(DX10Clamp);
+ Reg |= S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp);
if (ST.hasIEEEMode())
- Reg |= S_00B848_IEEE_MODE(IEEEMode);
+ Reg |= S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
if (ST.hasRrWGMode())
- Reg |= S_00B848_RR_WG_MODE(RrWgMode);
+ Reg |= S_00B848_RR_WG_MODE(ProgInfo.RrWgMode);
switch (CC) {
case CallingConv::AMDGPU_PS:
- Reg |= S_00B028_MEM_ORDERED(MemOrdered);
+ Reg |= S_00B028_MEM_ORDERED(ProgInfo.MemOrdered);
break;
case CallingConv::AMDGPU_VS:
- Reg |= S_00B128_MEM_ORDERED(MemOrdered);
+ Reg |= S_00B128_MEM_ORDERED(ProgInfo.MemOrdered);
break;
case CallingConv::AMDGPU_GS:
- Reg |= S_00B228_WGP_MODE(WgpMode) | S_00B228_MEM_ORDERED(MemOrdered);
+ Reg |= S_00B228_WGP_MODE(ProgInfo.WgpMode) |
+ S_00B228_MEM_ORDERED(ProgInfo.MemOrdered);
break;
case CallingConv::AMDGPU_HS:
- Reg |= S_00B428_WGP_MODE(WgpMode) | S_00B428_MEM_ORDERED(MemOrdered);
+ Reg |= S_00B428_WGP_MODE(ProgInfo.WgpMode) |
+ S_00B428_MEM_ORDERED(ProgInfo.MemOrdered);
break;
default:
break;
@@ -76,22 +133,108 @@ uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC,
return Reg;
}
-uint64_t SIProgramInfo::getComputePGMRSrc2() const {
- uint64_t Reg =
- S_00B84C_SCRATCH_EN(ScratchEnable) | S_00B84C_USER_SGPR(UserSGPR) |
- S_00B84C_TRAP_HANDLER(TrapHandlerEnable) |
- S_00B84C_TGID_X_EN(TGIdXEnable) | S_00B84C_TGID_Y_EN(TGIdYEnable) |
- S_00B84C_TGID_Z_EN(TGIdZEnable) | S_00B84C_TG_SIZE_EN(TGSizeEnable) |
- S_00B84C_TIDIG_COMP_CNT(TIdIGCompCount) |
- S_00B84C_EXCP_EN_MSB(EXCPEnMSB) | S_00B84C_LDS_SIZE(LdsSize) |
- S_00B84C_EXCP_EN(EXCPEnable);
+static uint64_t getComputePGMRSrc2Reg(const SIProgramInfo &ProgInfo) {
+ uint64_t Reg = S_00B84C_USER_SGPR(ProgInfo.UserSGPR) |
+ S_00B84C_TRAP_HANDLER(ProgInfo.TrapHandlerEnable) |
+ S_00B84C_TGID_X_EN(ProgInfo.TGIdXEnable) |
+ S_00B84C_TGID_Y_EN(ProgInfo.TGIdYEnable) |
+ S_00B84C_TGID_Z_EN(ProgInfo.TGIdZEnable) |
+ S_00B84C_TG_SIZE_EN(ProgInfo.TGSizeEnable) |
+ S_00B84C_TIDIG_COMP_CNT(ProgInfo.TIdIGCompCount) |
+ S_00B84C_EXCP_EN_MSB(ProgInfo.EXCPEnMSB) |
+ S_00B84C_LDS_SIZE(ProgInfo.LdsSize) |
+ S_00B84C_EXCP_EN(ProgInfo.EXCPEnable);
+
+ return Reg;
+}
+
+static const MCExpr *MaskShift(const MCExpr *Val, uint32_t Mask, uint32_t Shift,
+ MCContext &Ctx) {
+ if (Mask) {
+ const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx);
+ Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx);
+ }
+ if (Shift) {
+ const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx);
+ Val = MCBinaryExpr::createShl(Val, ShiftExpr, Ctx);
+ }
+ return Val;
+}
+
+uint64_t SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST) const {
+ int64_t VBlocks, SBlocks;
+ VGPRBlocks->evaluateAsAbsolute(VBlocks);
+ SGPRBlocks->evaluateAsAbsolute(SBlocks);
+
+ uint64_t Reg = S_00B848_VGPRS(static_cast<uint64_t>(VBlocks)) |
+ S_00B848_SGPRS(static_cast<uint64_t>(SBlocks)) |
+ getComputePGMRSrc1Reg(*this, ST);
return Reg;
}
+uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC,
+ const GCNSubtarget &ST) const {
+ if (AMDGPU::isCompute(CC)) {
+ return getComputePGMRSrc1(ST);
+ }
+ int64_t VBlocks, SBlocks;
+ VGPRBlocks->evaluateAsAbsolute(VBlocks);
+ SGPRBlocks->evaluateAsAbsolute(SBlocks);
+
+ return getPGMRSrc1Reg(*this, CC, ST) |
+ S_00B848_VGPRS(static_cast<uint64_t>(VBlocks)) |
+ S_00B848_SGPRS(static_cast<uint64_t>(SBlocks));
+}
+
+uint64_t SIProgramInfo::getComputePGMRSrc2() const {
+ int64_t ScratchEn;
+ ScratchEnable->evaluateAsAbsolute(ScratchEn);
+ return ScratchEn | getComputePGMRSrc2Reg(*this);
+}
+
uint64_t SIProgramInfo::getPGMRSrc2(CallingConv::ID CC) const {
if (AMDGPU::isCompute(CC))
return getComputePGMRSrc2();
return 0;
}
+
+const MCExpr *SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST,
+ MCContext &Ctx) const {
+ uint64_t Reg = getComputePGMRSrc1Reg(*this, ST);
+ const MCExpr *RegExpr = MCConstantExpr::create(Reg, Ctx);
+ const MCExpr *Res = MCBinaryExpr::createOr(
+ MaskShift(VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0, Ctx),
+ MaskShift(SGPRBlocks, /*Mask=*/0xF, /*Shift=*/6, Ctx), Ctx);
+ return MCBinaryExpr::createOr(RegExpr, Res, Ctx);
+}
+
+const MCExpr *SIProgramInfo::getPGMRSrc1(CallingConv::ID CC,
+ const GCNSubtarget &ST,
+ MCContext &Ctx) const {
+ if (AMDGPU::isCompute(CC)) {
+ return getComputePGMRSrc1(ST, Ctx);
+ }
+
+ uint64_t Reg = getPGMRSrc1Reg(*this, CC, ST);
+ const MCExpr *RegExpr = MCConstantExpr::create(Reg, Ctx);
+ const MCExpr *Res = MCBinaryExpr::createOr(
+ MaskShift(VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0, Ctx),
+ MaskShift(SGPRBlocks, /*Mask=*/0xF, /*Shift=*/6, Ctx), Ctx);
+ return MCBinaryExpr::createOr(RegExpr, Res, Ctx);
+}
+
+const MCExpr *SIProgramInfo::getComputePGMRSrc2(MCContext &Ctx) const {
+ uint64_t Reg = getComputePGMRSrc2Reg(*this);
+ const MCExpr *RegExpr = MCConstantExpr::create(Reg, Ctx);
+ return MCBinaryExpr::createOr(ScratchEnable, RegExpr, Ctx);
+}
+
+const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC,
+ MCContext &Ctx) const {
+ if (AMDGPU::isCompute(CC))
+ return getComputePGMRSrc2(Ctx);
+
+ return MCConstantExpr::create(0, Ctx);
+}
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index 8c26789f936cff..5ff9b607c7fbcf 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -22,12 +22,15 @@
namespace llvm {
class GCNSubtarget;
+class MCContext;
+class MCExpr;
+class MachineFunction;
/// Track resource usage for kernels / entry functions.
struct SIProgramInfo {
// Fields set in PGM_RSRC1 pm4 packet.
- uint32_t VGPRBlocks = 0;
- uint32_t SGPRBlocks = 0;
+ const MCExpr *VGPRBlocks = 0;
+ const MCExpr *SGPRBlocks = 0;
uint32_t Priority = 0;
uint32_t FloatMode = 0;
uint32_t Priv = 0;
@@ -37,14 +40,14 @@ struct SIProgramInfo {
uint32_t WgpMode = 0; // GFX10+
uint32_t MemOrdered = 0; // GFX10+
uint32_t RrWgMode = 0; // GFX12+
- uint64_t ScratchSize = 0;
+ const MCExpr *ScratchSize = nullptr;
// State used to calculate fields set in PGM_RSRC2 pm4 packet.
uint32_t LDSBlocks = 0;
- uint32_t ScratchBlocks = 0;
+ const MCExpr *ScratchBlocks = nullptr;
// Fields set in PGM_RSRC2 pm4 packet
- uint32_t ScratchEnable = 0;
+ const MCExpr *ScratchEnable = nullptr;
uint32_t UserSGPR = 0;
uint32_t TrapHandlerEnable = 0;
uint32_t TGIdXEnable = 0;
@@ -56,44 +59,52 @@ struct SIProgramInfo {
uint32_t LdsSize = 0;
uint32_t EXCPEnable = 0;
- uint64_t ComputePGMRSrc3GFX90A = 0;
+ const MCExpr *ComputePGMRSrc3GFX90A = nullptr;
- uint32_t NumVGPR = 0;
- uint32_t NumArchVGPR = 0;
- uint32_t NumAccVGPR = 0;
- uint32_t AccumOffset = 0;
+ const MCExpr *NumVGPR = nullptr;
+ const MCExpr *NumArchVGPR = nullptr;
+ const MCExpr *NumAccVGPR = nullptr;
+ const MCExpr *AccumOffset = nullptr;
uint32_t TgSplit = 0;
- uint32_t NumSGPR = 0;
+ const MCExpr *NumSGPR = nullptr;
unsigned SGPRSpill = 0;
unsigned VGPRSpill = 0;
uint32_t LDSSize = 0;
- bool FlatUsed = false;
+ const MCExpr *FlatUsed = nullptr;
// Number of SGPRs that meets number of waves per execution unit request.
- uint32_t NumSGPRsForWavesPerEU = 0;
+ const MCExpr *NumSGPRsForWavesPerEU = nullptr;
// Number of VGPRs that meets number of waves per execution unit request.
- uint32_t NumVGPRsForWavesPerEU = 0;
+ const MCExpr *NumVGPRsForWavesPerEU = nullptr;
// Final occupancy.
- uint32_t Occupancy = 0;
+ const MCExpr *Occupancy = nullptr;
// Whether there is recursion, dynamic allocas, indirect calls or some other
// reason there may be statically unknown stack usage.
- bool DynamicCallStack = false;
+ const MCExpr *DynamicCallStack = nullptr;
// Bonus information for debugging.
- bool VCCUsed = false;
+ const MCExpr *VCCUsed = nullptr;
SIProgramInfo() = default;
+ void reset(const MachineFunction &MF);
+
/// Compute the value of the ComputePGMRsrc1 register.
uint64_t getComputePGMRSrc1(const GCNSubtarget &ST) const;
uint64_t getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST) const;
+ const MCExpr *getComputePGMRSrc1(const GCNSubtarget &ST,
+ MCContext &Ctx) const;
+ const MCExpr *getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST,
+ MCContext &Ctx) const;
/// Compute the value of the ComputePGMRsrc2 register.
uint64_t getComputePGMRSrc2() const;
uint64_t getPGMRSrc2(CallingConv::ID CC) const;
+ const MCExpr *getComputePGMRSrc2(MCContext &Ctx) const;
+ const MCExpr *getPGMRSrc2(CallingConv::ID CC, MCContext &Ctx) const;
};
} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 4e0074451aa58c..82897a68082a74 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1129,12 +1129,18 @@ unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
unsigned NumVGPRs) {
- unsigned MaxWaves = getMaxWavesPerEU(STI);
- unsigned Granule = getVGPRAllocGranule(STI);
+ return getNumWavesPerEUWithNumVGPRs(NumVGPRs, getVGPRAllocGranule(STI),
+ getMaxWavesPerEU(STI),
+ getTotalNumVGPRs(STI));
+}
+
+unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
+ unsigned MaxWaves,
+ unsigned TotalNumVGPRs) {
if (NumVGPRs < Granule)
return MaxWaves;
unsigned RoundedRegs = alignTo(NumVGPRs, Granule);
- return std::min(std::max(getTotalNumVGPRs(STI) / RoundedRegs, 1u), MaxWaves);
+ return std::min(std::max(TotalNumVGPRs / RoundedRegs, 1u), MaxWaves);
}
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 943588fe701cc8..8b9ce1b191a298 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -311,6 +311,12 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
unsigned NumVGPRs);
+/// \returns Number of waves reachable for a given \p NumVGPRs usage, \p Granule
+/// size, \p MaxWaves possible, and \p TotalNumVGPRs available.
+unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
+ unsigned MaxWaves,
+ unsigned TotalNumVGPRs);
+
/// \returns Number of VGPR blocks needed for given subtarget \p STI when
/// \p NumVGPRs are used. We actually return the number of blocks -1, since
/// that's what we encode.
diff --git a/llvm/test/MC/AMDGPU/alignto_mcexpr.s b/llvm/test/MC/AMDGPU/alignto_mcexpr.s
new file mode 100644
index 00000000000000..e864f3736828c4
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/alignto_mcexpr.s
@@ -0,0 +1,15 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa < %s | FileCheck --check-prefix=ASM %s
+
+// ASM: .set alignto_zero_eight, 0
+// ASM: .set alignto_one_eight, 8
+// ASM: .set alignto_five_eight, 8
+// ASM: .set alignto_seven_eight, 8
+// ASM: .set alignto_eight_eight, 8
+// ASM: .set alignto_ten_eight, 16
+
+.set alignto_zero_eight, alignto(0, 8)
+.set alignto_one_eight, alignto(1, 8)
+.set alignto_five_eight, alignto(5, 8)
+.set alignto_seven_eight, alignto(7, 8)
+.set alignto_eight_eight, alignto(8, 8)
+.set alignto_ten_eight, alignto(10, 8)
diff --git a/llvm/test/MC/AMDGPU/extrasgprs_mcexpr.s b/llvm/test/MC/AMDGPU/extrasgprs_mcexpr.s
new file mode 100644
index 00000000000000..89e4954b805471
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/extrasgprs_mcexpr.s
@@ -0,0 +1,41 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa < %s | FileCheck --check-prefix=ASM %s
+
+// ASM: .set extrasgpr_none_gfx7, 0
+// ASM: .set extrasgpr_none_gfx9, 0
+// ASM: .set extrasgpr_none_gfx10, 0
+
+.set extrasgpr_none_gfx7, extrasgprs(7, 0, 0, 0, 0)
+.set extrasgpr_none_gfx9, extrasgprs(9, 0, 0, 0, 0)
+.set extrasgpr_none_gfx10, extrasgprs(10, 0, 0, 0, 0)
+
+// ASM: .set extrasgpr_vcc_gfx7, 2
+// ASM: .set extrasgpr_vcc_gfx9, 2
+// ASM: .set extrasgpr_vcc_gfx10, 2
+
+.set extrasgpr_vcc_gfx7, extrasgprs(7, 1, 0, 0, 0)
+.set extrasgpr_vcc_gfx9, extrasgprs(9, 1, 0, 0, 0)
+.set extrasgpr_vcc_gfx10, extrasgprs(10, 1, 0, 0, 0)
+
+// ASM: .set extrasgpr_flatscr_gfx7, 4
+// ASM: .set extrasgpr_flatscr_gfx9, 6
+// ASM: .set extrasgpr_flatscr_gfx10, 0
+
+.set extrasgpr_flatscr_gfx7, extrasgprs(7, 0, 1, 0, 0)
+.set extrasgpr_flatscr_gfx9, extrasgprs(9, 0, 1, 0, 0)
+.set extrasgpr_flatscr_gfx10, extrasgprs(10, 0, 1, 0, 0)
+
+// ASM: .set extrasgpr_xnack_gfx7, 0
+// ASM: .set extrasgpr_xnack_gfx9, 4
+// ASM: .set extrasgpr_xnack_gfx10, 0
+
+.set extrasgpr_xnack_gfx7, extrasgprs(7, 0, 0, 1, 0)
+.set extrasgpr_xnack_gfx9, extrasgprs(9, 0, 0, 1, 0)
+.set extrasgpr_xnack_gfx10, extrasgprs(10, 0, 0, 1, 0)
+
+// ASM: .set extrasgpr_archflatscr_gfx7, 0
+// ASM: .set extrasgpr_archflatscr_gfx9, 6
+// ASM: .set extrasgpr_archflatscr_gfx10, 0
+
+.set extrasgpr_archflatscr_gfx7, extrasgprs(7, 0, 0, 0, 1)
+.set extrasgpr_archflatscr_gfx9, extrasgprs(9, 0, 0, 0, 1)
+.set extrasgpr_archflatscr_gfx10, extrasgprs(10, 0, 0, 0, 1)
diff --git a/llvm/test/MC/AMDGPU/occupancy_mcexpr.s b/llvm/test/MC/AMDGPU/occupancy_mcexpr.s
new file mode 100644
index 00000000000000..06bec8c538daea
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/occupancy_mcexpr.s
@@ -0,0 +1,61 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa < %s | FileCheck --check-prefix=ASM %s
+
+// ASM: .set occupancy_init_one, 1
+// ASM: .set occupancy_init_seven, 7
+// ASM: .set occupancy_init_eight, 8
+
+.set occupancy_init_one, occupancy(0, 0, 0, 0, 1, 0, 0)
+.set occupancy_init_seven, occupancy(0, 0, 0, 0, 7, 0, 0)
+.set occupancy_init_eight, occupancy(0, 0, 0, 0, 8, 0, 0)
+
+// ASM: .set occupancy_numsgpr_seaisle_ten, 10
+// ASM: .set occupancy_numsgpr_seaisle_nine, 9
+// ASM: .set occupancy_numsgpr_seaisle_eight, 8
+// ASM: .set occupancy_numsgpr_seaisle_seven, 7
+// ASM: .set occupancy_numsgpr_seaisle_six, 6
+// ASM: .set occupancy_numsgpr_seaisle_five, 5
+
+.set occupancy_numsgpr_seaisle_ten, occupancy(0, 0, 0, 6, 11, 1, 0)
+.set occupancy_numsgpr_seaisle_nine, occupancy(0, 0, 0, 6, 11, 49, 0)
+.set occupancy_numsgpr_seaisle_eight, occupancy(0, 0, 0, 6, 11, 57, 0)
+.set occupancy_numsgpr_seaisle_seven, occupancy(0, 0, 0, 6, 11, 65, 0)
+.set occupancy_numsgpr_seaisle_six, occupancy(0, 0, 0, 6, 11, 73, 0)
+.set occupancy_numsgpr_seaisle_five, occupancy(0, 0, 0, 6, 11, 81, 0)
+
+// ASM: .set occupancy_numsgpr_gfx9_ten, 10
+// ASM: .set occupancy_numsgpr_gfx9_nine, 9
+// ASM: .set occupancy_numsgpr_gfx9_eight, 8
+// ASM: .set occupancy_numsgpr_gfx9_seven, 7
+
+.set occupancy_numsgpr_gfx9_ten, occupancy(0, 0, 0, 8, 11, 1, 0)
+.set occupancy_numsgpr_gfx9_nine, occupancy(0, 0, 0, 8, 11, 81, 0)
+.set occupancy_numsgpr_gfx9_eight, occupancy(0, 0, 0, 8, 11, 89, 0)
+.set occupancy_numsgpr_gfx9_seven, occupancy(0, 0, 0, 8, 11, 101, 0)
+
+// ASM: .set occupancy_numsgpr_gfx10_one, 1
+// ASM: .set occupancy_numsgpr_gfx10_seven, 7
+// ASM: .set occupancy_numsgpr_gfx10_eight, 8
+
+.set occupancy_numsgpr_gfx10_one, occupancy(1, 0, 0, 9, 11, 1, 0)
+.set occupancy_numsgpr_gfx10_seven, occupancy(7, 0, 0, 9, 11, 1, 0)
+.set occupancy_numsgpr_gfx10_eight, occupancy(8, 0, 0, 9, 11, 1, 0)
+
+// ASM: .set occupancy_numvgpr_high_granule_one, 1
+// ASM: .set occupancy_numvgpr_high_granule_seven, 7
+// ASM: .set occupancy_numvgpr_high_granule_eight, 8
+
+.set occupancy_numvgpr_high_granule_one, occupancy(1, 2, 0, 0, 11, 0, 1)
+.set occupancy_numvgpr_high_granule_seven, occupancy(7, 2, 0, 0, 11, 0, 1)
+.set occupancy_numvgpr_high_granule_eight, occupancy(8, 2, 0, 0, 11, 0, 1)
+
+// ASM: .set occupancy_numvgpr_low_total_one, 1
+// ASM: .set occupancy_numvgpr_one, 1
+// ASM: .set occupancy_numvgpr_seven, 7
+// ASM: .set occupancy_numvgpr_eight, 8
+// ASM: .set occupancy_numvgpr_ten, 10
+
+.set occupancy_numvgpr_low_total_one, occupancy(11, 4, 2, 0, 11, 0, 4)
+.set occupancy_numvgpr_one, occupancy(11, 4, 4, 0, 11, 0, 4)
+.set occupancy_numvgpr_seven, occupancy(11, 4, 28, 0, 11, 0, 4)
+.set occupancy_numvgpr_eight, occupancy(11, 4, 32, 0, 11, 0, 4)
+.set occupancy_numvgpr_ten, occupancy(11, 4, 40, 0, 11, 0, 4)
diff --git a/llvm/test/MC/AMDGPU/totalnumvgpr_mcexpr.s b/llvm/test/MC/AMDGPU/totalnumvgpr_mcexpr.s
new file mode 100644
index 00000000000000..58f317731df849
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/totalnumvgpr_mcexpr.s
@@ -0,0 +1,25 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa < %s | FileCheck --check-prefix=ASM %s
+
+// ASM: .set totalvgpr_none, 0
+// ASM: .set totalvgpr_one, 1
+// ASM: .set totalvgpr_two, 2
+
+.set totalvgpr_none, totalnumvgprs(0, 0)
+.set totalvgpr_one, totalnumvgprs(1, 0)
+.set totalvgpr_two, totalnumvgprs(1, 2)
+
+// ASM: .set totalvgpr90a_none, 0
+// ASM: .set totalvgpr90a_one, 1
+// ASM: .set totalvgpr90a_two, 2
+
+.set totalvgpr90a_none, totalnumvgprs90a(0, 0)
+.set totalvgpr90a_one, totalnumvgprs90a(0, 1)
+.set totalvgpr90a_two, totalnumvgprs90a(0, 2)
+
+// ASM: .set totalvgpr90a_agpr_minimal, 1
+// ASM: .set totalvgpr90a_agpr_rounded_eight, 8
+// ASM: .set totalvgpr90a_agpr_exact_eight, 8
+
+.set totalvgpr90a_agpr_minimal, totalnumvgprs90a(1, 0)
+.set totalvgpr90a_agpr_rounded_eight, totalnumvgprs90a(4, 2)
+.set totalvgpr90a_agpr_exact_eight, totalnumvgprs90a(4, 4)
>From ca6f3a0dd3f28487f8f6396508a7957a1fb1fc84 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <janek.vanoirschot at amd.com>
Date: Wed, 10 Apr 2024 16:56:08 +0100
Subject: [PATCH 2/3] Remove superfluous include, set pointers to nullptr
instead 0
---
llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 1 -
llvm/lib/Target/AMDGPU/SIProgramInfo.h | 4 ++--
2 files changed, 2 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 08b4a86994cab5..a08c49b9990241 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -7,7 +7,6 @@
//===----------------------------------------------------------------------===//
#include "AMDKernelCodeT.h"
-#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCExpr.h"
#include "MCTargetDesc/AMDGPUMCKernelDescriptor.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index 5ff9b607c7fbcf..047b758bb2a6cd 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -29,8 +29,8 @@ class MachineFunction;
/// Track resource usage for kernels / entry functions.
struct SIProgramInfo {
// Fields set in PGM_RSRC1 pm4 packet.
- const MCExpr *VGPRBlocks = 0;
- const MCExpr *SGPRBlocks = 0;
+ const MCExpr *VGPRBlocks = nullptr;
+ const MCExpr *SGPRBlocks = nullptr;
uint32_t Priority = 0;
uint32_t FloatMode = 0;
uint32_t Priv = 0;
>From aa64a6391bbb3da3b2f653578302e1551f7648fe Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <janek.vanoirschot at amd.com>
Date: Tue, 16 Apr 2024 16:03:15 +0100
Subject: [PATCH 3/3] Feedback, remove ExtraSGPR dupicate code
---
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 149 +++++-----
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h | 2 +-
.../AMDGPU/AMDGPUHSAMetadataStreamer.cpp | 22 +-
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 25 +-
.../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 1 -
.../AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp | 264 ++++++++----------
.../Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h | 21 +-
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 27 ++
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 6 +
llvm/test/MC/AMDGPU/extrasgprs_mcexpr.s | 72 ++---
llvm/test/MC/AMDGPU/totalnumvgpr_mcexpr.s | 33 +--
11 files changed, 319 insertions(+), 303 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index b410f0c13e1b49..496357855f34c9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -135,10 +135,13 @@ void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
getTargetStreamer()->getPALMetadata()->readFromIR(M);
}
-uint64_t AMDGPUAsmPrinter::getMCExprValue(const MCExpr *Value) {
+uint64_t AMDGPUAsmPrinter::getMCExprValue(const MCExpr *Value, MCContext &Ctx) {
int64_t Val;
- Value->evaluateAsAbsolute(Val);
- return Val;
+ if (!Value->evaluateAsAbsolute(Val)) {
+ Ctx.reportError(SMLoc(), "Could not resolve MCExpr when required.");
+ return 0;
+ }
+ return static_cast<uint64_t>(Val);
}
void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
@@ -244,14 +247,14 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
getNameWithPrefix(KernelName, &MF->getFunction());
getTargetStreamer()->EmitAmdhsaKernelDescriptor(
STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
- getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU),
- getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU) -
+ getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Context),
+ getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Context) -
IsaInfo::getNumExtraSGPRs(
- &STM, getMCExprValue(CurrentProgramInfo.VCCUsed),
- getMCExprValue(CurrentProgramInfo.FlatUsed),
+ &STM, getMCExprValue(CurrentProgramInfo.VCCUsed, Context),
+ getMCExprValue(CurrentProgramInfo.FlatUsed, Context),
getTargetStreamer()->getTargetID()->isXnackOnOrAny()),
- getMCExprValue(CurrentProgramInfo.VCCUsed),
- getMCExprValue(CurrentProgramInfo.FlatUsed));
+ getMCExprValue(CurrentProgramInfo.VCCUsed, Context),
+ getMCExprValue(CurrentProgramInfo.FlatUsed, Context));
Streamer.popSection();
}
@@ -431,7 +434,7 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
}
- if (getMCExprValue(CurrentProgramInfo.DynamicCallStack) &&
+ if (getMCExprValue(CurrentProgramInfo.DynamicCallStack, MF.getContext()) &&
CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK;
@@ -462,7 +465,7 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
MCConstantExpr::create(getAmdhsaKernelCodeProperties(MF), Ctx);
assert(STM.hasGFX90AInsts() ||
- getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A) == 0);
+ getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx) == 0);
KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A;
KernelDescriptor.kernarg_preload = MCConstantExpr::create(
@@ -482,6 +485,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
CurrentProgramInfo.reset(MF);
const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
+ MCContext &Ctx = MF.getContext();
// The starting address of all shader programs must be 256 bytes aligned.
// Regular functions just need the basic required instruction alignment.
@@ -552,13 +556,13 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
OutStreamer->emitRawComment(" Kernel info:", false);
emitCommonFunctionComments(
- getMCExprValue(CurrentProgramInfo.NumArchVGPR),
- STM.hasMAIInsts() ? getMCExprValue(CurrentProgramInfo.NumAccVGPR)
+ getMCExprValue(CurrentProgramInfo.NumArchVGPR, Ctx),
+ STM.hasMAIInsts() ? getMCExprValue(CurrentProgramInfo.NumAccVGPR, Ctx)
: std::optional<uint32_t>(),
- getMCExprValue(CurrentProgramInfo.NumVGPR),
- getMCExprValue(CurrentProgramInfo.NumSGPR),
- getMCExprValue(CurrentProgramInfo.ScratchSize), getFunctionCodeSize(MF),
- MFI);
+ getMCExprValue(CurrentProgramInfo.NumVGPR, Ctx),
+ getMCExprValue(CurrentProgramInfo.NumSGPR, Ctx),
+ getMCExprValue(CurrentProgramInfo.ScratchSize, Ctx),
+ getFunctionCodeSize(MF), MFI);
OutStreamer->emitRawComment(
" FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
@@ -569,29 +573,35 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
" bytes/workgroup (compile time only)", false);
OutStreamer->emitRawComment(
- " SGPRBlocks: " + Twine(getMCExprValue(CurrentProgramInfo.SGPRBlocks)),
+ " SGPRBlocks: " +
+ Twine(getMCExprValue(CurrentProgramInfo.SGPRBlocks, Ctx)),
false);
OutStreamer->emitRawComment(
- " VGPRBlocks: " + Twine(getMCExprValue(CurrentProgramInfo.VGPRBlocks)),
+ " VGPRBlocks: " +
+ Twine(getMCExprValue(CurrentProgramInfo.VGPRBlocks, Ctx)),
false);
OutStreamer->emitRawComment(
" NumSGPRsForWavesPerEU: " +
- Twine(getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU)),
+ Twine(
+ getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx)),
false);
OutStreamer->emitRawComment(
" NumVGPRsForWavesPerEU: " +
- Twine(getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU)),
+ Twine(
+ getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx)),
false);
if (STM.hasGFX90AInsts())
OutStreamer->emitRawComment(
" AccumOffset: " +
- Twine((getMCExprValue(CurrentProgramInfo.AccumOffset) + 1) * 4),
+ Twine((getMCExprValue(CurrentProgramInfo.AccumOffset, Ctx) + 1) *
+ 4),
false);
OutStreamer->emitRawComment(
- " Occupancy: " + Twine(getMCExprValue(CurrentProgramInfo.Occupancy)),
+ " Occupancy: " +
+ Twine(getMCExprValue(CurrentProgramInfo.Occupancy, Ctx)),
false);
OutStreamer->emitRawComment(
@@ -599,7 +609,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
OutStreamer->emitRawComment(
" COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
- Twine(getMCExprValue(CurrentProgramInfo.ScratchEnable)),
+ Twine(getMCExprValue(CurrentProgramInfo.ScratchEnable, Ctx)),
false);
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
Twine(CurrentProgramInfo.UserSGPR),
@@ -621,18 +631,18 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
false);
assert(STM.hasGFX90AInsts() ||
- getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A) == 0);
+ getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx) == 0);
if (STM.hasGFX90AInsts()) {
OutStreamer->emitRawComment(
" COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
Twine((AMDHSA_BITS_GET(
- getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A),
+ getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx),
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
false);
OutStreamer->emitRawComment(
" COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
Twine((AMDHSA_BITS_GET(
- getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A),
+ getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx),
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
false);
}
@@ -725,8 +735,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
if (Value->evaluateAsAbsolute(Val)) {
Res = Val;
return true;
- } else
- return false;
+ }
+ return false;
};
ProgInfo.NumArchVGPR = CreateExpr(Info.NumVGPR);
@@ -758,8 +768,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
// unified.
const MCExpr *ExtraSGPRs = AMDGPUVariadicMCExpr::createExtraSGPRs(
- ProgInfo.VCCUsed, ProgInfo.FlatUsed, getIsaVersion(STM.getCPU()).Major,
- STM.getFeatureBits().test(AMDGPU::FeatureArchitectedFlatScratch),
+ ProgInfo.VCCUsed, ProgInfo.FlatUsed,
getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
// Check the addressable register limit before we add ExtraSGPRs.
@@ -858,18 +867,19 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
{ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
ProgInfo.NumVGPR = AMDGPUVariadicMCExpr::createTotalNumVGPR(
- STM.hasGFX90AInsts(), ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
+ ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
}
// Adjust number of registers used to meet default/requested minimum/maximum
// number of waves per execution unit request.
+ unsigned MaxWaves = MFI->getMaxWavesPerEU();
ProgInfo.NumSGPRsForWavesPerEU = AMDGPUVariadicMCExpr::createMax(
{ProgInfo.NumSGPR, CreateExpr(1ul),
- CreateExpr(STM.getMinNumSGPRs(MFI->getMaxWavesPerEU()))},
+ CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
Ctx);
ProgInfo.NumVGPRsForWavesPerEU = AMDGPUVariadicMCExpr::createMax(
{ProgInfo.NumVGPR, CreateExpr(1ul),
- CreateExpr(STM.getMinNumVGPRs(MFI->getMaxWavesPerEU()))},
+ CreateExpr(STM.getMinNumVGPRs(MaxWaves))},
Ctx);
if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
@@ -1104,18 +1114,19 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
// Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
// appropriate generation.
- if (STM.getGeneration() >= AMDGPUSubtarget::GFX12)
+ if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
/*Mask=*/0x3FFFF, /*Shift=*/12),
/*Size=*/4);
- else if (STM.getGeneration() == AMDGPUSubtarget::GFX11)
+ } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
/*Mask=*/0x7FFF, /*Shift=*/12),
/*Size=*/4);
- else
+ } else {
EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
/*Mask=*/0x1FFF, /*Shift=*/12),
/*Size=*/4);
+ }
// TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
// 0" comment but I don't see a corresponding field in the register spec.
@@ -1131,18 +1142,19 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
// Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
// appropriate generation.
- if (STM.getGeneration() >= AMDGPUSubtarget::GFX12)
+ if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
/*Mask=*/0x3FFFF, /*Shift=*/12),
/*Size=*/4);
- else if (STM.getGeneration() == AMDGPUSubtarget::GFX11)
+ } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
/*Mask=*/0x7FFF, /*Shift=*/12),
/*Size=*/4);
- else
+ } else {
EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
/*Mask=*/0x1FFF, /*Shift=*/12),
/*Size=*/4);
+ }
}
if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
@@ -1194,37 +1206,38 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
auto CC = MF.getFunction().getCallingConv();
auto MD = getTargetStreamer()->getPALMetadata();
+ auto &Ctx = MF.getContext();
MD->setEntryPoint(CC, MF.getFunction().getName());
- MD->setNumUsedVgprs(CC,
- getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU));
+ MD->setNumUsedVgprs(
+ CC, getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx));
// Only set AGPRs for supported devices
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
if (STM.hasMAIInsts()) {
- MD->setNumUsedAgprs(CC, getMCExprValue(CurrentProgramInfo.NumAccVGPR));
+ MD->setNumUsedAgprs(CC, getMCExprValue(CurrentProgramInfo.NumAccVGPR, Ctx));
}
- MD->setNumUsedSgprs(CC,
- getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU));
+ MD->setNumUsedSgprs(
+ CC, getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx));
if (MD->getPALMajorVersion() < 3) {
MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM));
if (AMDGPU::isCompute(CC)) {
MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2());
} else {
- if (getMCExprValue(CurrentProgramInfo.ScratchBlocks) > 0)
+ if (getMCExprValue(CurrentProgramInfo.ScratchBlocks, Ctx) > 0)
MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
}
} else {
MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
MD->setHwStage(CC, ".scratch_en",
- (bool)getMCExprValue(CurrentProgramInfo.ScratchEnable));
+ (bool)getMCExprValue(CurrentProgramInfo.ScratchEnable, Ctx));
EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM);
}
// ScratchSize is in bytes, 16 aligned.
MD->setScratchSize(
- CC, alignTo(getMCExprValue(CurrentProgramInfo.ScratchSize), 16));
+ CC, alignTo(getMCExprValue(CurrentProgramInfo.ScratchSize, Ctx), 16));
if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
@@ -1273,6 +1286,7 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
StringRef FnName = MF.getFunction().getName();
MD->setFunctionScratchSize(FnName, MFI.getStackSize());
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ MCContext &Ctx = MF.getContext();
if (MD->getPALMajorVersion() < 3) {
// Set compute registers
@@ -1287,9 +1301,9 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
// Set optional info
MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
MD->setFunctionNumUsedVgprs(
- FnName, getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU));
+ FnName, getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx));
MD->setFunctionNumUsedSgprs(
- FnName, getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU));
+ FnName, getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx));
}
// This is supposed to be log2(Size)
@@ -1315,6 +1329,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+ MCContext &Ctx = MF.getContext();
AMDGPU::initDefaultAMDKernelCodeT(Out, &STM);
@@ -1323,7 +1338,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
(CurrentProgramInfo.getComputePGMRSrc2() << 32);
Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
- if (getMCExprValue(CurrentProgramInfo.DynamicCallStack))
+ if (getMCExprValue(CurrentProgramInfo.DynamicCallStack, Ctx))
Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK;
AMD_HSA_BITS_SET(Out.code_properties,
@@ -1359,10 +1374,10 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
Align MaxKernArgAlign;
Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
- Out.wavefront_sgpr_count = getMCExprValue(CurrentProgramInfo.NumSGPR);
- Out.workitem_vgpr_count = getMCExprValue(CurrentProgramInfo.NumVGPR);
+ Out.wavefront_sgpr_count = getMCExprValue(CurrentProgramInfo.NumSGPR, Ctx);
+ Out.workitem_vgpr_count = getMCExprValue(CurrentProgramInfo.NumVGPR, Ctx);
Out.workitem_private_segment_byte_size =
- getMCExprValue(CurrentProgramInfo.ScratchSize);
+ getMCExprValue(CurrentProgramInfo.ScratchSize, Ctx);
Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
// kernarg_segment_alignment is specified as log of the alignment.
@@ -1453,22 +1468,28 @@ void AMDGPUAsmPrinter::emitResourceUsageRemarks(
// remarks to simulate newlines. If and when clang does accept newlines, this
// formatting should be aggregated into one remark with newlines to avoid
// printing multiple diagnostic location and diag opts.
+ MCContext &MCCtx = MF.getContext();
EmitResourceUsageRemark("FunctionName", "Function Name",
MF.getFunction().getName());
EmitResourceUsageRemark("NumSGPR", "SGPRs",
- getMCExprValue(CurrentProgramInfo.NumSGPR));
- EmitResourceUsageRemark("NumVGPR", "VGPRs",
- getMCExprValue(CurrentProgramInfo.NumArchVGPR));
- if (hasMAIInsts)
- EmitResourceUsageRemark("NumAGPR", "AGPRs",
- getMCExprValue(CurrentProgramInfo.NumAccVGPR));
- EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
- getMCExprValue(CurrentProgramInfo.ScratchSize));
+ getMCExprValue(CurrentProgramInfo.NumSGPR, MCCtx));
+ EmitResourceUsageRemark(
+ "NumVGPR", "VGPRs",
+ getMCExprValue(CurrentProgramInfo.NumArchVGPR, MCCtx));
+ if (hasMAIInsts) {
+ EmitResourceUsageRemark(
+ "NumAGPR", "AGPRs",
+ getMCExprValue(CurrentProgramInfo.NumAccVGPR, MCCtx));
+ }
+ EmitResourceUsageRemark(
+ "ScratchSize", "ScratchSize [bytes/lane]",
+ getMCExprValue(CurrentProgramInfo.ScratchSize, MCCtx));
StringRef DynamicStackStr =
- getMCExprValue(CurrentProgramInfo.DynamicCallStack) ? "True" : "False";
+ getMCExprValue(CurrentProgramInfo.DynamicCallStack, MCCtx) ? "True"
+ : "False";
EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
- getMCExprValue(CurrentProgramInfo.Occupancy));
+ getMCExprValue(CurrentProgramInfo.Occupancy, MCCtx));
EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
CurrentProgramInfo.SGPRSpill);
EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 3d155905c4afeb..16d8952a533efd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -78,7 +78,7 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
void initTargetStreamer(Module &M);
- static uint64_t getMCExprValue(const MCExpr *Value);
+ static uint64_t getMCExprValue(const MCExpr *Value, MCContext &Ctx);
public:
explicit AMDGPUAsmPrinter(TargetMachine &TM,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index a402e6fc68b491..f70ae2c2759558 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -19,6 +19,7 @@
#include "SIMachineFunctionInfo.h"
#include "SIProgramInfo.h"
#include "llvm/IR/Module.h"
+#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
using namespace llvm;
@@ -463,9 +464,13 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
const Function &F = MF.getFunction();
- auto getMCExprValue = [](const MCExpr *Value) {
+ auto GetMCExprValue = [&MF](const MCExpr *Value) {
int64_t Val;
- Value->evaluateAsAbsolute(Val);
+ if (!Value->evaluateAsAbsolute(Val)) {
+ MCContext &Ctx = MF.getContext();
+ Ctx.reportError(SMLoc(), "Could not resolve MCExpr when required.");
+ Val = 0;
+ }
return static_cast<uint64_t>(Val);
};
@@ -477,10 +482,11 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
Kern[".group_segment_fixed_size"] =
Kern.getDocument()->getNode(ProgramInfo.LDSSize);
Kern[".private_segment_fixed_size"] =
- Kern.getDocument()->getNode(getMCExprValue(ProgramInfo.ScratchSize));
- if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
+ Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.ScratchSize));
+ if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5) {
Kern[".uses_dynamic_stack"] = Kern.getDocument()->getNode(
- static_cast<bool>(getMCExprValue(ProgramInfo.DynamicCallStack)));
+ static_cast<bool>(GetMCExprValue(ProgramInfo.DynamicCallStack)));
+ }
if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5 && STM.supportsWGP())
Kern[".workgroup_processor_mode"] =
@@ -492,14 +498,14 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
Kern[".wavefront_size"] =
Kern.getDocument()->getNode(STM.getWavefrontSize());
Kern[".sgpr_count"] =
- Kern.getDocument()->getNode(getMCExprValue(ProgramInfo.NumSGPR));
+ Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.NumSGPR));
Kern[".vgpr_count"] =
- Kern.getDocument()->getNode(getMCExprValue(ProgramInfo.NumVGPR));
+ Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.NumVGPR));
// Only add AGPR count to metadata for supported devices
if (STM.hasMAIInsts()) {
Kern[".agpr_count"] =
- Kern.getDocument()->getNode(getMCExprValue(ProgramInfo.NumAccVGPR));
+ Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.NumAccVGPR));
}
Kern[".max_flat_workgroup_size"] =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 8f0eae362ecae0..2e68e723283c1b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -664,29 +664,8 @@ bool GCNSubtarget::useVGPRIndexMode() const {
bool GCNSubtarget::useAA() const { return UseAA; }
unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
- if (getGeneration() >= AMDGPUSubtarget::GFX10)
- return getMaxWavesPerEU();
-
- if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
- if (SGPRs <= 80)
- return 10;
- if (SGPRs <= 88)
- return 9;
- if (SGPRs <= 100)
- return 8;
- return 7;
- }
- if (SGPRs <= 48)
- return 10;
- if (SGPRs <= 56)
- return 9;
- if (SGPRs <= 64)
- return 8;
- if (SGPRs <= 72)
- return 7;
- if (SGPRs <= 80)
- return 6;
- return 5;
+ return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, getMaxWavesPerEU(),
+ getGeneration());
}
unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index a08c49b9990241..7760af27a5ae9d 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -8408,7 +8408,6 @@ bool AMDGPUAsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
.Case("or", AGVK::AGVK_Or)
.Case("extrasgprs", AGVK::AGVK_ExtraSGPRs)
.Case("totalnumvgprs", AGVK::AGVK_TotalNumVGPRs)
- .Case("totalnumvgprs90a", AGVK::AGVK_TotalNumVGPRs90A)
.Case("alignto", AGVK::AGVK_AlignTo)
.Case("occupancy", AGVK::AGVK_Occupancy)
.Default(AGVK::AGVK_None);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
index 25813eb30aefd5..159664faf983fa 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
@@ -71,9 +71,6 @@ void AMDGPUVariadicMCExpr::printImpl(raw_ostream &OS,
case AGVK_TotalNumVGPRs:
OS << "totalnumvgprs(";
break;
- case AGVK_TotalNumVGPRs90A:
- OS << "totalnumvgprs90a(";
- break;
case AGVK_AlignTo:
OS << "alignto(";
break;
@@ -101,10 +98,9 @@ static int64_t op(AMDGPUVariadicMCExpr::VariadicKind Kind, int64_t Arg1,
}
}
-bool AMDGPUVariadicMCExpr::evaluateAsRelocatableImpl(
- MCValue &Res, const MCAsmLayout *Layout, const MCFixup *Fixup) const {
- std::optional<int64_t> Total;
-
+bool AMDGPUVariadicMCExpr::evaluateExtraSGPRs(MCValue &Res,
+ const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) {
MCValue MCVal;
if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) ||
@@ -115,146 +111,136 @@ bool AMDGPUVariadicMCExpr::evaluateAsRelocatableImpl(
return true;
};
- if (Kind == AGVK_ExtraSGPRs) {
- assert(Args.size() == 5 &&
- "AMDGPUVariadic Argument count incorrect for ExtraSGPRs");
- uint64_t VCCUsed, FlatScrUsed, MajorVersion, XNACKUsed,
- hasArchitectedFlatScr, ExtraSGPRs = 0;
-
- bool Success = true;
- Success &= TryGetMCExprValue(Args[0], MajorVersion);
- Success &= TryGetMCExprValue(Args[3], XNACKUsed);
- Success &= TryGetMCExprValue(Args[4], hasArchitectedFlatScr);
-
- assert(Success &&
- "Arguments 1, 4, and 5 for ExtraSGPRs should be known constants");
- if (!Success || !TryGetMCExprValue(Args[1], VCCUsed) ||
- !TryGetMCExprValue(Args[2], FlatScrUsed))
- return false;
+ assert(Args.size() == 3 &&
+ "AMDGPUVariadic Argument count incorrect for ExtraSGPRs");
+ const MCSubtargetInfo *STI = Ctx.getSubtargetInfo();
+ uint64_t VCCUsed = 0, FlatScrUsed = 0, XNACKUsed = 0;
- if (VCCUsed)
- ExtraSGPRs = 2;
- if (MajorVersion >= 10) {
- Res = MCValue::get(ExtraSGPRs);
- return true;
- }
- if (MajorVersion < 8) {
- if (FlatScrUsed)
- ExtraSGPRs = 4;
- } else {
- if (XNACKUsed)
- ExtraSGPRs = 4;
- if (FlatScrUsed || hasArchitectedFlatScr)
- ExtraSGPRs = 6;
- }
-
- Res = MCValue::get(ExtraSGPRs);
- return true;
- }
+ bool Success = TryGetMCExprValue(Args[2], XNACKUsed);
- if (Kind == AGVK_AlignTo) {
- assert(Args.size() == 2 &&
- "AMDGPUVariadic Argument count incorrect for AlignTo");
- uint64_t Value, Align;
- if (!TryGetMCExprValue(Args[0], Value) ||
- !TryGetMCExprValue(Args[1], Align))
+ assert(Success && "Arguments 3 for ExtraSGPRs should be a known constant");
+ if (!Success || !TryGetMCExprValue(Args[0], VCCUsed) ||
+ !TryGetMCExprValue(Args[1], FlatScrUsed))
+ return false;
+
+ uint64_t ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
+ STI, (bool)VCCUsed, (bool)FlatScrUsed, (bool)XNACKUsed);
+ Res = MCValue::get(ExtraSGPRs);
+ return true;
+}
+
+bool AMDGPUVariadicMCExpr::evaluateTotalNumVGPR(MCValue &Res,
+ const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
+ auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) {
+ MCValue MCVal;
+ if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) ||
+ !MCVal.isAbsolute())
return false;
- Res = MCValue::get(alignTo(Value, Align));
+ ConstantValue = MCVal.getConstant();
return true;
- }
+ };
+ assert(Args.size() == 2 &&
+ "AMDGPUVariadic Argument count incorrect for TotalNumVGPRs");
+ const MCSubtargetInfo *STI = Ctx.getSubtargetInfo();
+ uint64_t NumAGPR = 0, NumVGPR = 0;
- if (Kind == AGVK_TotalNumVGPRs90A) {
- assert(Args.size() == 2 &&
- "AMDGPUVariadic Argument count incorrect for TotalNumVGPRs90A");
- uint64_t NumAGPR, NumVGPR, Total;
- if (!TryGetMCExprValue(Args[0], NumAGPR) ||
- !TryGetMCExprValue(Args[1], NumVGPR))
- return false;
+ bool Has90AInsts = AMDGPU::isGFX90A(*STI);
- if (NumAGPR) {
- Total = alignTo(NumVGPR, 4) + NumAGPR;
- } else {
- Total = std::max(NumVGPR, NumAGPR);
- }
+ if (!TryGetMCExprValue(Args[0], NumAGPR) ||
+ !TryGetMCExprValue(Args[1], NumVGPR))
+ return false;
- Res = MCValue::get(Total);
- return true;
- }
+ uint64_t TotalNum = Has90AInsts && NumAGPR ? alignTo(NumVGPR, 4) + NumAGPR
+ : std::max(NumVGPR, NumAGPR);
+ Res = MCValue::get(TotalNum);
+ return true;
+}
- if (Kind == AGVK_TotalNumVGPRs) {
- assert(Args.size() == 2 &&
- "AMDGPUVariadic Argument count incorrect for TotalNumVGPRs");
- uint64_t NumAGPR, NumVGPR;
- if (!TryGetMCExprValue(Args[0], NumAGPR) ||
- !TryGetMCExprValue(Args[1], NumVGPR))
+bool AMDGPUVariadicMCExpr::evaluateAlignTo(MCValue &Res,
+ const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
+ auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) {
+ MCValue MCVal;
+ if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) ||
+ !MCVal.isAbsolute())
return false;
- Res = MCValue::get(std::max(NumVGPR, NumAGPR));
+ ConstantValue = MCVal.getConstant();
return true;
- }
-
- if (Kind == AGVK_Occupancy) {
- assert(Args.size() == 7 &&
- "AMDGPUVariadic Argument count incorrect for Occupancy");
- uint64_t InitOccupancy, MaxWaves, Granule, TargetTotalNumVGPRs, Generation,
- NumSGPRs, NumVGPRs;
+ };
- bool Success = true;
- Success &= TryGetMCExprValue(Args[0], MaxWaves);
- Success &= TryGetMCExprValue(Args[1], Granule);
- Success &= TryGetMCExprValue(Args[2], TargetTotalNumVGPRs);
- Success &= TryGetMCExprValue(Args[3], Generation);
- Success &= TryGetMCExprValue(Args[4], InitOccupancy);
+ assert(Args.size() == 2 &&
+ "AMDGPUVariadic Argument count incorrect for AlignTo");
+ uint64_t Value = 0, Align = 0;
+ if (!TryGetMCExprValue(Args[0], Value) || !TryGetMCExprValue(Args[1], Align))
+ return false;
- assert(Success &&
- "Arguments 1 to 5 for Occupancy should be known constants");
+ Res = MCValue::get(alignTo(Value, Align));
+ return true;
+}
- if (!Success || !TryGetMCExprValue(Args[5], NumSGPRs) ||
- !TryGetMCExprValue(Args[6], NumVGPRs))
+bool AMDGPUVariadicMCExpr::evaluateOccupancy(MCValue &Res,
+ const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
+ auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) {
+ MCValue MCVal;
+ if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) ||
+ !MCVal.isAbsolute())
return false;
- auto OccWithNumVGPRs = [&](uint64_t NumVGPRs) -> uint64_t {
- return IsaInfo::getNumWavesPerEUWithNumVGPRs(NumVGPRs, Granule, MaxWaves,
- TargetTotalNumVGPRs);
- };
-
- // Mirrors GCNSubtarget::getOccupancyWithNumSGPRs without dependency on
- // subtarget.
- auto OccWithNumSGPRs = [&](uint64_t NumSGPRs) -> uint64_t {
- if (Generation >= AMDGPUSubtarget::GFX10)
- return MaxWaves;
-
- if (Generation >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
- if (NumSGPRs <= 80)
- return 10;
- if (NumSGPRs <= 88)
- return 9;
- if (NumSGPRs <= 100)
- return 8;
- return 7;
- }
- if (NumSGPRs <= 48)
- return 10;
- if (NumSGPRs <= 56)
- return 9;
- if (NumSGPRs <= 64)
- return 8;
- if (NumSGPRs <= 72)
- return 7;
- if (NumSGPRs <= 80)
- return 6;
- return 5;
- };
-
- uint64_t Occupancy = InitOccupancy;
- if (NumSGPRs)
- Occupancy = std::min(Occupancy, OccWithNumSGPRs(NumSGPRs));
- if (NumVGPRs)
- Occupancy = std::min(Occupancy, OccWithNumVGPRs(NumVGPRs));
-
- Res = MCValue::get(Occupancy);
+ ConstantValue = MCVal.getConstant();
return true;
+ };
+ assert(Args.size() == 7 &&
+ "AMDGPUVariadic Argument count incorrect for Occupancy");
+ uint64_t InitOccupancy, MaxWaves, Granule, TargetTotalNumVGPRs, Generation,
+ NumSGPRs, NumVGPRs;
+
+ bool Success = true;
+ Success &= TryGetMCExprValue(Args[0], MaxWaves);
+ Success &= TryGetMCExprValue(Args[1], Granule);
+ Success &= TryGetMCExprValue(Args[2], TargetTotalNumVGPRs);
+ Success &= TryGetMCExprValue(Args[3], Generation);
+ Success &= TryGetMCExprValue(Args[4], InitOccupancy);
+
+ assert(Success && "Arguments 1 to 5 for Occupancy should be known constants");
+
+ if (!Success || !TryGetMCExprValue(Args[5], NumSGPRs) ||
+ !TryGetMCExprValue(Args[6], NumVGPRs))
+ return false;
+
+ unsigned Occupancy = InitOccupancy;
+ if (NumSGPRs)
+ Occupancy = std::min(
+ Occupancy, IsaInfo::getOccupancyWithNumSGPRs(
+ NumSGPRs, MaxWaves,
+ static_cast<AMDGPUSubtarget::Generation>(Generation)));
+ if (NumVGPRs)
+ Occupancy = std::min(Occupancy,
+ IsaInfo::getNumWavesPerEUWithNumVGPRs(
+ NumVGPRs, Granule, MaxWaves, TargetTotalNumVGPRs));
+
+ Res = MCValue::get(Occupancy);
+ return true;
+}
+
+bool AMDGPUVariadicMCExpr::evaluateAsRelocatableImpl(
+ MCValue &Res, const MCAsmLayout *Layout, const MCFixup *Fixup) const {
+ std::optional<int64_t> Total;
+
+ switch (Kind) {
+ default:
+ break;
+ case AGVK_ExtraSGPRs:
+ return evaluateExtraSGPRs(Res, Layout, Fixup);
+ case AGVK_AlignTo:
+ return evaluateAlignTo(Res, Layout, Fixup);
+ case AGVK_TotalNumVGPRs:
+ return evaluateTotalNumVGPR(Res, Layout, Fixup);
+ case AGVK_Occupancy:
+ return evaluateOccupancy(Res, Layout, Fixup);
}
for (const MCExpr *Arg : Args) {
@@ -289,25 +275,19 @@ MCFragment *AMDGPUVariadicMCExpr::findAssociatedFragment() const {
/// are unresolvable but needed for further MCExprs). Derived from
/// implementation of IsaInfo::getNumExtraSGPRs in AMDGPUBaseInfo.cpp.
///
-const AMDGPUVariadicMCExpr *AMDGPUVariadicMCExpr::createExtraSGPRs(
- const MCExpr *VCCUsed, const MCExpr *FlatScrUsed, unsigned MajorVersion,
- bool hasArchitectedFlatScratch, bool XNACKUsed, MCContext &Ctx) {
- auto GetConstantExpr = [&Ctx](int64_t Value) {
- return MCConstantExpr::create(Value, Ctx);
- };
+const AMDGPUVariadicMCExpr *
+AMDGPUVariadicMCExpr::createExtraSGPRs(const MCExpr *VCCUsed,
+ const MCExpr *FlatScrUsed,
+ bool XNACKUsed, MCContext &Ctx) {
return create(AGVK_ExtraSGPRs,
- {GetConstantExpr(MajorVersion), VCCUsed, FlatScrUsed,
- GetConstantExpr(XNACKUsed),
- GetConstantExpr(hasArchitectedFlatScratch)},
+ {VCCUsed, FlatScrUsed, MCConstantExpr::create(XNACKUsed, Ctx)},
Ctx);
}
const AMDGPUVariadicMCExpr *AMDGPUVariadicMCExpr::createTotalNumVGPR(
- bool has90AInsts, const MCExpr *NumAGPR, const MCExpr *NumVGPR,
- MCContext &Ctx) {
- return create(has90AInsts ? AGVK_TotalNumVGPRs90A : AGVK_TotalNumVGPRs,
- {NumAGPR, NumVGPR}, Ctx);
+ const MCExpr *NumAGPR, const MCExpr *NumVGPR, MCContext &Ctx) {
+ return create(AGVK_TotalNumVGPRs, {NumAGPR, NumVGPR}, Ctx);
}
/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
index f317ef73fa3e24..f92350b592350a 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@@ -35,7 +35,6 @@ class AMDGPUVariadicMCExpr : public MCTargetExpr {
AGVK_Max,
AGVK_ExtraSGPRs,
AGVK_TotalNumVGPRs,
- AGVK_TotalNumVGPRs90A,
AGVK_AlignTo,
AGVK_Occupancy
};
@@ -50,6 +49,15 @@ class AMDGPUVariadicMCExpr : public MCTargetExpr {
MCContext &Ctx);
~AMDGPUVariadicMCExpr();
+ bool evaluateExtraSGPRs(MCValue &Res, const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const;
+ bool evaluateTotalNumVGPR(MCValue &Res, const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const;
+ bool evaluateAlignTo(MCValue &Res, const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const;
+ bool evaluateOccupancy(MCValue &Res, const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const;
+
public:
static const AMDGPUVariadicMCExpr *
create(VariadicKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx);
@@ -64,13 +72,12 @@ class AMDGPUVariadicMCExpr : public MCTargetExpr {
return create(VariadicKind::AGVK_Max, Args, Ctx);
}
- static const AMDGPUVariadicMCExpr *
- createExtraSGPRs(const MCExpr *VCCUsed, const MCExpr *FlatScrUsed,
- unsigned MajorVersion, bool hasArchitectedFlatScratch,
- bool XNACKUsed, MCContext &Ctx);
+ static const AMDGPUVariadicMCExpr *createExtraSGPRs(const MCExpr *VCCUsed,
+ const MCExpr *FlatScrUsed,
+ bool XNACKUsed,
+ MCContext &Ctx);
- static const AMDGPUVariadicMCExpr *createTotalNumVGPR(bool has90AInsts,
- const MCExpr *NumAGPR,
+ static const AMDGPUVariadicMCExpr *createTotalNumVGPR(const MCExpr *NumAGPR,
const MCExpr *NumVGPR,
MCContext &Ctx);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 82897a68082a74..05ff357f6676cc 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1143,6 +1143,33 @@ unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
return std::min(std::max(TotalNumVGPRs / RoundedRegs, 1u), MaxWaves);
}
+unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves,
+ AMDGPUSubtarget::Generation Gen) {
+ if (Gen >= AMDGPUSubtarget::GFX10)
+ return MaxWaves;
+
+ if (Gen >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ if (SGPRs <= 80)
+ return 10;
+ if (SGPRs <= 88)
+ return 9;
+ if (SGPRs <= 100)
+ return 8;
+ return 7;
+ }
+ if (SGPRs <= 48)
+ return 10;
+ if (SGPRs <= 56)
+ return 9;
+ if (SGPRs <= 64)
+ return 8;
+ if (SGPRs <= 72)
+ return 7;
+ if (SGPRs <= 80)
+ return 6;
+ return 5;
+}
+
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
assert(WavesPerEU != 0);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 8b9ce1b191a298..905ac4d36153aa 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -9,6 +9,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
+#include "AMDGPUSubtarget.h"
#include "SIDefines.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/InstrTypes.h"
@@ -317,6 +318,11 @@ unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
unsigned MaxWaves,
unsigned TotalNumVGPRs);
+/// \returns Occupancy for a given \p SGPRs usage, \p MaxWaves possible, and \p
+/// Gen.
+unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves,
+ AMDGPUSubtarget::Generation Gen);
+
/// \returns Number of VGPR blocks needed for given subtarget \p STI when
/// \p NumVGPRs are used. We actually return the number of blocks -1, since
/// that's what we encode.
diff --git a/llvm/test/MC/AMDGPU/extrasgprs_mcexpr.s b/llvm/test/MC/AMDGPU/extrasgprs_mcexpr.s
index 89e4954b805471..e88b23bb34d4f0 100644
--- a/llvm/test/MC/AMDGPU/extrasgprs_mcexpr.s
+++ b/llvm/test/MC/AMDGPU/extrasgprs_mcexpr.s
@@ -1,41 +1,31 @@
-// RUN: llvm-mc -triple amdgcn-amd-amdhsa < %s | FileCheck --check-prefix=ASM %s
-
-// ASM: .set extrasgpr_none_gfx7, 0
-// ASM: .set extrasgpr_none_gfx9, 0
-// ASM: .set extrasgpr_none_gfx10, 0
-
-.set extrasgpr_none_gfx7, extrasgprs(7, 0, 0, 0, 0)
-.set extrasgpr_none_gfx9, extrasgprs(9, 0, 0, 0, 0)
-.set extrasgpr_none_gfx10, extrasgprs(10, 0, 0, 0, 0)
-
-// ASM: .set extrasgpr_vcc_gfx7, 2
-// ASM: .set extrasgpr_vcc_gfx9, 2
-// ASM: .set extrasgpr_vcc_gfx10, 2
-
-.set extrasgpr_vcc_gfx7, extrasgprs(7, 1, 0, 0, 0)
-.set extrasgpr_vcc_gfx9, extrasgprs(9, 1, 0, 0, 0)
-.set extrasgpr_vcc_gfx10, extrasgprs(10, 1, 0, 0, 0)
-
-// ASM: .set extrasgpr_flatscr_gfx7, 4
-// ASM: .set extrasgpr_flatscr_gfx9, 6
-// ASM: .set extrasgpr_flatscr_gfx10, 0
-
-.set extrasgpr_flatscr_gfx7, extrasgprs(7, 0, 1, 0, 0)
-.set extrasgpr_flatscr_gfx9, extrasgprs(9, 0, 1, 0, 0)
-.set extrasgpr_flatscr_gfx10, extrasgprs(10, 0, 1, 0, 0)
-
-// ASM: .set extrasgpr_xnack_gfx7, 0
-// ASM: .set extrasgpr_xnack_gfx9, 4
-// ASM: .set extrasgpr_xnack_gfx10, 0
-
-.set extrasgpr_xnack_gfx7, extrasgprs(7, 0, 0, 1, 0)
-.set extrasgpr_xnack_gfx9, extrasgprs(9, 0, 0, 1, 0)
-.set extrasgpr_xnack_gfx10, extrasgprs(10, 0, 0, 1, 0)
-
-// ASM: .set extrasgpr_archflatscr_gfx7, 0
-// ASM: .set extrasgpr_archflatscr_gfx9, 6
-// ASM: .set extrasgpr_archflatscr_gfx10, 0
-
-.set extrasgpr_archflatscr_gfx7, extrasgprs(7, 0, 0, 0, 1)
-.set extrasgpr_archflatscr_gfx9, extrasgprs(9, 0, 0, 0, 1)
-.set extrasgpr_archflatscr_gfx10, extrasgprs(10, 0, 0, 0, 1)
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=bonaire < %s | FileCheck --check-prefix=GFX7 %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck --check-prefix=GFX90A %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck --check-prefix=GFX940 %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+
+// gfx940 has architected flat scratch enabled.
+
+// GFX7: .set extrasgpr_none, 0
+// GFX7: .set extrasgpr_vcc, 2
+// GFX7: .set extrasgpr_flatscr, 4
+// GFX7: .set extrasgpr_xnack, 0
+
+// GFX90A: .set extrasgpr_none, 0
+// GFX90A: .set extrasgpr_vcc, 2
+// GFX90A: .set extrasgpr_flatscr, 6
+// GFX90A: .set extrasgpr_xnack, 4
+
+// GFX940: .set extrasgpr_none, 6
+// GFX940: .set extrasgpr_vcc, 6
+// GFX940: .set extrasgpr_flatscr, 6
+// GFX940: .set extrasgpr_xnack, 6
+
+// GFX10: .set extrasgpr_none, 0
+// GFX10: .set extrasgpr_vcc, 2
+// GFX10: .set extrasgpr_flatscr, 0
+// GFX10: .set extrasgpr_xnack, 0
+
+.set extrasgpr_none, extrasgprs(0, 0, 0)
+.set extrasgpr_vcc, extrasgprs(1, 0, 0)
+.set extrasgpr_flatscr, extrasgprs(0, 1, 0)
+.set extrasgpr_xnack, extrasgprs(0, 0, 1)
diff --git a/llvm/test/MC/AMDGPU/totalnumvgpr_mcexpr.s b/llvm/test/MC/AMDGPU/totalnumvgpr_mcexpr.s
index 58f317731df849..29bb885b208043 100644
--- a/llvm/test/MC/AMDGPU/totalnumvgpr_mcexpr.s
+++ b/llvm/test/MC/AMDGPU/totalnumvgpr_mcexpr.s
@@ -1,25 +1,26 @@
-// RUN: llvm-mc -triple amdgcn-amd-amdhsa < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck --check-prefix=GFX90A %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
-// ASM: .set totalvgpr_none, 0
-// ASM: .set totalvgpr_one, 1
-// ASM: .set totalvgpr_two, 2
+// GFX10: .set totalvgpr_none, 0
+// GFX10: .set totalvgpr_one, 1
+// GFX10: .set totalvgpr_two, 2
.set totalvgpr_none, totalnumvgprs(0, 0)
.set totalvgpr_one, totalnumvgprs(1, 0)
.set totalvgpr_two, totalnumvgprs(1, 2)
-// ASM: .set totalvgpr90a_none, 0
-// ASM: .set totalvgpr90a_one, 1
-// ASM: .set totalvgpr90a_two, 2
+// GFX90A: .set totalvgpr90a_none, 0
+// GFX90A: .set totalvgpr90a_one, 1
+// GFX90A: .set totalvgpr90a_two, 2
-.set totalvgpr90a_none, totalnumvgprs90a(0, 0)
-.set totalvgpr90a_one, totalnumvgprs90a(0, 1)
-.set totalvgpr90a_two, totalnumvgprs90a(0, 2)
+.set totalvgpr90a_none, totalnumvgprs(0, 0)
+.set totalvgpr90a_one, totalnumvgprs(0, 1)
+.set totalvgpr90a_two, totalnumvgprs(0, 2)
-// ASM: .set totalvgpr90a_agpr_minimal, 1
-// ASM: .set totalvgpr90a_agpr_rounded_eight, 8
-// ASM: .set totalvgpr90a_agpr_exact_eight, 8
+// GFX90A: .set totalvgpr90a_agpr_minimal, 1
+// GFX90A: .set totalvgpr90a_agpr_rounded_eight, 8
+// GFX90A: .set totalvgpr90a_agpr_exact_eight, 8
-.set totalvgpr90a_agpr_minimal, totalnumvgprs90a(1, 0)
-.set totalvgpr90a_agpr_rounded_eight, totalnumvgprs90a(4, 2)
-.set totalvgpr90a_agpr_exact_eight, totalnumvgprs90a(4, 4)
+.set totalvgpr90a_agpr_minimal, totalnumvgprs(1, 0)
+.set totalvgpr90a_agpr_rounded_eight, totalnumvgprs(4, 2)
+.set totalvgpr90a_agpr_exact_eight, totalnumvgprs(4, 4)
More information about the llvm-commits
mailing list