[llvm] d86b68a - MCExpr-ify SIProgramInfo (#88257)
via llvm-commits
llvm-commits at lists.llvm.org
Thu May 9 05:02:36 PDT 2024
Author: Janek van Oirschot
Date: 2024-05-09T13:02:32+01:00
New Revision: d86b68afd7f0d7684adc312bcdc87f9027d0d896
URL: https://github.com/llvm/llvm-project/commit/d86b68afd7f0d7684adc312bcdc87f9027d0d896
DIFF: https://github.com/llvm/llvm-project/commit/d86b68afd7f0d7684adc312bcdc87f9027d0d896.diff
LOG: MCExpr-ify SIProgramInfo (#88257)
Convert members in SIProgramInfo affected by variables provided by AMDGPUResourceUsageAnalysis into MCExprs.
Added:
llvm/test/MC/AMDGPU/alignto_mcexpr.s
llvm/test/MC/AMDGPU/extrasgprs_mcexpr.s
llvm/test/MC/AMDGPU/occupancy_mcexpr.s
llvm/test/MC/AMDGPU/totalnumvgpr_mcexpr.s
llvm/unittests/MC/AMDGPU/SIProgramInfoMCExprs.cpp
Modified:
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
llvm/lib/Target/AMDGPU/SIProgramInfo.h
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
llvm/unittests/MC/AMDGPU/CMakeLists.txt
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 89a5ceac629b1..de81904143b7b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -22,6 +22,7 @@
#include "AMDKernelCodeT.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUInstPrinter.h"
+#include "MCTargetDesc/AMDGPUMCExpr.h"
#include "MCTargetDesc/AMDGPUMCKernelDescriptor.h"
#include "MCTargetDesc/AMDGPUTargetStreamer.h"
#include "R600AsmPrinter.h"
@@ -134,6 +135,15 @@ void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
getTargetStreamer()->getPALMetadata()->readFromIR(M);
}
+uint64_t AMDGPUAsmPrinter::getMCExprValue(const MCExpr *Value, MCContext &Ctx) {
+ int64_t Val;
+ if (!Value->evaluateAsAbsolute(Val)) {
+ Ctx.reportError(SMLoc(), "could not resolve expression when required.");
+ return 0;
+ }
+ return static_cast<uint64_t>(Val);
+}
+
void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
// Init target streamer if it has not yet happened
if (!IsTargetStreamerInitialized)
@@ -237,12 +247,14 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
getNameWithPrefix(KernelName, &MF->getFunction());
getTargetStreamer()->EmitAmdhsaKernelDescriptor(
STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
- CurrentProgramInfo.NumVGPRsForWavesPerEU,
- CurrentProgramInfo.NumSGPRsForWavesPerEU -
+ getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Context),
+ getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Context) -
IsaInfo::getNumExtraSGPRs(
- &STM, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
+ &STM, getMCExprValue(CurrentProgramInfo.VCCUsed, Context),
+ getMCExprValue(CurrentProgramInfo.FlatUsed, Context),
getTargetStreamer()->getTargetID()->isXnackOnOrAny()),
- CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
+ getMCExprValue(CurrentProgramInfo.VCCUsed, Context),
+ getMCExprValue(CurrentProgramInfo.FlatUsed, Context));
Streamer.popSection();
}
@@ -422,7 +434,7 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
}
- if (CurrentProgramInfo.DynamicCallStack &&
+ if (getMCExprValue(CurrentProgramInfo.DynamicCallStack, MF.getContext()) &&
CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK;
@@ -439,29 +451,22 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
MCKernelDescriptor KernelDescriptor;
- assert(isUInt<32>(PI.ScratchSize));
- assert(isUInt<32>(PI.getComputePGMRSrc1(STM)));
- assert(isUInt<32>(PI.getComputePGMRSrc2()));
-
KernelDescriptor.group_segment_fixed_size =
MCConstantExpr::create(PI.LDSSize, Ctx);
- KernelDescriptor.private_segment_fixed_size =
- MCConstantExpr::create(PI.ScratchSize, Ctx);
+ KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
Align MaxKernArgAlign;
KernelDescriptor.kernarg_size = MCConstantExpr::create(
STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
- KernelDescriptor.compute_pgm_rsrc1 =
- MCConstantExpr::create(PI.getComputePGMRSrc1(STM), Ctx);
- KernelDescriptor.compute_pgm_rsrc2 =
- MCConstantExpr::create(PI.getComputePGMRSrc2(), Ctx);
+ KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
+ KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
KernelDescriptor.kernel_code_properties =
MCConstantExpr::create(getAmdhsaKernelCodeProperties(MF), Ctx);
- assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
- KernelDescriptor.compute_pgm_rsrc3 = MCConstantExpr::create(
- STM.hasGFX90AInsts() ? CurrentProgramInfo.ComputePGMRSrc3GFX90A : 0, Ctx);
+ assert(STM.hasGFX90AInsts() ||
+ getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx) == 0);
+ KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A;
KernelDescriptor.kernarg_preload = MCConstantExpr::create(
AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
@@ -477,9 +482,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
initTargetStreamer(*MF.getFunction().getParent());
ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
- CurrentProgramInfo = SIProgramInfo();
+ CurrentProgramInfo.reset(MF);
const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
+ MCContext &Ctx = MF.getContext();
// The starting address of all shader programs must be 256 bytes aligned.
// Regular functions just need the basic required instruction alignment.
@@ -550,11 +556,13 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
OutStreamer->emitRawComment(" Kernel info:", false);
emitCommonFunctionComments(
- CurrentProgramInfo.NumArchVGPR,
- STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR
+ getMCExprValue(CurrentProgramInfo.NumArchVGPR, Ctx),
+ STM.hasMAIInsts() ? getMCExprValue(CurrentProgramInfo.NumAccVGPR, Ctx)
: std::optional<uint32_t>(),
- CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
- CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI);
+ getMCExprValue(CurrentProgramInfo.NumVGPR, Ctx),
+ getMCExprValue(CurrentProgramInfo.NumSGPR, Ctx),
+ getMCExprValue(CurrentProgramInfo.ScratchSize, Ctx),
+ getFunctionCodeSize(MF), MFI);
OutStreamer->emitRawComment(
" FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
@@ -565,32 +573,44 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
" bytes/workgroup (compile time only)", false);
OutStreamer->emitRawComment(
- " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
+ " SGPRBlocks: " +
+ Twine(getMCExprValue(CurrentProgramInfo.SGPRBlocks, Ctx)),
+ false);
OutStreamer->emitRawComment(
- " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
+ " VGPRBlocks: " +
+ Twine(getMCExprValue(CurrentProgramInfo.VGPRBlocks, Ctx)),
+ false);
OutStreamer->emitRawComment(
- " NumSGPRsForWavesPerEU: " +
- Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
+ " NumSGPRsForWavesPerEU: " +
+ Twine(
+ getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx)),
+ false);
OutStreamer->emitRawComment(
- " NumVGPRsForWavesPerEU: " +
- Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
+ " NumVGPRsForWavesPerEU: " +
+ Twine(
+ getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx)),
+ false);
if (STM.hasGFX90AInsts())
OutStreamer->emitRawComment(
- " AccumOffset: " +
- Twine((CurrentProgramInfo.AccumOffset + 1) * 4), false);
+ " AccumOffset: " +
+ Twine((getMCExprValue(CurrentProgramInfo.AccumOffset, Ctx) + 1) *
+ 4),
+ false);
OutStreamer->emitRawComment(
- " Occupancy: " +
- Twine(CurrentProgramInfo.Occupancy), false);
+ " Occupancy: " +
+ Twine(getMCExprValue(CurrentProgramInfo.Occupancy, Ctx)),
+ false);
OutStreamer->emitRawComment(
" WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
- OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
- Twine(CurrentProgramInfo.ScratchEnable),
- false);
+ OutStreamer->emitRawComment(
+ " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
+ Twine(getMCExprValue(CurrentProgramInfo.ScratchEnable, Ctx)),
+ false);
OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
Twine(CurrentProgramInfo.UserSGPR),
false);
@@ -611,18 +631,20 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
false);
assert(STM.hasGFX90AInsts() ||
- CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
+ getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx) == 0);
if (STM.hasGFX90AInsts()) {
OutStreamer->emitRawComment(
- " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
- Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
- amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
- false);
+ " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
+ Twine((AMDHSA_BITS_GET(
+ getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx),
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
+ false);
OutStreamer->emitRawComment(
- " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
- Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
- amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
- false);
+ " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
+ Twine((AMDHSA_BITS_GET(
+ getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx),
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
+ false);
}
}
@@ -702,23 +724,40 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
ResourceUsage->getResourceInfo(&MF.getFunction());
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+ MCContext &Ctx = MF.getContext();
+
+ auto CreateExpr = [&Ctx](int64_t Value) {
+ return MCConstantExpr::create(Value, Ctx);
+ };
- ProgInfo.NumArchVGPR = Info.NumVGPR;
- ProgInfo.NumAccVGPR = Info.NumAGPR;
- ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM);
- ProgInfo.AccumOffset = alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1;
+ auto TryGetMCExprValue = [&Ctx](const MCExpr *Value, uint64_t &Res) -> bool {
+ int64_t Val;
+ if (Value->evaluateAsAbsolute(Val)) {
+ Res = Val;
+ return true;
+ }
+ return false;
+ };
+
+ ProgInfo.NumArchVGPR = CreateExpr(Info.NumVGPR);
+ ProgInfo.NumAccVGPR = CreateExpr(Info.NumAGPR);
+ ProgInfo.NumVGPR = CreateExpr(Info.getTotalNumVGPRs(STM));
+ ProgInfo.AccumOffset =
+ CreateExpr(alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1);
ProgInfo.TgSplit = STM.isTgSplitEnabled();
- ProgInfo.NumSGPR = Info.NumExplicitSGPR;
- ProgInfo.ScratchSize = Info.PrivateSegmentSize;
- ProgInfo.VCCUsed = Info.UsesVCC;
- ProgInfo.FlatUsed = Info.UsesFlatScratch;
- ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
+ ProgInfo.NumSGPR = CreateExpr(Info.NumExplicitSGPR);
+ ProgInfo.ScratchSize = CreateExpr(Info.PrivateSegmentSize);
+ ProgInfo.VCCUsed = CreateExpr(Info.UsesVCC);
+ ProgInfo.FlatUsed = CreateExpr(Info.UsesFlatScratch);
+ ProgInfo.DynamicCallStack =
+ CreateExpr(Info.HasDynamicallySizedStack || Info.HasRecursion);
const uint64_t MaxScratchPerWorkitem =
STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
- if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) {
- DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
- ProgInfo.ScratchSize,
+ uint64_t ScratchSize;
+ if (TryGetMCExprValue(ProgInfo.ScratchSize, ScratchSize) &&
+ ScratchSize > MaxScratchPerWorkitem) {
+ DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), ScratchSize,
MaxScratchPerWorkitem, DS_Error);
MF.getFunction().getContext().diagnose(DiagStackSize);
}
@@ -728,27 +767,29 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// The calculations related to SGPR/VGPR blocks are
// duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
// unified.
- unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
- &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed,
- getTargetStreamer()->getTargetID()->isXnackOnOrAny());
+ const MCExpr *ExtraSGPRs = AMDGPUVariadicMCExpr::createExtraSGPRs(
+ ProgInfo.VCCUsed, ProgInfo.FlatUsed,
+ getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
// Check the addressable register limit before we add ExtraSGPRs.
if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
!STM.hasSGPRInitBug()) {
unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
- if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
+ uint64_t NumSgpr;
+ if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
+ NumSgpr > MaxAddressableNumSGPRs) {
// This can happen due to a compiler bug or when using inline asm.
LLVMContext &Ctx = MF.getFunction().getContext();
DiagnosticInfoResourceLimit Diag(
- MF.getFunction(), "addressable scalar registers", ProgInfo.NumSGPR,
+ MF.getFunction(), "addressable scalar registers", NumSgpr,
MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
Ctx.diagnose(Diag);
- ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1;
+ ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);
}
}
// Account for extra SGPRs and VGPRs reserved for debugger use.
- ProgInfo.NumSGPR += ExtraSGPRs;
+ ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx);
const Function &F = MF.getFunction();
@@ -819,40 +860,51 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
}
}
}
- ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
- ProgInfo.NumArchVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
- ProgInfo.NumVGPR =
- Info.getTotalNumVGPRs(STM, Info.NumAGPR, ProgInfo.NumArchVGPR);
+ ProgInfo.NumSGPR = AMDGPUVariadicMCExpr::createMax(
+ {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx);
+
+ ProgInfo.NumArchVGPR = AMDGPUVariadicMCExpr::createMax(
+ {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
+
+ ProgInfo.NumVGPR = AMDGPUVariadicMCExpr::createTotalNumVGPR(
+ ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
}
// Adjust number of registers used to meet default/requested minimum/maximum
// number of waves per execution unit request.
- ProgInfo.NumSGPRsForWavesPerEU = std::max(
- std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU()));
- ProgInfo.NumVGPRsForWavesPerEU = std::max(
- std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU()));
+ unsigned MaxWaves = MFI->getMaxWavesPerEU();
+ ProgInfo.NumSGPRsForWavesPerEU = AMDGPUVariadicMCExpr::createMax(
+ {ProgInfo.NumSGPR, CreateExpr(1ul),
+ CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
+ Ctx);
+ ProgInfo.NumVGPRsForWavesPerEU = AMDGPUVariadicMCExpr::createMax(
+ {ProgInfo.NumVGPR, CreateExpr(1ul),
+ CreateExpr(STM.getMinNumVGPRs(MaxWaves))},
+ Ctx);
if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
STM.hasSGPRInitBug()) {
unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
- if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
+ uint64_t NumSgpr;
+ if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
+ NumSgpr > MaxAddressableNumSGPRs) {
// This can happen due to a compiler bug or when using inline asm to use
// the registers which are usually reserved for vcc etc.
LLVMContext &Ctx = MF.getFunction().getContext();
DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
- ProgInfo.NumSGPR, MaxAddressableNumSGPRs,
+ NumSgpr, MaxAddressableNumSGPRs,
DS_Error, DK_ResourceLimit);
Ctx.diagnose(Diag);
- ProgInfo.NumSGPR = MaxAddressableNumSGPRs;
- ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs;
+ ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);
+ ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);
}
}
if (STM.hasSGPRInitBug()) {
ProgInfo.NumSGPR =
- AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
+ CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);
ProgInfo.NumSGPRsForWavesPerEU =
- AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
+ CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);
}
if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
@@ -871,11 +923,26 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
STM.getAddressableLocalMemorySize(), DS_Error);
Ctx.diagnose(Diag);
}
+ // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
+ // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
+ auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
+ unsigned Granule) {
+ const MCExpr *OneConst = CreateExpr(1ul);
+ const MCExpr *GranuleConst = CreateExpr(Granule);
+ const MCExpr *MaxNumGPR =
+ AMDGPUVariadicMCExpr::createMax({NumGPR, OneConst}, Ctx);
+ const MCExpr *AlignToGPR =
+ AMDGPUVariadicMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
+ const MCExpr *DivGPR =
+ MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
+ const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
+ return SubGPR;
+ };
- ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks(
- &STM, ProgInfo.NumSGPRsForWavesPerEU);
- ProgInfo.VGPRBlocks =
- IsaInfo::getEncodedNumVGPRBlocks(&STM, ProgInfo.NumVGPRsForWavesPerEU);
+ ProgInfo.SGPRBlocks = GetNumGPRBlocks(ProgInfo.NumSGPRsForWavesPerEU,
+ IsaInfo::getSGPREncodingGranule(&STM));
+ ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
+ IsaInfo::getVGPREncodingGranule(&STM));
const SIModeRegisterDefaults Mode = MFI->getMode();
@@ -904,14 +971,23 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.LDSBlocks =
alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
+ // The MCExpr equivalent of divideCeil.
+ auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
+ const MCExpr *Ceil =
+ AMDGPUVariadicMCExpr::createAlignTo(Numerator, Denominator, Ctx);
+ return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);
+ };
+
// Scratch is allocated in 64-dword or 256-dword blocks.
unsigned ScratchAlignShift =
STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
// We need to program the hardware with the amount of scratch memory that
// is used by the entire wave. ProgInfo.ScratchSize is the amount of
// scratch memory used per thread.
- ProgInfo.ScratchBlocks = divideCeil(
- ProgInfo.ScratchSize * STM.getWavefrontSize(), 1ULL << ScratchAlignShift);
+ ProgInfo.ScratchBlocks = DivideCeil(
+ MCBinaryExpr::createMul(ProgInfo.ScratchSize,
+ CreateExpr(STM.getWavefrontSize()), Ctx),
+ CreateExpr(1ULL << ScratchAlignShift));
if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
@@ -930,8 +1006,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// anything to disable it if we know the stack isn't used here. We may still
// have emitted code reading it to initialize scratch, but if that's unused
// reading garbage should be OK.
- ProgInfo.ScratchEnable =
- ProgInfo.ScratchBlocks > 0 || ProgInfo.DynamicCallStack;
+ ProgInfo.ScratchEnable = MCBinaryExpr::createLOr(
+ MCBinaryExpr::createGT(ProgInfo.ScratchBlocks,
+ MCConstantExpr::create(0, Ctx), Ctx),
+ ProgInfo.DynamicCallStack, Ctx);
+
ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
// For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
ProgInfo.TrapHandlerEnable =
@@ -947,26 +1026,41 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.EXCPEnable = 0;
if (STM.hasGFX90AInsts()) {
- AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
- amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
- ProgInfo.AccumOffset);
- AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
- amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
- ProgInfo.TgSplit);
+ // return ((Dst & ~Mask) | (Value << Shift))
+ auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
+ uint32_t Shift) {
+ auto Shft = MCConstantExpr::create(Shift, Ctx);
+ auto Msk = MCConstantExpr::create(Mask, Ctx);
+ Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
+ Dst = MCBinaryExpr::createOr(
+ Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), Ctx);
+ return Dst;
+ };
+
+ ProgInfo.ComputePGMRSrc3GFX90A =
+ SetBits(ProgInfo.ComputePGMRSrc3GFX90A, ProgInfo.AccumOffset,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
+ ProgInfo.ComputePGMRSrc3GFX90A =
+ SetBits(ProgInfo.ComputePGMRSrc3GFX90A, CreateExpr(ProgInfo.TgSplit),
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
+ amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
}
- ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize,
- ProgInfo.NumSGPRsForWavesPerEU,
- ProgInfo.NumVGPRsForWavesPerEU);
+ ProgInfo.Occupancy = AMDGPUVariadicMCExpr::createOccupancy(
+ STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU,
+ ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
+
const auto [MinWEU, MaxWEU] =
AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
- if (ProgInfo.Occupancy < MinWEU) {
+ uint64_t Occupancy;
+ if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
DiagnosticInfoOptimizationFailure Diag(
F, F.getSubprogram(),
"failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
"'" +
F.getName() + "': desired occupancy was " + Twine(MinWEU) +
- ", final occupancy is " + Twine(ProgInfo.Occupancy));
+ ", final occupancy is " + Twine(Occupancy));
F.getContext().diagnose(Diag);
}
}
@@ -989,36 +1083,78 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
+ MCContext &Ctx = MF.getContext();
+
+ // (((Value) & Mask) << Shift)
+ auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
+ const MCExpr *msk = MCConstantExpr::create(Mask, Ctx);
+ const MCExpr *shft = MCConstantExpr::create(Shift, Ctx);
+ return MCBinaryExpr::createShl(MCBinaryExpr::createAnd(Value, msk, Ctx),
+ shft, Ctx);
+ };
+
+ auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {
+ int64_t Val;
+ if (Value->evaluateAsAbsolute(Val))
+ OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size);
+ else
+ OutStreamer->emitValue(Value, Size);
+ };
if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1);
- OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1(STM));
+ EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx),
+ /*Size=*/4);
OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
- OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc2());
+ EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4);
OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
- OutStreamer->emitInt32(
- STM.getGeneration() >= AMDGPUSubtarget::GFX12
- ? S_00B860_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
- : STM.getGeneration() == AMDGPUSubtarget::GFX11
- ? S_00B860_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
- : S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
+
+ // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
+ // appropriate generation.
+ if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
+ EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
+ /*Mask=*/0x3FFFF, /*Shift=*/12),
+ /*Size=*/4);
+ } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
+ EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
+ /*Mask=*/0x7FFF, /*Shift=*/12),
+ /*Size=*/4);
+ } else {
+ EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
+ /*Mask=*/0x1FFF, /*Shift=*/12),
+ /*Size=*/4);
+ }
// TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
// 0" comment but I don't see a corresponding field in the register spec.
} else {
OutStreamer->emitInt32(RsrcReg);
- OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
- S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
+
+ const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
+ SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),
+ SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),
+ MF.getContext());
+ EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);
OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
- OutStreamer->emitInt32(
- STM.getGeneration() >= AMDGPUSubtarget::GFX12
- ? S_0286E8_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
- : STM.getGeneration() == AMDGPUSubtarget::GFX11
- ? S_0286E8_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
- : S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
+
+ // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
+ // appropriate generation.
+ if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
+ EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
+ /*Mask=*/0x3FFFF, /*Shift=*/12),
+ /*Size=*/4);
+ } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
+ EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
+ /*Mask=*/0x7FFF, /*Shift=*/12),
+ /*Size=*/4);
+ } else {
+ EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
+ /*Mask=*/0x1FFF, /*Shift=*/12),
+ /*Size=*/4);
+ }
}
if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
@@ -1070,33 +1206,38 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
auto CC = MF.getFunction().getCallingConv();
auto MD = getTargetStreamer()->getPALMetadata();
+ auto &Ctx = MF.getContext();
MD->setEntryPoint(CC, MF.getFunction().getName());
- MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU);
+ MD->setNumUsedVgprs(
+ CC, getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx));
// Only set AGPRs for supported devices
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
if (STM.hasMAIInsts()) {
- MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
+ MD->setNumUsedAgprs(CC, getMCExprValue(CurrentProgramInfo.NumAccVGPR, Ctx));
}
- MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
+ MD->setNumUsedSgprs(
+ CC, getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx));
if (MD->getPALMajorVersion() < 3) {
MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM));
if (AMDGPU::isCompute(CC)) {
MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2());
} else {
- if (CurrentProgramInfo.ScratchBlocks > 0)
+ if (getMCExprValue(CurrentProgramInfo.ScratchBlocks, Ctx) > 0)
MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
}
} else {
MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
- MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
+ MD->setHwStage(CC, ".scratch_en",
+ (bool)getMCExprValue(CurrentProgramInfo.ScratchEnable, Ctx));
EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM);
}
// ScratchSize is in bytes, 16 aligned.
- MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16));
+ MD->setScratchSize(
+ CC, alignTo(getMCExprValue(CurrentProgramInfo.ScratchSize, Ctx), 16));
if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
@@ -1145,6 +1286,7 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
StringRef FnName = MF.getFunction().getName();
MD->setFunctionScratchSize(FnName, MFI.getStackSize());
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ MCContext &Ctx = MF.getContext();
if (MD->getPALMajorVersion() < 3) {
// Set compute registers
@@ -1158,8 +1300,10 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
// Set optional info
MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
- MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
- MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
+ MD->setFunctionNumUsedVgprs(
+ FnName, getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx));
+ MD->setFunctionNumUsedSgprs(
+ FnName, getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx));
}
// This is supposed to be log2(Size)
@@ -1185,6 +1329,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+ MCContext &Ctx = MF.getContext();
AMDGPU::initDefaultAMDKernelCodeT(Out, &STM);
@@ -1193,7 +1338,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
(CurrentProgramInfo.getComputePGMRSrc2() << 32);
Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
- if (CurrentProgramInfo.DynamicCallStack)
+ if (getMCExprValue(CurrentProgramInfo.DynamicCallStack, Ctx))
Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK;
AMD_HSA_BITS_SET(Out.code_properties,
@@ -1229,9 +1374,10 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
Align MaxKernArgAlign;
Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
- Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
- Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
- Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
+ Out.wavefront_sgpr_count = getMCExprValue(CurrentProgramInfo.NumSGPR, Ctx);
+ Out.workitem_vgpr_count = getMCExprValue(CurrentProgramInfo.NumVGPR, Ctx);
+ Out.workitem_private_segment_byte_size =
+ getMCExprValue(CurrentProgramInfo.ScratchSize, Ctx);
Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
// kernarg_segment_alignment is specified as log of the alignment.
@@ -1322,19 +1468,28 @@ void AMDGPUAsmPrinter::emitResourceUsageRemarks(
// remarks to simulate newlines. If and when clang does accept newlines, this
// formatting should be aggregated into one remark with newlines to avoid
// printing multiple diagnostic location and diag opts.
+ MCContext &MCCtx = MF.getContext();
EmitResourceUsageRemark("FunctionName", "Function Name",
MF.getFunction().getName());
- EmitResourceUsageRemark("NumSGPR", "SGPRs", CurrentProgramInfo.NumSGPR);
- EmitResourceUsageRemark("NumVGPR", "VGPRs", CurrentProgramInfo.NumArchVGPR);
- if (hasMAIInsts)
- EmitResourceUsageRemark("NumAGPR", "AGPRs", CurrentProgramInfo.NumAccVGPR);
- EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
- CurrentProgramInfo.ScratchSize);
+ EmitResourceUsageRemark("NumSGPR", "SGPRs",
+ getMCExprValue(CurrentProgramInfo.NumSGPR, MCCtx));
+ EmitResourceUsageRemark(
+ "NumVGPR", "VGPRs",
+ getMCExprValue(CurrentProgramInfo.NumArchVGPR, MCCtx));
+ if (hasMAIInsts) {
+ EmitResourceUsageRemark(
+ "NumAGPR", "AGPRs",
+ getMCExprValue(CurrentProgramInfo.NumAccVGPR, MCCtx));
+ }
+ EmitResourceUsageRemark(
+ "ScratchSize", "ScratchSize [bytes/lane]",
+ getMCExprValue(CurrentProgramInfo.ScratchSize, MCCtx));
StringRef DynamicStackStr =
- CurrentProgramInfo.DynamicCallStack ? "True" : "False";
+ getMCExprValue(CurrentProgramInfo.DynamicCallStack, MCCtx) ? "True"
+ : "False";
EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
- CurrentProgramInfo.Occupancy);
+ getMCExprValue(CurrentProgramInfo.Occupancy, MCCtx));
EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
CurrentProgramInfo.SGPRSpill);
EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index b8b2718d293e6..16d8952a533ef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -78,6 +78,8 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
void initTargetStreamer(Module &M);
+ static uint64_t getMCExprValue(const MCExpr *Value, MCContext &Ctx);
+
public:
explicit AMDGPUAsmPrinter(TargetMachine &TM,
std::unique_ptr<MCStreamer> Streamer);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 9e288ab50e170..7ab9ba2851332 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -19,6 +19,8 @@
#include "SIMachineFunctionInfo.h"
#include "SIProgramInfo.h"
#include "llvm/IR/Module.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
using namespace llvm;
static std::pair<Type *, Align> getArgumentTypeAlign(const Argument &Arg,
@@ -462,6 +464,16 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
const Function &F = MF.getFunction();
+ auto GetMCExprValue = [&MF](const MCExpr *Value) {
+ int64_t Val;
+ if (!Value->evaluateAsAbsolute(Val)) {
+ MCContext &Ctx = MF.getContext();
+ Ctx.reportError(SMLoc(), "could not resolve expression when required.");
+ Val = 0;
+ }
+ return static_cast<uint64_t>(Val);
+ };
+
auto Kern = HSAMetadataDoc->getMapNode();
Align MaxKernArgAlign;
@@ -470,10 +482,11 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
Kern[".group_segment_fixed_size"] =
Kern.getDocument()->getNode(ProgramInfo.LDSSize);
Kern[".private_segment_fixed_size"] =
- Kern.getDocument()->getNode(ProgramInfo.ScratchSize);
- if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
- Kern[".uses_dynamic_stack"] =
- Kern.getDocument()->getNode(ProgramInfo.DynamicCallStack);
+ Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.ScratchSize));
+ if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5) {
+ Kern[".uses_dynamic_stack"] = Kern.getDocument()->getNode(
+ static_cast<bool>(GetMCExprValue(ProgramInfo.DynamicCallStack)));
+ }
if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5 && STM.supportsWGP())
Kern[".workgroup_processor_mode"] =
@@ -484,12 +497,15 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
Kern.getDocument()->getNode(std::max(Align(4), MaxKernArgAlign).value());
Kern[".wavefront_size"] =
Kern.getDocument()->getNode(STM.getWavefrontSize());
- Kern[".sgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumSGPR);
- Kern[".vgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumVGPR);
+ Kern[".sgpr_count"] =
+ Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.NumSGPR));
+ Kern[".vgpr_count"] =
+ Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.NumVGPR));
// Only add AGPR count to metadata for supported devices
if (STM.hasMAIInsts()) {
- Kern[".agpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumAccVGPR);
+ Kern[".agpr_count"] =
+ Kern.getDocument()->getNode(GetMCExprValue(ProgramInfo.NumAccVGPR));
}
Kern[".max_flat_workgroup_size"] =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 9cfe81e5288ee..94ee4ac78142d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -674,29 +674,8 @@ bool GCNSubtarget::useVGPRIndexMode() const {
bool GCNSubtarget::useAA() const { return UseAA; }
unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
- if (getGeneration() >= AMDGPUSubtarget::GFX10)
- return getMaxWavesPerEU();
-
- if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
- if (SGPRs <= 80)
- return 10;
- if (SGPRs <= 88)
- return 9;
- if (SGPRs <= 100)
- return 8;
- return 7;
- }
- if (SGPRs <= 48)
- return 10;
- if (SGPRs <= 56)
- return 9;
- if (SGPRs <= 64)
- return 8;
- if (SGPRs <= 72)
- return 7;
- if (SGPRs <= 80)
- return 6;
- return 5;
+ return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, getMaxWavesPerEU(),
+ getGeneration());
}
unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 5ac245ac3b63d..d47a5f8ebb815 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -8399,12 +8399,16 @@ bool AMDGPUAsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
AGVK VK = StringSwitch<AGVK>(TokenId)
.Case("max", AGVK::AGVK_Max)
.Case("or", AGVK::AGVK_Or)
+ .Case("extrasgprs", AGVK::AGVK_ExtraSGPRs)
+ .Case("totalnumvgprs", AGVK::AGVK_TotalNumVGPRs)
+ .Case("alignto", AGVK::AGVK_AlignTo)
+ .Case("occupancy", AGVK::AGVK_Occupancy)
.Default(AGVK::AGVK_None);
if (VK != AGVK::AGVK_None && peekToken().is(AsmToken::LParen)) {
SmallVector<const MCExpr *, 4> Exprs;
uint64_t CommaCount = 0;
- lex(); // Eat 'max'/'or'
+ lex(); // Eat Arg ('or', 'max', 'occupancy', etc.)
lex(); // Eat '('
while (true) {
if (trySkipToken(AsmToken::RParen)) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
index 4578c33d92dce..159664faf983f 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
@@ -7,6 +7,9 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUMCExpr.h"
+#include "GCNSubtarget.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/IR/Function.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbol.h"
@@ -16,6 +19,7 @@
#include <optional>
using namespace llvm;
+using namespace llvm::AMDGPU;
AMDGPUVariadicMCExpr::AMDGPUVariadicMCExpr(VariadicKind Kind,
ArrayRef<const MCExpr *> Args,
@@ -61,6 +65,18 @@ void AMDGPUVariadicMCExpr::printImpl(raw_ostream &OS,
case AGVK_Max:
OS << "max(";
break;
+ case AGVK_ExtraSGPRs:
+ OS << "extrasgprs(";
+ break;
+ case AGVK_TotalNumVGPRs:
+ OS << "totalnumvgprs(";
+ break;
+ case AGVK_AlignTo:
+ OS << "alignto(";
+ break;
+ case AGVK_Occupancy:
+ OS << "occupancy(";
+ break;
}
for (auto It = Args.begin(); It != Args.end(); ++It) {
(*It)->print(OS, MAI, /*InParens=*/false);
@@ -82,10 +98,151 @@ static int64_t op(AMDGPUVariadicMCExpr::VariadicKind Kind, int64_t Arg1,
}
}
+bool AMDGPUVariadicMCExpr::evaluateExtraSGPRs(MCValue &Res,
+ const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
+ auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) {
+ MCValue MCVal;
+ if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) ||
+ !MCVal.isAbsolute())
+ return false;
+
+ ConstantValue = MCVal.getConstant();
+ return true;
+ };
+
+ assert(Args.size() == 3 &&
+ "AMDGPUVariadic Argument count incorrect for ExtraSGPRs");
+ const MCSubtargetInfo *STI = Ctx.getSubtargetInfo();
+ uint64_t VCCUsed = 0, FlatScrUsed = 0, XNACKUsed = 0;
+
+ bool Success = TryGetMCExprValue(Args[2], XNACKUsed);
+
+ assert(Success && "Arguments 3 for ExtraSGPRs should be a known constant");
+ if (!Success || !TryGetMCExprValue(Args[0], VCCUsed) ||
+ !TryGetMCExprValue(Args[1], FlatScrUsed))
+ return false;
+
+ uint64_t ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
+ STI, (bool)VCCUsed, (bool)FlatScrUsed, (bool)XNACKUsed);
+ Res = MCValue::get(ExtraSGPRs);
+ return true;
+}
+
+bool AMDGPUVariadicMCExpr::evaluateTotalNumVGPR(MCValue &Res,
+ const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
+ auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) {
+ MCValue MCVal;
+ if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) ||
+ !MCVal.isAbsolute())
+ return false;
+
+ ConstantValue = MCVal.getConstant();
+ return true;
+ };
+ assert(Args.size() == 2 &&
+ "AMDGPUVariadic Argument count incorrect for TotalNumVGPRs");
+ const MCSubtargetInfo *STI = Ctx.getSubtargetInfo();
+ uint64_t NumAGPR = 0, NumVGPR = 0;
+
+ bool Has90AInsts = AMDGPU::isGFX90A(*STI);
+
+ if (!TryGetMCExprValue(Args[0], NumAGPR) ||
+ !TryGetMCExprValue(Args[1], NumVGPR))
+ return false;
+
+ uint64_t TotalNum = Has90AInsts && NumAGPR ? alignTo(NumVGPR, 4) + NumAGPR
+ : std::max(NumVGPR, NumAGPR);
+ Res = MCValue::get(TotalNum);
+ return true;
+}
+
+bool AMDGPUVariadicMCExpr::evaluateAlignTo(MCValue &Res,
+ const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
+ auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) {
+ MCValue MCVal;
+ if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) ||
+ !MCVal.isAbsolute())
+ return false;
+
+ ConstantValue = MCVal.getConstant();
+ return true;
+ };
+
+ assert(Args.size() == 2 &&
+ "AMDGPUVariadic Argument count incorrect for AlignTo");
+ uint64_t Value = 0, Align = 0;
+ if (!TryGetMCExprValue(Args[0], Value) || !TryGetMCExprValue(Args[1], Align))
+ return false;
+
+ Res = MCValue::get(alignTo(Value, Align));
+ return true;
+}
+
+bool AMDGPUVariadicMCExpr::evaluateOccupancy(MCValue &Res,
+ const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
+ auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) {
+ MCValue MCVal;
+ if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) ||
+ !MCVal.isAbsolute())
+ return false;
+
+ ConstantValue = MCVal.getConstant();
+ return true;
+ };
+ assert(Args.size() == 7 &&
+ "AMDGPUVariadic Argument count incorrect for Occupancy");
+ uint64_t InitOccupancy, MaxWaves, Granule, TargetTotalNumVGPRs, Generation,
+ NumSGPRs, NumVGPRs;
+
+ bool Success = true;
+ Success &= TryGetMCExprValue(Args[0], MaxWaves);
+ Success &= TryGetMCExprValue(Args[1], Granule);
+ Success &= TryGetMCExprValue(Args[2], TargetTotalNumVGPRs);
+ Success &= TryGetMCExprValue(Args[3], Generation);
+ Success &= TryGetMCExprValue(Args[4], InitOccupancy);
+
+ assert(Success && "Arguments 1 to 5 for Occupancy should be known constants");
+
+ if (!Success || !TryGetMCExprValue(Args[5], NumSGPRs) ||
+ !TryGetMCExprValue(Args[6], NumVGPRs))
+ return false;
+
+ unsigned Occupancy = InitOccupancy;
+ if (NumSGPRs)
+ Occupancy = std::min(
+ Occupancy, IsaInfo::getOccupancyWithNumSGPRs(
+ NumSGPRs, MaxWaves,
+ static_cast<AMDGPUSubtarget::Generation>(Generation)));
+ if (NumVGPRs)
+ Occupancy = std::min(Occupancy,
+ IsaInfo::getNumWavesPerEUWithNumVGPRs(
+ NumVGPRs, Granule, MaxWaves, TargetTotalNumVGPRs));
+
+ Res = MCValue::get(Occupancy);
+ return true;
+}
+
bool AMDGPUVariadicMCExpr::evaluateAsRelocatableImpl(
MCValue &Res, const MCAsmLayout *Layout, const MCFixup *Fixup) const {
std::optional<int64_t> Total;
+ switch (Kind) {
+ default:
+ break;
+ case AGVK_ExtraSGPRs:
+ return evaluateExtraSGPRs(Res, Layout, Fixup);
+ case AGVK_AlignTo:
+ return evaluateAlignTo(Res, Layout, Fixup);
+ case AGVK_TotalNumVGPRs:
+ return evaluateTotalNumVGPR(Res, Layout, Fixup);
+ case AGVK_Occupancy:
+ return evaluateOccupancy(Res, Layout, Fixup);
+ }
+
for (const MCExpr *Arg : Args) {
MCValue ArgRes;
if (!Arg->evaluateAsRelocatable(ArgRes, Layout, Fixup) ||
@@ -113,3 +270,47 @@ MCFragment *AMDGPUVariadicMCExpr::findAssociatedFragment() const {
}
return nullptr;
}
+
+/// Allow delayed MCExpr resolve of ExtraSGPRs (in case VCCUsed or FlatScrUsed
+/// are unresolvable but needed for further MCExprs). Derived from
+/// implementation of IsaInfo::getNumExtraSGPRs in AMDGPUBaseInfo.cpp.
+///
+const AMDGPUVariadicMCExpr *
+AMDGPUVariadicMCExpr::createExtraSGPRs(const MCExpr *VCCUsed,
+ const MCExpr *FlatScrUsed,
+ bool XNACKUsed, MCContext &Ctx) {
+
+ return create(AGVK_ExtraSGPRs,
+ {VCCUsed, FlatScrUsed, MCConstantExpr::create(XNACKUsed, Ctx)},
+ Ctx);
+}
+
+const AMDGPUVariadicMCExpr *AMDGPUVariadicMCExpr::createTotalNumVGPR(
+ const MCExpr *NumAGPR, const MCExpr *NumVGPR, MCContext &Ctx) {
+ return create(AGVK_TotalNumVGPRs, {NumAGPR, NumVGPR}, Ctx);
+}
+
+/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
+///
+/// Remove dependency on GCNSubtarget and depend only only the necessary values
+/// for said occupancy computation. Should match computeOccupancy implementation
+/// without passing \p STM on.
+const AMDGPUVariadicMCExpr *
+AMDGPUVariadicMCExpr::createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
+ const MCExpr *NumVGPRs,
+ const GCNSubtarget &STM, MCContext &Ctx) {
+ unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
+ unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM);
+ unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
+ unsigned Generation = STM.getGeneration();
+
+ auto CreateExpr = [&Ctx](unsigned Value) {
+ return MCConstantExpr::create(Value, Ctx);
+ };
+
+ return create(AGVK_Occupancy,
+ {CreateExpr(MaxWaves), CreateExpr(Granule),
+ CreateExpr(TargetTotalNumVGPRs), CreateExpr(Generation),
+ CreateExpr(InitOcc), NumSGPRs, NumVGPRs},
+ Ctx);
+}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
index 238e0dea791b2..f92350b592350 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@@ -14,6 +14,9 @@
namespace llvm {
+class Function;
+class GCNSubtarget;
+
/// AMDGPU target specific variadic MCExpr operations.
///
/// Takes in a minimum of 1 argument to be used with an operation. The supported
@@ -26,7 +29,15 @@ namespace llvm {
///
class AMDGPUVariadicMCExpr : public MCTargetExpr {
public:
- enum VariadicKind { AGVK_None, AGVK_Or, AGVK_Max };
+ enum VariadicKind {
+ AGVK_None,
+ AGVK_Or,
+ AGVK_Max,
+ AGVK_ExtraSGPRs,
+ AGVK_TotalNumVGPRs,
+ AGVK_AlignTo,
+ AGVK_Occupancy
+ };
private:
VariadicKind Kind;
@@ -38,6 +49,15 @@ class AMDGPUVariadicMCExpr : public MCTargetExpr {
MCContext &Ctx);
~AMDGPUVariadicMCExpr();
+ bool evaluateExtraSGPRs(MCValue &Res, const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const;
+ bool evaluateTotalNumVGPR(MCValue &Res, const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const;
+ bool evaluateAlignTo(MCValue &Res, const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const;
+ bool evaluateOccupancy(MCValue &Res, const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const;
+
public:
static const AMDGPUVariadicMCExpr *
create(VariadicKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx);
@@ -52,6 +72,26 @@ class AMDGPUVariadicMCExpr : public MCTargetExpr {
return create(VariadicKind::AGVK_Max, Args, Ctx);
}
+ static const AMDGPUVariadicMCExpr *createExtraSGPRs(const MCExpr *VCCUsed,
+ const MCExpr *FlatScrUsed,
+ bool XNACKUsed,
+ MCContext &Ctx);
+
+ static const AMDGPUVariadicMCExpr *createTotalNumVGPR(const MCExpr *NumAGPR,
+ const MCExpr *NumVGPR,
+ MCContext &Ctx);
+
+ static const AMDGPUVariadicMCExpr *
+ createAlignTo(const MCExpr *Value, const MCExpr *Align, MCContext &Ctx) {
+ return create(VariadicKind::AGVK_AlignTo, {Value, Align}, Ctx);
+ }
+
+ static const AMDGPUVariadicMCExpr *createOccupancy(unsigned InitOcc,
+ const MCExpr *NumSGPRs,
+ const MCExpr *NumVGPRs,
+ const GCNSubtarget &STM,
+ MCContext &Ctx);
+
VariadicKind getKind() const { return Kind; }
const MCExpr *getSubExpr(size_t Index) const;
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
index 9ed7aacc0538e..0d40816cdd4b8 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -18,57 +18,114 @@
#include "GCNSubtarget.h"
#include "SIDefines.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/MC/MCExpr.h"
using namespace llvm;
-uint64_t SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST) const {
- uint64_t Reg = S_00B848_VGPRS(VGPRBlocks) | S_00B848_SGPRS(SGPRBlocks) |
- S_00B848_PRIORITY(Priority) | S_00B848_FLOAT_MODE(FloatMode) |
- S_00B848_PRIV(Priv) | S_00B848_DEBUG_MODE(DebugMode) |
- S_00B848_WGP_MODE(WgpMode) | S_00B848_MEM_ORDERED(MemOrdered);
+void SIProgramInfo::reset(const MachineFunction &MF) {
+ MCContext &Ctx = MF.getContext();
+
+ const MCExpr *ZeroExpr = MCConstantExpr::create(0, Ctx);
+
+ VGPRBlocks = ZeroExpr;
+ SGPRBlocks = ZeroExpr;
+ Priority = 0;
+ FloatMode = 0;
+ Priv = 0;
+ DX10Clamp = 0;
+ DebugMode = 0;
+ IEEEMode = 0;
+ WgpMode = 0;
+ MemOrdered = 0;
+ RrWgMode = 0;
+ ScratchSize = ZeroExpr;
+
+ LDSBlocks = 0;
+ ScratchBlocks = ZeroExpr;
+
+ ScratchEnable = ZeroExpr;
+ UserSGPR = 0;
+ TrapHandlerEnable = 0;
+ TGIdXEnable = 0;
+ TGIdYEnable = 0;
+ TGIdZEnable = 0;
+ TGSizeEnable = 0;
+ TIdIGCompCount = 0;
+ EXCPEnMSB = 0;
+ LdsSize = 0;
+ EXCPEnable = 0;
+
+ ComputePGMRSrc3GFX90A = ZeroExpr;
+
+ NumVGPR = ZeroExpr;
+ NumArchVGPR = ZeroExpr;
+ NumAccVGPR = ZeroExpr;
+ AccumOffset = ZeroExpr;
+ TgSplit = 0;
+ NumSGPR = ZeroExpr;
+ SGPRSpill = 0;
+ VGPRSpill = 0;
+ LDSSize = 0;
+ FlatUsed = ZeroExpr;
+
+ NumSGPRsForWavesPerEU = ZeroExpr;
+ NumVGPRsForWavesPerEU = ZeroExpr;
+ Occupancy = ZeroExpr;
+ DynamicCallStack = ZeroExpr;
+ VCCUsed = ZeroExpr;
+}
+
+static uint64_t getComputePGMRSrc1Reg(const SIProgramInfo &ProgInfo,
+ const GCNSubtarget &ST) {
+ uint64_t Reg = S_00B848_PRIORITY(ProgInfo.Priority) |
+ S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
+ S_00B848_PRIV(ProgInfo.Priv) |
+ S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
+ S_00B848_WGP_MODE(ProgInfo.WgpMode) |
+ S_00B848_MEM_ORDERED(ProgInfo.MemOrdered);
if (ST.hasDX10ClampMode())
- Reg |= S_00B848_DX10_CLAMP(DX10Clamp);
+ Reg |= S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp);
if (ST.hasIEEEMode())
- Reg |= S_00B848_IEEE_MODE(IEEEMode);
+ Reg |= S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
if (ST.hasRrWGMode())
- Reg |= S_00B848_RR_WG_MODE(RrWgMode);
+ Reg |= S_00B848_RR_WG_MODE(ProgInfo.RrWgMode);
return Reg;
}
-uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC,
- const GCNSubtarget &ST) const {
- if (AMDGPU::isCompute(CC)) {
- return getComputePGMRSrc1(ST);
- }
- uint64_t Reg = S_00B848_VGPRS(VGPRBlocks) | S_00B848_SGPRS(SGPRBlocks) |
- S_00B848_PRIORITY(Priority) | S_00B848_FLOAT_MODE(FloatMode) |
- S_00B848_PRIV(Priv) | S_00B848_DEBUG_MODE(DebugMode);
+static uint64_t getPGMRSrc1Reg(const SIProgramInfo &ProgInfo,
+ CallingConv::ID CC, const GCNSubtarget &ST) {
+ uint64_t Reg = S_00B848_PRIORITY(ProgInfo.Priority) |
+ S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
+ S_00B848_PRIV(ProgInfo.Priv) |
+ S_00B848_DEBUG_MODE(ProgInfo.DebugMode);
if (ST.hasDX10ClampMode())
- Reg |= S_00B848_DX10_CLAMP(DX10Clamp);
+ Reg |= S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp);
if (ST.hasIEEEMode())
- Reg |= S_00B848_IEEE_MODE(IEEEMode);
+ Reg |= S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
if (ST.hasRrWGMode())
- Reg |= S_00B848_RR_WG_MODE(RrWgMode);
+ Reg |= S_00B848_RR_WG_MODE(ProgInfo.RrWgMode);
switch (CC) {
case CallingConv::AMDGPU_PS:
- Reg |= S_00B028_MEM_ORDERED(MemOrdered);
+ Reg |= S_00B028_MEM_ORDERED(ProgInfo.MemOrdered);
break;
case CallingConv::AMDGPU_VS:
- Reg |= S_00B128_MEM_ORDERED(MemOrdered);
+ Reg |= S_00B128_MEM_ORDERED(ProgInfo.MemOrdered);
break;
case CallingConv::AMDGPU_GS:
- Reg |= S_00B228_WGP_MODE(WgpMode) | S_00B228_MEM_ORDERED(MemOrdered);
+ Reg |= S_00B228_WGP_MODE(ProgInfo.WgpMode) |
+ S_00B228_MEM_ORDERED(ProgInfo.MemOrdered);
break;
case CallingConv::AMDGPU_HS:
- Reg |= S_00B428_WGP_MODE(WgpMode) | S_00B428_MEM_ORDERED(MemOrdered);
+ Reg |= S_00B428_WGP_MODE(ProgInfo.WgpMode) |
+ S_00B428_MEM_ORDERED(ProgInfo.MemOrdered);
break;
default:
break;
@@ -76,22 +133,108 @@ uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC,
return Reg;
}
-uint64_t SIProgramInfo::getComputePGMRSrc2() const {
- uint64_t Reg =
- S_00B84C_SCRATCH_EN(ScratchEnable) | S_00B84C_USER_SGPR(UserSGPR) |
- S_00B84C_TRAP_HANDLER(TrapHandlerEnable) |
- S_00B84C_TGID_X_EN(TGIdXEnable) | S_00B84C_TGID_Y_EN(TGIdYEnable) |
- S_00B84C_TGID_Z_EN(TGIdZEnable) | S_00B84C_TG_SIZE_EN(TGSizeEnable) |
- S_00B84C_TIDIG_COMP_CNT(TIdIGCompCount) |
- S_00B84C_EXCP_EN_MSB(EXCPEnMSB) | S_00B84C_LDS_SIZE(LdsSize) |
- S_00B84C_EXCP_EN(EXCPEnable);
+static uint64_t getComputePGMRSrc2Reg(const SIProgramInfo &ProgInfo) {
+ uint64_t Reg = S_00B84C_USER_SGPR(ProgInfo.UserSGPR) |
+ S_00B84C_TRAP_HANDLER(ProgInfo.TrapHandlerEnable) |
+ S_00B84C_TGID_X_EN(ProgInfo.TGIdXEnable) |
+ S_00B84C_TGID_Y_EN(ProgInfo.TGIdYEnable) |
+ S_00B84C_TGID_Z_EN(ProgInfo.TGIdZEnable) |
+ S_00B84C_TG_SIZE_EN(ProgInfo.TGSizeEnable) |
+ S_00B84C_TIDIG_COMP_CNT(ProgInfo.TIdIGCompCount) |
+ S_00B84C_EXCP_EN_MSB(ProgInfo.EXCPEnMSB) |
+ S_00B84C_LDS_SIZE(ProgInfo.LdsSize) |
+ S_00B84C_EXCP_EN(ProgInfo.EXCPEnable);
+
+ return Reg;
+}
+
+static const MCExpr *MaskShift(const MCExpr *Val, uint32_t Mask, uint32_t Shift,
+ MCContext &Ctx) {
+ if (Mask) {
+ const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx);
+ Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx);
+ }
+ if (Shift) {
+ const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx);
+ Val = MCBinaryExpr::createShl(Val, ShiftExpr, Ctx);
+ }
+ return Val;
+}
+
+uint64_t SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST) const {
+ int64_t VBlocks, SBlocks;
+ VGPRBlocks->evaluateAsAbsolute(VBlocks);
+ SGPRBlocks->evaluateAsAbsolute(SBlocks);
+
+ uint64_t Reg = S_00B848_VGPRS(static_cast<uint64_t>(VBlocks)) |
+ S_00B848_SGPRS(static_cast<uint64_t>(SBlocks)) |
+ getComputePGMRSrc1Reg(*this, ST);
return Reg;
}
+uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC,
+ const GCNSubtarget &ST) const {
+ if (AMDGPU::isCompute(CC)) {
+ return getComputePGMRSrc1(ST);
+ }
+ int64_t VBlocks, SBlocks;
+ VGPRBlocks->evaluateAsAbsolute(VBlocks);
+ SGPRBlocks->evaluateAsAbsolute(SBlocks);
+
+ return getPGMRSrc1Reg(*this, CC, ST) |
+ S_00B848_VGPRS(static_cast<uint64_t>(VBlocks)) |
+ S_00B848_SGPRS(static_cast<uint64_t>(SBlocks));
+}
+
+uint64_t SIProgramInfo::getComputePGMRSrc2() const {
+ int64_t ScratchEn;
+ ScratchEnable->evaluateAsAbsolute(ScratchEn);
+ return ScratchEn | getComputePGMRSrc2Reg(*this);
+}
+
uint64_t SIProgramInfo::getPGMRSrc2(CallingConv::ID CC) const {
if (AMDGPU::isCompute(CC))
return getComputePGMRSrc2();
return 0;
}
+
+const MCExpr *SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST,
+ MCContext &Ctx) const {
+ uint64_t Reg = getComputePGMRSrc1Reg(*this, ST);
+ const MCExpr *RegExpr = MCConstantExpr::create(Reg, Ctx);
+ const MCExpr *Res = MCBinaryExpr::createOr(
+ MaskShift(VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0, Ctx),
+ MaskShift(SGPRBlocks, /*Mask=*/0xF, /*Shift=*/6, Ctx), Ctx);
+ return MCBinaryExpr::createOr(RegExpr, Res, Ctx);
+}
+
+const MCExpr *SIProgramInfo::getPGMRSrc1(CallingConv::ID CC,
+ const GCNSubtarget &ST,
+ MCContext &Ctx) const {
+ if (AMDGPU::isCompute(CC)) {
+ return getComputePGMRSrc1(ST, Ctx);
+ }
+
+ uint64_t Reg = getPGMRSrc1Reg(*this, CC, ST);
+ const MCExpr *RegExpr = MCConstantExpr::create(Reg, Ctx);
+ const MCExpr *Res = MCBinaryExpr::createOr(
+ MaskShift(VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0, Ctx),
+ MaskShift(SGPRBlocks, /*Mask=*/0xF, /*Shift=*/6, Ctx), Ctx);
+ return MCBinaryExpr::createOr(RegExpr, Res, Ctx);
+}
+
+const MCExpr *SIProgramInfo::getComputePGMRSrc2(MCContext &Ctx) const {
+ uint64_t Reg = getComputePGMRSrc2Reg(*this);
+ const MCExpr *RegExpr = MCConstantExpr::create(Reg, Ctx);
+ return MCBinaryExpr::createOr(ScratchEnable, RegExpr, Ctx);
+}
+
+const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC,
+ MCContext &Ctx) const {
+ if (AMDGPU::isCompute(CC))
+ return getComputePGMRSrc2(Ctx);
+
+ return MCConstantExpr::create(0, Ctx);
+}
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index 8c26789f936cf..c0a353033c3c5 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -22,12 +22,15 @@
namespace llvm {
class GCNSubtarget;
+class MCContext;
+class MCExpr;
+class MachineFunction;
/// Track resource usage for kernels / entry functions.
struct SIProgramInfo {
// Fields set in PGM_RSRC1 pm4 packet.
- uint32_t VGPRBlocks = 0;
- uint32_t SGPRBlocks = 0;
+ const MCExpr *VGPRBlocks = nullptr;
+ const MCExpr *SGPRBlocks = nullptr;
uint32_t Priority = 0;
uint32_t FloatMode = 0;
uint32_t Priv = 0;
@@ -37,14 +40,14 @@ struct SIProgramInfo {
uint32_t WgpMode = 0; // GFX10+
uint32_t MemOrdered = 0; // GFX10+
uint32_t RrWgMode = 0; // GFX12+
- uint64_t ScratchSize = 0;
+ const MCExpr *ScratchSize = nullptr;
// State used to calculate fields set in PGM_RSRC2 pm4 packet.
uint32_t LDSBlocks = 0;
- uint32_t ScratchBlocks = 0;
+ const MCExpr *ScratchBlocks = nullptr;
// Fields set in PGM_RSRC2 pm4 packet
- uint32_t ScratchEnable = 0;
+ const MCExpr *ScratchEnable = nullptr;
uint32_t UserSGPR = 0;
uint32_t TrapHandlerEnable = 0;
uint32_t TGIdXEnable = 0;
@@ -56,44 +59,56 @@ struct SIProgramInfo {
uint32_t LdsSize = 0;
uint32_t EXCPEnable = 0;
- uint64_t ComputePGMRSrc3GFX90A = 0;
+ const MCExpr *ComputePGMRSrc3GFX90A = nullptr;
- uint32_t NumVGPR = 0;
- uint32_t NumArchVGPR = 0;
- uint32_t NumAccVGPR = 0;
- uint32_t AccumOffset = 0;
+ const MCExpr *NumVGPR = nullptr;
+ const MCExpr *NumArchVGPR = nullptr;
+ const MCExpr *NumAccVGPR = nullptr;
+ const MCExpr *AccumOffset = nullptr;
uint32_t TgSplit = 0;
- uint32_t NumSGPR = 0;
+ const MCExpr *NumSGPR = nullptr;
unsigned SGPRSpill = 0;
unsigned VGPRSpill = 0;
uint32_t LDSSize = 0;
- bool FlatUsed = false;
+ const MCExpr *FlatUsed = nullptr;
// Number of SGPRs that meets number of waves per execution unit request.
- uint32_t NumSGPRsForWavesPerEU = 0;
+ const MCExpr *NumSGPRsForWavesPerEU = nullptr;
// Number of VGPRs that meets number of waves per execution unit request.
- uint32_t NumVGPRsForWavesPerEU = 0;
+ const MCExpr *NumVGPRsForWavesPerEU = nullptr;
// Final occupancy.
- uint32_t Occupancy = 0;
+ const MCExpr *Occupancy = nullptr;
// Whether there is recursion, dynamic allocas, indirect calls or some other
// reason there may be statically unknown stack usage.
- bool DynamicCallStack = false;
+ const MCExpr *DynamicCallStack = nullptr;
// Bonus information for debugging.
- bool VCCUsed = false;
+ const MCExpr *VCCUsed = nullptr;
SIProgramInfo() = default;
+ // The constructor sets the values for each member as shown in the struct.
+ // However, setting the MCExpr members to their zero value equivalent
+ // happens in reset together with (duplicated) value re-set for the
+ // non-MCExpr members.
+ void reset(const MachineFunction &MF);
+
/// Compute the value of the ComputePGMRsrc1 register.
uint64_t getComputePGMRSrc1(const GCNSubtarget &ST) const;
uint64_t getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST) const;
+ const MCExpr *getComputePGMRSrc1(const GCNSubtarget &ST,
+ MCContext &Ctx) const;
+ const MCExpr *getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST,
+ MCContext &Ctx) const;
/// Compute the value of the ComputePGMRsrc2 register.
uint64_t getComputePGMRSrc2() const;
uint64_t getPGMRSrc2(CallingConv::ID CC) const;
+ const MCExpr *getComputePGMRSrc2(MCContext &Ctx) const;
+ const MCExpr *getPGMRSrc2(CallingConv::ID CC, MCContext &Ctx) const;
};
} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 2fae7a31d70bf..2beaf903542bd 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1129,12 +1129,45 @@ unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
unsigned NumVGPRs) {
- unsigned MaxWaves = getMaxWavesPerEU(STI);
- unsigned Granule = getVGPRAllocGranule(STI);
+ return getNumWavesPerEUWithNumVGPRs(NumVGPRs, getVGPRAllocGranule(STI),
+ getMaxWavesPerEU(STI),
+ getTotalNumVGPRs(STI));
+}
+
+unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
+ unsigned MaxWaves,
+ unsigned TotalNumVGPRs) {
if (NumVGPRs < Granule)
return MaxWaves;
unsigned RoundedRegs = alignTo(NumVGPRs, Granule);
- return std::min(std::max(getTotalNumVGPRs(STI) / RoundedRegs, 1u), MaxWaves);
+ return std::min(std::max(TotalNumVGPRs / RoundedRegs, 1u), MaxWaves);
+}
+
+unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves,
+ AMDGPUSubtarget::Generation Gen) {
+ if (Gen >= AMDGPUSubtarget::GFX10)
+ return MaxWaves;
+
+ if (Gen >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ if (SGPRs <= 80)
+ return 10;
+ if (SGPRs <= 88)
+ return 9;
+ if (SGPRs <= 100)
+ return 8;
+ return 7;
+ }
+ if (SGPRs <= 48)
+ return 10;
+ if (SGPRs <= 56)
+ return 9;
+ if (SGPRs <= 64)
+ return 8;
+ if (SGPRs <= 72)
+ return 7;
+ if (SGPRs <= 80)
+ return 6;
+ return 5;
}
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 12d1b3a55cccb..fc4147df76e3e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -9,6 +9,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
+#include "AMDGPUSubtarget.h"
#include "SIDefines.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/InstrTypes.h"
@@ -311,6 +312,17 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
unsigned NumVGPRs);
+/// \returns Number of waves reachable for a given \p NumVGPRs usage, \p Granule
+/// size, \p MaxWaves possible, and \p TotalNumVGPRs available.
+unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
+ unsigned MaxWaves,
+ unsigned TotalNumVGPRs);
+
+/// \returns Occupancy for a given \p SGPRs usage, \p MaxWaves possible, and \p
+/// Gen.
+unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves,
+ AMDGPUSubtarget::Generation Gen);
+
/// \returns Number of VGPR blocks needed for given subtarget \p STI when
/// \p NumVGPRs are used. We actually return the number of blocks -1, since
/// that's what we encode.
diff --git a/llvm/test/MC/AMDGPU/alignto_mcexpr.s b/llvm/test/MC/AMDGPU/alignto_mcexpr.s
new file mode 100644
index 0000000000000..e864f3736828c
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/alignto_mcexpr.s
@@ -0,0 +1,15 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa < %s | FileCheck --check-prefix=ASM %s
+
+// ASM: .set alignto_zero_eight, 0
+// ASM: .set alignto_one_eight, 8
+// ASM: .set alignto_five_eight, 8
+// ASM: .set alignto_seven_eight, 8
+// ASM: .set alignto_eight_eight, 8
+// ASM: .set alignto_ten_eight, 16
+
+.set alignto_zero_eight, alignto(0, 8)
+.set alignto_one_eight, alignto(1, 8)
+.set alignto_five_eight, alignto(5, 8)
+.set alignto_seven_eight, alignto(7, 8)
+.set alignto_eight_eight, alignto(8, 8)
+.set alignto_ten_eight, alignto(10, 8)
diff --git a/llvm/test/MC/AMDGPU/extrasgprs_mcexpr.s b/llvm/test/MC/AMDGPU/extrasgprs_mcexpr.s
new file mode 100644
index 0000000000000..e88b23bb34d4f
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/extrasgprs_mcexpr.s
@@ -0,0 +1,31 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=bonaire < %s | FileCheck --check-prefix=GFX7 %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck --check-prefix=GFX90A %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck --check-prefix=GFX940 %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+
+// gfx940 has architected flat scratch enabled.
+
+// GFX7: .set extrasgpr_none, 0
+// GFX7: .set extrasgpr_vcc, 2
+// GFX7: .set extrasgpr_flatscr, 4
+// GFX7: .set extrasgpr_xnack, 0
+
+// GFX90A: .set extrasgpr_none, 0
+// GFX90A: .set extrasgpr_vcc, 2
+// GFX90A: .set extrasgpr_flatscr, 6
+// GFX90A: .set extrasgpr_xnack, 4
+
+// GFX940: .set extrasgpr_none, 6
+// GFX940: .set extrasgpr_vcc, 6
+// GFX940: .set extrasgpr_flatscr, 6
+// GFX940: .set extrasgpr_xnack, 6
+
+// GFX10: .set extrasgpr_none, 0
+// GFX10: .set extrasgpr_vcc, 2
+// GFX10: .set extrasgpr_flatscr, 0
+// GFX10: .set extrasgpr_xnack, 0
+
+.set extrasgpr_none, extrasgprs(0, 0, 0)
+.set extrasgpr_vcc, extrasgprs(1, 0, 0)
+.set extrasgpr_flatscr, extrasgprs(0, 1, 0)
+.set extrasgpr_xnack, extrasgprs(0, 0, 1)
diff --git a/llvm/test/MC/AMDGPU/occupancy_mcexpr.s b/llvm/test/MC/AMDGPU/occupancy_mcexpr.s
new file mode 100644
index 0000000000000..06bec8c538dae
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/occupancy_mcexpr.s
@@ -0,0 +1,61 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa < %s | FileCheck --check-prefix=ASM %s
+
+// ASM: .set occupancy_init_one, 1
+// ASM: .set occupancy_init_seven, 7
+// ASM: .set occupancy_init_eight, 8
+
+.set occupancy_init_one, occupancy(0, 0, 0, 0, 1, 0, 0)
+.set occupancy_init_seven, occupancy(0, 0, 0, 0, 7, 0, 0)
+.set occupancy_init_eight, occupancy(0, 0, 0, 0, 8, 0, 0)
+
+// ASM: .set occupancy_numsgpr_seaisle_ten, 10
+// ASM: .set occupancy_numsgpr_seaisle_nine, 9
+// ASM: .set occupancy_numsgpr_seaisle_eight, 8
+// ASM: .set occupancy_numsgpr_seaisle_seven, 7
+// ASM: .set occupancy_numsgpr_seaisle_six, 6
+// ASM: .set occupancy_numsgpr_seaisle_five, 5
+
+.set occupancy_numsgpr_seaisle_ten, occupancy(0, 0, 0, 6, 11, 1, 0)
+.set occupancy_numsgpr_seaisle_nine, occupancy(0, 0, 0, 6, 11, 49, 0)
+.set occupancy_numsgpr_seaisle_eight, occupancy(0, 0, 0, 6, 11, 57, 0)
+.set occupancy_numsgpr_seaisle_seven, occupancy(0, 0, 0, 6, 11, 65, 0)
+.set occupancy_numsgpr_seaisle_six, occupancy(0, 0, 0, 6, 11, 73, 0)
+.set occupancy_numsgpr_seaisle_five, occupancy(0, 0, 0, 6, 11, 81, 0)
+
+// ASM: .set occupancy_numsgpr_gfx9_ten, 10
+// ASM: .set occupancy_numsgpr_gfx9_nine, 9
+// ASM: .set occupancy_numsgpr_gfx9_eight, 8
+// ASM: .set occupancy_numsgpr_gfx9_seven, 7
+
+.set occupancy_numsgpr_gfx9_ten, occupancy(0, 0, 0, 8, 11, 1, 0)
+.set occupancy_numsgpr_gfx9_nine, occupancy(0, 0, 0, 8, 11, 81, 0)
+.set occupancy_numsgpr_gfx9_eight, occupancy(0, 0, 0, 8, 11, 89, 0)
+.set occupancy_numsgpr_gfx9_seven, occupancy(0, 0, 0, 8, 11, 101, 0)
+
+// ASM: .set occupancy_numsgpr_gfx10_one, 1
+// ASM: .set occupancy_numsgpr_gfx10_seven, 7
+// ASM: .set occupancy_numsgpr_gfx10_eight, 8
+
+.set occupancy_numsgpr_gfx10_one, occupancy(1, 0, 0, 9, 11, 1, 0)
+.set occupancy_numsgpr_gfx10_seven, occupancy(7, 0, 0, 9, 11, 1, 0)
+.set occupancy_numsgpr_gfx10_eight, occupancy(8, 0, 0, 9, 11, 1, 0)
+
+// ASM: .set occupancy_numvgpr_high_granule_one, 1
+// ASM: .set occupancy_numvgpr_high_granule_seven, 7
+// ASM: .set occupancy_numvgpr_high_granule_eight, 8
+
+.set occupancy_numvgpr_high_granule_one, occupancy(1, 2, 0, 0, 11, 0, 1)
+.set occupancy_numvgpr_high_granule_seven, occupancy(7, 2, 0, 0, 11, 0, 1)
+.set occupancy_numvgpr_high_granule_eight, occupancy(8, 2, 0, 0, 11, 0, 1)
+
+// ASM: .set occupancy_numvgpr_low_total_one, 1
+// ASM: .set occupancy_numvgpr_one, 1
+// ASM: .set occupancy_numvgpr_seven, 7
+// ASM: .set occupancy_numvgpr_eight, 8
+// ASM: .set occupancy_numvgpr_ten, 10
+
+.set occupancy_numvgpr_low_total_one, occupancy(11, 4, 2, 0, 11, 0, 4)
+.set occupancy_numvgpr_one, occupancy(11, 4, 4, 0, 11, 0, 4)
+.set occupancy_numvgpr_seven, occupancy(11, 4, 28, 0, 11, 0, 4)
+.set occupancy_numvgpr_eight, occupancy(11, 4, 32, 0, 11, 0, 4)
+.set occupancy_numvgpr_ten, occupancy(11, 4, 40, 0, 11, 0, 4)
diff --git a/llvm/test/MC/AMDGPU/totalnumvgpr_mcexpr.s b/llvm/test/MC/AMDGPU/totalnumvgpr_mcexpr.s
new file mode 100644
index 0000000000000..29bb885b20804
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/totalnumvgpr_mcexpr.s
@@ -0,0 +1,26 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck --check-prefix=GFX90A %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX10 %s
+
+// GFX10: .set totalvgpr_none, 0
+// GFX10: .set totalvgpr_one, 1
+// GFX10: .set totalvgpr_two, 2
+
+.set totalvgpr_none, totalnumvgprs(0, 0)
+.set totalvgpr_one, totalnumvgprs(1, 0)
+.set totalvgpr_two, totalnumvgprs(1, 2)
+
+// GFX90A: .set totalvgpr90a_none, 0
+// GFX90A: .set totalvgpr90a_one, 1
+// GFX90A: .set totalvgpr90a_two, 2
+
+.set totalvgpr90a_none, totalnumvgprs(0, 0)
+.set totalvgpr90a_one, totalnumvgprs(0, 1)
+.set totalvgpr90a_two, totalnumvgprs(0, 2)
+
+// GFX90A: .set totalvgpr90a_agpr_minimal, 1
+// GFX90A: .set totalvgpr90a_agpr_rounded_eight, 8
+// GFX90A: .set totalvgpr90a_agpr_exact_eight, 8
+
+.set totalvgpr90a_agpr_minimal, totalnumvgprs(1, 0)
+.set totalvgpr90a_agpr_rounded_eight, totalnumvgprs(4, 2)
+.set totalvgpr90a_agpr_exact_eight, totalnumvgprs(4, 4)
diff --git a/llvm/unittests/MC/AMDGPU/CMakeLists.txt b/llvm/unittests/MC/AMDGPU/CMakeLists.txt
index 06ca89a72a7cd..be8ff572e6f7d 100644
--- a/llvm/unittests/MC/AMDGPU/CMakeLists.txt
+++ b/llvm/unittests/MC/AMDGPU/CMakeLists.txt
@@ -1,12 +1,20 @@
+include_directories(
+ ${PROJECT_SOURCE_DIR}/lib/Target/AMDGPU
+ ${PROJECT_BINARY_DIR}/lib/Target/AMDGPU
+ )
+
set(LLVM_LINK_COMPONENTS
AMDGPUCodeGen
AMDGPUDesc
AMDGPUInfo
+ CodeGen
+ Core
MC
Support
TargetParser
)
-add_llvm_unittest(AMDGPUDwarfTests
+add_llvm_unittest(AMDGPUMCTests
DwarfRegMappings.cpp
+ SIProgramInfoMCExprs.cpp
)
diff --git a/llvm/unittests/MC/AMDGPU/SIProgramInfoMCExprs.cpp b/llvm/unittests/MC/AMDGPU/SIProgramInfoMCExprs.cpp
new file mode 100644
index 0000000000000..f2161f71e6e99
--- /dev/null
+++ b/llvm/unittests/MC/AMDGPU/SIProgramInfoMCExprs.cpp
@@ -0,0 +1,81 @@
+//===- llvm/unittests/MC/AMDGPU/SIProgramInfoMCExprs.cpp ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUHSAMetadataStreamer.h"
+#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
+#include "SIProgramInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+class SIProgramInfoMCExprsTest : public testing::Test {
+protected:
+ std::unique_ptr<GCNTargetMachine> TM;
+ std::unique_ptr<LLVMContext> Ctx;
+ std::unique_ptr<GCNSubtarget> ST;
+ std::unique_ptr<MachineModuleInfo> MMI;
+ std::unique_ptr<MachineFunction> MF;
+ std::unique_ptr<Module> M;
+
+ SIProgramInfo PI;
+
+ static void SetUpTestSuite() {
+ LLVMInitializeAMDGPUTargetInfo();
+ LLVMInitializeAMDGPUTarget();
+ LLVMInitializeAMDGPUTargetMC();
+ }
+
+ SIProgramInfoMCExprsTest() {
+ std::string Triple = "amdgcn-amd-amdhsa";
+ std::string CPU = "gfx1010";
+ std::string FS = "";
+
+ std::string Error;
+ const Target *TheTarget = TargetRegistry::lookupTarget(Triple, Error);
+ TargetOptions Options;
+
+ TM.reset(static_cast<GCNTargetMachine *>(TheTarget->createTargetMachine(
+ Triple, CPU, FS, Options, std::nullopt, std::nullopt)));
+
+ Ctx = std::make_unique<LLVMContext>();
+ M = std::make_unique<Module>("Module", *Ctx);
+ M->setDataLayout(TM->createDataLayout());
+ auto *FType = FunctionType::get(Type::getVoidTy(*Ctx), false);
+ auto *F = Function::Create(FType, GlobalValue::ExternalLinkage, "Test", *M);
+ MMI = std::make_unique<MachineModuleInfo>(TM.get());
+
+ ST = std::make_unique<GCNSubtarget>(TM->getTargetTriple(),
+ TM->getTargetCPU(),
+ TM->getTargetFeatureString(), *TM);
+
+ MF = std::make_unique<MachineFunction>(*F, *TM, *ST, 1, *MMI);
+ PI.reset(*MF.get());
+ }
+};
+
+TEST_F(SIProgramInfoMCExprsTest, TestDeathHSAKernelEmit) {
+ MCContext &Ctx = MF->getContext();
+ MCSymbol *Sym = Ctx.getOrCreateSymbol("Unknown");
+ PI.ScratchSize = MCSymbolRefExpr::create(Sym, Ctx);
+
+ auto &Func = MF->getFunction();
+ Func.setCallingConv(CallingConv::AMDGPU_KERNEL);
+ AMDGPU::HSAMD::MetadataStreamerMsgPackV4 MD;
+ EXPECT_DEATH(MD.emitKernel(*MF, PI),
+ "could not resolve expression when required.");
+}
More information about the llvm-commits
mailing list