[clang] [llvm] [AMDGPU] Convert AMDGPUResourceUsageAnalysis pass from Module to MF pass (PR #102913)
Janek van Oirschot via cfe-commits
cfe-commits at lists.llvm.org
Tue Sep 24 13:41:33 PDT 2024
https://github.com/JanekvO updated https://github.com/llvm/llvm-project/pull/102913
>From 136ad3a97df7b02d89d845920c53ef80da1e7c31 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <janek.vanoirschot at amd.com>
Date: Mon, 12 Aug 2024 14:58:41 +0100
Subject: [PATCH 01/11] Convert AMDGPUResourceUsageAnalysis pass from Module to
MachineFunction pass and move metadata propagation logic to MC layer
---
.../amdgcn-machine-analysis-remarks.cl | 10 +-
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 208 +++++--
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h | 19 +-
.../Target/AMDGPU/AMDGPUMCResourceInfo.cpp | 220 ++++++++
llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h | 94 +++
.../AMDGPU/AMDGPUResourceUsageAnalysis.cpp | 147 +----
.../AMDGPU/AMDGPUResourceUsageAnalysis.h | 40 +-
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
.../MCTargetDesc/AMDGPUTargetStreamer.cpp | 41 ++
.../MCTargetDesc/AMDGPUTargetStreamer.h | 23 +
.../Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp | 10 +-
.../AMDGPU/GlobalISel/extractelement.ll | 168 +++---
.../AMDGPU/GlobalISel/flat-scratch-init.ll | 16 +-
.../GlobalISel/llvm.amdgcn.workitem.id.ll | 6 +-
.../AMDGPU/GlobalISel/non-entry-alloca.ll | 8 +-
.../CodeGen/AMDGPU/agpr-register-count.ll | 65 ++-
.../CodeGen/AMDGPU/amdgpu.private-memory.ll | 3 +-
.../amdpal-metadata-agpr-register-count.ll | 4 +-
...-amdgpu-flat-work-group-size-vgpr-limit.ll | 51 +-
llvm/test/CodeGen/AMDGPU/attributor-noopt.ll | 30 +-
.../AMDGPU/call-alias-register-usage-agpr.ll | 16 +-
.../AMDGPU/call-alias-register-usage0.ll | 8 +-
.../AMDGPU/call-alias-register-usage1.ll | 11 +-
.../AMDGPU/call-alias-register-usage2.ll | 11 +-
.../AMDGPU/call-alias-register-usage3.ll | 11 +-
.../AMDGPU/call-graph-register-usage.ll | 50 +-
.../callee-special-input-sgprs-fixed-abi.ll | 4 +-
.../test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll | 1 +
llvm/test/CodeGen/AMDGPU/code-object-v3.ll | 34 +-
.../AMDGPU/codegen-internal-only-func.ll | 6 +-
.../AMDGPU/control-flow-fastregalloc.ll | 6 +-
llvm/test/CodeGen/AMDGPU/elf.ll | 7 +-
.../enable-scratch-only-dynamic-stack.ll | 14 +-
.../CodeGen/AMDGPU/function-resource-usage.ll | 533 ++++++++++++++++++
.../AMDGPU/gfx11-user-sgpr-init16-bug.ll | 16 +-
.../AMDGPU/inline-asm-reserved-regs.ll | 2 +-
.../CodeGen/AMDGPU/insert-delay-alu-bug.ll | 24 +-
llvm/test/CodeGen/AMDGPU/ipra.ll | 14 +-
llvm/test/CodeGen/AMDGPU/kernarg-size.ll | 4 +-
.../CodeGen/AMDGPU/kernel_code_t_recurse.ll | 7 +-
.../CodeGen/AMDGPU/large-alloca-compute.ll | 31 +-
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 45 +-
.../CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll | 6 +-
.../AMDGPU/lower-module-lds-offsets.ll | 26 +-
llvm/test/CodeGen/AMDGPU/mesa3d.ll | 13 +-
.../AMDGPU/module-lds-false-sharing.ll | 99 ++--
llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll | 20 +-
llvm/test/CodeGen/AMDGPU/recursion.ll | 28 +-
.../AMDGPU/resource-optimization-remarks.ll | 64 +--
.../AMDGPU/resource-usage-dead-function.ll | 13 +-
.../CodeGen/AMDGPU/stack-realign-kernel.ll | 126 +++--
llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll | 3 +-
llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll | 3 +-
llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll | 3 +-
llvm/test/CodeGen/AMDGPU/trap.ll | 33 +-
55 files changed, 1801 insertions(+), 655 deletions(-)
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
create mode 100644 llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
diff --git a/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl b/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl
index a05e21b37b9127..a2dd59a871904c 100644
--- a/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl
+++ b/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl
@@ -2,12 +2,12 @@
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx908 -Rpass-analysis=kernel-resource-usage -S -O0 -verify %s -o /dev/null
// expected-remark at +10 {{Function Name: foo}}
-// expected-remark at +9 {{ SGPRs: 13}}
-// expected-remark at +8 {{ VGPRs: 10}}
-// expected-remark at +7 {{ AGPRs: 12}}
-// expected-remark at +6 {{ ScratchSize [bytes/lane]: 0}}
+// expected-remark at +9 {{ SGPRs: foo.num_sgpr+(extrasgprs(foo.uses_vcc, foo.uses_flat_scratch, 1))}}
+// expected-remark at +8 {{ VGPRs: foo.num_vgpr}}
+// expected-remark at +7 {{ AGPRs: foo.num_agpr}}
+// expected-remark at +6 {{ ScratchSize [bytes/lane]: foo.private_seg_size}}
// expected-remark at +5 {{ Dynamic Stack: False}}
-// expected-remark at +4 {{ Occupancy [waves/SIMD]: 10}}
+// expected-remark at +4 {{ Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 10, max(foo.num_sgpr+(extrasgprs(foo.uses_vcc, foo.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(foo.num_agpr, foo.num_vgpr), 1, 0))}}
// expected-remark at +3 {{ SGPRs Spill: 0}}
// expected-remark at +2 {{ VGPRs Spill: 0}}
// expected-remark at +1 {{ LDS Size [bytes/block]: 0}}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index b90d245b7bd394..23bc804515e690 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -18,6 +18,7 @@
#include "AMDGPUAsmPrinter.h"
#include "AMDGPU.h"
#include "AMDGPUHSAMetadataStreamer.h"
+#include "AMDGPUMCResourceInfo.h"
#include "AMDGPUResourceUsageAnalysis.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUInstPrinter.h"
@@ -92,6 +93,9 @@ AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
std::unique_ptr<MCStreamer> Streamer)
: AsmPrinter(TM, std::move(Streamer)) {
assert(OutStreamer && "AsmPrinter constructed without streamer");
+ RI = std::make_unique<MCResourceInfo>(OutContext);
+ OccupancyValidateMap =
+ std::make_unique<DenseMap<const Function *, const MCExpr *>>();
}
StringRef AMDGPUAsmPrinter::getPassName() const {
@@ -359,6 +363,102 @@ bool AMDGPUAsmPrinter::doInitialization(Module &M) {
return AsmPrinter::doInitialization(M);
}
+void AMDGPUAsmPrinter::ValidateMCResourceInfo(Function &F) {
+ if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
+ return;
+
+ using RIK = MCResourceInfo::ResourceInfoKind;
+ const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
+
+ auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
+ int64_t Val;
+ if (Value->evaluateAsAbsolute(Val)) {
+ Res = Val;
+ return true;
+ }
+ return false;
+ };
+
+ const uint64_t MaxScratchPerWorkitem =
+ STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
+ MCSymbol *ScratchSizeSymbol =
+ RI->getSymbol(F.getName(), RIK::RIK_PrivateSegSize);
+ uint64_t ScratchSize;
+ if (ScratchSizeSymbol->isVariable() &&
+ TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
+ ScratchSize > MaxScratchPerWorkitem) {
+ DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem,
+ DS_Error);
+ F.getContext().diagnose(DiagStackSize);
+ }
+
+ // Validate addressable scalar registers (i.e., prior to added implicit
+ // SGPRs).
+ MCSymbol *NumSGPRSymbol = RI->getSymbol(F.getName(), RIK::RIK_NumSGPR);
+ if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
+ !STM.hasSGPRInitBug()) {
+ unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
+ uint64_t NumSgpr;
+ if (NumSGPRSymbol->isVariable() &&
+ TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
+ NumSgpr > MaxAddressableNumSGPRs) {
+ DiagnosticInfoResourceLimit Diag(F, "addressable scalar registers",
+ NumSgpr, MaxAddressableNumSGPRs,
+ DS_Error, DK_ResourceLimit);
+ F.getContext().diagnose(Diag);
+ return;
+ }
+ }
+
+ MCSymbol *VCCUsedSymbol = RI->getSymbol(F.getName(), RIK::RIK_UsesVCC);
+ MCSymbol *FlatUsedSymbol =
+ RI->getSymbol(F.getName(), RIK::RIK_UsesFlatScratch);
+ uint64_t VCCUsed, FlatUsed, NumSgpr;
+
+ if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
+ FlatUsedSymbol->isVariable() &&
+ TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
+ TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) &&
+ TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) {
+
+ // Recomputes NumSgprs + implicit SGPRs but all symbols should now be
+ // resolvable.
+ NumSgpr += IsaInfo::getNumExtraSGPRs(
+ &STM, VCCUsed, FlatUsed,
+ getTargetStreamer()->getTargetID()->isXnackOnOrAny());
+ if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
+ STM.hasSGPRInitBug()) {
+ unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
+ if (NumSgpr > MaxAddressableNumSGPRs) {
+ DiagnosticInfoResourceLimit Diag(F, "scalar registers", NumSgpr,
+ MaxAddressableNumSGPRs, DS_Error,
+ DK_ResourceLimit);
+ F.getContext().diagnose(Diag);
+ return;
+ }
+ }
+
+ auto I = OccupancyValidateMap->find(&F);
+ if (I != OccupancyValidateMap->end()) {
+ const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
+ F, "amdgpu-waves-per-eu", {0, 0}, true);
+ uint64_t Occupancy;
+ const MCExpr *OccupancyExpr = I->getSecond();
+
+ if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) {
+ DiagnosticInfoOptimizationFailure Diag(
+ F, F.getSubprogram(),
+ "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
+ "'" +
+ F.getName() + "': desired occupancy was " + Twine(MinWEU) +
+ ", final occupancy is " + Twine(Occupancy));
+ F.getContext().diagnose(Diag);
+ return;
+ }
+ }
+ }
+}
+
bool AMDGPUAsmPrinter::doFinalization(Module &M) {
// Pad with s_code_end to help tools and guard against instruction prefetch
// causing stale data in caches. Arguably this should be done by the linker,
@@ -371,25 +471,16 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) {
getTargetStreamer()->EmitCodeEnd(STI);
}
- return AsmPrinter::doFinalization(M);
-}
+ // Assign expressions which can only be resolved when all other functions are
+ // known.
+ RI->Finalize();
+ getTargetStreamer()->EmitMCResourceMaximums(
+ RI->getMaxVGPRSymbol(), RI->getMaxAGPRSymbol(), RI->getMaxSGPRSymbol());
-// Print comments that apply to both callable functions and entry points.
-void AMDGPUAsmPrinter::emitCommonFunctionComments(
- uint32_t NumVGPR, std::optional<uint32_t> NumAGPR, uint32_t TotalNumVGPR,
- uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize,
- const AMDGPUMachineFunction *MFI) {
- OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
- OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
- OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
- if (NumAGPR) {
- OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false);
- OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR),
- false);
+ for (Function &F : M.functions()) {
+ ValidateMCResourceInfo(F);
}
- OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
- OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
- false);
+ return AsmPrinter::doFinalization(M);
}
SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
@@ -402,6 +493,7 @@ SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
return Str;
}
+// Print comments that apply to both callable functions and entry points.
void AMDGPUAsmPrinter::emitCommonFunctionComments(
const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
@@ -571,21 +663,45 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
STM.hasMAIInsts());
+ {
+ const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
+ ResourceUsage->getResourceInfo();
+ RI->gatherResourceInfo(MF, Info);
+ using RIK = MCResourceInfo::ResourceInfoKind;
+ getTargetStreamer()->EmitMCResourceInfo(
+ RI->getSymbol(MF.getName(), RIK::RIK_NumVGPR),
+ RI->getSymbol(MF.getName(), RIK::RIK_NumAGPR),
+ RI->getSymbol(MF.getName(), RIK::RIK_NumSGPR),
+ RI->getSymbol(MF.getName(), RIK::RIK_PrivateSegSize),
+ RI->getSymbol(MF.getName(), RIK::RIK_UsesVCC),
+ RI->getSymbol(MF.getName(), RIK::RIK_UsesFlatScratch),
+ RI->getSymbol(MF.getName(), RIK::RIK_HasDynSizedStack),
+ RI->getSymbol(MF.getName(), RIK::RIK_HasRecursion),
+ RI->getSymbol(MF.getName(), RIK::RIK_HasIndirectCall));
+ }
+
if (isVerbose()) {
MCSectionELF *CommentSection =
Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
OutStreamer->switchSection(CommentSection);
if (!MFI->isEntryFunction()) {
+ using RIK = MCResourceInfo::ResourceInfoKind;
OutStreamer->emitRawComment(" Function info:", false);
- const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
- ResourceUsage->getResourceInfo(&MF.getFunction());
+
emitCommonFunctionComments(
- Info.NumVGPR,
- STM.hasMAIInsts() ? Info.NumAGPR : std::optional<uint32_t>(),
- Info.getTotalNumVGPRs(STM),
- Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
- Info.PrivateSegmentSize, getFunctionCodeSize(MF), MFI);
+ RI->getSymbol(MF.getName(), RIK::RIK_NumVGPR)->getVariableValue(),
+ STM.hasMAIInsts() ? RI->getSymbol(MF.getName(), RIK::RIK_NumAGPR)
+ ->getVariableValue()
+ : nullptr,
+ RI->createTotalNumVGPRs(MF, Ctx),
+ RI->createTotalNumSGPRs(
+ MF,
+ MF.getSubtarget<GCNSubtarget>().getTargetID().isXnackOnOrAny(),
+ Ctx),
+ RI->getSymbol(MF.getName(), RIK::RIK_PrivateSegSize)
+ ->getVariableValue(),
+ getFunctionCodeSize(MF), MFI);
return false;
}
@@ -753,8 +869,6 @@ uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const
void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
const MachineFunction &MF) {
- const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
- ResourceUsage->getResourceInfo(&MF.getFunction());
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
MCContext &Ctx = MF.getContext();
@@ -771,18 +885,38 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
return false;
};
- ProgInfo.NumArchVGPR = CreateExpr(Info.NumVGPR);
- ProgInfo.NumAccVGPR = CreateExpr(Info.NumAGPR);
- ProgInfo.NumVGPR = CreateExpr(Info.getTotalNumVGPRs(STM));
- ProgInfo.AccumOffset =
- CreateExpr(alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1);
+ auto GetSymRefExpr =
+ [&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
+ MCSymbol *Sym = RI->getSymbol(MF.getName(), RIK);
+ return MCSymbolRefExpr::create(Sym, Ctx);
+ };
+
+ const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx);
+ const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx);
+
+ using RIK = MCResourceInfo::ResourceInfoKind;
+ ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR);
+ ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR);
+ ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
+ ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
+
+ // AccumOffset computed for the MCExpr equivalent of:
+ // alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1;
+ ProgInfo.AccumOffset = MCBinaryExpr::createSub(
+ MCBinaryExpr::createDiv(
+ AMDGPUMCExpr::createAlignTo(
+ AMDGPUMCExpr::createMax({ConstOne, ProgInfo.NumArchVGPR}, Ctx),
+ ConstFour, Ctx),
+ ConstFour, Ctx),
+ ConstOne, Ctx);
ProgInfo.TgSplit = STM.isTgSplitEnabled();
- ProgInfo.NumSGPR = CreateExpr(Info.NumExplicitSGPR);
- ProgInfo.ScratchSize = CreateExpr(Info.PrivateSegmentSize);
- ProgInfo.VCCUsed = CreateExpr(Info.UsesVCC);
- ProgInfo.FlatUsed = CreateExpr(Info.UsesFlatScratch);
+ ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR);
+ ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize);
+ ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC);
+ ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch);
ProgInfo.DynamicCallStack =
- CreateExpr(Info.HasDynamicallySizedStack || Info.HasRecursion);
+ MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
+ GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
const uint64_t MaxScratchPerWorkitem =
STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
@@ -1082,6 +1216,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU,
ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
+ OccupancyValidateMap->insert({&MF.getFunction(), ProgInfo.Occupancy});
+
const auto [MinWEU, MaxWEU] =
AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
uint64_t Occupancy;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index f66bbde42ce278..676a4687ee2af7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -24,6 +24,7 @@ struct AMDGPUResourceUsageAnalysis;
class AMDGPUTargetStreamer;
class MCCodeEmitter;
class MCOperand;
+class MCResourceInfo;
namespace AMDGPU {
struct MCKernelDescriptor;
@@ -40,12 +41,20 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
AMDGPUResourceUsageAnalysis *ResourceUsage;
+ std::unique_ptr<MCResourceInfo> RI;
+
SIProgramInfo CurrentProgramInfo;
std::unique_ptr<AMDGPU::HSAMD::MetadataStreamer> HSAMetadataStream;
MCCodeEmitter *DumpCodeInstEmitter = nullptr;
+ // ValidateMCResourceInfo cannot recompute parts of the occupancy as it does
+ // for other metadata to validate (e.g., NumSGPRs) so a map is necessary if we
+ // really want to track and validate the occupancy.
+ std::unique_ptr<DenseMap<const Function *, const MCExpr *>>
+ OccupancyValidateMap;
+
uint64_t getFunctionCodeSize(const MachineFunction &MF) const;
void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF);
@@ -60,11 +69,6 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
void EmitPALMetadata(const MachineFunction &MF,
const SIProgramInfo &KernelInfo);
void emitPALFunctionMetadata(const MachineFunction &MF);
- void emitCommonFunctionComments(uint32_t NumVGPR,
- std::optional<uint32_t> NumAGPR,
- uint32_t TotalNumVGPR, uint32_t NumSGPR,
- uint64_t ScratchSize, uint64_t CodeSize,
- const AMDGPUMachineFunction *MFI);
void emitCommonFunctionComments(const MCExpr *NumVGPR, const MCExpr *NumAGPR,
const MCExpr *TotalNumVGPR,
const MCExpr *NumSGPR,
@@ -84,6 +88,11 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
SmallString<128> getMCExprStr(const MCExpr *Value);
+ /// Attempts to replace the validation that is missed in getSIProgramInfo due
+ /// to MCExpr being unknown. Invoked during doFinalization such that the
+ /// MCResourceInfo symbols are known.
+ void ValidateMCResourceInfo(Function &F);
+
public:
explicit AMDGPUAsmPrinter(TargetMachine &TM,
std::unique_ptr<MCStreamer> Streamer);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
new file mode 100644
index 00000000000000..58383475b312c9
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
@@ -0,0 +1,220 @@
+//===- AMDGPUMCResourceInfo.cpp --- MC Resource Info ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief MC infrastructure to propagate the function level resource usage
+/// info.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCResourceInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSymbol.h"
+
+using namespace llvm;
+
+MCSymbol *MCResourceInfo::getSymbol(StringRef FuncName, ResourceInfoKind RIK) {
+ switch (RIK) {
+ case RIK_NumVGPR:
+ return OutContext.getOrCreateSymbol(FuncName + Twine(".num_vgpr"));
+ case RIK_NumAGPR:
+ return OutContext.getOrCreateSymbol(FuncName + Twine(".num_agpr"));
+ case RIK_NumSGPR:
+ return OutContext.getOrCreateSymbol(FuncName + Twine(".num_sgpr"));
+ case RIK_PrivateSegSize:
+ return OutContext.getOrCreateSymbol(FuncName + Twine(".private_seg_size"));
+ case RIK_UsesVCC:
+ return OutContext.getOrCreateSymbol(FuncName + Twine(".uses_vcc"));
+ case RIK_UsesFlatScratch:
+ return OutContext.getOrCreateSymbol(FuncName + Twine(".uses_flat_scratch"));
+ case RIK_HasDynSizedStack:
+ return OutContext.getOrCreateSymbol(FuncName +
+ Twine(".has_dyn_sized_stack"));
+ case RIK_HasRecursion:
+ return OutContext.getOrCreateSymbol(FuncName + Twine(".has_recursion"));
+ case RIK_HasIndirectCall:
+ return OutContext.getOrCreateSymbol(FuncName + Twine(".has_indirect_call"));
+ }
+ llvm_unreachable("Unexpected ResourceInfoKind.");
+}
+
+const MCExpr *MCResourceInfo::getSymRefExpr(StringRef FuncName,
+ ResourceInfoKind RIK,
+ MCContext &Ctx) {
+ return MCSymbolRefExpr::create(getSymbol(FuncName, RIK), Ctx);
+}
+
+void MCResourceInfo::assignMaxRegs() {
+ // Assign expression to get the max register use to the max_num_Xgpr symbol.
+ MCSymbol *MaxVGPRSym = getMaxVGPRSymbol();
+ MCSymbol *MaxAGPRSym = getMaxAGPRSymbol();
+ MCSymbol *MaxSGPRSym = getMaxSGPRSymbol();
+
+ auto assignMaxRegSym = [this](MCSymbol *Sym, int32_t RegCount) {
+ const MCExpr *MaxExpr = MCConstantExpr::create(RegCount, OutContext);
+ Sym->setVariableValue(MaxExpr);
+ };
+
+ assignMaxRegSym(MaxVGPRSym, MaxVGPR);
+ assignMaxRegSym(MaxAGPRSym, MaxAGPR);
+ assignMaxRegSym(MaxSGPRSym, MaxSGPR);
+}
+
+void MCResourceInfo::Finalize() {
+ assert(!finalized && "Cannot finalize ResourceInfo again.");
+ finalized = true;
+ assignMaxRegs();
+}
+
+MCSymbol *MCResourceInfo::getMaxVGPRSymbol() {
+ return OutContext.getOrCreateSymbol("max_num_vgpr");
+}
+
+MCSymbol *MCResourceInfo::getMaxAGPRSymbol() {
+ return OutContext.getOrCreateSymbol("max_num_agpr");
+}
+
+MCSymbol *MCResourceInfo::getMaxSGPRSymbol() {
+ return OutContext.getOrCreateSymbol("max_num_sgpr");
+}
+
+void MCResourceInfo::assignResourceInfoExpr(
+ int64_t localValue, ResourceInfoKind RIK, AMDGPUMCExpr::VariantKind Kind,
+ const MachineFunction &MF,
+ const SmallVectorImpl<const Function *> &Callees) {
+ const MCConstantExpr *localConstExpr =
+ MCConstantExpr::create(localValue, OutContext);
+ const MCExpr *SymVal = localConstExpr;
+ if (Callees.size() > 0) {
+ std::vector<const MCExpr *> ArgExprs;
+ // Avoid recursive symbol assignment.
+ SmallSet<StringRef, 8> Seen;
+ ArgExprs.push_back(localConstExpr);
+ Seen.insert(MF.getName());
+
+ for (const Function *Callee : Callees) {
+ if (Seen.contains(Callee->getName()))
+ continue;
+ Seen.insert(Callee->getName());
+ MCSymbol *calleeValSym = getSymbol(Callee->getName(), RIK);
+ ArgExprs.push_back(MCSymbolRefExpr::create(calleeValSym, OutContext));
+ }
+ SymVal = AMDGPUMCExpr::create(Kind, ArgExprs, OutContext);
+ }
+ MCSymbol *Sym = getSymbol(MF.getName(), RIK);
+ Sym->setVariableValue(SymVal);
+}
+
+void MCResourceInfo::gatherResourceInfo(
+ const MachineFunction &MF,
+ const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &FRI) {
+ // Worst case VGPR use for non-hardware-entrypoints.
+ MCSymbol *maxVGPRSym = getMaxVGPRSymbol();
+ MCSymbol *maxAGPRSym = getMaxAGPRSymbol();
+ MCSymbol *maxSGPRSym = getMaxSGPRSymbol();
+
+ if (!AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())) {
+ addMaxVGPRCandidate(FRI.NumVGPR);
+ addMaxAGPRCandidate(FRI.NumAGPR);
+ addMaxSGPRCandidate(FRI.NumExplicitSGPR);
+ }
+
+ auto setMaxReg = [&](MCSymbol *MaxSym, int32_t numRegs,
+ ResourceInfoKind RIK) {
+ if (!FRI.HasIndirectCall) {
+ assignResourceInfoExpr(numRegs, RIK, AMDGPUMCExpr::AGVK_Max, MF,
+ FRI.Callees);
+ } else {
+ const MCExpr *SymRef = MCSymbolRefExpr::create(MaxSym, OutContext);
+ MCSymbol *LocalNumSym = getSymbol(MF.getName(), RIK);
+ const MCExpr *MaxWithLocal = AMDGPUMCExpr::createMax(
+ {MCConstantExpr::create(numRegs, OutContext), SymRef}, OutContext);
+ LocalNumSym->setVariableValue(MaxWithLocal);
+ }
+ };
+
+ setMaxReg(maxVGPRSym, FRI.NumVGPR, RIK_NumVGPR);
+ setMaxReg(maxAGPRSym, FRI.NumAGPR, RIK_NumAGPR);
+ setMaxReg(maxSGPRSym, FRI.NumExplicitSGPR, RIK_NumSGPR);
+
+ {
+ // The expression for private segment size should be: FRI.PrivateSegmentSize
+ // + max(FRI.Callees, FRI.CalleeSegmentSize)
+ std::vector<const MCExpr *> ArgExprs;
+ if (FRI.CalleeSegmentSize)
+ ArgExprs.push_back(
+ MCConstantExpr::create(FRI.CalleeSegmentSize, OutContext));
+
+ if (!FRI.HasIndirectCall) {
+ for (const Function *Callee : FRI.Callees) {
+ MCSymbol *calleeValSym =
+ getSymbol(Callee->getName(), RIK_PrivateSegSize);
+ ArgExprs.push_back(MCSymbolRefExpr::create(calleeValSym, OutContext));
+ }
+ }
+ const MCExpr *localConstExpr =
+ MCConstantExpr::create(FRI.PrivateSegmentSize, OutContext);
+ if (ArgExprs.size() > 0) {
+ const AMDGPUMCExpr *transitiveExpr =
+ AMDGPUMCExpr::createMax(ArgExprs, OutContext);
+ localConstExpr =
+ MCBinaryExpr::createAdd(localConstExpr, transitiveExpr, OutContext);
+ }
+ getSymbol(MF.getName(), RIK_PrivateSegSize)
+ ->setVariableValue(localConstExpr);
+ }
+
+ auto setToLocal = [&](int64_t localValue, ResourceInfoKind RIK) {
+ MCSymbol *Sym = getSymbol(MF.getName(), RIK);
+ Sym->setVariableValue(MCConstantExpr::create(localValue, OutContext));
+ };
+
+ if (!FRI.HasIndirectCall) {
+ assignResourceInfoExpr(FRI.UsesVCC, ResourceInfoKind::RIK_UsesVCC,
+ AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees);
+ assignResourceInfoExpr(FRI.UsesFlatScratch,
+ ResourceInfoKind::RIK_UsesFlatScratch,
+ AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees);
+ assignResourceInfoExpr(FRI.HasDynamicallySizedStack,
+ ResourceInfoKind::RIK_HasDynSizedStack,
+ AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees);
+ assignResourceInfoExpr(FRI.HasRecursion, ResourceInfoKind::RIK_HasRecursion,
+ AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees);
+ assignResourceInfoExpr(FRI.HasIndirectCall,
+ ResourceInfoKind::RIK_HasIndirectCall,
+ AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees);
+ } else {
+ setToLocal(FRI.UsesVCC, ResourceInfoKind::RIK_UsesVCC);
+ setToLocal(FRI.UsesFlatScratch, ResourceInfoKind::RIK_UsesFlatScratch);
+ setToLocal(FRI.HasDynamicallySizedStack,
+ ResourceInfoKind::RIK_HasDynSizedStack);
+ setToLocal(FRI.HasRecursion, ResourceInfoKind::RIK_HasRecursion);
+ setToLocal(FRI.HasIndirectCall, ResourceInfoKind::RIK_HasIndirectCall);
+ }
+}
+
+const MCExpr *MCResourceInfo::createTotalNumVGPRs(const MachineFunction &MF,
+ MCContext &Ctx) {
+ return AMDGPUMCExpr::createTotalNumVGPR(
+ getSymRefExpr(MF.getName(), RIK_NumAGPR, Ctx),
+ getSymRefExpr(MF.getName(), RIK_NumVGPR, Ctx), Ctx);
+}
+
+const MCExpr *MCResourceInfo::createTotalNumSGPRs(const MachineFunction &MF,
+ bool hasXnack,
+ MCContext &Ctx) {
+ return MCBinaryExpr::createAdd(
+ getSymRefExpr(MF.getName(), RIK_NumSGPR, Ctx),
+ AMDGPUMCExpr::createExtraSGPRs(
+ getSymRefExpr(MF.getName(), RIK_UsesVCC, Ctx),
+ getSymRefExpr(MF.getName(), RIK_UsesFlatScratch, Ctx), hasXnack, Ctx),
+ Ctx);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
new file mode 100644
index 00000000000000..6646003693a67f
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
@@ -0,0 +1,94 @@
+//===- AMDGPUMCResourceInfo.h ----- MC Resource Info --------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief MC infrastructure to propagate the function level resource usage
+/// info.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUResourceUsageAnalysis.h"
+#include "MCTargetDesc/AMDGPUMCExpr.h"
+
+namespace llvm {
+
+class MCContext;
+class MCSymbol;
+class StringRef;
+class MachineFunction;
+
+class MCResourceInfo {
+public:
+ enum ResourceInfoKind {
+ RIK_NumVGPR,
+ RIK_NumAGPR,
+ RIK_NumSGPR,
+ RIK_PrivateSegSize,
+ RIK_UsesVCC,
+ RIK_UsesFlatScratch,
+ RIK_HasDynSizedStack,
+ RIK_HasRecursion,
+ RIK_HasIndirectCall
+ };
+
+private:
+ int32_t MaxVGPR;
+ int32_t MaxAGPR;
+ int32_t MaxSGPR;
+
+ MCContext &OutContext;
+ bool finalized;
+
+ void assignResourceInfoExpr(int64_t localValue, ResourceInfoKind RIK,
+ AMDGPUMCExpr::VariantKind Kind,
+ const MachineFunction &MF,
+ const SmallVectorImpl<const Function *> &Callees);
+
+ // Assigns expression for Max S/V/A-GPRs to the referenced symbols.
+ void assignMaxRegs();
+
+public:
+ MCResourceInfo(MCContext &OutContext)
+ : MaxVGPR(0), MaxAGPR(0), MaxSGPR(0), OutContext(OutContext),
+ finalized(false) {}
+ void addMaxVGPRCandidate(int32_t candidate) {
+ MaxVGPR = std::max(MaxVGPR, candidate);
+ }
+ void addMaxAGPRCandidate(int32_t candidate) {
+ MaxAGPR = std::max(MaxAGPR, candidate);
+ }
+ void addMaxSGPRCandidate(int32_t candidate) {
+ MaxSGPR = std::max(MaxSGPR, candidate);
+ }
+
+ MCSymbol *getSymbol(StringRef FuncName, ResourceInfoKind RIK);
+ const MCExpr *getSymRefExpr(StringRef FuncName, ResourceInfoKind RIK,
+ MCContext &Ctx);
+
+ // Resolves the final symbols that requires the inter-function resource info
+ // to be resolved.
+ void Finalize();
+
+ MCSymbol *getMaxVGPRSymbol();
+ MCSymbol *getMaxAGPRSymbol();
+ MCSymbol *getMaxSGPRSymbol();
+
+ /// AMDGPUResourceUsageAnalysis gathers resource usage on a per-function
+ /// granularity. However, some resource info has to be assigned the call
+ /// transitive maximum or accumulative. For example, if A calls B and B's VGPR
+ /// usage exceeds A's, A should be assigned B's VGPR usage. Furthermore,
+ /// functions with indirect calls should be assigned the module level maximum.
+ void gatherResourceInfo(
+ const MachineFunction &MF,
+ const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &FRI);
+
+ const MCExpr *createTotalNumVGPRs(const MachineFunction &MF, MCContext &Ctx);
+ const MCExpr *createTotalNumSGPRs(const MachineFunction &MF, bool hasXnack,
+ MCContext &Ctx);
+};
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 0aca99a82d1978..1ee3c40d69a3b3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -13,14 +13,6 @@
/// The results of this analysis are used to fill the register usage, flat
/// usage, etc. into hardware registers.
///
-/// The analysis takes callees into account. E.g. if a function A that needs 10
-/// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
-/// will return 20.
-/// It is assumed that an indirect call can go into any function except
-/// hardware-entrypoints. Therefore the register usage of functions with
-/// indirect calls is estimated as the maximum of all non-entrypoint functions
-/// in the module.
-///
//===----------------------------------------------------------------------===//
#include "AMDGPUResourceUsageAnalysis.h"
@@ -28,8 +20,8 @@
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/Analysis/CallGraph.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/GlobalAlias.h"
#include "llvm/IR/GlobalValue.h"
@@ -78,92 +70,37 @@ static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
return false;
}
-int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
- const GCNSubtarget &ST) const {
- return NumExplicitSGPR +
- IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
- ST.getTargetID().isXnackOnOrAny());
-}
-
-int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
- const GCNSubtarget &ST) const {
- return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), NumAGPR, NumVGPR);
-}
-
-bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
+bool AMDGPUResourceUsageAnalysis::runOnMachineFunction(MachineFunction &MF) {
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
if (!TPC)
return false;
- MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
const TargetMachine &TM = TPC->getTM<TargetMachine>();
const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
- bool HasIndirectCall = false;
-
- CallGraph CG = CallGraph(M);
- auto End = po_end(&CG);
// By default, for code object v5 and later, track only the minimum scratch
// size
uint32_t AssumedStackSizeForDynamicSizeObjects =
clAssumedStackSizeForDynamicSizeObjects;
uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
- if (AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5 ||
+ if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
+ AMDGPU::AMDHSA_COV5 ||
STI.getTargetTriple().getOS() == Triple::AMDPAL) {
- if (clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences() == 0)
+ if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
AssumedStackSizeForDynamicSizeObjects = 0;
- if (clAssumedStackSizeForExternalCall.getNumOccurrences() == 0)
+ if (!clAssumedStackSizeForExternalCall.getNumOccurrences())
AssumedStackSizeForExternalCall = 0;
}
- for (auto IT = po_begin(&CG); IT != End; ++IT) {
- Function *F = IT->getFunction();
- if (!F || F->isDeclaration())
- continue;
-
- MachineFunction *MF = MMI.getMachineFunction(*F);
- assert(MF && "function must have been generated already");
-
- auto CI =
- CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo()));
- SIFunctionResourceInfo &Info = CI.first->second;
- assert(CI.second && "should only be called once per function");
- Info = analyzeResourceUsage(*MF, TM, AssumedStackSizeForDynamicSizeObjects,
- AssumedStackSizeForExternalCall);
- HasIndirectCall |= Info.HasIndirectCall;
- }
-
- // It's possible we have unreachable functions in the module which weren't
- // visited by the PO traversal. Make sure we have some resource counts to
- // report.
- for (const auto &IT : CG) {
- const Function *F = IT.first;
- if (!F || F->isDeclaration())
- continue;
-
- auto CI =
- CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo()));
- if (!CI.second) // Skip already visited functions
- continue;
-
- SIFunctionResourceInfo &Info = CI.first->second;
- MachineFunction *MF = MMI.getMachineFunction(*F);
- assert(MF && "function must have been generated already");
- Info = analyzeResourceUsage(*MF, TM, AssumedStackSizeForDynamicSizeObjects,
- AssumedStackSizeForExternalCall);
- HasIndirectCall |= Info.HasIndirectCall;
- }
-
- if (HasIndirectCall)
- propagateIndirectCallRegisterUsage();
+ ResourceInfo = analyzeResourceUsage(MF, AssumedStackSizeForDynamicSizeObjects,
+ AssumedStackSizeForExternalCall);
return false;
}
AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
- const MachineFunction &MF, const TargetMachine &TM,
- uint32_t AssumedStackSizeForDynamicSizeObjects,
+ const MachineFunction &MF, uint32_t AssumedStackSizeForDynamicSizeObjects,
uint32_t AssumedStackSizeForExternalCall) const {
SIFunctionResourceInfo Info;
@@ -253,7 +190,7 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
int32_t MaxVGPR = -1;
int32_t MaxAGPR = -1;
int32_t MaxSGPR = -1;
- uint64_t CalleeFrameSize = 0;
+ Info.CalleeSegmentSize = 0;
for (const MachineBasicBlock &MBB : MF) {
for (const MachineInstr &MI : MBB) {
@@ -512,8 +449,6 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
TII->getNamedOperand(MI, AMDGPU::OpName::callee);
const Function *Callee = getCalleeFunction(*CalleeOp);
- DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
- CallGraphResourceInfo.end();
// Avoid crashing on undefined behavior with an illegal call to a
// kernel. If a callsite's calling convention doesn't match the
@@ -522,9 +457,14 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
report_fatal_error("invalid call to entry function");
+ auto isSameFunction = [](const MachineFunction &MF, const Function *F) {
+ return F == &MF.getFunction();
+ };
+
+ if (Callee && !isSameFunction(MF, Callee))
+ Info.Callees.push_back(Callee);
+
bool IsIndirect = !Callee || Callee->isDeclaration();
- if (!IsIndirect)
- I = CallGraphResourceInfo.find(Callee);
// FIXME: Call site could have norecurse on it
if (!Callee || !Callee->doesNotRecurse()) {
@@ -539,15 +479,15 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
// directly call the tail called function. If a kernel directly
// calls a tail recursive function, we'll assume maximum stack size
// based on the regular call instruction.
- CalleeFrameSize = std::max(
- CalleeFrameSize,
+ Info.CalleeSegmentSize = std::max(
+ Info.CalleeSegmentSize,
static_cast<uint64_t>(AssumedStackSizeForExternalCall));
}
}
- if (IsIndirect || I == CallGraphResourceInfo.end()) {
- CalleeFrameSize =
- std::max(CalleeFrameSize,
+ if (IsIndirect) {
+ Info.CalleeSegmentSize =
+ std::max(Info.CalleeSegmentSize,
static_cast<uint64_t>(AssumedStackSizeForExternalCall));
// Register usage of indirect calls gets handled later
@@ -555,19 +495,6 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
Info.UsesFlatScratch = ST.hasFlatAddressSpace();
Info.HasDynamicallySizedStack = true;
Info.HasIndirectCall = true;
- } else {
- // We force CodeGen to run in SCC order, so the callee's register
- // usage etc. should be the cumulative usage of all callees.
- MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
- MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
- MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
- CalleeFrameSize =
- std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
- Info.UsesVCC |= I->second.UsesVCC;
- Info.UsesFlatScratch |= I->second.UsesFlatScratch;
- Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
- Info.HasRecursion |= I->second.HasRecursion;
- Info.HasIndirectCall |= I->second.HasIndirectCall;
}
}
}
@@ -576,36 +503,6 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
Info.NumExplicitSGPR = MaxSGPR + 1;
Info.NumVGPR = MaxVGPR + 1;
Info.NumAGPR = MaxAGPR + 1;
- Info.PrivateSegmentSize += CalleeFrameSize;
return Info;
}
-
-void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
- // Collect the maximum number of registers from non-hardware-entrypoints.
- // All these functions are potential targets for indirect calls.
- int32_t NonKernelMaxSGPRs = 0;
- int32_t NonKernelMaxVGPRs = 0;
- int32_t NonKernelMaxAGPRs = 0;
-
- for (const auto &I : CallGraphResourceInfo) {
- if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
- auto &Info = I.getSecond();
- NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
- NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
- NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
- }
- }
-
- // Add register usage for functions with indirect calls.
- // For calls to unknown functions, we assume the maximum register usage of
- // all non-hardware-entrypoints in the current module.
- for (auto &I : CallGraphResourceInfo) {
- auto &Info = I.getSecond();
- if (Info.HasIndirectCall) {
- Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
- Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
- Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
- }
- }
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
index 7f71de6749dcef..92ef41f49b3ba8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
@@ -15,8 +15,8 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H
-#include "llvm/Analysis/CallGraphSCCPass.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
namespace llvm {
@@ -24,10 +24,9 @@ class GCNSubtarget;
class MachineFunction;
class TargetMachine;
-struct AMDGPUResourceUsageAnalysis : public ModulePass {
- static char ID;
-
+struct AMDGPUResourceUsageAnalysis : public MachineFunctionPass {
public:
+ static char ID;
// Track resource usage for callee functions.
struct SIFunctionResourceInfo {
// Track the number of explicitly used VGPRs. Special registers reserved at
@@ -35,48 +34,33 @@ struct AMDGPUResourceUsageAnalysis : public ModulePass {
int32_t NumVGPR = 0;
int32_t NumAGPR = 0;
int32_t NumExplicitSGPR = 0;
+ uint64_t CalleeSegmentSize = 0;
uint64_t PrivateSegmentSize = 0;
bool UsesVCC = false;
bool UsesFlatScratch = false;
bool HasDynamicallySizedStack = false;
bool HasRecursion = false;
bool HasIndirectCall = false;
-
- int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const;
- // Total number of VGPRs is actually a combination of AGPR and VGPR
- // depending on architecture - and some alignment constraints
- int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const;
+ SmallVector<const Function *, 16> Callees;
};
- AMDGPUResourceUsageAnalysis() : ModulePass(ID) {}
+ AMDGPUResourceUsageAnalysis() : MachineFunctionPass(ID) {}
- bool doInitialization(Module &M) override {
- CallGraphResourceInfo.clear();
- return ModulePass::doInitialization(M);
- }
+ bool runOnMachineFunction(MachineFunction &MF) override;
- bool runOnModule(Module &M) override;
+ const SIFunctionResourceInfo &getResourceInfo() const { return ResourceInfo; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<MachineModuleInfoWrapperPass>();
AU.setPreservesAll();
- }
-
- const SIFunctionResourceInfo &getResourceInfo(const Function *F) const {
- auto Info = CallGraphResourceInfo.find(F);
- assert(Info != CallGraphResourceInfo.end() &&
- "Failed to find resource info for function");
- return Info->getSecond();
+ MachineFunctionPass::getAnalysisUsage(AU);
}
private:
SIFunctionResourceInfo
- analyzeResourceUsage(const MachineFunction &MF, const TargetMachine &TM,
+ analyzeResourceUsage(const MachineFunction &MF,
uint32_t AssumedStackSizeForDynamicSizeObjects,
uint32_t AssumedStackSizeForExternalCall) const;
- void propagateIndirectCallRegisterUsage();
-
- DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo;
+ SIFunctionResourceInfo ResourceInfo;
};
} // namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 671caf8484cd97..d685b635a82a6f 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -82,6 +82,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUMCInstLower.cpp
AMDGPUIGroupLP.cpp
AMDGPUInsertSingleUseVDST.cpp
+ AMDGPUMCResourceInfo.cpp
AMDGPUMarkLastScratchLoad.cpp
AMDGPUMIRFormatter.cpp
AMDGPUOpenCLEnqueuedBlockLowering.cpp
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 73d466abc66f7b..a1a41d6cc8c6a0 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -271,6 +271,47 @@ void AMDGPUTargetAsmStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
<< Alignment.value() << '\n';
}
+void AMDGPUTargetAsmStreamer::EmitMCResourceInfo(
+ const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
+ const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize,
+ const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch,
+ const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion,
+ const MCSymbol *HasIndirectCall) {
+#define PRINT_RES_INFO(ARG) \
+ OS << "\t.set "; \
+ ARG->print(OS, getContext().getAsmInfo()); \
+ OS << ", "; \
+ ARG->getVariableValue()->print(OS, getContext().getAsmInfo()); \
+ Streamer.addBlankLine();
+
+ PRINT_RES_INFO(NumVGPR);
+ PRINT_RES_INFO(NumAGPR);
+ PRINT_RES_INFO(NumExplicitSGPR);
+ PRINT_RES_INFO(PrivateSegmentSize);
+ PRINT_RES_INFO(UsesVCC);
+ PRINT_RES_INFO(UsesFlatScratch);
+ PRINT_RES_INFO(HasDynamicallySizedStack);
+ PRINT_RES_INFO(HasRecursion);
+ PRINT_RES_INFO(HasIndirectCall);
+#undef PRINT_RES_INFO
+}
+
+void AMDGPUTargetAsmStreamer::EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
+ const MCSymbol *MaxAGPR,
+ const MCSymbol *MaxSGPR) {
+#define PRINT_RES_INFO(ARG) \
+ OS << "\t.set "; \
+ ARG->print(OS, getContext().getAsmInfo()); \
+ OS << ", "; \
+ ARG->getVariableValue()->print(OS, getContext().getAsmInfo()); \
+ Streamer.addBlankLine();
+
+ PRINT_RES_INFO(MaxVGPR);
+ PRINT_RES_INFO(MaxAGPR);
+ PRINT_RES_INFO(MaxSGPR);
+#undef PRINT_RES_INFO
+}
+
bool AMDGPUTargetAsmStreamer::EmitISAVersion() {
OS << "\t.amd_amdgpu_isa \"" << getTargetID()->toString() << "\"\n";
return true;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index bf1538c71d1543..e41f302c3d56ce 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -60,6 +60,17 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
virtual void emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, Align Alignment) {
}
+ virtual void EmitMCResourceInfo(
+ const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
+ const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize,
+ const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch,
+ const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion,
+ const MCSymbol *HasIndirectCall){};
+
+ virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
+ const MCSymbol *MaxAGPR,
+ const MCSymbol *MaxSGPR){};
+
/// \returns True on success, false on failure.
virtual bool EmitISAVersion() { return true; }
@@ -136,6 +147,18 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override;
+ void EmitMCResourceInfo(const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
+ const MCSymbol *NumExplicitSGPR,
+ const MCSymbol *PrivateSegmentSize,
+ const MCSymbol *UsesVCC,
+ const MCSymbol *UsesFlatScratch,
+ const MCSymbol *HasDynamicallySizedStack,
+ const MCSymbol *HasRecursion,
+ const MCSymbol *HasIndirectCall) override;
+
+ void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR,
+ const MCSymbol *MaxSGPR) override;
+
/// \returns True on success, false on failure.
bool EmitISAVersion() override;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index a53bf70d77717b..92d09b3afa77d7 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -215,15 +215,15 @@ void AMDGPUPALMetadata::setRegister(unsigned Reg, const MCExpr *Val,
const MCExpr *NExpr = MCConstantExpr::create(N.getUInt(), Ctx);
Val = MCBinaryExpr::createOr(Val, NExpr, Ctx);
}
- ExprIt->getSecond() = Val;
} else if (N.getKind() == msgpack::Type::UInt) {
const MCExpr *NExpr = MCConstantExpr::create(N.getUInt(), Ctx);
Val = MCBinaryExpr::createOr(Val, NExpr, Ctx);
- int64_t Unused;
- if (!Val->evaluateAsAbsolute(Unused))
- REM[Reg] = Val;
- (void)Unused;
+ } else {
+ // Default to uint64_t 0 so additional calls to setRegister will allow
+ // propagate ORs.
+ N = (uint64_t)0;
}
+ REM[Reg] = Val;
DelayedExprs.assignDocNode(N, msgpack::Type::UInt, Val);
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 34efb089b72bf1..a7ab4393f3b0ac 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -3025,8 +3025,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: amd_machine_version_stepping = 0
; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256
; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0
-; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1
+; GPRIDX-NEXT: granulated_workitem_vgpr_count = (11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; GPRIDX-NEXT: granulated_wavefront_sgpr_count = ((11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
; GPRIDX-NEXT: priority = 0
; GPRIDX-NEXT: float_mode = 240
; GPRIDX-NEXT: priv = 0
@@ -3036,7 +3036,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: enable_wgp_mode = 0
; GPRIDX-NEXT: enable_mem_ordered = 0
; GPRIDX-NEXT: enable_fwd_progress = 0
-; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v5f64_s_s.private_seg_size*64, 1024))/1024)>0)||(dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion))|5012)&1
; GPRIDX-NEXT: user_sgpr_count = 10
; GPRIDX-NEXT: enable_trap_handler = 0
; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -3061,16 +3061,16 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: enable_ordered_append_gds = 0
; GPRIDX-NEXT: private_element_size = 1
; GPRIDX-NEXT: is_ptr64 = 1
-; GPRIDX-NEXT: is_dynamic_callstack = 0
+; GPRIDX-NEXT: is_dynamic_callstack = dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion
; GPRIDX-NEXT: is_debug_enabled = 0
; GPRIDX-NEXT: is_xnack_enabled = 1
-; GPRIDX-NEXT: workitem_private_segment_byte_size = 0
+; GPRIDX-NEXT: workitem_private_segment_byte_size = dyn_extract_v5f64_s_s.private_seg_size
; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
-; GPRIDX-NEXT: wavefront_sgpr_count = 13
-; GPRIDX-NEXT: workitem_vgpr_count = 3
+; GPRIDX-NEXT: wavefront_sgpr_count = dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 1))
+; GPRIDX-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr)
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
; GPRIDX-NEXT: reserved_sgpr_first = 0
@@ -3116,8 +3116,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: amd_machine_version_stepping = 3
; MOVREL-NEXT: kernel_code_entry_byte_offset = 256
; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0
-; MOVREL-NEXT: granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1
+; MOVREL-NEXT: granulated_workitem_vgpr_count = (11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; MOVREL-NEXT: granulated_wavefront_sgpr_count = ((11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
; MOVREL-NEXT: priority = 0
; MOVREL-NEXT: float_mode = 240
; MOVREL-NEXT: priv = 0
@@ -3127,7 +3127,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: enable_wgp_mode = 0
; MOVREL-NEXT: enable_mem_ordered = 0
; MOVREL-NEXT: enable_fwd_progress = 0
-; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v5f64_s_s.private_seg_size*64, 1024))/1024)>0)||(dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion))|5012)&1
; MOVREL-NEXT: user_sgpr_count = 10
; MOVREL-NEXT: enable_trap_handler = 0
; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -3152,16 +3152,16 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: enable_ordered_append_gds = 0
; MOVREL-NEXT: private_element_size = 1
; MOVREL-NEXT: is_ptr64 = 1
-; MOVREL-NEXT: is_dynamic_callstack = 0
+; MOVREL-NEXT: is_dynamic_callstack = dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion
; MOVREL-NEXT: is_debug_enabled = 0
; MOVREL-NEXT: is_xnack_enabled = 0
-; MOVREL-NEXT: workitem_private_segment_byte_size = 0
+; MOVREL-NEXT: workitem_private_segment_byte_size = dyn_extract_v5f64_s_s.private_seg_size
; MOVREL-NEXT: workgroup_group_segment_byte_size = 0
; MOVREL-NEXT: gds_segment_byte_size = 0
; MOVREL-NEXT: kernarg_segment_byte_size = 28
; MOVREL-NEXT: workgroup_fbarrier_count = 0
-; MOVREL-NEXT: wavefront_sgpr_count = 9
-; MOVREL-NEXT: workitem_vgpr_count = 4
+; MOVREL-NEXT: wavefront_sgpr_count = dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 0))
+; MOVREL-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr)
; MOVREL-NEXT: reserved_vgpr_first = 0
; MOVREL-NEXT: reserved_vgpr_count = 0
; MOVREL-NEXT: reserved_sgpr_first = 0
@@ -3208,8 +3208,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: amd_machine_version_stepping = 0
; GFX10-NEXT: kernel_code_entry_byte_offset = 256
; GFX10-NEXT: kernel_code_prefetch_byte_size = 0
-; GFX10-NEXT: granulated_workitem_vgpr_count = 0
-; GFX10-NEXT: granulated_wavefront_sgpr_count = 1
+; GFX10-NEXT: granulated_workitem_vgpr_count = (1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; GFX10-NEXT: granulated_wavefront_sgpr_count = ((1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
; GFX10-NEXT: priority = 0
; GFX10-NEXT: float_mode = 240
; GFX10-NEXT: priv = 0
@@ -3219,7 +3219,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: enable_wgp_mode = 1
; GFX10-NEXT: enable_mem_ordered = 1
; GFX10-NEXT: enable_fwd_progress = 0
-; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v5f64_s_s.private_seg_size*32, 1024))/1024)>0)||(dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion))|5012)&1
; GFX10-NEXT: user_sgpr_count = 10
; GFX10-NEXT: enable_trap_handler = 0
; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -3244,16 +3244,16 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: enable_ordered_append_gds = 0
; GFX10-NEXT: private_element_size = 1
; GFX10-NEXT: is_ptr64 = 1
-; GFX10-NEXT: is_dynamic_callstack = 0
+; GFX10-NEXT: is_dynamic_callstack = dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion
; GFX10-NEXT: is_debug_enabled = 0
; GFX10-NEXT: is_xnack_enabled = 1
-; GFX10-NEXT: workitem_private_segment_byte_size = 0
+; GFX10-NEXT: workitem_private_segment_byte_size = dyn_extract_v5f64_s_s.private_seg_size
; GFX10-NEXT: workgroup_group_segment_byte_size = 0
; GFX10-NEXT: gds_segment_byte_size = 0
; GFX10-NEXT: kernarg_segment_byte_size = 28
; GFX10-NEXT: workgroup_fbarrier_count = 0
-; GFX10-NEXT: wavefront_sgpr_count = 9
-; GFX10-NEXT: workitem_vgpr_count = 3
+; GFX10-NEXT: wavefront_sgpr_count = dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 1))
+; GFX10-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr)
; GFX10-NEXT: reserved_vgpr_first = 0
; GFX10-NEXT: reserved_vgpr_count = 0
; GFX10-NEXT: reserved_sgpr_first = 0
@@ -3300,8 +3300,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX11-NEXT: amd_machine_version_stepping = 0
; GFX11-NEXT: kernel_code_entry_byte_offset = 256
; GFX11-NEXT: kernel_code_prefetch_byte_size = 0
-; GFX11-NEXT: granulated_workitem_vgpr_count = 0
-; GFX11-NEXT: granulated_wavefront_sgpr_count = 0
+; GFX11-NEXT: granulated_workitem_vgpr_count = (1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; GFX11-NEXT: granulated_wavefront_sgpr_count = ((1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
; GFX11-NEXT: priority = 0
; GFX11-NEXT: float_mode = 240
; GFX11-NEXT: priv = 0
@@ -3311,7 +3311,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX11-NEXT: enable_wgp_mode = 1
; GFX11-NEXT: enable_mem_ordered = 1
; GFX11-NEXT: enable_fwd_progress = 0
-; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v5f64_s_s.private_seg_size*32, 256))/256)>0)||(dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion))|5018)&1
; GFX11-NEXT: user_sgpr_count = 13
; GFX11-NEXT: enable_trap_handler = 0
; GFX11-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -3336,16 +3336,16 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX11-NEXT: enable_ordered_append_gds = 0
; GFX11-NEXT: private_element_size = 1
; GFX11-NEXT: is_ptr64 = 1
-; GFX11-NEXT: is_dynamic_callstack = 0
+; GFX11-NEXT: is_dynamic_callstack = dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion
; GFX11-NEXT: is_debug_enabled = 0
; GFX11-NEXT: is_xnack_enabled = 0
-; GFX11-NEXT: workitem_private_segment_byte_size = 0
+; GFX11-NEXT: workitem_private_segment_byte_size = dyn_extract_v5f64_s_s.private_seg_size
; GFX11-NEXT: workgroup_group_segment_byte_size = 0
; GFX11-NEXT: gds_segment_byte_size = 0
; GFX11-NEXT: kernarg_segment_byte_size = 28
; GFX11-NEXT: workgroup_fbarrier_count = 0
-; GFX11-NEXT: wavefront_sgpr_count = 7
-; GFX11-NEXT: workitem_vgpr_count = 3
+; GFX11-NEXT: wavefront_sgpr_count = dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 0))
+; GFX11-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr)
; GFX11-NEXT: reserved_vgpr_first = 0
; GFX11-NEXT: reserved_vgpr_count = 0
; GFX11-NEXT: reserved_sgpr_first = 0
@@ -4042,8 +4042,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: amd_machine_version_stepping = 0
; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256
; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0
-; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1
+; GPRIDX-NEXT: granulated_workitem_vgpr_count = (11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; GPRIDX-NEXT: granulated_wavefront_sgpr_count = ((11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
; GPRIDX-NEXT: priority = 0
; GPRIDX-NEXT: float_mode = 240
; GPRIDX-NEXT: priv = 0
@@ -4053,7 +4053,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_wgp_mode = 0
; GPRIDX-NEXT: enable_mem_ordered = 0
; GPRIDX-NEXT: enable_fwd_progress = 0
-; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f32_s_s_s.private_seg_size*64, 1024))/1024)>0)||(dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion))|5012)&1
; GPRIDX-NEXT: user_sgpr_count = 10
; GPRIDX-NEXT: enable_trap_handler = 0
; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -4078,16 +4078,16 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_ordered_append_gds = 0
; GPRIDX-NEXT: private_element_size = 1
; GPRIDX-NEXT: is_ptr64 = 1
-; GPRIDX-NEXT: is_dynamic_callstack = 0
+; GPRIDX-NEXT: is_dynamic_callstack = dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion
; GPRIDX-NEXT: is_debug_enabled = 0
; GPRIDX-NEXT: is_xnack_enabled = 1
-; GPRIDX-NEXT: workitem_private_segment_byte_size = 0
+; GPRIDX-NEXT: workitem_private_segment_byte_size = dyn_extract_v4f32_s_s_s.private_seg_size
; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
-; GPRIDX-NEXT: wavefront_sgpr_count = 12
-; GPRIDX-NEXT: workitem_vgpr_count = 2
+; GPRIDX-NEXT: wavefront_sgpr_count = dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 1))
+; GPRIDX-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr)
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
; GPRIDX-NEXT: reserved_sgpr_first = 0
@@ -4126,8 +4126,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: amd_machine_version_stepping = 3
; MOVREL-NEXT: kernel_code_entry_byte_offset = 256
; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0
-; MOVREL-NEXT: granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT: granulated_wavefront_sgpr_count = 0
+; MOVREL-NEXT: granulated_workitem_vgpr_count = (11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; MOVREL-NEXT: granulated_wavefront_sgpr_count = ((11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
; MOVREL-NEXT: priority = 0
; MOVREL-NEXT: float_mode = 240
; MOVREL-NEXT: priv = 0
@@ -4137,7 +4137,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_wgp_mode = 0
; MOVREL-NEXT: enable_mem_ordered = 0
; MOVREL-NEXT: enable_fwd_progress = 0
-; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f32_s_s_s.private_seg_size*64, 1024))/1024)>0)||(dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion))|5012)&1
; MOVREL-NEXT: user_sgpr_count = 10
; MOVREL-NEXT: enable_trap_handler = 0
; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -4162,16 +4162,16 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_ordered_append_gds = 0
; MOVREL-NEXT: private_element_size = 1
; MOVREL-NEXT: is_ptr64 = 1
-; MOVREL-NEXT: is_dynamic_callstack = 0
+; MOVREL-NEXT: is_dynamic_callstack = dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion
; MOVREL-NEXT: is_debug_enabled = 0
; MOVREL-NEXT: is_xnack_enabled = 0
-; MOVREL-NEXT: workitem_private_segment_byte_size = 0
+; MOVREL-NEXT: workitem_private_segment_byte_size = dyn_extract_v4f32_s_s_s.private_seg_size
; MOVREL-NEXT: workgroup_group_segment_byte_size = 0
; MOVREL-NEXT: gds_segment_byte_size = 0
; MOVREL-NEXT: kernarg_segment_byte_size = 28
; MOVREL-NEXT: workgroup_fbarrier_count = 0
-; MOVREL-NEXT: wavefront_sgpr_count = 8
-; MOVREL-NEXT: workitem_vgpr_count = 3
+; MOVREL-NEXT: wavefront_sgpr_count = dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 0))
+; MOVREL-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr)
; MOVREL-NEXT: reserved_vgpr_first = 0
; MOVREL-NEXT: reserved_vgpr_count = 0
; MOVREL-NEXT: reserved_sgpr_first = 0
@@ -4211,8 +4211,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: amd_machine_version_stepping = 0
; GFX10-NEXT: kernel_code_entry_byte_offset = 256
; GFX10-NEXT: kernel_code_prefetch_byte_size = 0
-; GFX10-NEXT: granulated_workitem_vgpr_count = 0
-; GFX10-NEXT: granulated_wavefront_sgpr_count = 0
+; GFX10-NEXT: granulated_workitem_vgpr_count = (1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; GFX10-NEXT: granulated_wavefront_sgpr_count = ((1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
; GFX10-NEXT: priority = 0
; GFX10-NEXT: float_mode = 240
; GFX10-NEXT: priv = 0
@@ -4222,7 +4222,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_wgp_mode = 1
; GFX10-NEXT: enable_mem_ordered = 1
; GFX10-NEXT: enable_fwd_progress = 0
-; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f32_s_s_s.private_seg_size*32, 1024))/1024)>0)||(dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion))|5012)&1
; GFX10-NEXT: user_sgpr_count = 10
; GFX10-NEXT: enable_trap_handler = 0
; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -4247,16 +4247,16 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_ordered_append_gds = 0
; GFX10-NEXT: private_element_size = 1
; GFX10-NEXT: is_ptr64 = 1
-; GFX10-NEXT: is_dynamic_callstack = 0
+; GFX10-NEXT: is_dynamic_callstack = dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion
; GFX10-NEXT: is_debug_enabled = 0
; GFX10-NEXT: is_xnack_enabled = 1
-; GFX10-NEXT: workitem_private_segment_byte_size = 0
+; GFX10-NEXT: workitem_private_segment_byte_size = dyn_extract_v4f32_s_s_s.private_seg_size
; GFX10-NEXT: workgroup_group_segment_byte_size = 0
; GFX10-NEXT: gds_segment_byte_size = 0
; GFX10-NEXT: kernarg_segment_byte_size = 28
; GFX10-NEXT: workgroup_fbarrier_count = 0
-; GFX10-NEXT: wavefront_sgpr_count = 8
-; GFX10-NEXT: workitem_vgpr_count = 2
+; GFX10-NEXT: wavefront_sgpr_count = dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 1))
+; GFX10-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr)
; GFX10-NEXT: reserved_vgpr_first = 0
; GFX10-NEXT: reserved_vgpr_count = 0
; GFX10-NEXT: reserved_sgpr_first = 0
@@ -4296,8 +4296,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: amd_machine_version_stepping = 0
; GFX11-NEXT: kernel_code_entry_byte_offset = 256
; GFX11-NEXT: kernel_code_prefetch_byte_size = 0
-; GFX11-NEXT: granulated_workitem_vgpr_count = 0
-; GFX11-NEXT: granulated_wavefront_sgpr_count = 0
+; GFX11-NEXT: granulated_workitem_vgpr_count = (1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; GFX11-NEXT: granulated_wavefront_sgpr_count = ((1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
; GFX11-NEXT: priority = 0
; GFX11-NEXT: float_mode = 240
; GFX11-NEXT: priv = 0
@@ -4307,7 +4307,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: enable_wgp_mode = 1
; GFX11-NEXT: enable_mem_ordered = 1
; GFX11-NEXT: enable_fwd_progress = 0
-; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f32_s_s_s.private_seg_size*32, 256))/256)>0)||(dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion))|5018)&1
; GFX11-NEXT: user_sgpr_count = 13
; GFX11-NEXT: enable_trap_handler = 0
; GFX11-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -4332,16 +4332,16 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: enable_ordered_append_gds = 0
; GFX11-NEXT: private_element_size = 1
; GFX11-NEXT: is_ptr64 = 1
-; GFX11-NEXT: is_dynamic_callstack = 0
+; GFX11-NEXT: is_dynamic_callstack = dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion
; GFX11-NEXT: is_debug_enabled = 0
; GFX11-NEXT: is_xnack_enabled = 0
-; GFX11-NEXT: workitem_private_segment_byte_size = 0
+; GFX11-NEXT: workitem_private_segment_byte_size = dyn_extract_v4f32_s_s_s.private_seg_size
; GFX11-NEXT: workgroup_group_segment_byte_size = 0
; GFX11-NEXT: gds_segment_byte_size = 0
; GFX11-NEXT: kernarg_segment_byte_size = 28
; GFX11-NEXT: workgroup_fbarrier_count = 0
-; GFX11-NEXT: wavefront_sgpr_count = 5
-; GFX11-NEXT: workitem_vgpr_count = 2
+; GFX11-NEXT: wavefront_sgpr_count = dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 0))
+; GFX11-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr)
; GFX11-NEXT: reserved_vgpr_first = 0
; GFX11-NEXT: reserved_vgpr_count = 0
; GFX11-NEXT: reserved_sgpr_first = 0
@@ -4389,8 +4389,8 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: amd_machine_version_stepping = 0
; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256
; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0
-; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1
+; GPRIDX-NEXT: granulated_workitem_vgpr_count = (11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; GPRIDX-NEXT: granulated_wavefront_sgpr_count = ((11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
; GPRIDX-NEXT: priority = 0
; GPRIDX-NEXT: float_mode = 240
; GPRIDX-NEXT: priv = 0
@@ -4400,7 +4400,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_wgp_mode = 0
; GPRIDX-NEXT: enable_mem_ordered = 0
; GPRIDX-NEXT: enable_fwd_progress = 0
-; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f64_s_s_s.private_seg_size*64, 1024))/1024)>0)||(dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion))|5012)&1
; GPRIDX-NEXT: user_sgpr_count = 10
; GPRIDX-NEXT: enable_trap_handler = 0
; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -4425,16 +4425,16 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_ordered_append_gds = 0
; GPRIDX-NEXT: private_element_size = 1
; GPRIDX-NEXT: is_ptr64 = 1
-; GPRIDX-NEXT: is_dynamic_callstack = 0
+; GPRIDX-NEXT: is_dynamic_callstack = dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion
; GPRIDX-NEXT: is_debug_enabled = 0
; GPRIDX-NEXT: is_xnack_enabled = 1
-; GPRIDX-NEXT: workitem_private_segment_byte_size = 0
+; GPRIDX-NEXT: workitem_private_segment_byte_size = dyn_extract_v4f64_s_s_s.private_seg_size
; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
-; GPRIDX-NEXT: wavefront_sgpr_count = 13
-; GPRIDX-NEXT: workitem_vgpr_count = 3
+; GPRIDX-NEXT: wavefront_sgpr_count = dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 1))
+; GPRIDX-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr)
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
; GPRIDX-NEXT: reserved_sgpr_first = 0
@@ -4476,8 +4476,8 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: amd_machine_version_stepping = 3
; MOVREL-NEXT: kernel_code_entry_byte_offset = 256
; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0
-; MOVREL-NEXT: granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1
+; MOVREL-NEXT: granulated_workitem_vgpr_count = (11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; MOVREL-NEXT: granulated_wavefront_sgpr_count = ((11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
; MOVREL-NEXT: priority = 0
; MOVREL-NEXT: float_mode = 240
; MOVREL-NEXT: priv = 0
@@ -4487,7 +4487,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_wgp_mode = 0
; MOVREL-NEXT: enable_mem_ordered = 0
; MOVREL-NEXT: enable_fwd_progress = 0
-; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f64_s_s_s.private_seg_size*64, 1024))/1024)>0)||(dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion))|5012)&1
; MOVREL-NEXT: user_sgpr_count = 10
; MOVREL-NEXT: enable_trap_handler = 0
; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -4512,16 +4512,16 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_ordered_append_gds = 0
; MOVREL-NEXT: private_element_size = 1
; MOVREL-NEXT: is_ptr64 = 1
-; MOVREL-NEXT: is_dynamic_callstack = 0
+; MOVREL-NEXT: is_dynamic_callstack = dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion
; MOVREL-NEXT: is_debug_enabled = 0
; MOVREL-NEXT: is_xnack_enabled = 0
-; MOVREL-NEXT: workitem_private_segment_byte_size = 0
+; MOVREL-NEXT: workitem_private_segment_byte_size = dyn_extract_v4f64_s_s_s.private_seg_size
; MOVREL-NEXT: workgroup_group_segment_byte_size = 0
; MOVREL-NEXT: gds_segment_byte_size = 0
; MOVREL-NEXT: kernarg_segment_byte_size = 28
; MOVREL-NEXT: workgroup_fbarrier_count = 0
-; MOVREL-NEXT: wavefront_sgpr_count = 9
-; MOVREL-NEXT: workitem_vgpr_count = 4
+; MOVREL-NEXT: wavefront_sgpr_count = dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 0))
+; MOVREL-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr)
; MOVREL-NEXT: reserved_vgpr_first = 0
; MOVREL-NEXT: reserved_vgpr_count = 0
; MOVREL-NEXT: reserved_sgpr_first = 0
@@ -4564,8 +4564,8 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: amd_machine_version_stepping = 0
; GFX10-NEXT: kernel_code_entry_byte_offset = 256
; GFX10-NEXT: kernel_code_prefetch_byte_size = 0
-; GFX10-NEXT: granulated_workitem_vgpr_count = 0
-; GFX10-NEXT: granulated_wavefront_sgpr_count = 1
+; GFX10-NEXT: granulated_workitem_vgpr_count = (1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; GFX10-NEXT: granulated_wavefront_sgpr_count = ((1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
; GFX10-NEXT: priority = 0
; GFX10-NEXT: float_mode = 240
; GFX10-NEXT: priv = 0
@@ -4575,7 +4575,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_wgp_mode = 1
; GFX10-NEXT: enable_mem_ordered = 1
; GFX10-NEXT: enable_fwd_progress = 0
-; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f64_s_s_s.private_seg_size*32, 1024))/1024)>0)||(dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion))|5012)&1
; GFX10-NEXT: user_sgpr_count = 10
; GFX10-NEXT: enable_trap_handler = 0
; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -4600,16 +4600,16 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_ordered_append_gds = 0
; GFX10-NEXT: private_element_size = 1
; GFX10-NEXT: is_ptr64 = 1
-; GFX10-NEXT: is_dynamic_callstack = 0
+; GFX10-NEXT: is_dynamic_callstack = dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion
; GFX10-NEXT: is_debug_enabled = 0
; GFX10-NEXT: is_xnack_enabled = 1
-; GFX10-NEXT: workitem_private_segment_byte_size = 0
+; GFX10-NEXT: workitem_private_segment_byte_size = dyn_extract_v4f64_s_s_s.private_seg_size
; GFX10-NEXT: workgroup_group_segment_byte_size = 0
; GFX10-NEXT: gds_segment_byte_size = 0
; GFX10-NEXT: kernarg_segment_byte_size = 28
; GFX10-NEXT: workgroup_fbarrier_count = 0
-; GFX10-NEXT: wavefront_sgpr_count = 9
-; GFX10-NEXT: workitem_vgpr_count = 3
+; GFX10-NEXT: wavefront_sgpr_count = dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 1))
+; GFX10-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr)
; GFX10-NEXT: reserved_vgpr_first = 0
; GFX10-NEXT: reserved_vgpr_count = 0
; GFX10-NEXT: reserved_sgpr_first = 0
@@ -4652,8 +4652,8 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: amd_machine_version_stepping = 0
; GFX11-NEXT: kernel_code_entry_byte_offset = 256
; GFX11-NEXT: kernel_code_prefetch_byte_size = 0
-; GFX11-NEXT: granulated_workitem_vgpr_count = 0
-; GFX11-NEXT: granulated_wavefront_sgpr_count = 0
+; GFX11-NEXT: granulated_workitem_vgpr_count = (1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; GFX11-NEXT: granulated_wavefront_sgpr_count = ((1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
; GFX11-NEXT: priority = 0
; GFX11-NEXT: float_mode = 240
; GFX11-NEXT: priv = 0
@@ -4663,7 +4663,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: enable_wgp_mode = 1
; GFX11-NEXT: enable_mem_ordered = 1
; GFX11-NEXT: enable_fwd_progress = 0
-; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f64_s_s_s.private_seg_size*32, 256))/256)>0)||(dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion))|5018)&1
; GFX11-NEXT: user_sgpr_count = 13
; GFX11-NEXT: enable_trap_handler = 0
; GFX11-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -4688,16 +4688,16 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: enable_ordered_append_gds = 0
; GFX11-NEXT: private_element_size = 1
; GFX11-NEXT: is_ptr64 = 1
-; GFX11-NEXT: is_dynamic_callstack = 0
+; GFX11-NEXT: is_dynamic_callstack = dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion
; GFX11-NEXT: is_debug_enabled = 0
; GFX11-NEXT: is_xnack_enabled = 0
-; GFX11-NEXT: workitem_private_segment_byte_size = 0
+; GFX11-NEXT: workitem_private_segment_byte_size = dyn_extract_v4f64_s_s_s.private_seg_size
; GFX11-NEXT: workgroup_group_segment_byte_size = 0
; GFX11-NEXT: gds_segment_byte_size = 0
; GFX11-NEXT: kernarg_segment_byte_size = 28
; GFX11-NEXT: workgroup_fbarrier_count = 0
-; GFX11-NEXT: wavefront_sgpr_count = 7
-; GFX11-NEXT: workitem_vgpr_count = 3
+; GFX11-NEXT: wavefront_sgpr_count = dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 0))
+; GFX11-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr)
; GFX11-NEXT: reserved_vgpr_first = 0
; GFX11-NEXT: reserved_vgpr_count = 0
; GFX11-NEXT: reserved_sgpr_first = 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
index 66b88236bbb4c1..0221d0f790be0c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
@@ -18,7 +18,7 @@ target triple = "amdgcn-amd-amdhsa"
; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset
; RW-FLAT-NOT: .amdhsa_enable_private_segment
; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset
-; RO-FLAT: .amdhsa_enable_private_segment 1
+; RO-FLAT: .amdhsa_enable_private_segment (((((alignto(stack_object_addrspacecast_in_kernel_no_calls.private_seg_size*64, 1024))/1024)>0)||(stack_object_addrspacecast_in_kernel_no_calls.has_dyn_sized_stack|stack_object_addrspacecast_in_kernel_no_calls.has_recursion))|128)&1
; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1
; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 6
; RO-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 0
@@ -38,11 +38,12 @@ define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() {
; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer
; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 1
; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init
-; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(stack_object_in_kernel_no_calls.private_seg_size*64, 1024))/1024)>0)||(stack_object_in_kernel_no_calls.has_dyn_sized_stack|stack_object_in_kernel_no_calls.has_recursion))|140)&1
; RW-FLAT-NOT: .amdhsa_enable_private_segment
; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset
-; RO-FLAT: .amdhsa_enable_private_segment 1
-; RW-FLAT: .amdhsa_reserve_flat_scratch 0
+; RO-FLAT: .amdhsa_enable_private_segment (((((alignto(stack_object_in_kernel_no_calls.private_seg_size*64, 1024))/1024)>0)||(stack_object_in_kernel_no_calls.has_dyn_sized_stack|stack_object_in_kernel_no_calls.has_recursion))|128)&1
+; RW-FLAT: .amdhsa_reserve_flat_scratch stack_object_in_kernel_no_calls.uses_flat_scratch
+; RW-FLAT: .set stack_object_in_kernel_no_calls.uses_flat_scratch, 0
; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1
; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 6
; RO-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 0
@@ -58,11 +59,12 @@ define amdgpu_kernel void @stack_object_in_kernel_no_calls() {
; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer
; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 0
; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init
-; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
+; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(kernel_no_calls_no_stack.private_seg_size*64, 1024))/1024)>0)||(kernel_no_calls_no_stack.has_dyn_sized_stack|kernel_no_calls_no_stack.has_recursion))|136)&1
; RW-FLAT-NOT: .amdhsa_enable_private_segment
; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset
-; RO-FLAT: .amdhsa_enable_private_segment 0
-; RW-FLAT: .amdhsa_reserve_flat_scratch 0
+; RO-FLAT: .amdhsa_enable_private_segment (((((alignto(kernel_no_calls_no_stack.private_seg_size*64, 1024))/1024)>0)||(kernel_no_calls_no_stack.has_dyn_sized_stack|kernel_no_calls_no_stack.has_recursion))|128)&1
+; RW-FLAT: .amdhsa_reserve_flat_scratch kernel_no_calls_no_stack.uses_flat_scratch
+; RW-FLAT: .set kernel_no_calls_no_stack.uses_flat_scratch, 0
; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 4
; RO-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
index d5646820a19832..374ce0676d2205 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
@@ -16,7 +16,7 @@ declare i32 @llvm.amdgcn.workitem.id.z() #0
; MESA: .section .AMDGPU.config
; MESA: .long 47180
-; MESA-NEXT: .long 132{{$}}
+; MESA-NEXT: .long ((((alignto(test_workitem_id_x.private_seg_size*64, 1024))/1024)>0)||(test_workitem_id_x.has_dyn_sized_stack|test_workitem_id_x.has_recursion))|132{{$}}
; ALL-LABEL: {{^}}test_workitem_id_x:
; MESA3D: enable_vgpr_workitem_id = 0
@@ -33,7 +33,7 @@ define amdgpu_kernel void @test_workitem_id_x(ptr addrspace(1) %out) #1 {
; MESA: .section .AMDGPU.config
; MESA: .long 47180
-; MESA-NEXT: .long 2180{{$}}
+; MESA-NEXT: .long ((((alignto(test_workitem_id_y.private_seg_size*64, 1024))/1024)>0)||(test_workitem_id_y.has_dyn_sized_stack|test_workitem_id_y.has_recursion))|2180{{$}}
; ALL-LABEL: {{^}}test_workitem_id_y:
; MESA3D: enable_vgpr_workitem_id = 1
@@ -51,7 +51,7 @@ define amdgpu_kernel void @test_workitem_id_y(ptr addrspace(1) %out) #1 {
; MESA: .section .AMDGPU.config
; MESA: .long 47180
-; MESA-NEXT: .long 4228{{$}}
+; MESA-NEXT: .long ((((alignto(test_workitem_id_z.private_seg_size*64, 1024))/1024)>0)||(test_workitem_id_z.has_dyn_sized_stack|test_workitem_id_z.has_recursion))|4228{{$}}
; ALL-LABEL: {{^}}test_workitem_id_z:
; MESA3D: enable_vgpr_workitem_id = 2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
index c7afbeabbbb6b1..1c3db1d64b299d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
@@ -75,10 +75,10 @@ bb.2:
store volatile i32 0, ptr addrspace(1) undef
ret void
}
-; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 16
+; DEFAULTSIZE: .amdhsa_private_segment_fixed_size kernel_non_entry_block_static_alloca_uniformly_reached_align4.private_seg_size
; DEFAULTSIZE: ; ScratchSize: 16
-; ASSUME1024: .amdhsa_private_segment_fixed_size 1040
+; ASSUME1024: .amdhsa_private_segment_fixed_size kernel_non_entry_block_static_alloca_uniformly_reached_align4.private_seg_size
; ASSUME1024: ; ScratchSize: 1040
define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) {
@@ -137,10 +137,10 @@ bb.1:
ret void
}
-; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 64
+; DEFAULTSIZE: .amdhsa_private_segment_fixed_size kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size
; DEFAULTSIZE: ; ScratchSize: 64
-; ASSUME1024: .amdhsa_private_segment_fixed_size 1088
+; ASSUME1024: .amdhsa_private_segment_fixed_size kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size
; ASSUME1024: ; ScratchSize: 1088
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
index 8d87b53efb4e73..e311be4b12218a 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
@@ -2,9 +2,10 @@
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX90A %s
; GCN-LABEL: {{^}}kernel_32_agprs:
-; GFX908: .amdhsa_next_free_vgpr 32
-; GFX90A: .amdhsa_next_free_vgpr 44
-; GFX90A: .amdhsa_accum_offset 12
+; GCN: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_32_agprs.num_agpr, kernel_32_agprs.num_vgpr), 1, 0)
+; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_32_agprs.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
+; GFX908: .set kernel_32_agprs.num_vgpr, 9
+; GFX908: .set kernel_32_agprs.num_agpr, 32
; GCN: NumVgprs: 9
; GCN: NumAgprs: 32
; GFX908: TotalNumVgprs: 32
@@ -24,8 +25,9 @@ bb:
}
; GCN-LABEL: {{^}}kernel_0_agprs:
-; GCN: .amdhsa_next_free_vgpr 1
-; GFX90A: .amdhsa_accum_offset 4
+; GCN: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_0_agprs.num_agpr, kernel_0_agprs.num_vgpr), 1, 0)
+; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_0_agprs.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
+; GCN: .set kernel_0_agprs.num_vgpr, 1
; GCN: NumVgprs: 1
; GCN: NumAgprs: 0
; GCN: TotalNumVgprs: 1
@@ -42,9 +44,10 @@ bb:
}
; GCN-LABEL: {{^}}kernel_40_vgprs:
-; GFX908: .amdhsa_next_free_vgpr 40
-; GFX90A: .amdhsa_next_free_vgpr 56
-; GFX90A: .amdhsa_accum_offset 40
+; GCN: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_40_vgprs.num_agpr, kernel_40_vgprs.num_vgpr), 1, 0)
+; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_40_vgprs.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
+; GCN: .set kernel_40_vgprs.num_vgpr, 40
+; GFX90A: .set kernel_40_vgprs.num_agpr, 16
; GCN: NumVgprs: 40
; GCN: NumAgprs: 16
; GFX908: TotalNumVgprs: 40
@@ -99,9 +102,10 @@ bb:
}
; GCN-LABEL: {{^}}kernel_max_gprs:
-; GFX908: .amdhsa_next_free_vgpr 256
-; GFX90A: .amdhsa_next_free_vgpr 512
-; GFX90A: .amdhsa_accum_offset 256
+; GCN: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_max_gprs.num_agpr, kernel_max_gprs.num_vgpr), 1, 0)
+; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_max_gprs.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
+; GCN: .set kernel_max_gprs.num_vgpr, 256
+; GFX90A: .set kernel_max_gprs.num_agpr, 256
; GCN: NumVgprs: 256
; GCN: NumAgprs: 256
; GFX908: TotalNumVgprs: 256
@@ -121,8 +125,10 @@ bb:
}
; GCN-LABEL: {{^}}kernel_call_func_32_agprs:
-; GFX908: .amdhsa_next_free_vgpr 32
-; GFX90A: .amdhsa_accum_offset 12
+; GCN: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_call_func_32_agprs.num_agpr, kernel_call_func_32_agprs.num_vgpr), 1, 0)
+; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_call_func_32_agprs.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
+; GCN: .set kernel_call_func_32_agprs.num_vgpr, max(0, func_32_agprs.num_vgpr)
+; GCN: .set kernel_call_func_32_agprs.num_agpr, max(0, func_32_agprs.num_agpr)
; GCN: NumVgprs: 9
; GCN: NumAgprs: 32
; GFX908: TotalNumVgprs: 32
@@ -154,25 +160,28 @@ bb:
declare void @undef_func()
; GCN-LABEL: {{^}}kernel_call_undef_func:
-; GFX908: .amdhsa_next_free_vgpr 32
-; GFX90A: .amdhsa_next_free_vgpr 64
-; GFX90A: .amdhsa_accum_offset 32
-; GCN: NumVgprs: 32
-; GCN: NumAgprs: 32
-; GFX908: TotalNumVgprs: 32
-; GFX90A: TotalNumVgprs: 64
-; GFX908: VGPRBlocks: 7
-; GFX90A: VGPRBlocks: 7
-; GFX908: NumVGPRsForWavesPerEU: 32
-; GFX90A: NumVGPRsForWavesPerEU: 64
-; GFX90A: AccumOffset: 32
-; GFX908: Occupancy: 8
-; GFX90A: Occupancy: 8
-; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 7
+; GCN: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0)
+; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
+; GCN: .set kernel_call_undef_func.num_vgpr, max(32, max_num_vgpr)
+; GCN: .set kernel_call_undef_func.num_agpr, max(0, max_num_agpr)
+; GCN: NumVgprs: kernel_call_undef_func.num_vgpr
+; GCN: NumAgprs: kernel_call_undef_func.num_agpr
+; GCN: TotalNumVgprs: totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr)
+; GFX908: VGPRBlocks: ((alignto(max(max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0), 1), 4))/4)-1
+; GFX90A: VGPRBlocks: ((alignto(max(max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0), 1), 8))/8)-1
+; GCN: NumVGPRsForWavesPerEU: max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0)
+; GFX90A: AccumOffset: ((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4))/4)-1)+1)*4
+; GFX908: Occupancy: occupancy(10, 4, 256, 8, 10, max(kernel_call_undef_func.num_sgpr+(extrasgprs(kernel_call_undef_func.uses_vcc, kernel_call_undef_func.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0))
+; GFX90A: Occupancy: occupancy(8, 8, 512, 8, 8, max(kernel_call_undef_func.num_sgpr+(extrasgprs(kernel_call_undef_func.uses_vcc, kernel_call_undef_func.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0))
+; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: ((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4))/4)-1)&(~65536))&63
define amdgpu_kernel void @kernel_call_undef_func() #0 {
bb:
call void @undef_func()
ret void
}
+; GCN: .set max_num_vgpr, 32
+; GCN-NEXT: .set max_num_agpr, 32
+; GCN-NEXT: .set max_num_sgpr, 34
+
attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index 9ec8e425a3f55c..993ff4e4477d35 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -35,7 +35,8 @@
; FIXME: Creating the emergency stack slots causes us to over-estimate scratch
; by 4 bytes.
-; HSA-ALLOCA: .amdhsa_private_segment_fixed_size 24
+; HSA-ALLOCA: .amdhsa_private_segment_fixed_size mova_same_clause.private_seg_size
+; HSA-ALLOCA: .set mova_same_clause.private_seg_size, 24
; HSA-ALLOCA: s_add_i32 s12, s12, s17
; HSA-ALLOCA-DAG: s_mov_b32 flat_scratch_lo, s13
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll
index 99a7ae37e0e78d..f64a5e01cd2560 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll
@@ -60,7 +60,9 @@ bb:
declare void @undef_func()
; CHECK: .type kernel_call_undef_func
-; CHECK: NumAgprs: 32
+; CHECK: .set kernel_call_undef_func.num_agpr, max(0, max_num_agpr)
+; CHECK: NumAgprs: kernel_call_undef_func.num_agpr
+; CHECK: .set max_num_agpr, 32
define amdgpu_kernel void @kernel_call_undef_func() #0 {
bb:
call void @undef_func()
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
index e4d427a0b826f8..c893f6b04b7b66 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
@@ -547,18 +547,20 @@ define amdgpu_kernel void @f256() #256 {
attributes #256 = { nounwind "amdgpu-flat-work-group-size"="256,256" }
; GCN-LABEL: {{^}}f512:
-; GFX9: NumVgprs: 128
-; GFX90A: NumVgprs: 128
-; GFX90A: NumAgprs: 128
-; GFX90A: TotalNumVgprs: 256
-; GFX10WGP-WAVE32: NumVgprs: 256
-; GFX10WGP-WAVE64: NumVgprs: 256
-; GFX10CU-WAVE32: NumVgprs: 128
-; GFX10CU-WAVE64: NumVgprs: 128
-; GFX11WGP-WAVE32: NumVgprs: 256
-; GFX11WGP-WAVE64: NumVgprs: 256
-; GFX11CU-WAVE32: NumVgprs: 192
-; GFX11CU-WAVE64: NumVgprs: 192
+; GFX9: .set f512.num_vgpr, max(128, max_num_vgpr)
+; GFX90A: .set f512.num_vgpr, max(128, max_num_vgpr)
+; GFX90A: .set f512.num_agpr, max(128, max_num_agpr)
+; GFX10WGP-WAVE32: .set f512.num_vgpr, max(256, max_num_vgpr)
+; GFX10WGP-WAVE64: .set f512.num_vgpr, max(256, max_num_vgpr)
+; GFX10CU-WAVE32: .set f512.num_vgpr, max(128, max_num_vgpr)
+; GFX10CU-WAVE64: .set f512.num_vgpr, max(128, max_num_vgpr)
+; GFX11WGP-WAVE32: .set f512.num_vgpr, max(256, max_num_vgpr)
+; GFX11WGP-WAVE64: .set f512.num_vgpr, max(256, max_num_vgpr)
+; GFX11CU-WAVE32: .set f512.num_vgpr, max(192, max_num_vgpr)
+; GFX11CU-WAVE64: .set f512.num_vgpr, max(192, max_num_vgpr)
+; GCN: NumVgprs: f512.num_vgpr
+; GFX90A: NumAgprs: f512.num_agpr
+; GFX90A: TotalNumVgprs: totalnumvgprs(f512.num_agpr, f512.num_vgpr)
define amdgpu_kernel void @f512() #512 {
call void @foo()
call void @use256vgprs()
@@ -567,17 +569,20 @@ define amdgpu_kernel void @f512() #512 {
attributes #512 = { nounwind "amdgpu-flat-work-group-size"="512,512" }
; GCN-LABEL: {{^}}f1024:
-; GFX9: NumVgprs: 64
-; GFX90A: NumAgprs: 64
-; GFX90A: TotalNumVgprs: 128
-; GFX10WGP-WAVE32: NumVgprs: 128
-; GFX10WGP-WAVE64: NumVgprs: 128
-; GFX10CU-WAVE32: NumVgprs: 64
-; GFX10CU-WAVE64: NumVgprs: 64
-; GFX11WGP-WAVE32: NumVgprs: 192
-; GFX11WGP-WAVE64: NumVgprs: 192
-; GFX11CU-WAVE32: NumVgprs: 96
-; GFX11CU-WAVE64: NumVgprs: 96
+; GFX9: .set f1024.num_vgpr, max(64, max_num_vgpr)
+; GFX90A: .set f1024.num_vgpr, max(64, max_num_vgpr)
+; GFX90A: .set f1024.num_agpr, max(64, max_num_agpr)
+; GFX10WGP-WAVE32: .set f1024.num_vgpr, max(128, max_num_vgpr)
+; GFX10WGP-WAVE64: .set f1024.num_vgpr, max(128, max_num_vgpr)
+; GFX10CU-WAVE32: .set f1024.num_vgpr, max(64, max_num_vgpr)
+; GFX10CU-WAVE64: .set f1024.num_vgpr, max(64, max_num_vgpr)
+; GFX11WGP-WAVE32: .set f1024.num_vgpr, max(192, max_num_vgpr)
+; GFX11WGP-WAVE64: .set f1024.num_vgpr, max(192, max_num_vgpr)
+; GFX11CU-WAVE32: .set f1024.num_vgpr, max(96, max_num_vgpr)
+; GFX11CU-WAVE64: .set f1024.num_vgpr, max(96, max_num_vgpr)
+; GCN: NumVgprs: f1024.num_vgpr
+; GFX90A: NumAgprs: f1024.num_agpr
+; GFX90A: TotalNumVgprs: totalnumvgprs(f1024.num_agpr, f1024.num_vgpr)
define amdgpu_kernel void @f1024() #1024 {
call void @foo()
call void @use256vgprs()
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
index 90562e25a3e9c1..77be8605f20015 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
@@ -10,12 +10,21 @@
; OPT: .amdhsa_user_sgpr_dispatch_id 0
; OPT: .amdhsa_user_sgpr_flat_scratch_init 0
; OPT: .amdhsa_user_sgpr_private_segment_size 0
-; OPT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
+; OPT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(foo.private_seg_size*64, 1024))/1024)>0)||(foo.has_dyn_sized_stack|foo.has_recursion))|136)&1
; OPT: .amdhsa_system_sgpr_workgroup_id_x 1
; OPT: .amdhsa_system_sgpr_workgroup_id_y 0
; OPT: .amdhsa_system_sgpr_workgroup_id_z 0
; OPT: .amdhsa_system_sgpr_workgroup_info 0
; OPT: .amdhsa_system_vgpr_workitem_id 0
+; OPT: .set foo.num_vgpr, 0
+; OPT: .set foo.num_agpr, 0
+; OPT: .set foo.num_sgpr, 0
+; OPT: .set foo.private_seg_size, 0
+; OPT: .set foo.uses_vcc, 0
+; OPT: .set foo.uses_flat_scratch, 0
+; OPT: .set foo.has_dyn_sized_stack, 0
+; OPT: .set foo.has_recursion, 0
+; OPT: .set foo.has_indirect_call, 0
; NOOPT: .amdhsa_user_sgpr_private_segment_buffer 1
; NOOPT: .amdhsa_user_sgpr_dispatch_ptr 1
@@ -25,12 +34,25 @@
; NOOPT: .amdhsa_user_sgpr_dispatch_id 1
; NOOPT: .amdhsa_user_sgpr_flat_scratch_init 0
; NOOPT: .amdhsa_user_sgpr_private_segment_size 0
-; NOOPT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
+; COV4: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(foo.private_seg_size*64, 1024))/1024)>0)||(foo.has_dyn_sized_stack|foo.has_recursion))|5016)&1
+; COV5: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(foo.private_seg_size*64, 1024))/1024)>0)||(foo.has_dyn_sized_stack|foo.has_recursion))|5012)&1
; NOOPT: .amdhsa_system_sgpr_workgroup_id_x 1
; NOOPT: .amdhsa_system_sgpr_workgroup_id_y 1
; NOOPT: .amdhsa_system_sgpr_workgroup_id_z 1
-; NOOPT: .amdhsa_system_sgpr_workgroup_info 0
-; NOOPT: .amdhsa_system_vgpr_workitem_id 2
+; COV4: .amdhsa_system_sgpr_workgroup_info 0
+; COV5: .amdhsa_system_sgpr_workgroup_info 0
+; COV4: .amdhsa_system_vgpr_workitem_id 2
+; COV5: .amdhsa_system_vgpr_workitem_id 2
+; NOOPT: .set foo.num_vgpr, 0
+; NOOPT: .set foo.num_agpr, 0
+; NOOPT: .set foo.num_sgpr, 0
+; NOOPT: .set foo.private_seg_size, 0
+; NOOPT: .set foo.uses_vcc, 0
+; NOOPT: .set foo.uses_flat_scratch, 0
+; NOOPT: .set foo.has_dyn_sized_stack, 0
+; NOOPT: .set foo.has_recursion, 0
+; NOOPT: .set foo.has_indirect_call, 0
+
define amdgpu_kernel void @foo() {
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
index a795e995603410..f5d45993742814 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefixes=ALL,GFX908 %s
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=ALL %s
; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=ALL,GFX90A %s
; CallGraphAnalysis, which CodeGenSCC order depends on, does not look
@@ -8,12 +8,13 @@
@alias = hidden alias void (), ptr @aliasee_default
; ALL-LABEL: {{^}}kernel:
-; GFX908: .amdhsa_next_free_vgpr 32
-; GFX908-NEXT: .amdhsa_next_free_sgpr 33
+; ALL: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel.num_agpr, kernel.num_vgpr), 1, 0)
+; ALL-NEXT: .amdhsa_next_free_sgpr (max(kernel.num_sgpr+(extrasgprs(kernel.uses_vcc, kernel.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel.uses_vcc, kernel.uses_flat_scratch, 1))
+; GFX90A-NEXT: .amdhsa_accum_offset ((((((alignto(max(1, kernel.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
-; GFX90A: .amdhsa_next_free_vgpr 59
-; GFX90A-NEXT: .amdhsa_next_free_sgpr 33
-; GFX90A-NEXT: .amdhsa_accum_offset 32
+; ALL: .set kernel.num_vgpr, max(32, aliasee_default.num_vgpr)
+; ALL-NEXT: .set kernel.num_agpr, max(0, aliasee_default.num_agpr)
+; ALL-NEXT: .set kernel.num_sgpr, max(33, aliasee_default.num_sgpr)
define amdgpu_kernel void @kernel() #0 {
bb:
call void @alias() #2
@@ -25,6 +26,9 @@ bb:
call void asm sideeffect "; clobber a26 ", "~{a26}"()
ret void
}
+; ALL: .set aliasee_default.num_vgpr, 0
+; ALL-NEXT: .set aliasee_default.num_agpr, 27
+; ALL-NEXT: .set aliasee_default.num_sgpr, 32
attributes #0 = { noinline norecurse nounwind optnone }
attributes #1 = { noinline norecurse nounwind readnone willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
index c976cc3d53b5eb..092e734ef106be 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
@@ -7,14 +7,18 @@
@alias0 = hidden alias void (), ptr @aliasee_default_vgpr64_sgpr102
; CHECK-LABEL: {{^}}kernel0:
-; CHECK: .amdhsa_next_free_vgpr 53
-; CHECK-NEXT: .amdhsa_next_free_sgpr 33
+; CHECK: .set kernel0.num_vgpr, max(32, aliasee_default_vgpr64_sgpr102.num_vgpr)
+; CHECK-NEXT: .set kernel0.num_agpr, max(0, aliasee_default_vgpr64_sgpr102.num_agpr)
+; CHECK-NEXT: .set kernel0.num_sgpr, max(33, aliasee_default_vgpr64_sgpr102.num_sgpr)
define amdgpu_kernel void @kernel0() #0 {
bb:
call void @alias0() #2
ret void
}
+; CHECK: .set aliasee_default_vgpr64_sgpr102.num_vgpr, 53
+; CHECK-NEXT: .set aliasee_default_vgpr64_sgpr102.num_agpr, 0
+; CHECK-NEXT: .set aliasee_default_vgpr64_sgpr102.num_sgpr, 32
define internal void @aliasee_default_vgpr64_sgpr102() #1 {
bb:
call void asm sideeffect "; clobber v52 ", "~{v52}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
index edef71ef143dfd..f8287dc518421e 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
@@ -9,8 +9,12 @@
; The parent kernel has a higher VGPR usage than the possible callees.
; CHECK-LABEL: {{^}}kernel1:
-; CHECK: .amdhsa_next_free_vgpr 41
-; CHECK-NEXT: .amdhsa_next_free_sgpr 33
+; CHECK: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel1.num_agpr, kernel1.num_vgpr), 1, 0)
+; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel1.num_sgpr+(extrasgprs(kernel1.uses_vcc, kernel1.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel1.uses_vcc, kernel1.uses_flat_scratch, 1))
+
+; CHECK: .set kernel1.num_vgpr, max(41, aliasee_vgpr32_sgpr76.num_vgpr)
+; CHECK-NEXT: .set kernel1.num_agpr, max(0, aliasee_vgpr32_sgpr76.num_agpr)
+; CHECK-NEXT: .set kernel1.num_sgpr, max(33, aliasee_vgpr32_sgpr76.num_sgpr)
define amdgpu_kernel void @kernel1() #0 {
bb:
call void asm sideeffect "; clobber v40 ", "~{v40}"()
@@ -18,6 +22,9 @@ bb:
ret void
}
+; CHECK: .set aliasee_vgpr32_sgpr76.num_vgpr, 27
+; CHECK-NEXT: .set aliasee_vgpr32_sgpr76.num_agpr, 0
+; CHECK-NEXT: .set aliasee_vgpr32_sgpr76.num_sgpr, 32
define internal void @aliasee_vgpr32_sgpr76() #1 {
bb:
call void asm sideeffect "; clobber v26 ", "~{v26}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
index bb34ef1a15d2b9..a99b2295dfe85c 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
@@ -7,14 +7,21 @@
@alias2 = hidden alias void (), ptr @aliasee_vgpr64_sgpr102
; CHECK-LABEL: {{^}}kernel2:
-; CHECK: .amdhsa_next_free_vgpr 53
-; CHECK-NEXT: .amdhsa_next_free_sgpr 33
+; CHECK: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel2.num_agpr, kernel2.num_vgpr), 1, 0)
+; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel2.num_sgpr+(extrasgprs(kernel2.uses_vcc, kernel2.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel2.uses_vcc, kernel2.uses_flat_scratch, 1))
+
+; CHECK: .set kernel2.num_vgpr, max(32, aliasee_vgpr64_sgpr102.num_vgpr)
+; CHECK-NEXT: .set kernel2.num_agpr, max(0, aliasee_vgpr64_sgpr102.num_agpr)
+; CHECK-NEXT: .set kernel2.num_sgpr, max(33, aliasee_vgpr64_sgpr102.num_sgpr)
define amdgpu_kernel void @kernel2() #0 {
bb:
call void @alias2() #2
ret void
}
+; CHECK: .set aliasee_vgpr64_sgpr102.num_vgpr, 53
+; CHECK-NEXT: .set aliasee_vgpr64_sgpr102.num_agpr, 0
+; CHECK-NEXT: .set aliasee_vgpr64_sgpr102.num_sgpr, 32
define internal void @aliasee_vgpr64_sgpr102() #1 {
bb:
call void asm sideeffect "; clobber v52 ", "~{v52}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
index 8a88eb7e51ad72..793dc1bc3a6f33 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
@@ -7,14 +7,21 @@
@alias3 = hidden alias void (), ptr @aliasee_vgpr256_sgpr102
; CHECK-LABEL: {{^}}kernel3:
-; CHECK: .amdhsa_next_free_vgpr 253
-; CHECK-NEXT: .amdhsa_next_free_sgpr 33
+; CHECK: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel3.num_agpr, kernel3.num_vgpr), 1, 0)
+; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel3.num_sgpr+(extrasgprs(kernel3.uses_vcc, kernel3.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel3.uses_vcc, kernel3.uses_flat_scratch, 1))
+
+; CHECK: .set kernel3.num_vgpr, max(32, aliasee_vgpr256_sgpr102.num_vgpr)
+; CHECK-NEXT: .set kernel3.num_agpr, max(0, aliasee_vgpr256_sgpr102.num_agpr)
+; CHECK-NEXT: .set kernel3.num_sgpr, max(33, aliasee_vgpr256_sgpr102.num_sgpr)
define amdgpu_kernel void @kernel3() #0 {
bb:
call void @alias3() #2
ret void
}
+; CHECK: .set aliasee_vgpr256_sgpr102.num_vgpr, 253
+; CHECK-NEXT: .set aliasee_vgpr256_sgpr102.num_agpr, 0
+; CHECK-NEXT: .set aliasee_vgpr256_sgpr102.num_sgpr, 33
define internal void @aliasee_vgpr256_sgpr102() #1 {
bb:
call void asm sideeffect "; clobber v252 ", "~{v252}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index 6af45035d394f8..6311c2a01d366b 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -222,10 +222,14 @@ ret:
}
; GCN-LABEL: {{^}}usage_direct_recursion:
-; GCN: .amdhsa_private_segment_fixed_size 18448
+; GCN: .amdhsa_private_segment_fixed_size usage_direct_recursion.private_seg_size
+; GCN: .set usage_direct_recursion.private_seg_size, 0+(max(16384, direct_recursion_use_stack.private_seg_size))
+; GCN: ScratchSize: 18448
;
; GCN-V5-LABEL: {{^}}usage_direct_recursion:
-; GCN-V5: .amdhsa_private_segment_fixed_size 2064{{$}}
+; GCN-V5: .amdhsa_private_segment_fixed_size usage_direct_recursion.private_seg_size
+; GCN-V5: .set usage_direct_recursion.private_seg_size, 0+(max(direct_recursion_use_stack.private_seg_size))
+; GCN-V5: ScratchSize: 2064
define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
call void @direct_recursion_use_stack(i32 %n)
ret void
@@ -234,10 +238,11 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
; Make sure there's no assert when a sgpr96 is used.
; GCN-LABEL: {{^}}count_use_sgpr96_external_call
; GCN: ; sgpr96 s[{{[0-9]+}}:{{[0-9]+}}]
-; CI: NumSgprs: 84
-; VI-NOBUG: NumSgprs: 86
+; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(0, max_num_vgpr)
+; GCN: .set count_use_sgpr96_external_call.num_sgpr, max(33, max_num_sgpr)
+; CI: NumSgprs: count_use_sgpr96_external_call.num_sgpr+4
; VI-BUG: NumSgprs: 96
-; GCN: NumVgprs: 50
+; GCN: NumVgprs: count_use_sgpr96_external_call.num_vgpr
define amdgpu_kernel void @count_use_sgpr96_external_call() {
entry:
tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> <i32 10, i32 11, i32 12>) #1
@@ -248,10 +253,11 @@ entry:
; Make sure there's no assert when a sgpr160 is used.
; GCN-LABEL: {{^}}count_use_sgpr160_external_call
; GCN: ; sgpr160 s[{{[0-9]+}}:{{[0-9]+}}]
-; CI: NumSgprs: 84
-; VI-NOBUG: NumSgprs: 86
+; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(0, max_num_vgpr)
+; GCN: .set count_use_sgpr160_external_call.num_sgpr, max(33, max_num_sgpr)
+; CI: NumSgprs: count_use_sgpr160_external_call.num_sgpr+4
; VI-BUG: NumSgprs: 96
-; GCN: NumVgprs: 50
+; GCN: NumVgprs: count_use_sgpr160_external_call.num_vgpr
define amdgpu_kernel void @count_use_sgpr160_external_call() {
entry:
tail call void asm sideeffect "; sgpr160 $0", "s"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
@@ -262,10 +268,11 @@ entry:
; Make sure there's no assert when a vgpr160 is used.
; GCN-LABEL: {{^}}count_use_vgpr160_external_call
; GCN: ; vgpr160 v[{{[0-9]+}}:{{[0-9]+}}]
-; CI: NumSgprs: 84
-; VI-NOBUG: NumSgprs: 86
+; GCN: .set count_use_vgpr160_external_call.num_vgpr, max(5, max_num_vgpr)
+; GCN: .set count_use_vgpr160_external_call.num_sgpr, max(33, max_num_sgpr)
+; CI: NumSgprs: count_use_vgpr160_external_call.num_sgpr+4
; VI-BUG: NumSgprs: 96
-; GCN: NumVgprs: 50
+; GCN: NumVgprs: count_use_vgpr160_external_call.num_vgpr
define amdgpu_kernel void @count_use_vgpr160_external_call() {
entry:
tail call void asm sideeffect "; vgpr160 $0", "v"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
@@ -273,6 +280,27 @@ entry:
ret void
}
+; GCN: .set max_num_vgpr, 50
+; GCN: .set max_num_agpr, 0
+; GCN: .set max_num_sgpr, 80
+
+; GCN-LABEL: amdhsa.kernels:
+; GCN: .name: count_use_sgpr96_external_call
+; CI: .sgpr_count: 84
+; VI-NOBUG: .sgpr_count: 86
+; VI-BUG: .sgpr_count: 96
+; GCN: .vgpr_count: 50
+; GCN: .name: count_use_sgpr160_external_call
+; CI: .sgpr_count: 84
+; VI-NOBUG: .sgpr_count: 86
+; VI-BUG: .sgpr_count: 96
+; GCN: .vgpr_count: 50
+; GCN: .name: count_use_vgpr160_external_call
+; CI: .sgpr_count: 84
+; VI-NOBUG: .sgpr_count: 86
+; VI-BUG: .sgpr_count: 96
+; GCN: .vgpr_count: 50
+
attributes #0 = { nounwind noinline norecurse "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
attributes #1 = { nounwind noinline norecurse "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
attributes #2 = { nounwind noinline }
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
index 032ec65fa85133..7e731a70ca4d76 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
@@ -469,7 +469,7 @@ define hidden void @use_every_sgpr_input() #1 {
; GCN: .amdhsa_user_sgpr_dispatch_id 1
; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
; GCN: .amdhsa_user_sgpr_private_segment_size 0
-; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(kern_indirect_use_every_sgpr_input.private_seg_size*64, 1024))/1024)>0)||(kern_indirect_use_every_sgpr_input.has_dyn_sized_stack|kern_indirect_use_every_sgpr_input.has_recursion))|920)&1
; GCN: .amdhsa_system_sgpr_workgroup_id_x 1
; GCN: .amdhsa_system_sgpr_workgroup_id_y 1
; GCN: .amdhsa_system_sgpr_workgroup_id_z 1
@@ -494,7 +494,7 @@ define amdgpu_kernel void @kern_indirect_use_every_sgpr_input(i8) #1 {
; GCN: .amdhsa_user_sgpr_dispatch_id 1
; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
; GCN: .amdhsa_user_sgpr_private_segment_size 0
-; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(kern_indirect_use_every_sgpr_input_no_kernargs.private_seg_size*64, 1024))/1024)>0)||(kern_indirect_use_every_sgpr_input_no_kernargs.has_dyn_sized_stack|kern_indirect_use_every_sgpr_input_no_kernargs.has_recursion))|916)&1
; GCN: .amdhsa_system_sgpr_workgroup_id_x 1
; GCN: .amdhsa_system_sgpr_workgroup_id_y 1
; GCN: .amdhsa_system_sgpr_workgroup_id_z 1
diff --git a/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll b/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
index 643f2619840a22..ede57f1a0a04ce 100644
--- a/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
@@ -33,6 +33,7 @@ bb2:
; GCN-LABEL: {{^}}preserve_condition_undef_flag:
; GCN-NOT: vcc
+; GCN: s_endpgm
define amdgpu_kernel void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) {
bb0:
%tmp = icmp sgt i32 %arg1, 4
diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
index 3035a8579c8a6d..13fd714933dbb6 100644
--- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
@@ -15,13 +15,19 @@
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 10
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
-; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8
-; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
-; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
+; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr max(totalnumvgprs(fadd.num_agpr, fadd.num_vgpr), 1, 0)
+; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr (max(fadd.num_sgpr+(extrasgprs(fadd.uses_vcc, fadd.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(fadd.uses_vcc, fadd.uses_flat_scratch, 0))
+; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc fadd.uses_vcc
+; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch fadd.uses_flat_scratch
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
; OSABI-AMDHSA-ASM: .text
+; OSABI-AMDHSA-ASM: .set fadd.num_vgpr, 3
+; OSABI-AMDHSA-ASM: .set fadd.num_agpr, 0
+; OSABI-AMDHSA-ASM: .set fadd.num_sgpr, 8
+; OSABI-AMDHSA-ASM: .set fadd.uses_vcc, 0
+; OSABI-AMDHSA-ASM: .set fadd.uses_flat_scratch, 0
+
; ALL-ASM-LABEL: {{^}}fsub:
; OSABI-AMDHSA-ASM-NOT: .amdgpu_hsa_kernel
@@ -34,13 +40,19 @@
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 10
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
-; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8
-; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
-; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
+; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr max(totalnumvgprs(fsub.num_agpr, fsub.num_vgpr), 1, 0)
+; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr (max(fsub.num_sgpr+(extrasgprs(fsub.uses_vcc, fsub.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(fsub.uses_vcc, fsub.uses_flat_scratch, 0))
+; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc fsub.uses_vcc
+; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch fsub.uses_flat_scratch
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
; OSABI-AMDHSA-ASM: .text
+; OSABI-AMDHSA-ASM: .set fsub.num_vgpr, 3
+; OSABI-AMDHSA-ASM: .set fsub.num_agpr, 0
+; OSABI-AMDHSA-ASM: .set fsub.num_sgpr, 8
+; OSABI-AMDHSA-ASM: .set fsub.uses_vcc, 0
+; OSABI-AMDHSA-ASM: .set fsub.uses_flat_scratch, 0
+
; OSABI-AMDHSA-ASM-NOT: .hsa_code_object_version
; OSABI-AMDHSA-ASM-NOT: .hsa_code_object_isa
; OSABI-AMDHSA-ASM-NOT: .amd_amdgpu_isa
@@ -93,8 +105,10 @@ entry:
; registers used.
;
; ALL-ASM-LABEL: {{^}}empty:
-; ALL-ASM: .amdhsa_next_free_vgpr 1
-; ALL-ASM: .amdhsa_next_free_sgpr 1
+; ALL-ASM: .amdhsa_next_free_vgpr max(totalnumvgprs(empty.num_agpr, empty.num_vgpr), 1, 0)
+; ALL-ASM: .amdhsa_next_free_sgpr (max(empty.num_sgpr+(extrasgprs(empty.uses_vcc, empty.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(empty.uses_vcc, empty.uses_flat_scratch, 0))
+; ALL-ASM: NumSGPRsForWavesPerEU: 1
+; ALL-ASM: NumVGPRsForWavesPerEU: 1
define amdgpu_kernel void @empty(
i32 %i,
ptr addrspace(1) %r,
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll
index 9d93609b1e8813..aa1a93cfec3d86 100644
--- a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll
@@ -1,8 +1,8 @@
; REQUIRES: asserts
-; RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
-; RUN: not --crash llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=asm < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=asm < %s | FileCheck %s
-; CHECK: function must have been generated already
+; CHECK-NOT: func
define internal i32 @func() {
ret i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
index 789150f690d52e..69a729f6847f0a 100644
--- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -62,7 +62,8 @@
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]]
-; VGPR: .amdhsa_private_segment_fixed_size 16
+; VGPR: .amdhsa_private_segment_fixed_size divergent_if_endif.private_seg_size
+; VGPR: .set divergent_if_endif.private_seg_size, 16
define amdgpu_kernel void @divergent_if_endif(ptr addrspace(1) %out) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -133,7 +134,8 @@ endif:
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]]
-; VGPR: .amdhsa_private_segment_fixed_size 20
+; VGPR: .amdhsa_private_segment_fixed_size divergent_loop.private_seg_size
+; VGPR: .set divergent_loop.private_seg_size, 20
define amdgpu_kernel void @divergent_loop(ptr addrspace(1) %out) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/elf.ll b/llvm/test/CodeGen/AMDGPU/elf.ll
index f51d9fc5125ba6..423bb95af25df9 100644
--- a/llvm/test/CodeGen/AMDGPU/elf.ll
+++ b/llvm/test/CodeGen/AMDGPU/elf.ll
@@ -3,7 +3,7 @@
; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -filetype=obj | llvm-readobj -S --symbols --file-headers - | FileCheck --check-prefix=ELF %s
; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TONGA %s
; RUN: llc < %s -mtriple=amdgcn -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs -filetype=obj | llvm-readobj -S --symbols --file-headers - | FileCheck --check-prefix=ELF %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=CARRIZO %s
; Test that we don't try to produce a COFF file on windows
; RUN: llc < %s -mtriple=amdgcn-pc-mingw -verify-machineinstrs -filetype=obj | llvm-readobj -S --symbols --file-headers - | FileCheck --check-prefix=ELF %s
@@ -20,8 +20,9 @@
; CONFIG: .section .AMDGPU.config
; CONFIG-NEXT: .long 45096
-; TYPICAL-NEXT: .long 0
-; TONGA-NEXT: .long 704
+; TYPICAL-NEXT: .long (((((alignto(max(max(totalnumvgprs(test.num_agpr, max(totalnumvgprs(test.num_agpr, test.num_vgpr), 1)), 1, 0), 1), 4))/4)-1)&63)<<0)|(((((alignto(max(max(max(test.num_sgpr+(extrasgprs(test.uses_vcc, test.uses_flat_scratch, 0)), 0), 1, 0), 1), 8))/8)-1)&15)<<6)
+; TONGA-NEXT: .long (((((alignto(max(max(totalnumvgprs(test.num_agpr, max(totalnumvgprs(test.num_agpr, test.num_vgpr), 1)), 1, 0), 1), 4))/4)-1)&63)<<0)|(((((alignto(max(96, 1), 8))/8)-1)&15)<<6)
+; CARRIZO-NEXT: .long (((((alignto(max(max(totalnumvgprs(test.num_agpr, max(totalnumvgprs(test.num_agpr, test.num_vgpr), 1)), 1, 0), 1), 4))/4)-1)&63)<<0)|(((((alignto(max(max(max(test.num_sgpr+(extrasgprs(test.uses_vcc, test.uses_flat_scratch, 1)), 0), 1, 0), 1), 8))/8)-1)&15)<<6)
; CONFIG: .p2align 8
; CONFIG: test:
define amdgpu_ps void @test(i32 %p) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/enable-scratch-only-dynamic-stack.ll b/llvm/test/CodeGen/AMDGPU/enable-scratch-only-dynamic-stack.ll
index 78ac2f9eaff020..c4111282682527 100644
--- a/llvm/test/CodeGen/AMDGPU/enable-scratch-only-dynamic-stack.ll
+++ b/llvm/test/CodeGen/AMDGPU/enable-scratch-only-dynamic-stack.ll
@@ -7,13 +7,17 @@
; No stack objects, only indirect call has to enable scrathch
; GCN-LABEL: test_indirect_call:
-; COV5: .amdhsa_private_segment_fixed_size 0{{$}}
-; COV4: .amdhsa_private_segment_fixed_size 16384{{$}}
-
+; GCN: .amdhsa_private_segment_fixed_size test_indirect_call.private_seg_size
; GCN: .amdhsa_user_sgpr_private_segment_buffer 1
+; COV5: .amdhsa_uses_dynamic_stack ((59|((test_indirect_call.has_dyn_sized_stack|test_indirect_call.has_recursion)<<11))&2048)>>11
+; COV5: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(test_indirect_call.private_seg_size*64, 1024))/1024)>0)||(test_indirect_call.has_dyn_sized_stack|test_indirect_call.has_recursion))|5016)&1
+; COV4: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(test_indirect_call.private_seg_size*64, 1024))/1024)>0)||(test_indirect_call.has_dyn_sized_stack|test_indirect_call.has_recursion))|5020)&1
+
+; COV5: .set test_indirect_call.private_seg_size, 0{{$}}
+; COV4: .set test_indirect_call.private_seg_size, 0+(max(16384))
+; COV5: .set test_indirect_call.has_recursion, 1
+; COV5: .set test_indirect_call.has_indirect_call, 1
-; COV5: .amdhsa_uses_dynamic_stack 1
-; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
define amdgpu_kernel void @test_indirect_call() {
%fptr = load ptr, ptr addrspace(4) @gv.fptr0
call void %fptr()
diff --git a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
new file mode 100644
index 00000000000000..c411323a70ed31
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
@@ -0,0 +1,533 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; SGPR use may not seem equal to the sgpr use provided in comments as the latter includes extra sgprs (e.g., for vcc use).
+
+; Functions that don't make calls should have constants as its resource usage as no resource information has to be propagated.
+
+; GCN-LABEL: {{^}}use_vcc:
+; GCN: .set use_vcc.num_vgpr, 0
+; GCN: .set use_vcc.num_agpr, 0
+; GCN: .set use_vcc.num_sgpr, 32
+; GCN: .set use_vcc.private_seg_size, 0
+; GCN: .set use_vcc.uses_vcc, 1
+; GCN: .set use_vcc.uses_flat_scratch, 0
+; GCN: .set use_vcc.has_dyn_sized_stack, 0
+; GCN: .set use_vcc.has_recursion, 0
+; GCN: .set use_vcc.has_indirect_call, 0
+; GCN: NumSgprs: 36
+; GCN: NumVgprs: 0
+; GCN: ScratchSize: 0
+define void @use_vcc() #1 {
+ call void asm sideeffect "", "~{vcc}" () #0
+ ret void
+}
+
+; GCN-LABEL: {{^}}indirect_use_vcc:
+; GCN: .set indirect_use_vcc.num_vgpr, max(41, use_vcc.num_vgpr)
+; GCN: .set indirect_use_vcc.num_agpr, max(0, use_vcc.num_agpr)
+; GCN: .set indirect_use_vcc.num_sgpr, max(34, use_vcc.num_sgpr)
+; GCN: .set indirect_use_vcc.private_seg_size, 16+(max(use_vcc.private_seg_size))
+; GCN: .set indirect_use_vcc.uses_vcc, or(1, use_vcc.uses_vcc)
+; GCN: .set indirect_use_vcc.uses_flat_scratch, or(0, use_vcc.uses_flat_scratch)
+; GCN: .set indirect_use_vcc.has_dyn_sized_stack, or(0, use_vcc.has_dyn_sized_stack)
+; GCN: .set indirect_use_vcc.has_recursion, or(0, use_vcc.has_recursion)
+; GCN: .set indirect_use_vcc.has_indirect_call, or(0, use_vcc.has_indirect_call)
+; GCN: NumSgprs: 38
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 16
+define void @indirect_use_vcc() #1 {
+ call void @use_vcc()
+ ret void
+}
+
+; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel:
+; GCN: .set indirect_2level_use_vcc_kernel.num_vgpr, max(32, indirect_use_vcc.num_vgpr)
+; GCN: .set indirect_2level_use_vcc_kernel.num_agpr, max(0, indirect_use_vcc.num_agpr)
+; GCN: .set indirect_2level_use_vcc_kernel.num_sgpr, max(33, indirect_use_vcc.num_sgpr)
+; GCN: .set indirect_2level_use_vcc_kernel.private_seg_size, 0+(max(indirect_use_vcc.private_seg_size))
+; GCN: .set indirect_2level_use_vcc_kernel.uses_vcc, or(1, indirect_use_vcc.uses_vcc)
+; GCN: .set indirect_2level_use_vcc_kernel.uses_flat_scratch, or(1, indirect_use_vcc.uses_flat_scratch)
+; GCN: .set indirect_2level_use_vcc_kernel.has_dyn_sized_stack, or(0, indirect_use_vcc.has_dyn_sized_stack)
+; GCN: .set indirect_2level_use_vcc_kernel.has_recursion, or(0, indirect_use_vcc.has_recursion)
+; GCN: .set indirect_2level_use_vcc_kernel.has_indirect_call, or(0, indirect_use_vcc.has_indirect_call)
+; GCN: NumSgprs: 40
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 16
+define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) #0 {
+ call void @indirect_use_vcc()
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_flat_scratch:
+; GCN: .set use_flat_scratch.num_vgpr, 0
+; GCN: .set use_flat_scratch.num_agpr, 0
+; GCN: .set use_flat_scratch.num_sgpr, 32
+; GCN: .set use_flat_scratch.private_seg_size, 0
+; GCN: .set use_flat_scratch.uses_vcc, 0
+; GCN: .set use_flat_scratch.uses_flat_scratch, 1
+; GCN: .set use_flat_scratch.has_dyn_sized_stack, 0
+; GCN: .set use_flat_scratch.has_recursion, 0
+; GCN: .set use_flat_scratch.has_indirect_call, 0
+; GCN: NumSgprs: 38
+; GCN: NumVgprs: 0
+; GCN: ScratchSize: 0
+define void @use_flat_scratch() #1 {
+ call void asm sideeffect "", "~{flat_scratch}" () #0
+ ret void
+}
+
+; GCN-LABEL: {{^}}indirect_use_flat_scratch:
+; GCN: .set indirect_use_flat_scratch.num_vgpr, max(41, use_flat_scratch.num_vgpr)
+; GCN: .set indirect_use_flat_scratch.num_agpr, max(0, use_flat_scratch.num_agpr)
+; GCN: .set indirect_use_flat_scratch.num_sgpr, max(34, use_flat_scratch.num_sgpr)
+; GCN: .set indirect_use_flat_scratch.private_seg_size, 16+(max(use_flat_scratch.private_seg_size))
+; GCN: .set indirect_use_flat_scratch.uses_vcc, or(1, use_flat_scratch.uses_vcc)
+; GCN: .set indirect_use_flat_scratch.uses_flat_scratch, or(0, use_flat_scratch.uses_flat_scratch)
+; GCN: .set indirect_use_flat_scratch.has_dyn_sized_stack, or(0, use_flat_scratch.has_dyn_sized_stack)
+; GCN: .set indirect_use_flat_scratch.has_recursion, or(0, use_flat_scratch.has_recursion)
+; GCN: .set indirect_use_flat_scratch.has_indirect_call, or(0, use_flat_scratch.has_indirect_call)
+; GCN: NumSgprs: 40
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 16
+define void @indirect_use_flat_scratch() #1 {
+ call void @use_flat_scratch()
+ ret void
+}
+
+; GCN-LABEL: {{^}}indirect_2level_use_flat_scratch_kernel:
+; GCN: .set indirect_2level_use_flat_scratch_kernel.num_vgpr, max(32, indirect_use_flat_scratch.num_vgpr)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.num_agpr, max(0, indirect_use_flat_scratch.num_agpr)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.num_sgpr, max(33, indirect_use_flat_scratch.num_sgpr)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.private_seg_size, 0+(max(indirect_use_flat_scratch.private_seg_size))
+; GCN: .set indirect_2level_use_flat_scratch_kernel.uses_vcc, or(1, indirect_use_flat_scratch.uses_vcc)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.uses_flat_scratch, or(1, indirect_use_flat_scratch.uses_flat_scratch)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.has_dyn_sized_stack, or(0, indirect_use_flat_scratch.has_dyn_sized_stack)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.has_recursion, or(0, indirect_use_flat_scratch.has_recursion)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.has_indirect_call, or(0, indirect_use_flat_scratch.has_indirect_call)
+; GCN: NumSgprs: 40
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 16
+define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace(1) %out) #0 {
+ call void @indirect_use_flat_scratch()
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_10_vgpr:
+; GCN: .set use_10_vgpr.num_vgpr, 10
+; GCN: .set use_10_vgpr.num_agpr, 0
+; GCN: .set use_10_vgpr.num_sgpr, 32
+; GCN: .set use_10_vgpr.private_seg_size, 0
+; GCN: .set use_10_vgpr.uses_vcc, 0
+; GCN: .set use_10_vgpr.uses_flat_scratch, 0
+; GCN: .set use_10_vgpr.has_dyn_sized_stack, 0
+; GCN: .set use_10_vgpr.has_recursion, 0
+; GCN: .set use_10_vgpr.has_indirect_call, 0
+; GCN: NumSgprs: 36
+; GCN: NumVgprs: 10
+; GCN: ScratchSize: 0
+define void @use_10_vgpr() #1 {
+ call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4}"() #0
+ call void asm sideeffect "", "~{v5},~{v6},~{v7},~{v8},~{v9}"() #0
+ ret void
+}
+
+; GCN-LABEL: {{^}}indirect_use_10_vgpr:
+; GCN: .set indirect_use_10_vgpr.num_vgpr, max(41, use_10_vgpr.num_vgpr)
+; GCN: .set indirect_use_10_vgpr.num_agpr, max(0, use_10_vgpr.num_agpr)
+; GCN: .set indirect_use_10_vgpr.num_sgpr, max(34, use_10_vgpr.num_sgpr)
+; GCN: .set indirect_use_10_vgpr.private_seg_size, 16+(max(use_10_vgpr.private_seg_size))
+; GCN: .set indirect_use_10_vgpr.uses_vcc, or(1, use_10_vgpr.uses_vcc)
+; GCN: .set indirect_use_10_vgpr.uses_flat_scratch, or(0, use_10_vgpr.uses_flat_scratch)
+; GCN: .set indirect_use_10_vgpr.has_dyn_sized_stack, or(0, use_10_vgpr.has_dyn_sized_stack)
+; GCN: .set indirect_use_10_vgpr.has_recursion, or(0, use_10_vgpr.has_recursion)
+; GCN: .set indirect_use_10_vgpr.has_indirect_call, or(0, use_10_vgpr.has_indirect_call)
+; GCN: NumSgprs: 38
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 16
+define void @indirect_use_10_vgpr() #0 {
+ call void @use_10_vgpr()
+ ret void
+}
+
+; GCN-LABEL: {{^}}indirect_2_level_use_10_vgpr:
+; GCN: .set indirect_2_level_use_10_vgpr.num_vgpr, max(32, indirect_use_10_vgpr.num_vgpr)
+; GCN: .set indirect_2_level_use_10_vgpr.num_agpr, max(0, indirect_use_10_vgpr.num_agpr)
+; GCN: .set indirect_2_level_use_10_vgpr.num_sgpr, max(33, indirect_use_10_vgpr.num_sgpr)
+; GCN: .set indirect_2_level_use_10_vgpr.private_seg_size, 0+(max(indirect_use_10_vgpr.private_seg_size))
+; GCN: .set indirect_2_level_use_10_vgpr.uses_vcc, or(1, indirect_use_10_vgpr.uses_vcc)
+; GCN: .set indirect_2_level_use_10_vgpr.uses_flat_scratch, or(1, indirect_use_10_vgpr.uses_flat_scratch)
+; GCN: .set indirect_2_level_use_10_vgpr.has_dyn_sized_stack, or(0, indirect_use_10_vgpr.has_dyn_sized_stack)
+; GCN: .set indirect_2_level_use_10_vgpr.has_recursion, or(0, indirect_use_10_vgpr.has_recursion)
+; GCN: .set indirect_2_level_use_10_vgpr.has_indirect_call, or(0, indirect_use_10_vgpr.has_indirect_call)
+; GCN: NumSgprs: 40
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 16
+define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 {
+ call void @indirect_use_10_vgpr()
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_50_vgpr:
+; GCN: .set use_50_vgpr.num_vgpr, 50
+; GCN: .set use_50_vgpr.num_agpr, 0
+; GCN: .set use_50_vgpr.num_sgpr, 32
+; GCN: .set use_50_vgpr.private_seg_size, 0
+; GCN: .set use_50_vgpr.uses_vcc, 0
+; GCN: .set use_50_vgpr.uses_flat_scratch, 0
+; GCN: .set use_50_vgpr.has_dyn_sized_stack, 0
+; GCN: .set use_50_vgpr.has_recursion, 0
+; GCN: .set use_50_vgpr.has_indirect_call, 0
+; GCN: NumSgprs: 36
+; GCN: NumVgprs: 50
+; GCN: ScratchSize: 0
+define void @use_50_vgpr() #1 {
+ call void asm sideeffect "", "~{v49}"() #0
+ ret void
+}
+
+; GCN-LABEL: {{^}}indirect_use_50_vgpr:
+; GCN: .set indirect_use_50_vgpr.num_vgpr, max(41, use_50_vgpr.num_vgpr)
+; GCN: .set indirect_use_50_vgpr.num_agpr, max(0, use_50_vgpr.num_agpr)
+; GCN: .set indirect_use_50_vgpr.num_sgpr, max(34, use_50_vgpr.num_sgpr)
+; GCN: .set indirect_use_50_vgpr.private_seg_size, 16+(max(use_50_vgpr.private_seg_size))
+; GCN: .set indirect_use_50_vgpr.uses_vcc, or(1, use_50_vgpr.uses_vcc)
+; GCN: .set indirect_use_50_vgpr.uses_flat_scratch, or(0, use_50_vgpr.uses_flat_scratch)
+; GCN: .set indirect_use_50_vgpr.has_dyn_sized_stack, or(0, use_50_vgpr.has_dyn_sized_stack)
+; GCN: .set indirect_use_50_vgpr.has_recursion, or(0, use_50_vgpr.has_recursion)
+; GCN: .set indirect_use_50_vgpr.has_indirect_call, or(0, use_50_vgpr.has_indirect_call)
+; GCN: NumSgprs: 38
+; GCN: NumVgprs: 50
+; GCN: ScratchSize: 16
+define void @indirect_use_50_vgpr() #0 {
+ call void @use_50_vgpr()
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_80_sgpr:
+; GCN: .set use_80_sgpr.num_vgpr, 1
+; GCN: .set use_80_sgpr.num_agpr, 0
+; GCN: .set use_80_sgpr.num_sgpr, 80
+; GCN: .set use_80_sgpr.private_seg_size, 8
+; GCN: .set use_80_sgpr.uses_vcc, 0
+; GCN: .set use_80_sgpr.uses_flat_scratch, 0
+; GCN: .set use_80_sgpr.has_dyn_sized_stack, 0
+; GCN: .set use_80_sgpr.has_recursion, 0
+; GCN: .set use_80_sgpr.has_indirect_call, 0
+; GCN: NumSgprs: 84
+; GCN: NumVgprs: 1
+; GCN: ScratchSize: 8
+define void @use_80_sgpr() #1 {
+ call void asm sideeffect "", "~{s79}"() #0
+ ret void
+}
+
+; GCN-LABEL: {{^}}indirect_use_80_sgpr:
+; GCN: .set indirect_use_80_sgpr.num_vgpr, max(41, use_80_sgpr.num_vgpr)
+; GCN: .set indirect_use_80_sgpr.num_agpr, max(0, use_80_sgpr.num_agpr)
+; GCN: .set indirect_use_80_sgpr.num_sgpr, max(34, use_80_sgpr.num_sgpr)
+; GCN: .set indirect_use_80_sgpr.private_seg_size, 16+(max(use_80_sgpr.private_seg_size))
+; GCN: .set indirect_use_80_sgpr.uses_vcc, or(1, use_80_sgpr.uses_vcc)
+; GCN: .set indirect_use_80_sgpr.uses_flat_scratch, or(0, use_80_sgpr.uses_flat_scratch)
+; GCN: .set indirect_use_80_sgpr.has_dyn_sized_stack, or(0, use_80_sgpr.has_dyn_sized_stack)
+; GCN: .set indirect_use_80_sgpr.has_recursion, or(0, use_80_sgpr.has_recursion)
+; GCN: .set indirect_use_80_sgpr.has_indirect_call, or(0, use_80_sgpr.has_indirect_call)
+; GCN: NumSgprs: 84
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 24
+define void @indirect_use_80_sgpr() #1 {
+ call void @use_80_sgpr()
+ ret void
+}
+
+; GCN-LABEL: {{^}}indirect_2_level_use_80_sgpr:
+; GCN: .set indirect_2_level_use_80_sgpr.num_vgpr, max(32, indirect_use_80_sgpr.num_vgpr)
+; GCN: .set indirect_2_level_use_80_sgpr.num_agpr, max(0, indirect_use_80_sgpr.num_agpr)
+; GCN: .set indirect_2_level_use_80_sgpr.num_sgpr, max(33, indirect_use_80_sgpr.num_sgpr)
+; GCN: .set indirect_2_level_use_80_sgpr.private_seg_size, 0+(max(indirect_use_80_sgpr.private_seg_size))
+; GCN: .set indirect_2_level_use_80_sgpr.uses_vcc, or(1, indirect_use_80_sgpr.uses_vcc)
+; GCN: .set indirect_2_level_use_80_sgpr.uses_flat_scratch, or(1, indirect_use_80_sgpr.uses_flat_scratch)
+; GCN: .set indirect_2_level_use_80_sgpr.has_dyn_sized_stack, or(0, indirect_use_80_sgpr.has_dyn_sized_stack)
+; GCN: .set indirect_2_level_use_80_sgpr.has_recursion, or(0, indirect_use_80_sgpr.has_recursion)
+; GCN: .set indirect_2_level_use_80_sgpr.has_indirect_call, or(0, indirect_use_80_sgpr.has_indirect_call)
+; GCN: NumSgprs: 86
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 24
+define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 {
+ call void @indirect_use_80_sgpr()
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_stack0:
+; GCN: .set use_stack0.num_vgpr, 1
+; GCN: .set use_stack0.num_agpr, 0
+; GCN: .set use_stack0.num_sgpr, 33
+; GCN: .set use_stack0.private_seg_size, 2052
+; GCN: .set use_stack0.uses_vcc, 0
+; GCN: .set use_stack0.uses_flat_scratch, 0
+; GCN: .set use_stack0.has_dyn_sized_stack, 0
+; GCN: .set use_stack0.has_recursion, 0
+; GCN: .set use_stack0.has_indirect_call, 0
+; GCN: NumSgprs: 37
+; GCN: NumVgprs: 1
+; GCN: ScratchSize: 2052
+define void @use_stack0() #1 {
+ %alloca = alloca [512 x i32], align 4, addrspace(5)
+ call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_stack1:
+; GCN: .set use_stack1.num_vgpr, 1
+; GCN: .set use_stack1.num_agpr, 0
+; GCN: .set use_stack1.num_sgpr, 33
+; GCN: .set use_stack1.private_seg_size, 404
+; GCN: .set use_stack1.uses_vcc, 0
+; GCN: .set use_stack1.uses_flat_scratch, 0
+; GCN: .set use_stack1.has_dyn_sized_stack, 0
+; GCN: .set use_stack1.has_recursion, 0
+; GCN: .set use_stack1.has_indirect_call, 0
+; GCN: NumSgprs: 37
+; GCN: NumVgprs: 1
+; GCN: ScratchSize: 404
+define void @use_stack1() #1 {
+ %alloca = alloca [100 x i32], align 4, addrspace(5)
+ call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0
+ ret void
+}
+
+; GCN-LABEL: {{^}}indirect_use_stack:
+; GCN: .set indirect_use_stack.num_vgpr, max(41, use_stack0.num_vgpr)
+; GCN: .set indirect_use_stack.num_agpr, max(0, use_stack0.num_agpr)
+; GCN: .set indirect_use_stack.num_sgpr, max(34, use_stack0.num_sgpr)
+; GCN: .set indirect_use_stack.private_seg_size, 80+(max(use_stack0.private_seg_size))
+; GCN: .set indirect_use_stack.uses_vcc, or(1, use_stack0.uses_vcc)
+; GCN: .set indirect_use_stack.uses_flat_scratch, or(0, use_stack0.uses_flat_scratch)
+; GCN: .set indirect_use_stack.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack)
+; GCN: .set indirect_use_stack.has_recursion, or(0, use_stack0.has_recursion)
+; GCN: .set indirect_use_stack.has_indirect_call, or(0, use_stack0.has_indirect_call)
+; GCN: NumSgprs: 38
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 2132
+define void @indirect_use_stack() #1 {
+ %alloca = alloca [16 x i32], align 4, addrspace(5)
+ call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0
+ call void @use_stack0()
+ ret void
+}
+
+; GCN-LABEL: {{^}}indirect_2_level_use_stack:
+; GCN: .set indirect_2_level_use_stack.num_vgpr, max(32, indirect_use_stack.num_vgpr)
+; GCN: .set indirect_2_level_use_stack.num_agpr, max(0, indirect_use_stack.num_agpr)
+; GCN: .set indirect_2_level_use_stack.num_sgpr, max(33, indirect_use_stack.num_sgpr)
+; GCN: .set indirect_2_level_use_stack.private_seg_size, 0+(max(indirect_use_stack.private_seg_size))
+; GCN: .set indirect_2_level_use_stack.uses_vcc, or(1, indirect_use_stack.uses_vcc)
+; GCN: .set indirect_2_level_use_stack.uses_flat_scratch, or(1, indirect_use_stack.uses_flat_scratch)
+; GCN: .set indirect_2_level_use_stack.has_dyn_sized_stack, or(0, indirect_use_stack.has_dyn_sized_stack)
+; GCN: .set indirect_2_level_use_stack.has_recursion, or(0, indirect_use_stack.has_recursion)
+; GCN: .set indirect_2_level_use_stack.has_indirect_call, or(0, indirect_use_stack.has_indirect_call)
+; GCN: NumSgprs: 40
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 2132
+define amdgpu_kernel void @indirect_2_level_use_stack() #0 {
+ call void @indirect_use_stack()
+ ret void
+}
+
+
+; Should be maximum of callee usage
+; GCN-LABEL: {{^}}multi_call_use_use_stack:
+; GCN: .set multi_call_use_use_stack.num_vgpr, max(41, use_stack0.num_vgpr, use_stack1.num_vgpr)
+; GCN: .set multi_call_use_use_stack.num_agpr, max(0, use_stack0.num_agpr, use_stack1.num_agpr)
+; GCN: .set multi_call_use_use_stack.num_sgpr, max(42, use_stack0.num_sgpr, use_stack1.num_sgpr)
+; GCN: .set multi_call_use_use_stack.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size))
+; GCN: .set multi_call_use_use_stack.uses_vcc, or(1, use_stack0.uses_vcc, use_stack1.uses_vcc)
+; GCN: .set multi_call_use_use_stack.uses_flat_scratch, or(1, use_stack0.uses_flat_scratch, use_stack1.uses_flat_scratch)
+; GCN: .set multi_call_use_use_stack.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack, use_stack1.has_dyn_sized_stack)
+; GCN: .set multi_call_use_use_stack.has_recursion, or(0, use_stack0.has_recursion, use_stack1.has_recursion)
+; GCN: .set multi_call_use_use_stack.has_indirect_call, or(0, use_stack0.has_indirect_call, use_stack1.has_indirect_call)
+; GCN: NumSgprs: 48
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 2052
+define amdgpu_kernel void @multi_call_use_use_stack() #0 {
+ call void @use_stack0()
+ call void @use_stack1()
+ ret void
+}
+
+declare void @external() #0
+
+; GCN-LABEL: {{^}}multi_call_with_external:
+; GCN: .set multi_call_with_external.num_vgpr, max(41, max_num_vgpr)
+; GCN: .set multi_call_with_external.num_agpr, max(0, max_num_agpr)
+; GCN: .set multi_call_with_external.num_sgpr, max(42, max_num_sgpr)
+; GCN: .set multi_call_with_external.private_seg_size, 0
+; GCN: .set multi_call_with_external.uses_vcc, 1
+; GCN: .set multi_call_with_external.uses_flat_scratch, 1
+; GCN: .set multi_call_with_external.has_dyn_sized_stack, 1
+; GCN: .set multi_call_with_external.has_recursion, 0
+; GCN: .set multi_call_with_external.has_indirect_call, 1
+; GCN: NumSgprs: multi_call_with_external.num_sgpr+6
+; GCN: NumVgprs: multi_call_with_external.num_vgpr
+; GCN: ScratchSize: 0
+define amdgpu_kernel void @multi_call_with_external() #0 {
+ call void @use_stack0()
+ call void @use_stack1()
+ call void @external()
+ ret void
+}
+
+; GCN-LABEL: {{^}}usage_external:
+; GCN: .set usage_external.num_vgpr, max(32, max_num_vgpr)
+; GCN: .set usage_external.num_agpr, max(0, max_num_agpr)
+; GCN: .set usage_external.num_sgpr, max(33, max_num_sgpr)
+; GCN: .set usage_external.private_seg_size, 0
+; GCN: .set usage_external.uses_vcc, 1
+; GCN: .set usage_external.uses_flat_scratch, 1
+; GCN: .set usage_external.has_dyn_sized_stack, 1
+; GCN: .set usage_external.has_recursion, 0
+; GCN: .set usage_external.has_indirect_call, 1
+; GCN: NumSgprs: usage_external.num_sgpr+6
+; GCN: NumVgprs: usage_external.num_vgpr
+; GCN: ScratchSize: 0
+define amdgpu_kernel void @usage_external() #0 {
+ call void @external()
+ ret void
+}
+
+declare void @external_recurse() #2
+
+; GCN-LABEL: {{^}}usage_external_recurse:
+; GCN: .set usage_external_recurse.num_vgpr, max(32, max_num_vgpr)
+; GCN: .set usage_external_recurse.num_agpr, max(0, max_num_agpr)
+; GCN: .set usage_external_recurse.num_sgpr, max(33, max_num_sgpr)
+; GCN: .set usage_external_recurse.private_seg_size, 0
+; GCN: .set usage_external_recurse.uses_vcc, 1
+; GCN: .set usage_external_recurse.uses_flat_scratch, 1
+; GCN: .set usage_external_recurse.has_dyn_sized_stack, 1
+; GCN: .set usage_external_recurse.has_recursion, 1
+; GCN: .set usage_external_recurse.has_indirect_call, 1
+; GCN: NumSgprs: usage_external_recurse.num_sgpr+6
+; GCN: NumVgprs: usage_external_recurse.num_vgpr
+; GCN: ScratchSize: 0
+define amdgpu_kernel void @usage_external_recurse() #0 {
+ call void @external_recurse()
+ ret void
+}
+
+; GCN-LABEL: {{^}}direct_recursion_use_stack:
+; GCN: .set direct_recursion_use_stack.num_vgpr, 41
+; GCN: .set direct_recursion_use_stack.num_agpr, 0
+; GCN: .set direct_recursion_use_stack.num_sgpr, 36
+; GCN: .set direct_recursion_use_stack.private_seg_size, 2064
+; GCN: .set direct_recursion_use_stack.uses_vcc, 1
+; GCN: .set direct_recursion_use_stack.uses_flat_scratch, 0
+; GCN: .set direct_recursion_use_stack.has_dyn_sized_stack, 0
+; GCN: .set direct_recursion_use_stack.has_recursion, 1
+; GCN: .set direct_recursion_use_stack.has_indirect_call, 0
+; GCN: NumSgprs: 40
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 2064
+define void @direct_recursion_use_stack(i32 %val) #2 {
+ %alloca = alloca [512 x i32], align 4, addrspace(5)
+ call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0
+ %cmp = icmp eq i32 %val, 0
+ br i1 %cmp, label %ret, label %call
+
+call:
+ %val.sub1 = sub i32 %val, 1
+ call void @direct_recursion_use_stack(i32 %val.sub1)
+ br label %ret
+
+ret:
+ ret void
+}
+
+; GCN-LABEL: {{^}}usage_direct_recursion:
+; GCN: .set usage_direct_recursion.num_vgpr, max(32, direct_recursion_use_stack.num_vgpr)
+; GCN: .set usage_direct_recursion.num_agpr, max(0, direct_recursion_use_stack.num_agpr)
+; GCN: .set usage_direct_recursion.num_sgpr, max(33, direct_recursion_use_stack.num_sgpr)
+; GCN: .set usage_direct_recursion.private_seg_size, 0+(max(direct_recursion_use_stack.private_seg_size))
+; GCN: .set usage_direct_recursion.uses_vcc, or(1, direct_recursion_use_stack.uses_vcc)
+; GCN: .set usage_direct_recursion.uses_flat_scratch, or(1, direct_recursion_use_stack.uses_flat_scratch)
+; GCN: .set usage_direct_recursion.has_dyn_sized_stack, or(0, direct_recursion_use_stack.has_dyn_sized_stack)
+; GCN: .set usage_direct_recursion.has_recursion, or(1, direct_recursion_use_stack.has_recursion)
+; GCN: .set usage_direct_recursion.has_indirect_call, or(0, direct_recursion_use_stack.has_indirect_call)
+; GCN: NumSgprs: 42
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 2064
+define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
+ call void @direct_recursion_use_stack(i32 %n)
+ ret void
+}
+
+; Make sure there's no assert when a sgpr96 is used.
+; GCN-LABEL: {{^}}count_use_sgpr96_external_call
+; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(32, max_num_vgpr)
+; GCN: .set count_use_sgpr96_external_call.num_agpr, max(0, max_num_agpr)
+; GCN: .set count_use_sgpr96_external_call.num_sgpr, max(33, max_num_sgpr)
+; GCN: .set count_use_sgpr96_external_call.private_seg_size, 0
+; GCN: .set count_use_sgpr96_external_call.uses_vcc, 1
+; GCN: .set count_use_sgpr96_external_call.uses_flat_scratch, 1
+; GCN: .set count_use_sgpr96_external_call.has_dyn_sized_stack, 1
+; GCN: .set count_use_sgpr96_external_call.has_recursion, 0
+; GCN: .set count_use_sgpr96_external_call.has_indirect_call, 1
+; GCN: NumSgprs: count_use_sgpr96_external_call.num_sgpr+6
+; GCN: NumVgprs: count_use_sgpr96_external_call.num_vgpr
+; GCN: ScratchSize: 0
+define amdgpu_kernel void @count_use_sgpr96_external_call() {
+entry:
+ tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> <i32 10, i32 11, i32 12>) #1
+ call void @external()
+ ret void
+}
+
+; Make sure there's no assert when a sgpr160 is used.
+; GCN-LABEL: {{^}}count_use_sgpr160_external_call
+; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(32, max_num_vgpr)
+; GCN: .set count_use_sgpr160_external_call.num_agpr, max(0, max_num_agpr)
+; GCN: .set count_use_sgpr160_external_call.num_sgpr, max(33, max_num_sgpr)
+; GCN: .set count_use_sgpr160_external_call.private_seg_size, 0
+; GCN: .set count_use_sgpr160_external_call.uses_vcc, 1
+; GCN: .set count_use_sgpr160_external_call.uses_flat_scratch, 1
+; GCN: .set count_use_sgpr160_external_call.has_dyn_sized_stack, 1
+; GCN: .set count_use_sgpr160_external_call.has_recursion, 0
+; GCN: .set count_use_sgpr160_external_call.has_indirect_call, 1
+; GCN: NumSgprs: count_use_sgpr160_external_call.num_sgpr+6
+; GCN: NumVgprs: count_use_sgpr160_external_call.num_vgpr
+; GCN: ScratchSize: 0
+define amdgpu_kernel void @count_use_sgpr160_external_call() {
+entry:
+ tail call void asm sideeffect "; sgpr160 $0", "s"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
+ call void @external()
+ ret void
+}
+
+; Make sure there's no assert when a vgpr160 is used.
+; GCN-LABEL: {{^}}count_use_vgpr160_external_call
+; GCN: .set count_use_vgpr160_external_call.num_vgpr, max(32, max_num_vgpr)
+; GCN: .set count_use_vgpr160_external_call.num_agpr, max(0, max_num_agpr)
+; GCN: .set count_use_vgpr160_external_call.num_sgpr, max(33, max_num_sgpr)
+; GCN: .set count_use_vgpr160_external_call.private_seg_size, 0
+; GCN: .set count_use_vgpr160_external_call.uses_vcc, 1
+; GCN: .set count_use_vgpr160_external_call.uses_flat_scratch, 1
+; GCN: .set count_use_vgpr160_external_call.has_dyn_sized_stack, 1
+; GCN: .set count_use_vgpr160_external_call.has_recursion, 0
+; GCN: .set count_use_vgpr160_external_call.has_indirect_call, 1
+; GCN: NumSgprs: count_use_vgpr160_external_call.num_sgpr+6
+; GCN: NumVgprs: count_use_vgpr160_external_call.num_vgpr
+; GCN: ScratchSize: 0
+define amdgpu_kernel void @count_use_vgpr160_external_call() {
+entry:
+ tail call void asm sideeffect "; vgpr160 $0", "v"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
+ call void @external()
+ ret void
+}
+
+; Added at the of the .s are the module level maximums
+; GCN: .set max_num_vgpr, 50
+; GCN: .set max_num_agpr, 0
+; GCN: .set max_num_sgpr, 80
+
+attributes #0 = { nounwind noinline norecurse }
+attributes #1 = { nounwind noinline norecurse }
+attributes #2 = { nounwind noinline }
diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
index 0f951e89d37c8a..4f300e2282426e 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
@@ -36,8 +36,8 @@
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
; GCN-NEXT: .amdhsa_wavefront_size32
-; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
-; GCN-NEXT: .amdhsa_enable_private_segment 0
+; GCN-NEXT: .amdhsa_uses_dynamic_stack
+; GCN-NEXT: .amdhsa_enable_private_segment
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
@@ -65,8 +65,8 @@ define amdgpu_kernel void @minimal_kernel_inputs() #0 {
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
; GCN-NEXT: .amdhsa_wavefront_size32
-; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
-; GCN-NEXT: .amdhsa_enable_private_segment 1
+; GCN-NEXT: .amdhsa_uses_dynamic_stack
+; GCN-NEXT: .amdhsa_enable_private_segment
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
@@ -98,8 +98,8 @@ define amdgpu_kernel void @minimal_kernel_inputs_with_stack() #0 {
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
; GCN-NEXT: .amdhsa_wavefront_size32
-; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
-; GCN-NEXT: .amdhsa_enable_private_segment 0
+; GCN-NEXT: .amdhsa_uses_dynamic_stack
+; GCN-NEXT: .amdhsa_enable_private_segment
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
@@ -145,8 +145,8 @@ define amdgpu_kernel void @queue_ptr() #1 {
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 1
; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
; GCN-NEXT: .amdhsa_wavefront_size32
-; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
-; GCN-NEXT: .amdhsa_enable_private_segment 1
+; GCN-NEXT: .amdhsa_uses_dynamic_stack
+; GCN-NEXT: .amdhsa_enable_private_segment
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll b/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll
index f20d720c3876ba..dce4162c246247 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll
@@ -41,7 +41,7 @@ entry:
}
; FIXME: This should warn too
-; ERR-NOT: warning
+; ERR-NOT: warning: inline asm clobber list contains reserved registers
define amdgpu_kernel void @def_exec(ptr addrspace(1) %ptr) {
entry:
%exec = call i64 asm sideeffect "; def $0", "={exec}"()
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index cddfb21a6fbdf4..6e07e2fc9da79a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -3,6 +3,18 @@
declare i32 @llvm.amdgcn.workitem.id.x()
+define <2 x i64> @f1() #0 {
+; GFX11-LABEL: f1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: v_mov_b32_e32 v3, 0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ ret <2 x i64> zeroinitializer
+}
+
define void @f0() {
; GFX11-LABEL: f0:
; GFX11: ; %bb.0: ; %bb
@@ -36,18 +48,6 @@ bb:
ret void
}
-define <2 x i64> @f1() #0 {
-; GFX11-LABEL: f1:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: v_mov_b32_e32 v3, 0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- ret <2 x i64> zeroinitializer
-}
-
; FIXME: This generates "instid1(/* invalid instid value */)".
define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, ptr %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i1 %arg11) {
; GFX11-LABEL: f2:
diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll
index b49931379b84a5..4575df1e0c6b95 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra.ll
@@ -105,13 +105,6 @@ define void @test_funcx2() #0 {
ret void
}
-; GCN-LABEL: {{^}}wombat:
-define weak amdgpu_kernel void @wombat(ptr %arg, ptr %arg2) {
-bb:
- call void @hoge() #0
- ret void
-}
-
; Make sure we save/restore the return address around the call.
; Function Attrs: norecurse
define internal void @hoge() #2 {
@@ -128,6 +121,13 @@ bb:
ret void
}
+; GCN-LABEL: {{^}}wombat:
+define weak amdgpu_kernel void @wombat(ptr %arg, ptr %arg2) {
+bb:
+ call void @hoge() #0
+ ret void
+}
+
declare dso_local void @eggs()
diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
index 496a1c652da251..d9a5a49e75f0a5 100644
--- a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
@@ -5,12 +5,14 @@ declare void @llvm.trap() #0
; DOORBELL: .amdhsa_kernel trap
; DOORBELL-NEXT: .amdhsa_group_segment_fixed_size 0
-; DOORBELL-NEXT: .amdhsa_private_segment_fixed_size 0
+; DOORBELL-NEXT: .amdhsa_private_segment_fixed_size trap.private_seg_size
; DOORBELL-NEXT: .amdhsa_kernarg_size 8
; DOORBELL-NEXT: .amdhsa_user_sgpr_count 12
; DOORBELL-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
; DOORBELL: .end_amdhsa_kernel
+; DOORBELL: .set trap.private_seg_size, 0
+
define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) #0 {
store volatile i32 1, ptr addrspace(1) %arg0
call void @llvm.trap()
diff --git a/llvm/test/CodeGen/AMDGPU/kernel_code_t_recurse.ll b/llvm/test/CodeGen/AMDGPU/kernel_code_t_recurse.ll
index cdd6e88dd103b7..f3d9e9a727c251 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel_code_t_recurse.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel_code_t_recurse.ll
@@ -1,6 +1,8 @@
; RUN: llc -mtriple=amdgcn-mesa-mesa3d < %s | FileCheck %s
; CHECK-LABEL: non_kernel_recursion:
+; CHECK: .set non_kernel_recursion.has_recursion, 1
+; CHECK: .set non_kernel_recursion.has_indirect_call, 0
define void @non_kernel_recursion(i32 %val) #2 {
%cmp = icmp eq i32 %val, 0
br i1 %cmp, label %ret, label %call
@@ -16,8 +18,11 @@ ret:
; CHECK-LABEL: kernel_caller_recursion:
; CHECK: .amd_kernel_code_t
-; CHECK: is_dynamic_callstack = 1
+; CHECK: is_dynamic_callstack = kernel_caller_recursion.has_dyn_sized_stack|kernel_caller_recursion.has_recursion
; CHECK: .end_amd_kernel_code_t
+
+; CHECK: .set kernel_caller_recursion.has_recursion, or(1, non_kernel_recursion.has_recursion)
+; CHECK: .set kernel_caller_recursion.has_indirect_call, or(0, non_kernel_recursion.has_indirect_call)
define amdgpu_kernel void @kernel_caller_recursion(i32 %n) #0 {
call void @non_kernel_recursion(i32 %n)
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll
index 7698372b687797..64bc14d750573b 100644
--- a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll
+++ b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll
@@ -1,10 +1,10 @@
; RUN: llc -mtriple=amdgcn -mcpu=bonaire -show-mc-encoding < %s | FileCheck --check-prefixes=GCN,CI,ALL %s
; RUN: llc -mtriple=amdgcn -mcpu=carrizo --show-mc-encoding < %s | FileCheck --check-prefixes=GCN,VI,ALL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 --show-mc-encoding < %s | FileCheck --check-prefixes=GCN,GFX9,ALL %s
-; RUN: llc -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCNHSA,ALL %s
-; RUN: llc -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCNHSA,ALL %s
-; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCNHSA,ALL %s
-; RUN: llc -mcpu=gfx1100 -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global,-architected-flat-scratch,-user-sgpr-init16-bug < %s | FileCheck --check-prefixes=GCNHSA,ALL %s
+; RUN: llc -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck --check-prefixes=BON,GCNHSA,ALL %s
+; RUN: llc -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck --check-prefixes=CAR,GCNHSA,ALL %s
+; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX10,GCNHSA,ALL %s
+; RUN: llc -mcpu=gfx1100 -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global,-architected-flat-scratch,-user-sgpr-init16-bug < %s | FileCheck --check-prefixes=GFX11,GCNHSA,ALL %s
; FIXME: align on alloca seems to be ignored for private_segment_alignment
@@ -24,7 +24,7 @@
; GCNHSA: .amdhsa_kernel large_alloca_compute_shader
; GCNHSA: .amdhsa_group_segment_fixed_size 0
-; GCNHSA: .amdhsa_private_segment_fixed_size 32772
+; GCNHSA: .amdhsa_private_segment_fixed_size large_alloca_compute_shader.private_seg_size
; GCNHSA: .amdhsa_user_sgpr_private_segment_buffer 1
; GCNHSA: .amdhsa_user_sgpr_dispatch_ptr 1
; GCNHSA: .amdhsa_user_sgpr_queue_ptr 1
@@ -32,14 +32,19 @@
; GCNHSA: .amdhsa_user_sgpr_dispatch_id 1
; GCNHSA: .amdhsa_user_sgpr_flat_scratch_init 1
; GCNHSA: .amdhsa_user_sgpr_private_segment_size 0
-; GCNHSA: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; GCNHSA: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(large_alloca_compute_shader.private_seg_size*{{32|64}}, {{1024|256}}))/{{1024|256}})>0)||(large_alloca_compute_shader.has_dyn_sized_stack|large_alloca_compute_shader.has_recursion))|5020)&1
; GCNHSA: .amdhsa_system_sgpr_workgroup_id_x 1
; GCNHSA: .amdhsa_system_sgpr_workgroup_id_y 1
; GCNHSA: .amdhsa_system_sgpr_workgroup_id_z 1
; GCNHSA: .amdhsa_system_sgpr_workgroup_info 0
; GCNHSA: .amdhsa_system_vgpr_workitem_id 2
-; GCNHSA: .amdhsa_next_free_vgpr 3
-; GCNHSA: .amdhsa_next_free_sgpr 18
+; GCNHSA: .amdhsa_next_free_vgpr max(totalnumvgprs(large_alloca_compute_shader.num_agpr, large_alloca_compute_shader.num_vgpr), 1, 0)
+; BON: .amdhsa_next_free_sgpr (max(large_alloca_compute_shader.num_sgpr+(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 0))
+; CAR: .amdhsa_next_free_sgpr (max(large_alloca_compute_shader.num_sgpr+(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 1))
+; GFX10: .amdhsa_next_free_sgpr (max(large_alloca_compute_shader.num_sgpr+(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 1))
+; GFX11: .amdhsa_next_free_sgpr (max(large_alloca_compute_shader.num_sgpr+(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 0))
+; GCNHSA: .amdhsa_reserve_vcc large_alloca_compute_shader.uses_vcc
+; GCNHSA: .amdhsa_reserve_flat_scratch large_alloca_compute_shader.uses_flat_scratch
; GCNHSA: .amdhsa_float_round_mode_32 0
; GCNHSA: .amdhsa_float_round_mode_16_64 0
; GCNHSA: .amdhsa_float_denorm_mode_32 3
@@ -55,6 +60,16 @@
; GCNHSA: .amdhsa_exception_int_div_zero 0
; GCNHSA: .end_amdhsa_kernel
+; GCNHSA: .set large_alloca_compute_shader.num_vgpr, 3
+; GCNHSA: .set large_alloca_compute_shader.num_agpr, 0
+; GCNHSA: .set large_alloca_compute_shader.num_sgpr, 18
+; GCNHSA: .set large_alloca_compute_shader.private_seg_size, 32772
+; GCNHSA: .set large_alloca_compute_shader.uses_vcc
+; GCNHSA: .set large_alloca_compute_shader.uses_flat_scratch, 0
+; GCNHSA: .set large_alloca_compute_shader.has_dyn_sized_stack, 0
+; GCNHSA: .set large_alloca_compute_shader.has_recursion, 0
+; GCNHSA: .set large_alloca_compute_shader.has_indirect_call, 0
+
; Scratch size = alloca size + emergency stack slot, align {{.*}}, addrspace(5)
; ALL: ; ScratchSize: 32772
define amdgpu_kernel void @large_alloca_compute_shader(i32 %x, i32 %y) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 1b1ea52520c0bf..cf331e119126ed 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -146,12 +146,9 @@
; GCN-O0-NEXT: Lazy Machine Block Frequency Analysis
; GCN-O0-NEXT: Machine Optimization Remark Emitter
; GCN-O0-NEXT: Stack Frame Layout Analysis
-; GCN-O0-NEXT: Function register usage analysis
-; GCN-O0-NEXT: FunctionPass Manager
-; GCN-O0-NEXT: Lazy Machine Block Frequency Analysis
-; GCN-O0-NEXT: Machine Optimization Remark Emitter
-; GCN-O0-NEXT: AMDGPU Assembly Printer
-; GCN-O0-NEXT: Free MachineFunction
+; GCN-O0-NEXT: Function register usage analysis
+; GCN-O0-NEXT: AMDGPU Assembly Printer
+; GCN-O0-NEXT: Free MachineFunction
; GCN-O1:Target Library Information
; GCN-O1-NEXT:Target Pass Configuration
@@ -421,12 +418,9 @@
; GCN-O1-NEXT: Lazy Machine Block Frequency Analysis
; GCN-O1-NEXT: Machine Optimization Remark Emitter
; GCN-O1-NEXT: Stack Frame Layout Analysis
-; GCN-O1-NEXT: Function register usage analysis
-; GCN-O1-NEXT: FunctionPass Manager
-; GCN-O1-NEXT: Lazy Machine Block Frequency Analysis
-; GCN-O1-NEXT: Machine Optimization Remark Emitter
-; GCN-O1-NEXT: AMDGPU Assembly Printer
-; GCN-O1-NEXT: Free MachineFunction
+; GCN-O1-NEXT: Function register usage analysis
+; GCN-O1-NEXT: AMDGPU Assembly Printer
+; GCN-O1-NEXT: Free MachineFunction
; GCN-O1-OPTS:Target Library Information
; GCN-O1-OPTS-NEXT:Target Pass Configuration
@@ -724,12 +718,9 @@
; GCN-O1-OPTS-NEXT: Lazy Machine Block Frequency Analysis
; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter
; GCN-O1-OPTS-NEXT: Stack Frame Layout Analysis
-; GCN-O1-OPTS-NEXT: Function register usage analysis
-; GCN-O1-OPTS-NEXT: FunctionPass Manager
-; GCN-O1-OPTS-NEXT: Lazy Machine Block Frequency Analysis
-; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter
-; GCN-O1-OPTS-NEXT: AMDGPU Assembly Printer
-; GCN-O1-OPTS-NEXT: Free MachineFunction
+; GCN-O1-OPTS-NEXT: Function register usage analysis
+; GCN-O1-OPTS-NEXT: AMDGPU Assembly Printer
+; GCN-O1-OPTS-NEXT: Free MachineFunction
; GCN-O2:Target Library Information
; GCN-O2-NEXT:Target Pass Configuration
@@ -1033,12 +1024,9 @@
; GCN-O2-NEXT: Lazy Machine Block Frequency Analysis
; GCN-O2-NEXT: Machine Optimization Remark Emitter
; GCN-O2-NEXT: Stack Frame Layout Analysis
-; GCN-O2-NEXT: Function register usage analysis
-; GCN-O2-NEXT: FunctionPass Manager
-; GCN-O2-NEXT: Lazy Machine Block Frequency Analysis
-; GCN-O2-NEXT: Machine Optimization Remark Emitter
-; GCN-O2-NEXT: AMDGPU Assembly Printer
-; GCN-O2-NEXT: Free MachineFunction
+; GCN-O2-NEXT: Function register usage analysis
+; GCN-O2-NEXT: AMDGPU Assembly Printer
+; GCN-O2-NEXT: Free MachineFunction
; GCN-O3:Target Library Information
; GCN-O3-NEXT:Target Pass Configuration
@@ -1354,12 +1342,9 @@
; GCN-O3-NEXT: Lazy Machine Block Frequency Analysis
; GCN-O3-NEXT: Machine Optimization Remark Emitter
; GCN-O3-NEXT: Stack Frame Layout Analysis
-; GCN-O3-NEXT: Function register usage analysis
-; GCN-O3-NEXT: FunctionPass Manager
-; GCN-O3-NEXT: Lazy Machine Block Frequency Analysis
-; GCN-O3-NEXT: Machine Optimization Remark Emitter
-; GCN-O3-NEXT: AMDGPU Assembly Printer
-; GCN-O3-NEXT: Free MachineFunction
+; GCN-O3-NEXT: Function register usage analysis
+; GCN-O3-NEXT: AMDGPU Assembly Printer
+; GCN-O3-NEXT: Free MachineFunction
define void @empty() {
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
index eaee8ec73fe411..778060d3c5fb3d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
@@ -12,7 +12,7 @@ declare i32 @llvm.amdgcn.workitem.id.z() #0
; MESA: .section .AMDGPU.config
; MESA: .long 47180
-; MESA-NEXT: .long 132{{$}}
+; MESA-NEXT: .long ((((alignto(test_workitem_id_x.private_seg_size*64, 1024))/1024)>0)||(test_workitem_id_x.has_dyn_sized_stack|test_workitem_id_x.has_recursion))|132{{$}}
; ALL-LABEL: {{^}}test_workitem_id_x:
; MESA3D: enable_vgpr_workitem_id = 0
@@ -29,7 +29,7 @@ define amdgpu_kernel void @test_workitem_id_x(ptr addrspace(1) %out) #1 {
; MESA: .section .AMDGPU.config
; MESA: .long 47180
-; MESA-NEXT: .long 2180{{$}}
+; MESA-NEXT: .long ((((alignto(test_workitem_id_y.private_seg_size*64, 1024))/1024)>0)||(test_workitem_id_y.has_dyn_sized_stack|test_workitem_id_y.has_recursion))|2180{{$}}
; ALL-LABEL: {{^}}test_workitem_id_y:
; MESA3D: enable_vgpr_workitem_id = 1
@@ -47,7 +47,7 @@ define amdgpu_kernel void @test_workitem_id_y(ptr addrspace(1) %out) #1 {
; MESA: .section .AMDGPU.config
; MESA: .long 47180
-; MESA-NEXT: .long 4228{{$}}
+; MESA-NEXT: .long ((((alignto(test_workitem_id_z.private_seg_size*64, 1024))/1024)>0)||(test_workitem_id_z.has_dyn_sized_stack|test_workitem_id_z.has_recursion))|4228{{$}}
; ALL-LABEL: {{^}}test_workitem_id_z:
; MESA3D: enable_vgpr_workitem_id = 2
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
index 34dcdaf29677e4..b508ffff8050a8 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
@@ -9,6 +9,19 @@
@lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1
@lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16
+; GCN-LABEL: {{^}}f0:
+; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0
+; GCN-DAG: v_mov_b32_e32 [[TREE:v[0-9]+]], 3
+; GCN: ds_write_b8 [[NULL]], [[TREE]]
+define void @f0() {
+; OPT-LABEL: @f0() {
+; OPT-NEXT: store i8 3, ptr addrspace(3) @llvm.amdgcn.module.lds, align 1
+; OPT-NEXT: ret void
+;
+ store i8 3, ptr addrspace(3) @lds.size.1.align.1, align 1
+ ret void
+}
+
; GCN-LABEL: {{^}}k0:
; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0
; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
@@ -29,16 +42,3 @@ define amdgpu_kernel void @k0() {
call void @f0()
ret void
}
-
-; GCN-LABEL: {{^}}f0:
-; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0
-; GCN-DAG: v_mov_b32_e32 [[TREE:v[0-9]+]], 3
-; GCN: ds_write_b8 [[NULL]], [[TREE]]
-define void @f0() {
-; OPT-LABEL: @f0() {
-; OPT-NEXT: store i8 3, ptr addrspace(3) @llvm.amdgcn.module.lds, align 1
-; OPT-NEXT: ret void
-;
- store i8 3, ptr addrspace(3) @lds.size.1.align.1, align 1
- ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/mesa3d.ll b/llvm/test/CodeGen/AMDGPU/mesa3d.ll
index 7f0f473c11bd59..f3cf2a5ca8ff62 100644
--- a/llvm/test/CodeGen/AMDGPU/mesa3d.ll
+++ b/llvm/test/CodeGen/AMDGPU/mesa3d.ll
@@ -1,14 +1,15 @@
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,ALL %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,ALL %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,ALL %s
; SPI_TMPRING_SIZE.WAVESIZE = 5
; GFX10: .long 165608
-; GFX10-NEXT: .long 20480
+; GFX10-NEXT: .long (((alignto(scratch_ps.private_seg_size*32, 1024))/1024)&8191)<<12
; SPI_TMPRING_SIZE.WAVESIZE = 17
; GFX11: .long 165608
-; GFX11-NEXT: .long 69632
+; 11XFG-TXEN: .long 69632
+; GFX11-NEXT:.long (((alignto(scratch_ps.private_seg_size*32, 256))/256)&32767)<<12
; GCN-LABEL: {{^}}scratch_ps:
; GCN: s_load_dwordx2 s[4:5], s[0:1], 0x0{{$}}
@@ -23,3 +24,5 @@ entry:
store volatile i32 2, ptr addrspace(5) %ptr
ret void
}
+
+; ALL: .set scratch_ps.private_seg_size, 132
diff --git a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
index 27b71dd471a839..aa16937d7d897d 100644
--- a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
+++ b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
@@ -24,6 +24,55 @@ store i32 0, ptr addrspace(3) @used_by_kernel
}
; CHECK: ; LDSByteSize: 4 bytes
+define void @nonkernel() {
+; GFX9-LABEL: nonkernel:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: ds_write_b32 v0, v0 offset:8
+; GFX9-NEXT: ds_write_b64 v0, v[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: nonkernel:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: ds_write_b32 v0, v0 offset:8
+; GFX10-NEXT: ds_write_b64 v0, v[0:1]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; G_GFX9-LABEL: nonkernel:
+; G_GFX9: ; %bb.0:
+; G_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; G_GFX9-NEXT: v_mov_b32_e32 v2, 0
+; G_GFX9-NEXT: v_mov_b32_e32 v3, 8
+; G_GFX9-NEXT: v_mov_b32_e32 v0, 0
+; G_GFX9-NEXT: v_mov_b32_e32 v1, 0
+; G_GFX9-NEXT: ds_write_b32 v3, v2
+; G_GFX9-NEXT: ds_write_b64 v2, v[0:1]
+; G_GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; G_GFX10-LABEL: nonkernel:
+; G_GFX10: ; %bb.0:
+; G_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; G_GFX10-NEXT: v_mov_b32_e32 v2, 0
+; G_GFX10-NEXT: v_mov_b32_e32 v3, 8
+; G_GFX10-NEXT: v_mov_b32_e32 v0, 0
+; G_GFX10-NEXT: v_mov_b32_e32 v1, 0
+; G_GFX10-NEXT: ds_write_b32 v3, v2
+; G_GFX10-NEXT: ds_write_b64 v2, v[0:1]
+; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT: s_setpc_b64 s[30:31]
+ store i32 0, ptr addrspace(3) @used_by_both
+ store double 0.0, ptr addrspace(3) @used_by_function
+ ret void
+}
+
; Needs to allocate both variables, store to used_by_both is at sizeof(double)
define amdgpu_kernel void @withcall() {
; GFX9-LABEL: withcall:
@@ -171,55 +220,5 @@ define amdgpu_kernel void @nocall_false_sharing() {
}
; CHECK: ; LDSByteSize: 4 bytes
-
-define void @nonkernel() {
-; GFX9-LABEL: nonkernel:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: ds_write_b32 v0, v0 offset:8
-; GFX9-NEXT: ds_write_b64 v0, v[0:1]
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: nonkernel:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: ds_write_b32 v0, v0 offset:8
-; GFX10-NEXT: ds_write_b64 v0, v[0:1]
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; G_GFX9-LABEL: nonkernel:
-; G_GFX9: ; %bb.0:
-; G_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; G_GFX9-NEXT: v_mov_b32_e32 v2, 0
-; G_GFX9-NEXT: v_mov_b32_e32 v3, 8
-; G_GFX9-NEXT: v_mov_b32_e32 v0, 0
-; G_GFX9-NEXT: v_mov_b32_e32 v1, 0
-; G_GFX9-NEXT: ds_write_b32 v3, v2
-; G_GFX9-NEXT: ds_write_b64 v2, v[0:1]
-; G_GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; G_GFX10-LABEL: nonkernel:
-; G_GFX10: ; %bb.0:
-; G_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; G_GFX10-NEXT: v_mov_b32_e32 v2, 0
-; G_GFX10-NEXT: v_mov_b32_e32 v3, 8
-; G_GFX10-NEXT: v_mov_b32_e32 v0, 0
-; G_GFX10-NEXT: v_mov_b32_e32 v1, 0
-; G_GFX10-NEXT: ds_write_b32 v3, v2
-; G_GFX10-NEXT: ds_write_b64 v2, v[0:1]
-; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT: s_setpc_b64 s[30:31]
- store i32 0, ptr addrspace(3) @used_by_both
- store double 0.0, ptr addrspace(3) @used_by_function
- ret void
-}
-
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index b84686139d0e2c..3c100cf7a38527 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -110,13 +110,14 @@ bb.2:
store volatile i32 0, ptr addrspace(1) undef
ret void
}
-; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112
+; DEFAULTSIZE: .set kernel_non_entry_block_static_alloca_uniformly_reached_align4.private_seg_size, 4112
; DEFAULTSIZE: ; ScratchSize: 4112
-; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size 16
-; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack 1
+; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack ((41|((kernel_non_entry_block_static_alloca_uniformly_reached_align4.has_dyn_sized_stack|kernel_non_entry_block_static_alloca_uniformly_reached_align4.has_recursion)<<11))&2048)>>11
+; DEFAULTSIZE-V5: .set kernel_non_entry_block_static_alloca_uniformly_reached_align4.private_seg_size, 16
+; DEFAULTSIZE-V5: .set kernel_non_entry_block_static_alloca_uniformly_reached_align4.has_dyn_sized_stack, 1
; DEFAULTSIZE-V5: ; ScratchSize: 16
-; ASSUME1024: .amdhsa_private_segment_fixed_size 1040
+; ASSUME1024: .set kernel_non_entry_block_static_alloca_uniformly_reached_align4.private_seg_size, 1040
; ASSUME1024: ; ScratchSize: 1040
define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) {
@@ -205,13 +206,16 @@ bb.1:
ret void
}
-; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160
+; DEFAULTSIZE: .amdhsa_private_segment_fixed_size kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size
+; DEFAULTSIZE: .set kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size, 4160
; DEFAULTSIZE: ; ScratchSize: 4160
-; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size 64
-; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack 1
+; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size
+; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack ((59|((kernel_non_entry_block_static_alloca_uniformly_reached_align64.has_dyn_sized_stack|kernel_non_entry_block_static_alloca_uniformly_reached_align64.has_recursion)<<11))&2048)>>11
+; DEFAULTSIZE-V5: .set kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size, 64
+; DEFAULTSIZE-V5: .set kernel_non_entry_block_static_alloca_uniformly_reached_align64.has_dyn_sized_stack, 1
; DEFAULTSIZE-V5: ; ScratchSize: 64
-; ASSUME1024: .amdhsa_private_segment_fixed_size 1088
+; ASSUME1024: .set kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size, 1088
; ASSUME1024: ; ScratchSize: 1088
diff --git a/llvm/test/CodeGen/AMDGPU/recursion.ll b/llvm/test/CodeGen/AMDGPU/recursion.ll
index d58477c194ea62..c0d228e1254e64 100644
--- a/llvm/test/CodeGen/AMDGPU/recursion.ll
+++ b/llvm/test/CodeGen/AMDGPU/recursion.ll
@@ -3,7 +3,11 @@
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=V5 %s
; CHECK-LABEL: {{^}}recursive:
+; CHECK: .set recursive.private_seg_size, 16+(max(16384))
; CHECK: ScratchSize: 16
+
+; V5-LABEL: {{^}}recursive:
+; V5: .set recursive.has_recursion, 1
define void @recursive() {
call void @recursive()
store volatile i32 0, ptr addrspace(1) undef
@@ -11,18 +15,22 @@ define void @recursive() {
}
; CHECK-LABEL: {{^}}tail_recursive:
+; CHECK: .set tail_recursive.private_seg_size, 0
; CHECK: ScratchSize: 0
define void @tail_recursive() {
tail call void @tail_recursive()
ret void
}
+; CHECK: .set calls_tail_recursive.private_seg_size, 0+(max(tail_recursive.private_seg_size))
define void @calls_tail_recursive() norecurse {
tail call void @tail_recursive()
ret void
}
; CHECK-LABEL: {{^}}tail_recursive_with_stack:
+; CHECK: .set tail_recursive_with_stack.private_seg_size, 8
+; CHECK: .set tail_recursive_with_stack.has_recursion, 1
define void @tail_recursive_with_stack() {
%alloca = alloca i32, addrspace(5)
store volatile i32 0, ptr addrspace(5) %alloca
@@ -33,11 +41,11 @@ define void @tail_recursive_with_stack() {
; For an arbitrary recursive call, report a large number for unknown stack
; usage for code object v4 and older
; CHECK-LABEL: {{^}}calls_recursive:
-; CHECK: .amdhsa_private_segment_fixed_size 16400{{$}}
+; CHECK: .set calls_recursive.private_seg_size, 0+(max(16384, recursive.private_seg_size))
;
; V5-LABEL: {{^}}calls_recursive:
-; V5: .amdhsa_private_segment_fixed_size 0{{$}}
-; V5: .amdhsa_uses_dynamic_stack 1
+; V5: .set calls_recursive.private_seg_size, 0+(max(recursive.private_seg_size))
+; V5: .set calls_recursive.has_dyn_sized_stack, or(0, recursive.has_dyn_sized_stack)
define amdgpu_kernel void @calls_recursive() {
call void @recursive()
ret void
@@ -46,7 +54,7 @@ define amdgpu_kernel void @calls_recursive() {
; Make sure we do not report a huge stack size for tail recursive
; functions
; CHECK-LABEL: {{^}}kernel_indirectly_calls_tail_recursive:
-; CHECK: .amdhsa_private_segment_fixed_size 0{{$}}
+; CHECK: .set kernel_indirectly_calls_tail_recursive.private_seg_size, 0+(max(calls_tail_recursive.private_seg_size))
define amdgpu_kernel void @kernel_indirectly_calls_tail_recursive() {
call void @calls_tail_recursive()
ret void
@@ -57,22 +65,22 @@ define amdgpu_kernel void @kernel_indirectly_calls_tail_recursive() {
; in the kernel.
; CHECK-LABEL: {{^}}kernel_calls_tail_recursive:
-; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}}
+; CHECK: .set kernel_calls_tail_recursive.private_seg_size, 0+(max(16384, tail_recursive.private_seg_size))
;
; V5-LABEL: {{^}}kernel_calls_tail_recursive:
-; V5: .amdhsa_private_segment_fixed_size 0{{$}}
-; V5: .amdhsa_uses_dynamic_stack 1
+; V5: .set kernel_calls_tail_recursive.private_seg_size, 0+(max(tail_recursive.private_seg_size))
+; V5: .set kernel_calls_tail_recursive.has_recursion, or(1, tail_recursive.has_recursion)
define amdgpu_kernel void @kernel_calls_tail_recursive() {
call void @tail_recursive()
ret void
}
; CHECK-LABEL: {{^}}kernel_calls_tail_recursive_with_stack:
-; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}}
+; CHECK: .set kernel_calls_tail_recursive_with_stack.private_seg_size, 0+(max(16384, tail_recursive_with_stack.private_seg_size))
;
; V5-LABEL: {{^}}kernel_calls_tail_recursive_with_stack:
-; V5: .amdhsa_private_segment_fixed_size 8{{$}}
-; V5: .amdhsa_uses_dynamic_stack 1
+; V5: .set kernel_calls_tail_recursive_with_stack.private_seg_size, 0+(max(tail_recursive_with_stack.private_seg_size))
+; V5: .set kernel_calls_tail_recursive_with_stack.has_dyn_sized_stack, or(0, tail_recursive_with_stack.has_dyn_sized_stack)
define amdgpu_kernel void @kernel_calls_tail_recursive_with_stack() {
call void @tail_recursive_with_stack()
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
index 002de8bb4eb510..87ec51fb44ac45 100644
--- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
+++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
@@ -2,12 +2,12 @@
; RUN: FileCheck -check-prefix=REMARK %s < %t
; STDERR: remark: foo.cl:27:0: Function Name: test_kernel
-; STDERR-NEXT: remark: foo.cl:27:0: SGPRs: 28
-; STDERR-NEXT: remark: foo.cl:27:0: VGPRs: 9
-; STDERR-NEXT: remark: foo.cl:27:0: AGPRs: 43
-; STDERR-NEXT: remark: foo.cl:27:0: ScratchSize [bytes/lane]: 0
+; STDERR-NEXT: remark: foo.cl:27:0: SGPRs: test_kernel.num_sgpr+(extrasgprs(test_kernel.uses_vcc, test_kernel.uses_flat_scratch, 1))
+; STDERR-NEXT: remark: foo.cl:27:0: VGPRs: test_kernel.num_vgpr
+; STDERR-NEXT: remark: foo.cl:27:0: AGPRs: test_kernel.num_agpr
+; STDERR-NEXT: remark: foo.cl:27:0: ScratchSize [bytes/lane]: test_kernel.private_seg_size
; STDERR-NEXT: remark: foo.cl:27:0: Dynamic Stack: False
-; STDERR-NEXT: remark: foo.cl:27:0: Occupancy [waves/SIMD]: 5
+; STDERR-NEXT: remark: foo.cl:27:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_kernel.num_sgpr+(extrasgprs(test_kernel.uses_vcc, test_kernel.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_kernel.num_agpr, test_kernel.num_vgpr), 1, 0))
; STDERR-NEXT: remark: foo.cl:27:0: SGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:27:0: VGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:27:0: LDS Size [bytes/block]: 512
@@ -19,7 +19,7 @@
; REMARK-NEXT: Function: test_kernel
; REMARK-NEXT: Args:
; REMARK-NEXT: - String: 'Function Name: '
-; REMARK-NEXT: - FunctionName: test_kernel
+; REMARK-NEXT: - FunctionName: test_kernel
; REMARK-NEXT: ...
; REMARK-NEXT: --- !Analysis
; REMARK-NEXT: Pass: kernel-resource-usage
@@ -28,7 +28,7 @@
; REMARK-NEXT: Function: test_kernel
; REMARK-NEXT: Args:
; REMARK-NEXT: - String: ' SGPRs: '
-; REMARK-NEXT: - NumSGPR: '28'
+; REMARK-NEXT: - NumSGPR: 'test_kernel.num_sgpr+(extrasgprs(test_kernel.uses_vcc, test_kernel.uses_flat_scratch, 1))'
; REMARK-NEXT: ...
; REMARK-NEXT: --- !Analysis
; REMARK-NEXT: Pass: kernel-resource-usage
@@ -37,7 +37,7 @@
; REMARK-NEXT: Function: test_kernel
; REMARK-NEXT: Args:
; REMARK-NEXT: - String: ' VGPRs: '
-; REMARK-NEXT: - NumVGPR: '9'
+; REMARK-NEXT: - NumVGPR: test_kernel.num_vgpr
; REMARK-NEXT: ...
; REMARK-NEXT: --- !Analysis
; REMARK-NEXT: Pass: kernel-resource-usage
@@ -46,7 +46,7 @@
; REMARK-NEXT: Function: test_kernel
; REMARK-NEXT: Args:
; REMARK-NEXT: - String: ' AGPRs: '
-; REMARK-NEXT: - NumAGPR: '43'
+; REMARK-NEXT: - NumAGPR: test_kernel.num_agpr
; REMARK-NEXT: ...
; REMARK-NEXT: --- !Analysis
; REMARK-NEXT: Pass: kernel-resource-usage
@@ -55,17 +55,17 @@
; REMARK-NEXT: Function: test_kernel
; REMARK-NEXT: Args:
; REMARK-NEXT: - String: ' ScratchSize [bytes/lane]: '
-; REMARK-NEXT: - ScratchSize: '0'
-; REMARK-NEXT: ..
+; REMARK-NEXT: - ScratchSize: test_kernel.private_seg_size
+; REMARK-NEXT: ...
; REMARK-NEXT: --- !Analysis
; REMARK-NEXT: Pass: kernel-resource-usage
; REMARK-NEXT: Name: DynamicStack
; REMARK-NEXT: DebugLoc: { File: foo.cl, Line: 27, Column: 0 }
; REMARK-NEXT: Function: test_kernel
; REMARK-NEXT: Args:
-; REMARK-NEXT: - String: ' Dynamic Stack:
-; REMARK-NEXT: - DynamicStack: 'False'
-; REMARK-NEXT: ..
+; REMARK-NEXT: - String: ' Dynamic Stack: '
+; REMARK-NEXT: - DynamicStack: 'False'
+; REMARK-NEXT: ...
; REMARK-NEXT: --- !Analysis
; REMARK-NEXT: Pass: kernel-resource-usage
; REMARK-NEXT: Name: Occupancy
@@ -73,7 +73,7 @@
; REMARK-NEXT: Function: test_kernel
; REMARK-NEXT: Args:
; REMARK-NEXT: - String: ' Occupancy [waves/SIMD]: '
-; REMARK-NEXT: - Occupancy: '5'
+; REMARK-NEXT: - Occupancy: 'occupancy(10, 4, 256, 8, 8, max(test_kernel.num_sgpr+(extrasgprs(test_kernel.uses_vcc, test_kernel.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_kernel.num_agpr, test_kernel.num_vgpr), 1, 0))'
; REMARK-NEXT: ...
; REMARK-NEXT: --- !Analysis
; REMARK-NEXT: Pass: kernel-resource-usage
@@ -122,12 +122,12 @@ define void @test_func() !dbg !6 {
}
; STDERR: remark: foo.cl:8:0: Function Name: empty_kernel
-; STDERR-NEXT: remark: foo.cl:8:0: SGPRs: 4
-; STDERR-NEXT: remark: foo.cl:8:0: VGPRs: 0
-; STDERR-NEXT: remark: foo.cl:8:0: AGPRs: 0
-; STDERR-NEXT: remark: foo.cl:8:0: ScratchSize [bytes/lane]: 0
+; STDERR-NEXT: remark: foo.cl:8:0: SGPRs: empty_kernel.num_sgpr+(extrasgprs(empty_kernel.uses_vcc, empty_kernel.uses_flat_scratch, 1))
+; STDERR-NEXT: remark: foo.cl:8:0: VGPRs: empty_kernel.num_vgpr
+; STDERR-NEXT: remark: foo.cl:8:0: AGPRs: empty_kernel.num_agpr
+; STDERR-NEXT: remark: foo.cl:8:0: ScratchSize [bytes/lane]: empty_kernel.private_seg_size
; STDERR-NEXT: remark: foo.cl:8:0: Dynamic Stack: False
-; STDERR-NEXT: remark: foo.cl:8:0: Occupancy [waves/SIMD]: 8
+; STDERR-NEXT: remark: foo.cl:8:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(empty_kernel.num_sgpr+(extrasgprs(empty_kernel.uses_vcc, empty_kernel.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(empty_kernel.num_agpr, empty_kernel.num_vgpr), 1, 0))
; STDERR-NEXT: remark: foo.cl:8:0: SGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:8:0: VGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:8:0: LDS Size [bytes/block]: 0
@@ -141,12 +141,12 @@ define void @empty_func() !dbg !8 {
}
; STDERR: remark: foo.cl:64:0: Function Name: test_indirect_call
-; STDERR-NEXT: remark: foo.cl:64:0: SGPRs: 39
-; STDERR-NEXT: remark: foo.cl:64:0: VGPRs: 32
-; STDERR-NEXT: remark: foo.cl:64:0: AGPRs: 10
-; STDERR-NEXT: remark: foo.cl:64:0: ScratchSize [bytes/lane]: 0
-; STDERR-NEXT: remark: foo.cl:64:0: Dynamic Stack: True
-; STDERR-NEXT: remark: foo.cl:64:0: Occupancy [waves/SIMD]: 8
+; STDERR-NEXT: remark: foo.cl:64:0: SGPRs: test_indirect_call.num_sgpr+(extrasgprs(test_indirect_call.uses_vcc, test_indirect_call.uses_flat_scratch, 1))
+; STDERR-NEXT: remark: foo.cl:64:0: VGPRs: test_indirect_call.num_vgpr
+; STDERR-NEXT: remark: foo.cl:64:0: AGPRs: test_indirect_call.num_agpr
+; STDERR-NEXT: remark: foo.cl:64:0: ScratchSize [bytes/lane]: test_indirect_call.private_seg_size
+; STDERR-NEXT: remark: foo.cl:64:0: Dynamic Stack: False
+; STDERR-NEXT: remark: foo.cl:64:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_indirect_call.num_sgpr+(extrasgprs(test_indirect_call.uses_vcc, test_indirect_call.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_call.num_agpr, test_indirect_call.num_vgpr), 1, 0))
; STDERR-NEXT: remark: foo.cl:64:0: SGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:64:0: VGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:64:0: LDS Size [bytes/block]: 0
@@ -159,12 +159,12 @@ define amdgpu_kernel void @test_indirect_call() !dbg !9 {
}
; STDERR: remark: foo.cl:74:0: Function Name: test_indirect_w_static_stack
-; STDERR-NEXT: remark: foo.cl:74:0: SGPRs: 39
-; STDERR-NEXT: remark: foo.cl:74:0: VGPRs: 32
-; STDERR-NEXT: remark: foo.cl:74:0: AGPRs: 10
-; STDERR-NEXT: remark: foo.cl:74:0: ScratchSize [bytes/lane]: 144
-; STDERR-NEXT: remark: foo.cl:74:0: Dynamic Stack: True
-; STDERR-NEXT: remark: foo.cl:74:0: Occupancy [waves/SIMD]: 8
+; STDERR-NEXT: remark: foo.cl:74:0: SGPRs: test_indirect_w_static_stack.num_sgpr+(extrasgprs(test_indirect_w_static_stack.uses_vcc, test_indirect_w_static_stack.uses_flat_scratch, 1))
+; STDERR-NEXT: remark: foo.cl:74:0: VGPRs: test_indirect_w_static_stack.num_vgpr
+; STDERR-NEXT: remark: foo.cl:74:0: AGPRs: test_indirect_w_static_stack.num_agpr
+; STDERR-NEXT: remark: foo.cl:74:0: ScratchSize [bytes/lane]: test_indirect_w_static_stack.private_seg_size
+; STDERR-NEXT: remark: foo.cl:74:0: Dynamic Stack: False
+; STDERR-NEXT: remark: foo.cl:74:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_indirect_w_static_stack.num_sgpr+(extrasgprs(test_indirect_w_static_stack.uses_vcc, test_indirect_w_static_stack.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_w_static_stack.num_agpr, test_indirect_w_static_stack.num_vgpr), 1, 0))
; STDERR-NEXT: remark: foo.cl:74:0: SGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:74:0: VGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:74:0: LDS Size [bytes/block]: 0
diff --git a/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll b/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll
index bba59ba4d80302..5d5aad76afd095 100644
--- a/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll
+++ b/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll
@@ -1,6 +1,6 @@
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefix=GCN %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefix=GCN-V5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefix=GCN-V5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefixes=GCN,ALL %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefixes=GCN-V5,ALL %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefixes=GCN-V5,ALL %s
; Make sure there's no assertion when trying to report the resource
; usage for a function which becomes dead during codegen.
@@ -21,9 +21,10 @@ define internal fastcc void @unreachable() {
; GCN-NOT: s_swappc_b64
; GCN: s_endpgm
-; GCN: .amdhsa_private_segment_fixed_size 0
-; GCN-NOT: .amdhsa_uses_dynamic_stack 0
-; GCN-V5: .amdhsa_uses_dynamic_stack 0
+; GCN-NOT: .amdhsa_uses_dynamic_stack
+; GCN-V5: .amdhsa_uses_dynamic_stack
+; ALL: .set entry.private_seg_size, 0
+; ALL: .set entry.has_dyn_sized_stack, 0
define amdgpu_kernel void @entry() {
bb0:
br i1 false, label %bb1, label %bb2
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
index 6ddf0986755f95..38d202eb4308f6 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
@@ -19,7 +19,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
; VI-NEXT: .p2align 6
; VI-NEXT: .amdhsa_kernel max_alignment_128
; VI-NEXT: .amdhsa_group_segment_fixed_size 0
-; VI-NEXT: .amdhsa_private_segment_fixed_size 256
+; VI-NEXT: .amdhsa_private_segment_fixed_size max_alignment_128.private_seg_size
; VI-NEXT: .amdhsa_kernarg_size 56
; VI-NEXT: .amdhsa_user_sgpr_count 14
; VI-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
@@ -29,16 +29,16 @@ define amdgpu_kernel void @max_alignment_128() #0 {
; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 1
; VI-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
; VI-NEXT: .amdhsa_user_sgpr_private_segment_size 0
-; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(max_alignment_128.private_seg_size*64, 1024))/1024)>0)||(max_alignment_128.has_dyn_sized_stack|max_alignment_128.has_recursion))|5020)&1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT: .amdhsa_next_free_vgpr 1
-; VI-NEXT: .amdhsa_next_free_sgpr 18
-; VI-NEXT: .amdhsa_reserve_vcc 0
-; VI-NEXT: .amdhsa_reserve_flat_scratch 0
+; VI-NEXT: .amdhsa_next_free_vgpr max(totalnumvgprs(max_alignment_128.num_agpr, max_alignment_128.num_vgpr), 1, 0)
+; VI-NEXT: .amdhsa_next_free_sgpr (max(max_alignment_128.num_sgpr+(extrasgprs(max_alignment_128.uses_vcc, max_alignment_128.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(max_alignment_128.uses_vcc, max_alignment_128.uses_flat_scratch, 0))
+; VI-NEXT: .amdhsa_reserve_vcc max_alignment_128.uses_vcc
+; VI-NEXT: .amdhsa_reserve_flat_scratch max_alignment_128.uses_flat_scratch
; VI-NEXT: .amdhsa_float_round_mode_32 0
; VI-NEXT: .amdhsa_float_round_mode_16_64 0
; VI-NEXT: .amdhsa_float_denorm_mode_32 3
@@ -54,6 +54,15 @@ define amdgpu_kernel void @max_alignment_128() #0 {
; VI-NEXT: .amdhsa_exception_int_div_zero 0
; VI-NEXT: .end_amdhsa_kernel
; VI-NEXT: .text
+; VI: .set max_alignment_128.num_vgpr, 1
+; VI-NEXT: .set max_alignment_128.num_agpr, 0
+; VI-NEXT: .set max_alignment_128.num_sgpr, 18
+; VI-NEXT: .set max_alignment_128.private_seg_size, 256
+; VI-NEXT: .set max_alignment_128.uses_vcc, 0
+; VI-NEXT: .set max_alignment_128.uses_flat_scratch, 0
+; VI-NEXT: .set max_alignment_128.has_dyn_sized_stack, 0
+; VI-NEXT: .set max_alignment_128.has_recursion, 0
+; VI-NEXT: .set max_alignment_128.has_indirect_call, 0
;
; GFX9-LABEL: max_alignment_128:
; GFX9: ; %bb.0:
@@ -70,7 +79,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
; GFX9-NEXT: .p2align 6
; GFX9-NEXT: .amdhsa_kernel max_alignment_128
; GFX9-NEXT: .amdhsa_group_segment_fixed_size 0
-; GFX9-NEXT: .amdhsa_private_segment_fixed_size 256
+; GFX9-NEXT: .amdhsa_private_segment_fixed_size max_alignment_128.private_seg_size
; GFX9-NEXT: .amdhsa_kernarg_size 56
; GFX9-NEXT: .amdhsa_user_sgpr_count 14
; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
@@ -80,16 +89,16 @@ define amdgpu_kernel void @max_alignment_128() #0 {
; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 1
; GFX9-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_size 0
-; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(max_alignment_128.private_seg_size*64, 1024))/1024)>0)||(max_alignment_128.has_dyn_sized_stack|max_alignment_128.has_recursion))|5020)&1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT: .amdhsa_next_free_vgpr 1
-; GFX9-NEXT: .amdhsa_next_free_sgpr 18
-; GFX9-NEXT: .amdhsa_reserve_vcc 0
-; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0
+; GFX9-NEXT: .amdhsa_next_free_vgpr max(totalnumvgprs(max_alignment_128.num_agpr, max_alignment_128.num_vgpr), 1, 0)
+; GFX9-NEXT: .amdhsa_next_free_sgpr (max(max_alignment_128.num_sgpr+(extrasgprs(max_alignment_128.uses_vcc, max_alignment_128.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(max_alignment_128.uses_vcc, max_alignment_128.uses_flat_scratch, 1))
+; GFX9-NEXT: .amdhsa_reserve_vcc max_alignment_128.uses_vcc
+; GFX9-NEXT: .amdhsa_reserve_flat_scratch max_alignment_128.uses_flat_scratch
; GFX9-NEXT: .amdhsa_reserve_xnack_mask 1
; GFX9-NEXT: .amdhsa_float_round_mode_32 0
; GFX9-NEXT: .amdhsa_float_round_mode_16_64 0
@@ -107,6 +116,15 @@ define amdgpu_kernel void @max_alignment_128() #0 {
; GFX9-NEXT: .amdhsa_exception_int_div_zero 0
; GFX9-NEXT: .end_amdhsa_kernel
; GFX9-NEXT: .text
+; GFX9: .set max_alignment_128.num_vgpr, 1
+; GFX9-NEXT: .set max_alignment_128.num_agpr, 0
+; GFX9-NEXT: .set max_alignment_128.num_sgpr, 18
+; GFX9-NEXT: .set max_alignment_128.private_seg_size, 256
+; GFX9-NEXT: .set max_alignment_128.uses_vcc, 0
+; GFX9-NEXT: .set max_alignment_128.uses_flat_scratch, 0
+; GFX9-NEXT: .set max_alignment_128.has_dyn_sized_stack, 0
+; GFX9-NEXT: .set max_alignment_128.has_recursion, 0
+; GFX9-NEXT: .set max_alignment_128.has_indirect_call, 0
%clutter = alloca i8, addrspace(5) ; Force non-zero offset for next alloca
store volatile i8 3, ptr addrspace(5) %clutter
%alloca.align = alloca i32, align 128, addrspace(5)
@@ -130,7 +148,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
; VI-NEXT: .p2align 6
; VI-NEXT: .amdhsa_kernel stackrealign_attr
; VI-NEXT: .amdhsa_group_segment_fixed_size 0
-; VI-NEXT: .amdhsa_private_segment_fixed_size 12
+; VI-NEXT: .amdhsa_private_segment_fixed_size stackrealign_attr.private_seg_size
; VI-NEXT: .amdhsa_kernarg_size 56
; VI-NEXT: .amdhsa_user_sgpr_count 14
; VI-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
@@ -140,16 +158,16 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 1
; VI-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
; VI-NEXT: .amdhsa_user_sgpr_private_segment_size 0
-; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(stackrealign_attr.private_seg_size*64, 1024))/1024)>0)||(stackrealign_attr.has_dyn_sized_stack|stackrealign_attr.has_recursion))|5020)&1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT: .amdhsa_next_free_vgpr 1
-; VI-NEXT: .amdhsa_next_free_sgpr 18
-; VI-NEXT: .amdhsa_reserve_vcc 0
-; VI-NEXT: .amdhsa_reserve_flat_scratch 0
+; VI-NEXT: .amdhsa_next_free_vgpr max(totalnumvgprs(stackrealign_attr.num_agpr, stackrealign_attr.num_vgpr), 1, 0)
+; VI-NEXT: .amdhsa_next_free_sgpr (max(stackrealign_attr.num_sgpr+(extrasgprs(stackrealign_attr.uses_vcc, stackrealign_attr.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(stackrealign_attr.uses_vcc, stackrealign_attr.uses_flat_scratch, 0))
+; VI-NEXT: .amdhsa_reserve_vcc stackrealign_attr.uses_vcc
+; VI-NEXT: .amdhsa_reserve_flat_scratch stackrealign_attr.uses_flat_scratch
; VI-NEXT: .amdhsa_float_round_mode_32 0
; VI-NEXT: .amdhsa_float_round_mode_16_64 0
; VI-NEXT: .amdhsa_float_denorm_mode_32 3
@@ -165,6 +183,15 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
; VI-NEXT: .amdhsa_exception_int_div_zero 0
; VI-NEXT: .end_amdhsa_kernel
; VI-NEXT: .text
+; VI: .set stackrealign_attr.num_vgpr, 1
+; VI-NEXT: .set stackrealign_attr.num_agpr, 0
+; VI-NEXT: .set stackrealign_attr.num_sgpr, 18
+; VI-NEXT: .set stackrealign_attr.private_seg_size, 12
+; VI-NEXT: .set stackrealign_attr.uses_vcc, 0
+; VI-NEXT: .set stackrealign_attr.uses_flat_scratch, 0
+; VI-NEXT: .set stackrealign_attr.has_dyn_sized_stack, 0
+; VI-NEXT: .set stackrealign_attr.has_recursion, 0
+; VI-NEXT: .set stackrealign_attr.has_indirect_call, 0
;
; GFX9-LABEL: stackrealign_attr:
; GFX9: ; %bb.0:
@@ -181,7 +208,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
; GFX9-NEXT: .p2align 6
; GFX9-NEXT: .amdhsa_kernel stackrealign_attr
; GFX9-NEXT: .amdhsa_group_segment_fixed_size 0
-; GFX9-NEXT: .amdhsa_private_segment_fixed_size 12
+; GFX9-NEXT: .amdhsa_private_segment_fixed_size stackrealign_attr.private_seg_size
; GFX9-NEXT: .amdhsa_kernarg_size 56
; GFX9-NEXT: .amdhsa_user_sgpr_count 14
; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
@@ -191,16 +218,16 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 1
; GFX9-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_size 0
-; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(stackrealign_attr.private_seg_size*64, 1024))/1024)>0)||(stackrealign_attr.has_dyn_sized_stack|stackrealign_attr.has_recursion))|5020)&1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT: .amdhsa_next_free_vgpr 1
-; GFX9-NEXT: .amdhsa_next_free_sgpr 18
-; GFX9-NEXT: .amdhsa_reserve_vcc 0
-; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0
+; GFX9-NEXT: .amdhsa_next_free_vgpr max(totalnumvgprs(stackrealign_attr.num_agpr, stackrealign_attr.num_vgpr), 1, 0)
+; GFX9-NEXT: .amdhsa_next_free_sgpr (max(stackrealign_attr.num_sgpr+(extrasgprs(stackrealign_attr.uses_vcc, stackrealign_attr.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(stackrealign_attr.uses_vcc, stackrealign_attr.uses_flat_scratch, 1))
+; GFX9-NEXT: .amdhsa_reserve_vcc stackrealign_attr.uses_vcc
+; GFX9-NEXT: .amdhsa_reserve_flat_scratch stackrealign_attr.uses_flat_scratch
; GFX9-NEXT: .amdhsa_reserve_xnack_mask 1
; GFX9-NEXT: .amdhsa_float_round_mode_32 0
; GFX9-NEXT: .amdhsa_float_round_mode_16_64 0
@@ -218,6 +245,15 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
; GFX9-NEXT: .amdhsa_exception_int_div_zero 0
; GFX9-NEXT: .end_amdhsa_kernel
; GFX9-NEXT: .text
+; GFX9: .set stackrealign_attr.num_vgpr, 1
+; GFX9-NEXT: .set stackrealign_attr.num_agpr, 0
+; GFX9-NEXT: .set stackrealign_attr.num_sgpr, 18
+; GFX9-NEXT: .set stackrealign_attr.private_seg_size, 12
+; GFX9-NEXT: .set stackrealign_attr.uses_vcc, 0
+; GFX9-NEXT: .set stackrealign_attr.uses_flat_scratch, 0
+; GFX9-NEXT: .set stackrealign_attr.has_dyn_sized_stack, 0
+; GFX9-NEXT: .set stackrealign_attr.has_recursion, 0
+; GFX9-NEXT: .set stackrealign_attr.has_indirect_call, 0
%clutter = alloca i8, addrspace(5) ; Force non-zero offset for next alloca
store volatile i8 3, ptr addrspace(5) %clutter
%alloca.align = alloca i32, align 4, addrspace(5)
@@ -241,7 +277,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
; VI-NEXT: .p2align 6
; VI-NEXT: .amdhsa_kernel alignstack_attr
; VI-NEXT: .amdhsa_group_segment_fixed_size 0
-; VI-NEXT: .amdhsa_private_segment_fixed_size 128
+; VI-NEXT: .amdhsa_private_segment_fixed_size alignstack_attr.private_seg_size
; VI-NEXT: .amdhsa_kernarg_size 56
; VI-NEXT: .amdhsa_user_sgpr_count 14
; VI-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
@@ -251,16 +287,16 @@ define amdgpu_kernel void @alignstack_attr() #2 {
; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 1
; VI-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
; VI-NEXT: .amdhsa_user_sgpr_private_segment_size 0
-; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(alignstack_attr.private_seg_size*64, 1024))/1024)>0)||(alignstack_attr.has_dyn_sized_stack|alignstack_attr.has_recursion))|5020)&1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT: .amdhsa_next_free_vgpr 1
-; VI-NEXT: .amdhsa_next_free_sgpr 18
-; VI-NEXT: .amdhsa_reserve_vcc 0
-; VI-NEXT: .amdhsa_reserve_flat_scratch 0
+; VI-NEXT: .amdhsa_next_free_vgpr max(totalnumvgprs(alignstack_attr.num_agpr, alignstack_attr.num_vgpr), 1, 0)
+; VI-NEXT: .amdhsa_next_free_sgpr (max(alignstack_attr.num_sgpr+(extrasgprs(alignstack_attr.uses_vcc, alignstack_attr.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(alignstack_attr.uses_vcc, alignstack_attr.uses_flat_scratch, 0))
+; VI-NEXT: .amdhsa_reserve_vcc alignstack_attr.uses_vcc
+; VI-NEXT: .amdhsa_reserve_flat_scratch alignstack_attr.uses_flat_scratch
; VI-NEXT: .amdhsa_float_round_mode_32 0
; VI-NEXT: .amdhsa_float_round_mode_16_64 0
; VI-NEXT: .amdhsa_float_denorm_mode_32 3
@@ -276,6 +312,15 @@ define amdgpu_kernel void @alignstack_attr() #2 {
; VI-NEXT: .amdhsa_exception_int_div_zero 0
; VI-NEXT: .end_amdhsa_kernel
; VI-NEXT: .text
+; VI: .set alignstack_attr.num_vgpr, 1
+; VI-NEXT: .set alignstack_attr.num_agpr, 0
+; VI-NEXT: .set alignstack_attr.num_sgpr, 18
+; VI-NEXT: .set alignstack_attr.private_seg_size, 128
+; VI-NEXT: .set alignstack_attr.uses_vcc, 0
+; VI-NEXT: .set alignstack_attr.uses_flat_scratch, 0
+; VI-NEXT: .set alignstack_attr.has_dyn_sized_stack, 0
+; VI-NEXT: .set alignstack_attr.has_recursion, 0
+; VI-NEXT: .set alignstack_attr.has_indirect_call, 0
;
; GFX9-LABEL: alignstack_attr:
; GFX9: ; %bb.0:
@@ -292,7 +337,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
; GFX9-NEXT: .p2align 6
; GFX9-NEXT: .amdhsa_kernel alignstack_attr
; GFX9-NEXT: .amdhsa_group_segment_fixed_size 0
-; GFX9-NEXT: .amdhsa_private_segment_fixed_size 128
+; GFX9-NEXT: .amdhsa_private_segment_fixed_size alignstack_attr.private_seg_size
; GFX9-NEXT: .amdhsa_kernarg_size 56
; GFX9-NEXT: .amdhsa_user_sgpr_count 14
; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
@@ -302,16 +347,16 @@ define amdgpu_kernel void @alignstack_attr() #2 {
; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 1
; GFX9-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_size 0
-; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(alignstack_attr.private_seg_size*64, 1024))/1024)>0)||(alignstack_attr.has_dyn_sized_stack|alignstack_attr.has_recursion))|5020)&1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT: .amdhsa_next_free_vgpr 1
-; GFX9-NEXT: .amdhsa_next_free_sgpr 18
-; GFX9-NEXT: .amdhsa_reserve_vcc 0
-; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0
+; GFX9-NEXT: .amdhsa_next_free_vgpr max(totalnumvgprs(alignstack_attr.num_agpr, alignstack_attr.num_vgpr), 1, 0)
+; GFX9-NEXT: .amdhsa_next_free_sgpr (max(alignstack_attr.num_sgpr+(extrasgprs(alignstack_attr.uses_vcc, alignstack_attr.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(alignstack_attr.uses_vcc, alignstack_attr.uses_flat_scratch, 1))
+; GFX9-NEXT: .amdhsa_reserve_vcc alignstack_attr.uses_vcc
+; GFX9-NEXT: .amdhsa_reserve_flat_scratch alignstack_attr.uses_flat_scratch
; GFX9-NEXT: .amdhsa_reserve_xnack_mask 1
; GFX9-NEXT: .amdhsa_float_round_mode_32 0
; GFX9-NEXT: .amdhsa_float_round_mode_16_64 0
@@ -329,6 +374,15 @@ define amdgpu_kernel void @alignstack_attr() #2 {
; GFX9-NEXT: .amdhsa_exception_int_div_zero 0
; GFX9-NEXT: .end_amdhsa_kernel
; GFX9-NEXT: .text
+; GFX9: .set alignstack_attr.num_vgpr, 1
+; GFX9-NEXT: .set alignstack_attr.num_agpr, 0
+; GFX9-NEXT: .set alignstack_attr.num_sgpr, 18
+; GFX9-NEXT: .set alignstack_attr.private_seg_size, 128
+; GFX9-NEXT: .set alignstack_attr.uses_vcc, 0
+; GFX9-NEXT: .set alignstack_attr.uses_flat_scratch, 0
+; GFX9-NEXT: .set alignstack_attr.has_dyn_sized_stack, 0
+; GFX9-NEXT: .set alignstack_attr.has_recursion, 0
+; GFX9-NEXT: .set alignstack_attr.has_indirect_call, 0
%clutter = alloca i8, addrspace(5) ; Force non-zero offset for next alloca
store volatile i8 3, ptr addrspace(5) %clutter
%alloca.align = alloca i32, align 4, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
index 19d633651fdd0d..432d8e0e856dbf 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
@@ -6,8 +6,9 @@
define amdgpu_kernel void @kern() #0 {
; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 5
+; ASM: .amdhsa_next_free_sgpr (max(kern.num_sgpr+(extrasgprs(kern.uses_vcc, kern.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kern.uses_vcc, kern.uses_flat_scratch, 1))
; ASM: .amdhsa_reserve_xnack_mask 1
+; ASM: .set kern.num_sgpr, 5
; Verify that an extra SGPR block is reserved with XNACK "any" tid setting.
; OBJ: Contents of section .rodata:
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
index 2097579e0c9959..b6b30bc591e2b9 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
@@ -6,8 +6,9 @@
define amdgpu_kernel void @kern() #0 {
; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 5
+; ASM: .amdhsa_next_free_sgpr (max(kern.num_sgpr+(extrasgprs(kern.uses_vcc, kern.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(kern.uses_vcc, kern.uses_flat_scratch, 0))
; ASM: .amdhsa_reserve_xnack_mask 0
+; ASM: .set kern.num_sgpr, 5
; Verify that an extra SGPR block is not reserved with XNACK "off" tid setting.
; OBJ: Contents of section .rodata:
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
index 775c62e73261a9..0aa5f2a0919761 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
@@ -6,8 +6,9 @@
define amdgpu_kernel void @kern() #0 {
; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 5
+; ASM: .amdhsa_next_free_sgpr (max(kern.num_sgpr+(extrasgprs(kern.uses_vcc, kern.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kern.uses_vcc, kern.uses_flat_scratch, 1))
; ASM: .amdhsa_reserve_xnack_mask 1
+; ASM: .set kern.num_sgpr, 5
; Verify that an extra SGPR block is reserved with XNACK "on" tid setting.
; OBJ: Contents of section .rodata:
diff --git a/llvm/test/CodeGen/AMDGPU/trap.ll b/llvm/test/CodeGen/AMDGPU/trap.ll
index 9bab3e6fcf8c45..c2845bf1035640 100644
--- a/llvm/test/CodeGen/AMDGPU/trap.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap.ll
@@ -5,23 +5,32 @@
; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=+trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=HSA-TRAP %s
; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-HSA-TRAP %s
; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-HSA-TRAP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs %s -o %t1 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs %s -o %t2 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
+; RUN: FileCheck -check-prefix=GCN %s < %t1
+; RUN: FileCheck -check-prefix=GCN %s < %t2
; enable trap handler feature
; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=TRAP-BIT -check-prefix=MESA-TRAP %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=TRAP-BIT -check-prefix=MESA-TRAP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=TRAP-BIT %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=TRAP-BIT %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs %s -o %t3 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs %s -o %t4 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
+; RUN: FileCheck -check-prefix=GCN -check-prefix=TRAP-BIT %s < %t3
+; RUN: FileCheck -check-prefix=GCN -check-prefix=TRAP-BIT %s < %t4
; disable trap handler feature
; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=NO-TRAP-BIT -check-prefix=NOMESA-TRAP %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=NO-TRAP-BIT -check-prefix=NOMESA-TRAP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=NO-TRAP-BIT %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=NO-TRAP-BIT %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs %s -o %t5 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs %s -o %t6 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
+; RUN: FileCheck -check-prefix=GCN -check-prefix=NO-TRAP-BIT %s < %t5
+; RUN: FileCheck -check-prefix=GCN -check-prefix=NO-TRAP-BIT %s < %t6
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn -verify-machineinstrs %s -o %t7 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -verify-machineinstrs %s -o %t8 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
+; RUN: FileCheck -check-prefix=GCN %s < %t7
+; RUN: FileCheck -check-prefix=GCN %s < %t8
-; RUN: llc -global-isel=0 -mtriple=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
; GCN-WARNING: warning: <unknown>:0:0: in function hsa_debugtrap void (ptr addrspace(1)): debugtrap handler not supported
@@ -31,11 +40,11 @@ declare void @llvm.debugtrap() #1
; MESA-TRAP: .section .AMDGPU.config
; MESA-TRAP: .long 47180
-; MESA-TRAP-NEXT: .long 5080
+; MESA-TRAP-NEXT: .long ((((alignto(hsa_trap.private_seg_size*64, 1024))/1024)>0)||(hsa_trap.has_dyn_sized_stack|hsa_trap.has_recursion))|5080
; NOMESA-TRAP: .section .AMDGPU.config
; NOMESA-TRAP: .long 47180
-; NOMESA-TRAP-NEXT: .long 5016
+; NOMESA-TRAP-NEXT: .long ((((alignto(hsa_trap.private_seg_size*64, 1024))/1024)>0)||(hsa_trap.has_dyn_sized_stack|hsa_trap.has_recursion))|5016
; GCN-LABEL: {{^}}hsa_trap:
; HSA-TRAP: s_mov_b64 s[0:1], s[6:7]
@@ -59,11 +68,11 @@ define amdgpu_kernel void @hsa_trap(ptr addrspace(1) nocapture readonly %arg0) {
; MESA-TRAP: .section .AMDGPU.config
; MESA-TRAP: .long 47180
-; MESA-TRAP-NEXT: .long 5080
+; MESA-TRAP-NEXT: .long ((((alignto(hsa_debugtrap.private_seg_size*64, 1024))/1024)>0)||(hsa_debugtrap.has_dyn_sized_stack|hsa_debugtrap.has_recursion))|5080
; NOMESA-TRAP: .section .AMDGPU.config
; NOMESA-TRAP: .long 47180
-; NOMESA-TRAP-NEXT: .long 5016
+; NOMESA-TRAP-NEXT: .long ((((alignto(hsa_debugtrap.private_seg_size*64, 1024))/1024)>0)||(hsa_debugtrap.has_dyn_sized_stack|hsa_debugtrap.has_recursion))|5016
; GCN-LABEL: {{^}}hsa_debugtrap:
; HSA-TRAP: s_trap 3
>From 5dff9e2352a3c00164c6fe6e9d9fdc27a0c5fb15 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <janek.vanoirschot at amd.com>
Date: Mon, 12 Aug 2024 15:56:46 +0100
Subject: [PATCH 02/11] Formatting
---
llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index e41f302c3d56ce..6a91ad06de5d12 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -65,11 +65,11 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize,
const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch,
const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion,
- const MCSymbol *HasIndirectCall){};
+ const MCSymbol *HasIndirectCall) {};
virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
const MCSymbol *MaxAGPR,
- const MCSymbol *MaxSGPR){};
+ const MCSymbol *MaxSGPR) {};
/// \returns True on success, false on failure.
virtual bool EmitISAVersion() { return true; }
>From 82ca3282e05bffaf1b56c0d17727d246f8bb496c Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <janek.vanoirschot at amd.com>
Date: Fri, 16 Aug 2024 14:23:43 +0100
Subject: [PATCH 03/11] Feedback
---
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 49 ++++++++++---------
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h | 7 ++-
.../Target/AMDGPU/AMDGPUMCResourceInfo.cpp | 10 ++--
llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h | 11 ++---
4 files changed, 40 insertions(+), 37 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 23bc804515e690..524be4cbc4a2ab 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -94,8 +94,6 @@ AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
: AsmPrinter(TM, std::move(Streamer)) {
assert(OutStreamer && "AsmPrinter constructed without streamer");
RI = std::make_unique<MCResourceInfo>(OutContext);
- OccupancyValidateMap =
- std::make_unique<DenseMap<const Function *, const MCExpr *>>();
}
StringRef AMDGPUAsmPrinter::getPassName() const {
@@ -363,7 +361,7 @@ bool AMDGPUAsmPrinter::doInitialization(Module &M) {
return AsmPrinter::doInitialization(M);
}
-void AMDGPUAsmPrinter::ValidateMCResourceInfo(Function &F) {
+void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
return;
@@ -438,8 +436,8 @@ void AMDGPUAsmPrinter::ValidateMCResourceInfo(Function &F) {
}
}
- auto I = OccupancyValidateMap->find(&F);
- if (I != OccupancyValidateMap->end()) {
+ auto I = OccupancyValidateMap.find(&F);
+ if (I != OccupancyValidateMap.end()) {
const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
F, "amdgpu-waves-per-eu", {0, 0}, true);
uint64_t Occupancy;
@@ -473,13 +471,13 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) {
// Assign expressions which can only be resolved when all other functions are
// known.
- RI->Finalize();
+ RI->finalize();
getTargetStreamer()->EmitMCResourceMaximums(
RI->getMaxVGPRSymbol(), RI->getMaxAGPRSymbol(), RI->getMaxSGPRSymbol());
- for (Function &F : M.functions()) {
- ValidateMCResourceInfo(F);
- }
+ for (Function &F : M.functions())
+ validateMCResourceInfo(F);
+
return AsmPrinter::doFinalization(M);
}
@@ -867,6 +865,24 @@ uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const
return CodeSize;
}
+// AccumOffset computed for the MCExpr equivalent of:
+// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
+static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
+ const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx);
+ const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx);
+
+ // Can't be lower than 1 for subsequent alignTo.
+ const MCExpr *MaximumTaken =
+ AMDGPUMCExpr::createMax({ConstOne, NumVGPR}, Ctx);
+
+ // Practically, it's computing divideCeil(MaximumTaken, 4).
+ const MCExpr *DivCeil = MCBinaryExpr::createDiv(
+ AMDGPUMCExpr::createAlignTo(MaximumTaken, ConstFour, Ctx), ConstFour,
+ Ctx);
+
+ return MCBinaryExpr::createSub(DivCeil, ConstOne, Ctx);
+}
+
void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
const MachineFunction &MF) {
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
@@ -891,24 +907,13 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
return MCSymbolRefExpr::create(Sym, Ctx);
};
- const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx);
- const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx);
-
using RIK = MCResourceInfo::ResourceInfoKind;
ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR);
ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR);
ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
- // AccumOffset computed for the MCExpr equivalent of:
- // alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1;
- ProgInfo.AccumOffset = MCBinaryExpr::createSub(
- MCBinaryExpr::createDiv(
- AMDGPUMCExpr::createAlignTo(
- AMDGPUMCExpr::createMax({ConstOne, ProgInfo.NumArchVGPR}, Ctx),
- ConstFour, Ctx),
- ConstFour, Ctx),
- ConstOne, Ctx);
+ ProgInfo.AccumOffset = computeAccumOffset(ProgInfo.NumArchVGPR, Ctx);
ProgInfo.TgSplit = STM.isTgSplitEnabled();
ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR);
ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize);
@@ -1216,7 +1221,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU,
ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
- OccupancyValidateMap->insert({&MF.getFunction(), ProgInfo.Occupancy});
+ OccupancyValidateMap.insert({&MF.getFunction(), ProgInfo.Occupancy});
const auto [MinWEU, MaxWEU] =
AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 676a4687ee2af7..a49ef406268d76 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -49,11 +49,10 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
MCCodeEmitter *DumpCodeInstEmitter = nullptr;
- // ValidateMCResourceInfo cannot recompute parts of the occupancy as it does
+ // validateMCResourceInfo cannot recompute parts of the occupancy as it does
// for other metadata to validate (e.g., NumSGPRs) so a map is necessary if we
// really want to track and validate the occupancy.
- std::unique_ptr<DenseMap<const Function *, const MCExpr *>>
- OccupancyValidateMap;
+ DenseMap<const Function *, const MCExpr *> OccupancyValidateMap;
uint64_t getFunctionCodeSize(const MachineFunction &MF) const;
@@ -91,7 +90,7 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
/// Attempts to replace the validation that is missed in getSIProgramInfo due
/// to MCExpr being unknown. Invoked during doFinalization such that the
/// MCResourceInfo symbols are known.
- void ValidateMCResourceInfo(Function &F);
+ void validateMCResourceInfo(Function &F);
public:
explicit AMDGPUAsmPrinter(TargetMachine &TM,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
index 58383475b312c9..a704a5fdc1cf19 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
@@ -68,7 +68,7 @@ void MCResourceInfo::assignMaxRegs() {
assignMaxRegSym(MaxSGPRSym, MaxSGPR);
}
-void MCResourceInfo::Finalize() {
+void MCResourceInfo::finalize() {
assert(!finalized && "Cannot finalize ResourceInfo again.");
finalized = true;
assignMaxRegs();
@@ -93,8 +93,8 @@ void MCResourceInfo::assignResourceInfoExpr(
const MCConstantExpr *localConstExpr =
MCConstantExpr::create(localValue, OutContext);
const MCExpr *SymVal = localConstExpr;
- if (Callees.size() > 0) {
- std::vector<const MCExpr *> ArgExprs;
+ if (!Callees.empty()) {
+ SmallVector<const MCExpr *, 8> ArgExprs;
// Avoid recursive symbol assignment.
SmallSet<StringRef, 8> Seen;
ArgExprs.push_back(localConstExpr);
@@ -148,7 +148,7 @@ void MCResourceInfo::gatherResourceInfo(
{
// The expression for private segment size should be: FRI.PrivateSegmentSize
// + max(FRI.Callees, FRI.CalleeSegmentSize)
- std::vector<const MCExpr *> ArgExprs;
+ SmallVector<const MCExpr *, 8> ArgExprs;
if (FRI.CalleeSegmentSize)
ArgExprs.push_back(
MCConstantExpr::create(FRI.CalleeSegmentSize, OutContext));
@@ -162,7 +162,7 @@ void MCResourceInfo::gatherResourceInfo(
}
const MCExpr *localConstExpr =
MCConstantExpr::create(FRI.PrivateSegmentSize, OutContext);
- if (ArgExprs.size() > 0) {
+ if (!ArgExprs.empty()) {
const AMDGPUMCExpr *transitiveExpr =
AMDGPUMCExpr::createMax(ArgExprs, OutContext);
localConstExpr =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
index 6646003693a67f..97c1843e3b80f3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
@@ -37,9 +37,9 @@ class MCResourceInfo {
};
private:
- int32_t MaxVGPR;
- int32_t MaxAGPR;
- int32_t MaxSGPR;
+ int32_t MaxVGPR = 0;
+ int32_t MaxAGPR = 0;
+ int32_t MaxSGPR = 0;
MCContext &OutContext;
bool finalized;
@@ -54,8 +54,7 @@ class MCResourceInfo {
public:
MCResourceInfo(MCContext &OutContext)
- : MaxVGPR(0), MaxAGPR(0), MaxSGPR(0), OutContext(OutContext),
- finalized(false) {}
+ : OutContext(OutContext), finalized(false) {}
void addMaxVGPRCandidate(int32_t candidate) {
MaxVGPR = std::max(MaxVGPR, candidate);
}
@@ -72,7 +71,7 @@ class MCResourceInfo {
// Resolves the final symbols that requires the inter-function resource info
// to be resolved.
- void Finalize();
+ void finalize();
MCSymbol *getMaxVGPRSymbol();
MCSymbol *getMaxAGPRSymbol();
>From 9c18593f64a2ca4f8f66a6193a631ab9308d78e0 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <janek.vanoirschot at amd.com>
Date: Mon, 19 Aug 2024 06:42:51 -0700
Subject: [PATCH 04/11] Feedback, capitalizations, move ResourceUsage
initialization prior to amdhsa kernel descriptor emit, remove duplicate
validation for stack size
---
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 17 +-
.../Target/AMDGPU/AMDGPUMCResourceInfo.cpp | 77 ++++----
llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h | 4 +-
.../AMDGPU/GlobalISel/extractelement.ll | 168 +++++++++---------
.../AMDGPU/GlobalISel/flat-scratch-init.ll | 16 +-
.../GlobalISel/llvm.amdgcn.workitem.id.ll | 6 +-
.../AMDGPU/GlobalISel/non-entry-alloca.ll | 8 +-
.../CodeGen/AMDGPU/agpr-register-count.ll | 32 ++--
.../CodeGen/AMDGPU/amdgpu.private-memory.ll | 3 +-
llvm/test/CodeGen/AMDGPU/attributor-noopt.ll | 30 +---
.../AMDGPU/call-graph-register-usage.ll | 8 +-
.../callee-special-input-sgprs-fixed-abi.ll | 4 +-
llvm/test/CodeGen/AMDGPU/code-object-v3.ll | 34 ++--
.../AMDGPU/control-flow-fastregalloc.ll | 6 +-
llvm/test/CodeGen/AMDGPU/elf.ll | 7 +-
.../enable-scratch-only-dynamic-stack.ll | 14 +-
llvm/test/CodeGen/AMDGPU/kernarg-size.ll | 4 +-
.../CodeGen/AMDGPU/kernel_code_t_recurse.ll | 7 +-
.../CodeGen/AMDGPU/large-alloca-compute.ll | 31 +---
.../CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll | 6 +-
llvm/test/CodeGen/AMDGPU/mesa3d.ll | 13 +-
llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll | 20 +--
.../AMDGPU/resource-optimization-remarks.ll | 52 +++---
.../CodeGen/AMDGPU/stack-realign-kernel.ll | 126 ++++---------
llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll | 3 +-
llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll | 3 +-
llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll | 3 +-
llvm/test/CodeGen/AMDGPU/trap.ll | 33 ++--
28 files changed, 289 insertions(+), 446 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 524be4cbc4a2ab..3e830881e8395b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -630,6 +630,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
OutStreamer->switchSection(ConfigSection);
}
+ const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
+ ResourceUsage->getResourceInfo();
+ RI->gatherResourceInfo(MF, Info);
+
if (MFI->isModuleEntryFunction()) {
getSIProgramInfo(CurrentProgramInfo, MF);
}
@@ -662,9 +666,6 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
STM.hasMAIInsts());
{
- const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
- ResourceUsage->getResourceInfo();
- RI->gatherResourceInfo(MF, Info);
using RIK = MCResourceInfo::ResourceInfoKind;
getTargetStreamer()->EmitMCResourceInfo(
RI->getSymbol(MF.getName(), RIK::RIK_NumVGPR),
@@ -923,16 +924,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
- const uint64_t MaxScratchPerWorkitem =
- STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
- uint64_t ScratchSize;
- if (TryGetMCExprValue(ProgInfo.ScratchSize, ScratchSize) &&
- ScratchSize > MaxScratchPerWorkitem) {
- DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), ScratchSize,
- MaxScratchPerWorkitem, DS_Error);
- MF.getFunction().getContext().diagnose(DiagStackSize);
- }
-
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
// The calculations related to SGPR/VGPR blocks are
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
index a704a5fdc1cf19..0ae0907f99ec5a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
@@ -22,26 +22,28 @@
using namespace llvm;
MCSymbol *MCResourceInfo::getSymbol(StringRef FuncName, ResourceInfoKind RIK) {
+ auto GOCS = [this, FuncName](StringRef Suffix) {
+ return OutContext.getOrCreateSymbol(FuncName + Twine(Suffix));
+ };
switch (RIK) {
case RIK_NumVGPR:
- return OutContext.getOrCreateSymbol(FuncName + Twine(".num_vgpr"));
+ return GOCS(".num_vgpr");
case RIK_NumAGPR:
- return OutContext.getOrCreateSymbol(FuncName + Twine(".num_agpr"));
+ return GOCS(".num_agpr");
case RIK_NumSGPR:
- return OutContext.getOrCreateSymbol(FuncName + Twine(".num_sgpr"));
+ return GOCS(".num_sgpr");
case RIK_PrivateSegSize:
- return OutContext.getOrCreateSymbol(FuncName + Twine(".private_seg_size"));
+ return GOCS(".private_seg_size");
case RIK_UsesVCC:
- return OutContext.getOrCreateSymbol(FuncName + Twine(".uses_vcc"));
+ return GOCS(".uses_vcc");
case RIK_UsesFlatScratch:
- return OutContext.getOrCreateSymbol(FuncName + Twine(".uses_flat_scratch"));
+ return GOCS(".uses_flat_scratch");
case RIK_HasDynSizedStack:
- return OutContext.getOrCreateSymbol(FuncName +
- Twine(".has_dyn_sized_stack"));
+ return GOCS(".has_dyn_sized_stack");
case RIK_HasRecursion:
- return OutContext.getOrCreateSymbol(FuncName + Twine(".has_recursion"));
+ return GOCS(".has_recursion");
case RIK_HasIndirectCall:
- return OutContext.getOrCreateSymbol(FuncName + Twine(".has_indirect_call"));
+ return GOCS(".has_indirect_call");
}
llvm_unreachable("Unexpected ResourceInfoKind.");
}
@@ -69,8 +71,8 @@ void MCResourceInfo::assignMaxRegs() {
}
void MCResourceInfo::finalize() {
- assert(!finalized && "Cannot finalize ResourceInfo again.");
- finalized = true;
+ assert(!Finalized && "Cannot finalize ResourceInfo again.");
+ Finalized = true;
assignMaxRegs();
}
@@ -87,25 +89,26 @@ MCSymbol *MCResourceInfo::getMaxSGPRSymbol() {
}
void MCResourceInfo::assignResourceInfoExpr(
- int64_t localValue, ResourceInfoKind RIK, AMDGPUMCExpr::VariantKind Kind,
+ int64_t LocalValue, ResourceInfoKind RIK, AMDGPUMCExpr::VariantKind Kind,
const MachineFunction &MF,
const SmallVectorImpl<const Function *> &Callees) {
- const MCConstantExpr *localConstExpr =
- MCConstantExpr::create(localValue, OutContext);
- const MCExpr *SymVal = localConstExpr;
+ const MCConstantExpr *LocalConstExpr =
+ MCConstantExpr::create(LocalValue, OutContext);
+ const MCExpr *SymVal = LocalConstExpr;
if (!Callees.empty()) {
SmallVector<const MCExpr *, 8> ArgExprs;
// Avoid recursive symbol assignment.
- SmallSet<StringRef, 8> Seen;
- ArgExprs.push_back(localConstExpr);
- Seen.insert(MF.getName());
+ SmallPtrSet<const Function *, 8> Seen;
+ ArgExprs.push_back(LocalConstExpr);
+ const Function &F = MF.getFunction();
+ Seen.insert(&F);
for (const Function *Callee : Callees) {
- if (Seen.contains(Callee->getName()))
+ if (Seen.contains(Callee))
continue;
- Seen.insert(Callee->getName());
- MCSymbol *calleeValSym = getSymbol(Callee->getName(), RIK);
- ArgExprs.push_back(MCSymbolRefExpr::create(calleeValSym, OutContext));
+ Seen.insert(Callee);
+ MCSymbol *CalleeValSym = getSymbol(Callee->getName(), RIK);
+ ArgExprs.push_back(MCSymbolRefExpr::create(CalleeValSym, OutContext));
}
SymVal = AMDGPUMCExpr::create(Kind, ArgExprs, OutContext);
}
@@ -117,9 +120,9 @@ void MCResourceInfo::gatherResourceInfo(
const MachineFunction &MF,
const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &FRI) {
// Worst case VGPR use for non-hardware-entrypoints.
- MCSymbol *maxVGPRSym = getMaxVGPRSymbol();
- MCSymbol *maxAGPRSym = getMaxAGPRSymbol();
- MCSymbol *maxSGPRSym = getMaxSGPRSymbol();
+ MCSymbol *MaxVGPRSym = getMaxVGPRSymbol();
+ MCSymbol *MaxAGPRSym = getMaxAGPRSymbol();
+ MCSymbol *MaxSGPRSym = getMaxSGPRSymbol();
if (!AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())) {
addMaxVGPRCandidate(FRI.NumVGPR);
@@ -127,7 +130,7 @@ void MCResourceInfo::gatherResourceInfo(
addMaxSGPRCandidate(FRI.NumExplicitSGPR);
}
- auto setMaxReg = [&](MCSymbol *MaxSym, int32_t numRegs,
+ auto SetMaxReg = [&](MCSymbol *MaxSym, int32_t numRegs,
ResourceInfoKind RIK) {
if (!FRI.HasIndirectCall) {
assignResourceInfoExpr(numRegs, RIK, AMDGPUMCExpr::AGVK_Max, MF,
@@ -141,9 +144,9 @@ void MCResourceInfo::gatherResourceInfo(
}
};
- setMaxReg(maxVGPRSym, FRI.NumVGPR, RIK_NumVGPR);
- setMaxReg(maxAGPRSym, FRI.NumAGPR, RIK_NumAGPR);
- setMaxReg(maxSGPRSym, FRI.NumExplicitSGPR, RIK_NumSGPR);
+ SetMaxReg(MaxVGPRSym, FRI.NumVGPR, RIK_NumVGPR);
+ SetMaxReg(MaxAGPRSym, FRI.NumAGPR, RIK_NumAGPR);
+ SetMaxReg(MaxSGPRSym, FRI.NumExplicitSGPR, RIK_NumSGPR);
{
// The expression for private segment size should be: FRI.PrivateSegmentSize
@@ -172,9 +175,9 @@ void MCResourceInfo::gatherResourceInfo(
->setVariableValue(localConstExpr);
}
- auto setToLocal = [&](int64_t localValue, ResourceInfoKind RIK) {
+ auto SetToLocal = [&](int64_t LocalValue, ResourceInfoKind RIK) {
MCSymbol *Sym = getSymbol(MF.getName(), RIK);
- Sym->setVariableValue(MCConstantExpr::create(localValue, OutContext));
+ Sym->setVariableValue(MCConstantExpr::create(LocalValue, OutContext));
};
if (!FRI.HasIndirectCall) {
@@ -192,12 +195,12 @@ void MCResourceInfo::gatherResourceInfo(
ResourceInfoKind::RIK_HasIndirectCall,
AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees);
} else {
- setToLocal(FRI.UsesVCC, ResourceInfoKind::RIK_UsesVCC);
- setToLocal(FRI.UsesFlatScratch, ResourceInfoKind::RIK_UsesFlatScratch);
- setToLocal(FRI.HasDynamicallySizedStack,
+ SetToLocal(FRI.UsesVCC, ResourceInfoKind::RIK_UsesVCC);
+ SetToLocal(FRI.UsesFlatScratch, ResourceInfoKind::RIK_UsesFlatScratch);
+ SetToLocal(FRI.HasDynamicallySizedStack,
ResourceInfoKind::RIK_HasDynSizedStack);
- setToLocal(FRI.HasRecursion, ResourceInfoKind::RIK_HasRecursion);
- setToLocal(FRI.HasIndirectCall, ResourceInfoKind::RIK_HasIndirectCall);
+ SetToLocal(FRI.HasRecursion, ResourceInfoKind::RIK_HasRecursion);
+ SetToLocal(FRI.HasIndirectCall, ResourceInfoKind::RIK_HasIndirectCall);
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
index 97c1843e3b80f3..22ad1eccf4566d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
@@ -42,7 +42,7 @@ class MCResourceInfo {
int32_t MaxSGPR = 0;
MCContext &OutContext;
- bool finalized;
+ bool Finalized;
void assignResourceInfoExpr(int64_t localValue, ResourceInfoKind RIK,
AMDGPUMCExpr::VariantKind Kind,
@@ -54,7 +54,7 @@ class MCResourceInfo {
public:
MCResourceInfo(MCContext &OutContext)
- : OutContext(OutContext), finalized(false) {}
+ : OutContext(OutContext), Finalized(false) {}
void addMaxVGPRCandidate(int32_t candidate) {
MaxVGPR = std::max(MaxVGPR, candidate);
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index a7ab4393f3b0ac..34efb089b72bf1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -3025,8 +3025,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: amd_machine_version_stepping = 0
; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256
; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0
-; GPRIDX-NEXT: granulated_workitem_vgpr_count = (11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
-; GPRIDX-NEXT: granulated_wavefront_sgpr_count = ((11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
+; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0
+; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1
; GPRIDX-NEXT: priority = 0
; GPRIDX-NEXT: float_mode = 240
; GPRIDX-NEXT: priv = 0
@@ -3036,7 +3036,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: enable_wgp_mode = 0
; GPRIDX-NEXT: enable_mem_ordered = 0
; GPRIDX-NEXT: enable_fwd_progress = 0
-; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v5f64_s_s.private_seg_size*64, 1024))/1024)>0)||(dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion))|5012)&1
+; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
; GPRIDX-NEXT: user_sgpr_count = 10
; GPRIDX-NEXT: enable_trap_handler = 0
; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -3061,16 +3061,16 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: enable_ordered_append_gds = 0
; GPRIDX-NEXT: private_element_size = 1
; GPRIDX-NEXT: is_ptr64 = 1
-; GPRIDX-NEXT: is_dynamic_callstack = dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion
+; GPRIDX-NEXT: is_dynamic_callstack = 0
; GPRIDX-NEXT: is_debug_enabled = 0
; GPRIDX-NEXT: is_xnack_enabled = 1
-; GPRIDX-NEXT: workitem_private_segment_byte_size = dyn_extract_v5f64_s_s.private_seg_size
+; GPRIDX-NEXT: workitem_private_segment_byte_size = 0
; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
-; GPRIDX-NEXT: wavefront_sgpr_count = dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 1))
-; GPRIDX-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr)
+; GPRIDX-NEXT: wavefront_sgpr_count = 13
+; GPRIDX-NEXT: workitem_vgpr_count = 3
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
; GPRIDX-NEXT: reserved_sgpr_first = 0
@@ -3116,8 +3116,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: amd_machine_version_stepping = 3
; MOVREL-NEXT: kernel_code_entry_byte_offset = 256
; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0
-; MOVREL-NEXT: granulated_workitem_vgpr_count = (11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
-; MOVREL-NEXT: granulated_wavefront_sgpr_count = ((11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
+; MOVREL-NEXT: granulated_workitem_vgpr_count = 0
+; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1
; MOVREL-NEXT: priority = 0
; MOVREL-NEXT: float_mode = 240
; MOVREL-NEXT: priv = 0
@@ -3127,7 +3127,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: enable_wgp_mode = 0
; MOVREL-NEXT: enable_mem_ordered = 0
; MOVREL-NEXT: enable_fwd_progress = 0
-; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v5f64_s_s.private_seg_size*64, 1024))/1024)>0)||(dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion))|5012)&1
+; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
; MOVREL-NEXT: user_sgpr_count = 10
; MOVREL-NEXT: enable_trap_handler = 0
; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -3152,16 +3152,16 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: enable_ordered_append_gds = 0
; MOVREL-NEXT: private_element_size = 1
; MOVREL-NEXT: is_ptr64 = 1
-; MOVREL-NEXT: is_dynamic_callstack = dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion
+; MOVREL-NEXT: is_dynamic_callstack = 0
; MOVREL-NEXT: is_debug_enabled = 0
; MOVREL-NEXT: is_xnack_enabled = 0
-; MOVREL-NEXT: workitem_private_segment_byte_size = dyn_extract_v5f64_s_s.private_seg_size
+; MOVREL-NEXT: workitem_private_segment_byte_size = 0
; MOVREL-NEXT: workgroup_group_segment_byte_size = 0
; MOVREL-NEXT: gds_segment_byte_size = 0
; MOVREL-NEXT: kernarg_segment_byte_size = 28
; MOVREL-NEXT: workgroup_fbarrier_count = 0
-; MOVREL-NEXT: wavefront_sgpr_count = dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 0))
-; MOVREL-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr)
+; MOVREL-NEXT: wavefront_sgpr_count = 9
+; MOVREL-NEXT: workitem_vgpr_count = 4
; MOVREL-NEXT: reserved_vgpr_first = 0
; MOVREL-NEXT: reserved_vgpr_count = 0
; MOVREL-NEXT: reserved_sgpr_first = 0
@@ -3208,8 +3208,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: amd_machine_version_stepping = 0
; GFX10-NEXT: kernel_code_entry_byte_offset = 256
; GFX10-NEXT: kernel_code_prefetch_byte_size = 0
-; GFX10-NEXT: granulated_workitem_vgpr_count = (1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
-; GFX10-NEXT: granulated_wavefront_sgpr_count = ((1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
+; GFX10-NEXT: granulated_workitem_vgpr_count = 0
+; GFX10-NEXT: granulated_wavefront_sgpr_count = 1
; GFX10-NEXT: priority = 0
; GFX10-NEXT: float_mode = 240
; GFX10-NEXT: priv = 0
@@ -3219,7 +3219,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: enable_wgp_mode = 1
; GFX10-NEXT: enable_mem_ordered = 1
; GFX10-NEXT: enable_fwd_progress = 0
-; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v5f64_s_s.private_seg_size*32, 1024))/1024)>0)||(dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion))|5012)&1
+; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
; GFX10-NEXT: user_sgpr_count = 10
; GFX10-NEXT: enable_trap_handler = 0
; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -3244,16 +3244,16 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: enable_ordered_append_gds = 0
; GFX10-NEXT: private_element_size = 1
; GFX10-NEXT: is_ptr64 = 1
-; GFX10-NEXT: is_dynamic_callstack = dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion
+; GFX10-NEXT: is_dynamic_callstack = 0
; GFX10-NEXT: is_debug_enabled = 0
; GFX10-NEXT: is_xnack_enabled = 1
-; GFX10-NEXT: workitem_private_segment_byte_size = dyn_extract_v5f64_s_s.private_seg_size
+; GFX10-NEXT: workitem_private_segment_byte_size = 0
; GFX10-NEXT: workgroup_group_segment_byte_size = 0
; GFX10-NEXT: gds_segment_byte_size = 0
; GFX10-NEXT: kernarg_segment_byte_size = 28
; GFX10-NEXT: workgroup_fbarrier_count = 0
-; GFX10-NEXT: wavefront_sgpr_count = dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 1))
-; GFX10-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr)
+; GFX10-NEXT: wavefront_sgpr_count = 9
+; GFX10-NEXT: workitem_vgpr_count = 3
; GFX10-NEXT: reserved_vgpr_first = 0
; GFX10-NEXT: reserved_vgpr_count = 0
; GFX10-NEXT: reserved_sgpr_first = 0
@@ -3300,8 +3300,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX11-NEXT: amd_machine_version_stepping = 0
; GFX11-NEXT: kernel_code_entry_byte_offset = 256
; GFX11-NEXT: kernel_code_prefetch_byte_size = 0
-; GFX11-NEXT: granulated_workitem_vgpr_count = (1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
-; GFX11-NEXT: granulated_wavefront_sgpr_count = ((1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
+; GFX11-NEXT: granulated_workitem_vgpr_count = 0
+; GFX11-NEXT: granulated_wavefront_sgpr_count = 0
; GFX11-NEXT: priority = 0
; GFX11-NEXT: float_mode = 240
; GFX11-NEXT: priv = 0
@@ -3311,7 +3311,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX11-NEXT: enable_wgp_mode = 1
; GFX11-NEXT: enable_mem_ordered = 1
; GFX11-NEXT: enable_fwd_progress = 0
-; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v5f64_s_s.private_seg_size*32, 256))/256)>0)||(dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion))|5018)&1
+; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
; GFX11-NEXT: user_sgpr_count = 13
; GFX11-NEXT: enable_trap_handler = 0
; GFX11-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -3336,16 +3336,16 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX11-NEXT: enable_ordered_append_gds = 0
; GFX11-NEXT: private_element_size = 1
; GFX11-NEXT: is_ptr64 = 1
-; GFX11-NEXT: is_dynamic_callstack = dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion
+; GFX11-NEXT: is_dynamic_callstack = 0
; GFX11-NEXT: is_debug_enabled = 0
; GFX11-NEXT: is_xnack_enabled = 0
-; GFX11-NEXT: workitem_private_segment_byte_size = dyn_extract_v5f64_s_s.private_seg_size
+; GFX11-NEXT: workitem_private_segment_byte_size = 0
; GFX11-NEXT: workgroup_group_segment_byte_size = 0
; GFX11-NEXT: gds_segment_byte_size = 0
; GFX11-NEXT: kernarg_segment_byte_size = 28
; GFX11-NEXT: workgroup_fbarrier_count = 0
-; GFX11-NEXT: wavefront_sgpr_count = dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 0))
-; GFX11-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr)
+; GFX11-NEXT: wavefront_sgpr_count = 7
+; GFX11-NEXT: workitem_vgpr_count = 3
; GFX11-NEXT: reserved_vgpr_first = 0
; GFX11-NEXT: reserved_vgpr_count = 0
; GFX11-NEXT: reserved_sgpr_first = 0
@@ -4042,8 +4042,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: amd_machine_version_stepping = 0
; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256
; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0
-; GPRIDX-NEXT: granulated_workitem_vgpr_count = (11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
-; GPRIDX-NEXT: granulated_wavefront_sgpr_count = ((11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
+; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0
+; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1
; GPRIDX-NEXT: priority = 0
; GPRIDX-NEXT: float_mode = 240
; GPRIDX-NEXT: priv = 0
@@ -4053,7 +4053,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_wgp_mode = 0
; GPRIDX-NEXT: enable_mem_ordered = 0
; GPRIDX-NEXT: enable_fwd_progress = 0
-; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f32_s_s_s.private_seg_size*64, 1024))/1024)>0)||(dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion))|5012)&1
+; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
; GPRIDX-NEXT: user_sgpr_count = 10
; GPRIDX-NEXT: enable_trap_handler = 0
; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -4078,16 +4078,16 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_ordered_append_gds = 0
; GPRIDX-NEXT: private_element_size = 1
; GPRIDX-NEXT: is_ptr64 = 1
-; GPRIDX-NEXT: is_dynamic_callstack = dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion
+; GPRIDX-NEXT: is_dynamic_callstack = 0
; GPRIDX-NEXT: is_debug_enabled = 0
; GPRIDX-NEXT: is_xnack_enabled = 1
-; GPRIDX-NEXT: workitem_private_segment_byte_size = dyn_extract_v4f32_s_s_s.private_seg_size
+; GPRIDX-NEXT: workitem_private_segment_byte_size = 0
; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
-; GPRIDX-NEXT: wavefront_sgpr_count = dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 1))
-; GPRIDX-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr)
+; GPRIDX-NEXT: wavefront_sgpr_count = 12
+; GPRIDX-NEXT: workitem_vgpr_count = 2
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
; GPRIDX-NEXT: reserved_sgpr_first = 0
@@ -4126,8 +4126,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: amd_machine_version_stepping = 3
; MOVREL-NEXT: kernel_code_entry_byte_offset = 256
; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0
-; MOVREL-NEXT: granulated_workitem_vgpr_count = (11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
-; MOVREL-NEXT: granulated_wavefront_sgpr_count = ((11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
+; MOVREL-NEXT: granulated_workitem_vgpr_count = 0
+; MOVREL-NEXT: granulated_wavefront_sgpr_count = 0
; MOVREL-NEXT: priority = 0
; MOVREL-NEXT: float_mode = 240
; MOVREL-NEXT: priv = 0
@@ -4137,7 +4137,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_wgp_mode = 0
; MOVREL-NEXT: enable_mem_ordered = 0
; MOVREL-NEXT: enable_fwd_progress = 0
-; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f32_s_s_s.private_seg_size*64, 1024))/1024)>0)||(dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion))|5012)&1
+; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
; MOVREL-NEXT: user_sgpr_count = 10
; MOVREL-NEXT: enable_trap_handler = 0
; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -4162,16 +4162,16 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_ordered_append_gds = 0
; MOVREL-NEXT: private_element_size = 1
; MOVREL-NEXT: is_ptr64 = 1
-; MOVREL-NEXT: is_dynamic_callstack = dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion
+; MOVREL-NEXT: is_dynamic_callstack = 0
; MOVREL-NEXT: is_debug_enabled = 0
; MOVREL-NEXT: is_xnack_enabled = 0
-; MOVREL-NEXT: workitem_private_segment_byte_size = dyn_extract_v4f32_s_s_s.private_seg_size
+; MOVREL-NEXT: workitem_private_segment_byte_size = 0
; MOVREL-NEXT: workgroup_group_segment_byte_size = 0
; MOVREL-NEXT: gds_segment_byte_size = 0
; MOVREL-NEXT: kernarg_segment_byte_size = 28
; MOVREL-NEXT: workgroup_fbarrier_count = 0
-; MOVREL-NEXT: wavefront_sgpr_count = dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 0))
-; MOVREL-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr)
+; MOVREL-NEXT: wavefront_sgpr_count = 8
+; MOVREL-NEXT: workitem_vgpr_count = 3
; MOVREL-NEXT: reserved_vgpr_first = 0
; MOVREL-NEXT: reserved_vgpr_count = 0
; MOVREL-NEXT: reserved_sgpr_first = 0
@@ -4211,8 +4211,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: amd_machine_version_stepping = 0
; GFX10-NEXT: kernel_code_entry_byte_offset = 256
; GFX10-NEXT: kernel_code_prefetch_byte_size = 0
-; GFX10-NEXT: granulated_workitem_vgpr_count = (1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
-; GFX10-NEXT: granulated_wavefront_sgpr_count = ((1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
+; GFX10-NEXT: granulated_workitem_vgpr_count = 0
+; GFX10-NEXT: granulated_wavefront_sgpr_count = 0
; GFX10-NEXT: priority = 0
; GFX10-NEXT: float_mode = 240
; GFX10-NEXT: priv = 0
@@ -4222,7 +4222,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_wgp_mode = 1
; GFX10-NEXT: enable_mem_ordered = 1
; GFX10-NEXT: enable_fwd_progress = 0
-; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f32_s_s_s.private_seg_size*32, 1024))/1024)>0)||(dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion))|5012)&1
+; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
; GFX10-NEXT: user_sgpr_count = 10
; GFX10-NEXT: enable_trap_handler = 0
; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -4247,16 +4247,16 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_ordered_append_gds = 0
; GFX10-NEXT: private_element_size = 1
; GFX10-NEXT: is_ptr64 = 1
-; GFX10-NEXT: is_dynamic_callstack = dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion
+; GFX10-NEXT: is_dynamic_callstack = 0
; GFX10-NEXT: is_debug_enabled = 0
; GFX10-NEXT: is_xnack_enabled = 1
-; GFX10-NEXT: workitem_private_segment_byte_size = dyn_extract_v4f32_s_s_s.private_seg_size
+; GFX10-NEXT: workitem_private_segment_byte_size = 0
; GFX10-NEXT: workgroup_group_segment_byte_size = 0
; GFX10-NEXT: gds_segment_byte_size = 0
; GFX10-NEXT: kernarg_segment_byte_size = 28
; GFX10-NEXT: workgroup_fbarrier_count = 0
-; GFX10-NEXT: wavefront_sgpr_count = dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 1))
-; GFX10-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr)
+; GFX10-NEXT: wavefront_sgpr_count = 8
+; GFX10-NEXT: workitem_vgpr_count = 2
; GFX10-NEXT: reserved_vgpr_first = 0
; GFX10-NEXT: reserved_vgpr_count = 0
; GFX10-NEXT: reserved_sgpr_first = 0
@@ -4296,8 +4296,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: amd_machine_version_stepping = 0
; GFX11-NEXT: kernel_code_entry_byte_offset = 256
; GFX11-NEXT: kernel_code_prefetch_byte_size = 0
-; GFX11-NEXT: granulated_workitem_vgpr_count = (1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
-; GFX11-NEXT: granulated_wavefront_sgpr_count = ((1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
+; GFX11-NEXT: granulated_workitem_vgpr_count = 0
+; GFX11-NEXT: granulated_wavefront_sgpr_count = 0
; GFX11-NEXT: priority = 0
; GFX11-NEXT: float_mode = 240
; GFX11-NEXT: priv = 0
@@ -4307,7 +4307,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: enable_wgp_mode = 1
; GFX11-NEXT: enable_mem_ordered = 1
; GFX11-NEXT: enable_fwd_progress = 0
-; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f32_s_s_s.private_seg_size*32, 256))/256)>0)||(dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion))|5018)&1
+; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
; GFX11-NEXT: user_sgpr_count = 13
; GFX11-NEXT: enable_trap_handler = 0
; GFX11-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -4332,16 +4332,16 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: enable_ordered_append_gds = 0
; GFX11-NEXT: private_element_size = 1
; GFX11-NEXT: is_ptr64 = 1
-; GFX11-NEXT: is_dynamic_callstack = dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion
+; GFX11-NEXT: is_dynamic_callstack = 0
; GFX11-NEXT: is_debug_enabled = 0
; GFX11-NEXT: is_xnack_enabled = 0
-; GFX11-NEXT: workitem_private_segment_byte_size = dyn_extract_v4f32_s_s_s.private_seg_size
+; GFX11-NEXT: workitem_private_segment_byte_size = 0
; GFX11-NEXT: workgroup_group_segment_byte_size = 0
; GFX11-NEXT: gds_segment_byte_size = 0
; GFX11-NEXT: kernarg_segment_byte_size = 28
; GFX11-NEXT: workgroup_fbarrier_count = 0
-; GFX11-NEXT: wavefront_sgpr_count = dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 0))
-; GFX11-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr)
+; GFX11-NEXT: wavefront_sgpr_count = 5
+; GFX11-NEXT: workitem_vgpr_count = 2
; GFX11-NEXT: reserved_vgpr_first = 0
; GFX11-NEXT: reserved_vgpr_count = 0
; GFX11-NEXT: reserved_sgpr_first = 0
@@ -4389,8 +4389,8 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: amd_machine_version_stepping = 0
; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256
; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0
-; GPRIDX-NEXT: granulated_workitem_vgpr_count = (11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
-; GPRIDX-NEXT: granulated_wavefront_sgpr_count = ((11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
+; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0
+; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1
; GPRIDX-NEXT: priority = 0
; GPRIDX-NEXT: float_mode = 240
; GPRIDX-NEXT: priv = 0
@@ -4400,7 +4400,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_wgp_mode = 0
; GPRIDX-NEXT: enable_mem_ordered = 0
; GPRIDX-NEXT: enable_fwd_progress = 0
-; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f64_s_s_s.private_seg_size*64, 1024))/1024)>0)||(dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion))|5012)&1
+; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
; GPRIDX-NEXT: user_sgpr_count = 10
; GPRIDX-NEXT: enable_trap_handler = 0
; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -4425,16 +4425,16 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_ordered_append_gds = 0
; GPRIDX-NEXT: private_element_size = 1
; GPRIDX-NEXT: is_ptr64 = 1
-; GPRIDX-NEXT: is_dynamic_callstack = dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion
+; GPRIDX-NEXT: is_dynamic_callstack = 0
; GPRIDX-NEXT: is_debug_enabled = 0
; GPRIDX-NEXT: is_xnack_enabled = 1
-; GPRIDX-NEXT: workitem_private_segment_byte_size = dyn_extract_v4f64_s_s_s.private_seg_size
+; GPRIDX-NEXT: workitem_private_segment_byte_size = 0
; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
-; GPRIDX-NEXT: wavefront_sgpr_count = dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 1))
-; GPRIDX-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr)
+; GPRIDX-NEXT: wavefront_sgpr_count = 13
+; GPRIDX-NEXT: workitem_vgpr_count = 3
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
; GPRIDX-NEXT: reserved_sgpr_first = 0
@@ -4476,8 +4476,8 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: amd_machine_version_stepping = 3
; MOVREL-NEXT: kernel_code_entry_byte_offset = 256
; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0
-; MOVREL-NEXT: granulated_workitem_vgpr_count = (11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
-; MOVREL-NEXT: granulated_wavefront_sgpr_count = ((11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
+; MOVREL-NEXT: granulated_workitem_vgpr_count = 0
+; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1
; MOVREL-NEXT: priority = 0
; MOVREL-NEXT: float_mode = 240
; MOVREL-NEXT: priv = 0
@@ -4487,7 +4487,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_wgp_mode = 0
; MOVREL-NEXT: enable_mem_ordered = 0
; MOVREL-NEXT: enable_fwd_progress = 0
-; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f64_s_s_s.private_seg_size*64, 1024))/1024)>0)||(dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion))|5012)&1
+; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
; MOVREL-NEXT: user_sgpr_count = 10
; MOVREL-NEXT: enable_trap_handler = 0
; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -4512,16 +4512,16 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_ordered_append_gds = 0
; MOVREL-NEXT: private_element_size = 1
; MOVREL-NEXT: is_ptr64 = 1
-; MOVREL-NEXT: is_dynamic_callstack = dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion
+; MOVREL-NEXT: is_dynamic_callstack = 0
; MOVREL-NEXT: is_debug_enabled = 0
; MOVREL-NEXT: is_xnack_enabled = 0
-; MOVREL-NEXT: workitem_private_segment_byte_size = dyn_extract_v4f64_s_s_s.private_seg_size
+; MOVREL-NEXT: workitem_private_segment_byte_size = 0
; MOVREL-NEXT: workgroup_group_segment_byte_size = 0
; MOVREL-NEXT: gds_segment_byte_size = 0
; MOVREL-NEXT: kernarg_segment_byte_size = 28
; MOVREL-NEXT: workgroup_fbarrier_count = 0
-; MOVREL-NEXT: wavefront_sgpr_count = dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 0))
-; MOVREL-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr)
+; MOVREL-NEXT: wavefront_sgpr_count = 9
+; MOVREL-NEXT: workitem_vgpr_count = 4
; MOVREL-NEXT: reserved_vgpr_first = 0
; MOVREL-NEXT: reserved_vgpr_count = 0
; MOVREL-NEXT: reserved_sgpr_first = 0
@@ -4564,8 +4564,8 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: amd_machine_version_stepping = 0
; GFX10-NEXT: kernel_code_entry_byte_offset = 256
; GFX10-NEXT: kernel_code_prefetch_byte_size = 0
-; GFX10-NEXT: granulated_workitem_vgpr_count = (1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
-; GFX10-NEXT: granulated_wavefront_sgpr_count = ((1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
+; GFX10-NEXT: granulated_workitem_vgpr_count = 0
+; GFX10-NEXT: granulated_wavefront_sgpr_count = 1
; GFX10-NEXT: priority = 0
; GFX10-NEXT: float_mode = 240
; GFX10-NEXT: priv = 0
@@ -4575,7 +4575,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_wgp_mode = 1
; GFX10-NEXT: enable_mem_ordered = 1
; GFX10-NEXT: enable_fwd_progress = 0
-; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f64_s_s_s.private_seg_size*32, 1024))/1024)>0)||(dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion))|5012)&1
+; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
; GFX10-NEXT: user_sgpr_count = 10
; GFX10-NEXT: enable_trap_handler = 0
; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -4600,16 +4600,16 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_ordered_append_gds = 0
; GFX10-NEXT: private_element_size = 1
; GFX10-NEXT: is_ptr64 = 1
-; GFX10-NEXT: is_dynamic_callstack = dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion
+; GFX10-NEXT: is_dynamic_callstack = 0
; GFX10-NEXT: is_debug_enabled = 0
; GFX10-NEXT: is_xnack_enabled = 1
-; GFX10-NEXT: workitem_private_segment_byte_size = dyn_extract_v4f64_s_s_s.private_seg_size
+; GFX10-NEXT: workitem_private_segment_byte_size = 0
; GFX10-NEXT: workgroup_group_segment_byte_size = 0
; GFX10-NEXT: gds_segment_byte_size = 0
; GFX10-NEXT: kernarg_segment_byte_size = 28
; GFX10-NEXT: workgroup_fbarrier_count = 0
-; GFX10-NEXT: wavefront_sgpr_count = dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 1))
-; GFX10-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr)
+; GFX10-NEXT: wavefront_sgpr_count = 9
+; GFX10-NEXT: workitem_vgpr_count = 3
; GFX10-NEXT: reserved_vgpr_first = 0
; GFX10-NEXT: reserved_vgpr_count = 0
; GFX10-NEXT: reserved_sgpr_first = 0
@@ -4652,8 +4652,8 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: amd_machine_version_stepping = 0
; GFX11-NEXT: kernel_code_entry_byte_offset = 256
; GFX11-NEXT: kernel_code_prefetch_byte_size = 0
-; GFX11-NEXT: granulated_workitem_vgpr_count = (1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
-; GFX11-NEXT: granulated_wavefront_sgpr_count = ((1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
+; GFX11-NEXT: granulated_workitem_vgpr_count = 0
+; GFX11-NEXT: granulated_wavefront_sgpr_count = 0
; GFX11-NEXT: priority = 0
; GFX11-NEXT: float_mode = 240
; GFX11-NEXT: priv = 0
@@ -4663,7 +4663,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: enable_wgp_mode = 1
; GFX11-NEXT: enable_mem_ordered = 1
; GFX11-NEXT: enable_fwd_progress = 0
-; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f64_s_s_s.private_seg_size*32, 256))/256)>0)||(dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion))|5018)&1
+; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
; GFX11-NEXT: user_sgpr_count = 13
; GFX11-NEXT: enable_trap_handler = 0
; GFX11-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -4688,16 +4688,16 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: enable_ordered_append_gds = 0
; GFX11-NEXT: private_element_size = 1
; GFX11-NEXT: is_ptr64 = 1
-; GFX11-NEXT: is_dynamic_callstack = dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion
+; GFX11-NEXT: is_dynamic_callstack = 0
; GFX11-NEXT: is_debug_enabled = 0
; GFX11-NEXT: is_xnack_enabled = 0
-; GFX11-NEXT: workitem_private_segment_byte_size = dyn_extract_v4f64_s_s_s.private_seg_size
+; GFX11-NEXT: workitem_private_segment_byte_size = 0
; GFX11-NEXT: workgroup_group_segment_byte_size = 0
; GFX11-NEXT: gds_segment_byte_size = 0
; GFX11-NEXT: kernarg_segment_byte_size = 28
; GFX11-NEXT: workgroup_fbarrier_count = 0
-; GFX11-NEXT: wavefront_sgpr_count = dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 0))
-; GFX11-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr)
+; GFX11-NEXT: wavefront_sgpr_count = 7
+; GFX11-NEXT: workitem_vgpr_count = 3
; GFX11-NEXT: reserved_vgpr_first = 0
; GFX11-NEXT: reserved_vgpr_count = 0
; GFX11-NEXT: reserved_sgpr_first = 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
index 0221d0f790be0c..66b88236bbb4c1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
@@ -18,7 +18,7 @@ target triple = "amdgcn-amd-amdhsa"
; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset
; RW-FLAT-NOT: .amdhsa_enable_private_segment
; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset
-; RO-FLAT: .amdhsa_enable_private_segment (((((alignto(stack_object_addrspacecast_in_kernel_no_calls.private_seg_size*64, 1024))/1024)>0)||(stack_object_addrspacecast_in_kernel_no_calls.has_dyn_sized_stack|stack_object_addrspacecast_in_kernel_no_calls.has_recursion))|128)&1
+; RO-FLAT: .amdhsa_enable_private_segment 1
; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1
; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 6
; RO-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 0
@@ -38,12 +38,11 @@ define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() {
; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer
; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 1
; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init
-; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(stack_object_in_kernel_no_calls.private_seg_size*64, 1024))/1024)>0)||(stack_object_in_kernel_no_calls.has_dyn_sized_stack|stack_object_in_kernel_no_calls.has_recursion))|140)&1
+; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
; RW-FLAT-NOT: .amdhsa_enable_private_segment
; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset
-; RO-FLAT: .amdhsa_enable_private_segment (((((alignto(stack_object_in_kernel_no_calls.private_seg_size*64, 1024))/1024)>0)||(stack_object_in_kernel_no_calls.has_dyn_sized_stack|stack_object_in_kernel_no_calls.has_recursion))|128)&1
-; RW-FLAT: .amdhsa_reserve_flat_scratch stack_object_in_kernel_no_calls.uses_flat_scratch
-; RW-FLAT: .set stack_object_in_kernel_no_calls.uses_flat_scratch, 0
+; RO-FLAT: .amdhsa_enable_private_segment 1
+; RW-FLAT: .amdhsa_reserve_flat_scratch 0
; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1
; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 6
; RO-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 0
@@ -59,12 +58,11 @@ define amdgpu_kernel void @stack_object_in_kernel_no_calls() {
; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer
; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 0
; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init
-; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(kernel_no_calls_no_stack.private_seg_size*64, 1024))/1024)>0)||(kernel_no_calls_no_stack.has_dyn_sized_stack|kernel_no_calls_no_stack.has_recursion))|136)&1
+; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
; RW-FLAT-NOT: .amdhsa_enable_private_segment
; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset
-; RO-FLAT: .amdhsa_enable_private_segment (((((alignto(kernel_no_calls_no_stack.private_seg_size*64, 1024))/1024)>0)||(kernel_no_calls_no_stack.has_dyn_sized_stack|kernel_no_calls_no_stack.has_recursion))|128)&1
-; RW-FLAT: .amdhsa_reserve_flat_scratch kernel_no_calls_no_stack.uses_flat_scratch
-; RW-FLAT: .set kernel_no_calls_no_stack.uses_flat_scratch, 0
+; RO-FLAT: .amdhsa_enable_private_segment 0
+; RW-FLAT: .amdhsa_reserve_flat_scratch 0
; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 4
; RO-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
index 374ce0676d2205..d5646820a19832 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
@@ -16,7 +16,7 @@ declare i32 @llvm.amdgcn.workitem.id.z() #0
; MESA: .section .AMDGPU.config
; MESA: .long 47180
-; MESA-NEXT: .long ((((alignto(test_workitem_id_x.private_seg_size*64, 1024))/1024)>0)||(test_workitem_id_x.has_dyn_sized_stack|test_workitem_id_x.has_recursion))|132{{$}}
+; MESA-NEXT: .long 132{{$}}
; ALL-LABEL: {{^}}test_workitem_id_x:
; MESA3D: enable_vgpr_workitem_id = 0
@@ -33,7 +33,7 @@ define amdgpu_kernel void @test_workitem_id_x(ptr addrspace(1) %out) #1 {
; MESA: .section .AMDGPU.config
; MESA: .long 47180
-; MESA-NEXT: .long ((((alignto(test_workitem_id_y.private_seg_size*64, 1024))/1024)>0)||(test_workitem_id_y.has_dyn_sized_stack|test_workitem_id_y.has_recursion))|2180{{$}}
+; MESA-NEXT: .long 2180{{$}}
; ALL-LABEL: {{^}}test_workitem_id_y:
; MESA3D: enable_vgpr_workitem_id = 1
@@ -51,7 +51,7 @@ define amdgpu_kernel void @test_workitem_id_y(ptr addrspace(1) %out) #1 {
; MESA: .section .AMDGPU.config
; MESA: .long 47180
-; MESA-NEXT: .long ((((alignto(test_workitem_id_z.private_seg_size*64, 1024))/1024)>0)||(test_workitem_id_z.has_dyn_sized_stack|test_workitem_id_z.has_recursion))|4228{{$}}
+; MESA-NEXT: .long 4228{{$}}
; ALL-LABEL: {{^}}test_workitem_id_z:
; MESA3D: enable_vgpr_workitem_id = 2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
index 1c3db1d64b299d..c7afbeabbbb6b1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
@@ -75,10 +75,10 @@ bb.2:
store volatile i32 0, ptr addrspace(1) undef
ret void
}
-; DEFAULTSIZE: .amdhsa_private_segment_fixed_size kernel_non_entry_block_static_alloca_uniformly_reached_align4.private_seg_size
+; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 16
; DEFAULTSIZE: ; ScratchSize: 16
-; ASSUME1024: .amdhsa_private_segment_fixed_size kernel_non_entry_block_static_alloca_uniformly_reached_align4.private_seg_size
+; ASSUME1024: .amdhsa_private_segment_fixed_size 1040
; ASSUME1024: ; ScratchSize: 1040
define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) {
@@ -137,10 +137,10 @@ bb.1:
ret void
}
-; DEFAULTSIZE: .amdhsa_private_segment_fixed_size kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size
+; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 64
; DEFAULTSIZE: ; ScratchSize: 64
-; ASSUME1024: .amdhsa_private_segment_fixed_size kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size
+; ASSUME1024: .amdhsa_private_segment_fixed_size 1088
; ASSUME1024: ; ScratchSize: 1088
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
index e311be4b12218a..647b5aff242984 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
@@ -2,10 +2,9 @@
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX90A %s
; GCN-LABEL: {{^}}kernel_32_agprs:
-; GCN: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_32_agprs.num_agpr, kernel_32_agprs.num_vgpr), 1, 0)
-; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_32_agprs.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
-; GFX908: .set kernel_32_agprs.num_vgpr, 9
-; GFX908: .set kernel_32_agprs.num_agpr, 32
+; GFX908: .amdhsa_next_free_vgpr 32
+; GFX90A: .amdhsa_next_free_vgpr 44
+; GFX90A: .amdhsa_accum_offset 12
; GCN: NumVgprs: 9
; GCN: NumAgprs: 32
; GFX908: TotalNumVgprs: 32
@@ -25,9 +24,8 @@ bb:
}
; GCN-LABEL: {{^}}kernel_0_agprs:
-; GCN: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_0_agprs.num_agpr, kernel_0_agprs.num_vgpr), 1, 0)
-; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_0_agprs.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
-; GCN: .set kernel_0_agprs.num_vgpr, 1
+; GCN: .amdhsa_next_free_vgpr 1
+; GFX90A: .amdhsa_accum_offset 4
; GCN: NumVgprs: 1
; GCN: NumAgprs: 0
; GCN: TotalNumVgprs: 1
@@ -44,10 +42,9 @@ bb:
}
; GCN-LABEL: {{^}}kernel_40_vgprs:
-; GCN: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_40_vgprs.num_agpr, kernel_40_vgprs.num_vgpr), 1, 0)
-; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_40_vgprs.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
-; GCN: .set kernel_40_vgprs.num_vgpr, 40
-; GFX90A: .set kernel_40_vgprs.num_agpr, 16
+; GFX908: .amdhsa_next_free_vgpr 40
+; GFX90A: .amdhsa_next_free_vgpr 56
+; GFX90A: .amdhsa_accum_offset 40
; GCN: NumVgprs: 40
; GCN: NumAgprs: 16
; GFX908: TotalNumVgprs: 40
@@ -102,10 +99,9 @@ bb:
}
; GCN-LABEL: {{^}}kernel_max_gprs:
-; GCN: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_max_gprs.num_agpr, kernel_max_gprs.num_vgpr), 1, 0)
-; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_max_gprs.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
-; GCN: .set kernel_max_gprs.num_vgpr, 256
-; GFX90A: .set kernel_max_gprs.num_agpr, 256
+; GFX908: .amdhsa_next_free_vgpr 256
+; GFX90A: .amdhsa_next_free_vgpr 512
+; GFX90A: .amdhsa_accum_offset 256
; GCN: NumVgprs: 256
; GCN: NumAgprs: 256
; GFX908: TotalNumVgprs: 256
@@ -125,10 +121,8 @@ bb:
}
; GCN-LABEL: {{^}}kernel_call_func_32_agprs:
-; GCN: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_call_func_32_agprs.num_agpr, kernel_call_func_32_agprs.num_vgpr), 1, 0)
-; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_call_func_32_agprs.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
-; GCN: .set kernel_call_func_32_agprs.num_vgpr, max(0, func_32_agprs.num_vgpr)
-; GCN: .set kernel_call_func_32_agprs.num_agpr, max(0, func_32_agprs.num_agpr)
+; GFX908: .amdhsa_next_free_vgpr 32
+; GFX90A: .amdhsa_accum_offset 12
; GCN: NumVgprs: 9
; GCN: NumAgprs: 32
; GFX908: TotalNumVgprs: 32
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index 993ff4e4477d35..9ec8e425a3f55c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -35,8 +35,7 @@
; FIXME: Creating the emergency stack slots causes us to over-estimate scratch
; by 4 bytes.
-; HSA-ALLOCA: .amdhsa_private_segment_fixed_size mova_same_clause.private_seg_size
-; HSA-ALLOCA: .set mova_same_clause.private_seg_size, 24
+; HSA-ALLOCA: .amdhsa_private_segment_fixed_size 24
; HSA-ALLOCA: s_add_i32 s12, s12, s17
; HSA-ALLOCA-DAG: s_mov_b32 flat_scratch_lo, s13
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
index 77be8605f20015..90562e25a3e9c1 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
@@ -10,21 +10,12 @@
; OPT: .amdhsa_user_sgpr_dispatch_id 0
; OPT: .amdhsa_user_sgpr_flat_scratch_init 0
; OPT: .amdhsa_user_sgpr_private_segment_size 0
-; OPT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(foo.private_seg_size*64, 1024))/1024)>0)||(foo.has_dyn_sized_stack|foo.has_recursion))|136)&1
+; OPT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
; OPT: .amdhsa_system_sgpr_workgroup_id_x 1
; OPT: .amdhsa_system_sgpr_workgroup_id_y 0
; OPT: .amdhsa_system_sgpr_workgroup_id_z 0
; OPT: .amdhsa_system_sgpr_workgroup_info 0
; OPT: .amdhsa_system_vgpr_workitem_id 0
-; OPT: .set foo.num_vgpr, 0
-; OPT: .set foo.num_agpr, 0
-; OPT: .set foo.num_sgpr, 0
-; OPT: .set foo.private_seg_size, 0
-; OPT: .set foo.uses_vcc, 0
-; OPT: .set foo.uses_flat_scratch, 0
-; OPT: .set foo.has_dyn_sized_stack, 0
-; OPT: .set foo.has_recursion, 0
-; OPT: .set foo.has_indirect_call, 0
; NOOPT: .amdhsa_user_sgpr_private_segment_buffer 1
; NOOPT: .amdhsa_user_sgpr_dispatch_ptr 1
@@ -34,25 +25,12 @@
; NOOPT: .amdhsa_user_sgpr_dispatch_id 1
; NOOPT: .amdhsa_user_sgpr_flat_scratch_init 0
; NOOPT: .amdhsa_user_sgpr_private_segment_size 0
-; COV4: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(foo.private_seg_size*64, 1024))/1024)>0)||(foo.has_dyn_sized_stack|foo.has_recursion))|5016)&1
-; COV5: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(foo.private_seg_size*64, 1024))/1024)>0)||(foo.has_dyn_sized_stack|foo.has_recursion))|5012)&1
+; NOOPT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
; NOOPT: .amdhsa_system_sgpr_workgroup_id_x 1
; NOOPT: .amdhsa_system_sgpr_workgroup_id_y 1
; NOOPT: .amdhsa_system_sgpr_workgroup_id_z 1
-; COV4: .amdhsa_system_sgpr_workgroup_info 0
-; COV5: .amdhsa_system_sgpr_workgroup_info 0
-; COV4: .amdhsa_system_vgpr_workitem_id 2
-; COV5: .amdhsa_system_vgpr_workitem_id 2
-; NOOPT: .set foo.num_vgpr, 0
-; NOOPT: .set foo.num_agpr, 0
-; NOOPT: .set foo.num_sgpr, 0
-; NOOPT: .set foo.private_seg_size, 0
-; NOOPT: .set foo.uses_vcc, 0
-; NOOPT: .set foo.uses_flat_scratch, 0
-; NOOPT: .set foo.has_dyn_sized_stack, 0
-; NOOPT: .set foo.has_recursion, 0
-; NOOPT: .set foo.has_indirect_call, 0
-
+; NOOPT: .amdhsa_system_sgpr_workgroup_info 0
+; NOOPT: .amdhsa_system_vgpr_workitem_id 2
define amdgpu_kernel void @foo() {
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index 6311c2a01d366b..05a974695643d3 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -222,14 +222,10 @@ ret:
}
; GCN-LABEL: {{^}}usage_direct_recursion:
-; GCN: .amdhsa_private_segment_fixed_size usage_direct_recursion.private_seg_size
-; GCN: .set usage_direct_recursion.private_seg_size, 0+(max(16384, direct_recursion_use_stack.private_seg_size))
-; GCN: ScratchSize: 18448
+; GCN: .amdhsa_private_segment_fixed_size 18448
;
; GCN-V5-LABEL: {{^}}usage_direct_recursion:
-; GCN-V5: .amdhsa_private_segment_fixed_size usage_direct_recursion.private_seg_size
-; GCN-V5: .set usage_direct_recursion.private_seg_size, 0+(max(direct_recursion_use_stack.private_seg_size))
-; GCN-V5: ScratchSize: 2064
+; GCN-V5: .amdhsa_private_segment_fixed_size 2064{{$}}
define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
call void @direct_recursion_use_stack(i32 %n)
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
index 7e731a70ca4d76..032ec65fa85133 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
@@ -469,7 +469,7 @@ define hidden void @use_every_sgpr_input() #1 {
; GCN: .amdhsa_user_sgpr_dispatch_id 1
; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
; GCN: .amdhsa_user_sgpr_private_segment_size 0
-; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(kern_indirect_use_every_sgpr_input.private_seg_size*64, 1024))/1024)>0)||(kern_indirect_use_every_sgpr_input.has_dyn_sized_stack|kern_indirect_use_every_sgpr_input.has_recursion))|920)&1
+; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
; GCN: .amdhsa_system_sgpr_workgroup_id_x 1
; GCN: .amdhsa_system_sgpr_workgroup_id_y 1
; GCN: .amdhsa_system_sgpr_workgroup_id_z 1
@@ -494,7 +494,7 @@ define amdgpu_kernel void @kern_indirect_use_every_sgpr_input(i8) #1 {
; GCN: .amdhsa_user_sgpr_dispatch_id 1
; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
; GCN: .amdhsa_user_sgpr_private_segment_size 0
-; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(kern_indirect_use_every_sgpr_input_no_kernargs.private_seg_size*64, 1024))/1024)>0)||(kern_indirect_use_every_sgpr_input_no_kernargs.has_dyn_sized_stack|kern_indirect_use_every_sgpr_input_no_kernargs.has_recursion))|916)&1
+; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
; GCN: .amdhsa_system_sgpr_workgroup_id_x 1
; GCN: .amdhsa_system_sgpr_workgroup_id_y 1
; GCN: .amdhsa_system_sgpr_workgroup_id_z 1
diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
index 13fd714933dbb6..3035a8579c8a6d 100644
--- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
@@ -15,19 +15,13 @@
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 10
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
-; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr max(totalnumvgprs(fadd.num_agpr, fadd.num_vgpr), 1, 0)
-; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr (max(fadd.num_sgpr+(extrasgprs(fadd.uses_vcc, fadd.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(fadd.uses_vcc, fadd.uses_flat_scratch, 0))
-; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc fadd.uses_vcc
-; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch fadd.uses_flat_scratch
+; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
+; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8
+; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
+; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
; OSABI-AMDHSA-ASM: .text
-; OSABI-AMDHSA-ASM: .set fadd.num_vgpr, 3
-; OSABI-AMDHSA-ASM: .set fadd.num_agpr, 0
-; OSABI-AMDHSA-ASM: .set fadd.num_sgpr, 8
-; OSABI-AMDHSA-ASM: .set fadd.uses_vcc, 0
-; OSABI-AMDHSA-ASM: .set fadd.uses_flat_scratch, 0
-
; ALL-ASM-LABEL: {{^}}fsub:
; OSABI-AMDHSA-ASM-NOT: .amdgpu_hsa_kernel
@@ -40,19 +34,13 @@
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 10
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
-; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr max(totalnumvgprs(fsub.num_agpr, fsub.num_vgpr), 1, 0)
-; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr (max(fsub.num_sgpr+(extrasgprs(fsub.uses_vcc, fsub.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(fsub.uses_vcc, fsub.uses_flat_scratch, 0))
-; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc fsub.uses_vcc
-; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch fsub.uses_flat_scratch
+; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
+; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8
+; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
+; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
; OSABI-AMDHSA-ASM: .text
-; OSABI-AMDHSA-ASM: .set fsub.num_vgpr, 3
-; OSABI-AMDHSA-ASM: .set fsub.num_agpr, 0
-; OSABI-AMDHSA-ASM: .set fsub.num_sgpr, 8
-; OSABI-AMDHSA-ASM: .set fsub.uses_vcc, 0
-; OSABI-AMDHSA-ASM: .set fsub.uses_flat_scratch, 0
-
; OSABI-AMDHSA-ASM-NOT: .hsa_code_object_version
; OSABI-AMDHSA-ASM-NOT: .hsa_code_object_isa
; OSABI-AMDHSA-ASM-NOT: .amd_amdgpu_isa
@@ -105,10 +93,8 @@ entry:
; registers used.
;
; ALL-ASM-LABEL: {{^}}empty:
-; ALL-ASM: .amdhsa_next_free_vgpr max(totalnumvgprs(empty.num_agpr, empty.num_vgpr), 1, 0)
-; ALL-ASM: .amdhsa_next_free_sgpr (max(empty.num_sgpr+(extrasgprs(empty.uses_vcc, empty.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(empty.uses_vcc, empty.uses_flat_scratch, 0))
-; ALL-ASM: NumSGPRsForWavesPerEU: 1
-; ALL-ASM: NumVGPRsForWavesPerEU: 1
+; ALL-ASM: .amdhsa_next_free_vgpr 1
+; ALL-ASM: .amdhsa_next_free_sgpr 1
define amdgpu_kernel void @empty(
i32 %i,
ptr addrspace(1) %r,
diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
index 69a729f6847f0a..789150f690d52e 100644
--- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -62,8 +62,7 @@
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]]
-; VGPR: .amdhsa_private_segment_fixed_size divergent_if_endif.private_seg_size
-; VGPR: .set divergent_if_endif.private_seg_size, 16
+; VGPR: .amdhsa_private_segment_fixed_size 16
define amdgpu_kernel void @divergent_if_endif(ptr addrspace(1) %out) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -134,8 +133,7 @@ endif:
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]]
-; VGPR: .amdhsa_private_segment_fixed_size divergent_loop.private_seg_size
-; VGPR: .set divergent_loop.private_seg_size, 20
+; VGPR: .amdhsa_private_segment_fixed_size 20
define amdgpu_kernel void @divergent_loop(ptr addrspace(1) %out) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/elf.ll b/llvm/test/CodeGen/AMDGPU/elf.ll
index 423bb95af25df9..f51d9fc5125ba6 100644
--- a/llvm/test/CodeGen/AMDGPU/elf.ll
+++ b/llvm/test/CodeGen/AMDGPU/elf.ll
@@ -3,7 +3,7 @@
; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -filetype=obj | llvm-readobj -S --symbols --file-headers - | FileCheck --check-prefix=ELF %s
; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TONGA %s
; RUN: llc < %s -mtriple=amdgcn -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs -filetype=obj | llvm-readobj -S --symbols --file-headers - | FileCheck --check-prefix=ELF %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=CARRIZO %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s
; Test that we don't try to produce a COFF file on windows
; RUN: llc < %s -mtriple=amdgcn-pc-mingw -verify-machineinstrs -filetype=obj | llvm-readobj -S --symbols --file-headers - | FileCheck --check-prefix=ELF %s
@@ -20,9 +20,8 @@
; CONFIG: .section .AMDGPU.config
; CONFIG-NEXT: .long 45096
-; TYPICAL-NEXT: .long (((((alignto(max(max(totalnumvgprs(test.num_agpr, max(totalnumvgprs(test.num_agpr, test.num_vgpr), 1)), 1, 0), 1), 4))/4)-1)&63)<<0)|(((((alignto(max(max(max(test.num_sgpr+(extrasgprs(test.uses_vcc, test.uses_flat_scratch, 0)), 0), 1, 0), 1), 8))/8)-1)&15)<<6)
-; TONGA-NEXT: .long (((((alignto(max(max(totalnumvgprs(test.num_agpr, max(totalnumvgprs(test.num_agpr, test.num_vgpr), 1)), 1, 0), 1), 4))/4)-1)&63)<<0)|(((((alignto(max(96, 1), 8))/8)-1)&15)<<6)
-; CARRIZO-NEXT: .long (((((alignto(max(max(totalnumvgprs(test.num_agpr, max(totalnumvgprs(test.num_agpr, test.num_vgpr), 1)), 1, 0), 1), 4))/4)-1)&63)<<0)|(((((alignto(max(max(max(test.num_sgpr+(extrasgprs(test.uses_vcc, test.uses_flat_scratch, 1)), 0), 1, 0), 1), 8))/8)-1)&15)<<6)
+; TYPICAL-NEXT: .long 0
+; TONGA-NEXT: .long 704
; CONFIG: .p2align 8
; CONFIG: test:
define amdgpu_ps void @test(i32 %p) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/enable-scratch-only-dynamic-stack.ll b/llvm/test/CodeGen/AMDGPU/enable-scratch-only-dynamic-stack.ll
index c4111282682527..78ac2f9eaff020 100644
--- a/llvm/test/CodeGen/AMDGPU/enable-scratch-only-dynamic-stack.ll
+++ b/llvm/test/CodeGen/AMDGPU/enable-scratch-only-dynamic-stack.ll
@@ -7,17 +7,13 @@
; No stack objects, only indirect call has to enable scrathch
; GCN-LABEL: test_indirect_call:
-; GCN: .amdhsa_private_segment_fixed_size test_indirect_call.private_seg_size
-; GCN: .amdhsa_user_sgpr_private_segment_buffer 1
-; COV5: .amdhsa_uses_dynamic_stack ((59|((test_indirect_call.has_dyn_sized_stack|test_indirect_call.has_recursion)<<11))&2048)>>11
-; COV5: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(test_indirect_call.private_seg_size*64, 1024))/1024)>0)||(test_indirect_call.has_dyn_sized_stack|test_indirect_call.has_recursion))|5016)&1
-; COV4: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(test_indirect_call.private_seg_size*64, 1024))/1024)>0)||(test_indirect_call.has_dyn_sized_stack|test_indirect_call.has_recursion))|5020)&1
+; COV5: .amdhsa_private_segment_fixed_size 0{{$}}
+; COV4: .amdhsa_private_segment_fixed_size 16384{{$}}
-; COV5: .set test_indirect_call.private_seg_size, 0{{$}}
-; COV4: .set test_indirect_call.private_seg_size, 0+(max(16384))
-; COV5: .set test_indirect_call.has_recursion, 1
-; COV5: .set test_indirect_call.has_indirect_call, 1
+; GCN: .amdhsa_user_sgpr_private_segment_buffer 1
+; COV5: .amdhsa_uses_dynamic_stack 1
+; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
define amdgpu_kernel void @test_indirect_call() {
%fptr = load ptr, ptr addrspace(4) @gv.fptr0
call void %fptr()
diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
index d9a5a49e75f0a5..496a1c652da251 100644
--- a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
@@ -5,14 +5,12 @@ declare void @llvm.trap() #0
; DOORBELL: .amdhsa_kernel trap
; DOORBELL-NEXT: .amdhsa_group_segment_fixed_size 0
-; DOORBELL-NEXT: .amdhsa_private_segment_fixed_size trap.private_seg_size
+; DOORBELL-NEXT: .amdhsa_private_segment_fixed_size 0
; DOORBELL-NEXT: .amdhsa_kernarg_size 8
; DOORBELL-NEXT: .amdhsa_user_sgpr_count 12
; DOORBELL-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
; DOORBELL: .end_amdhsa_kernel
-; DOORBELL: .set trap.private_seg_size, 0
-
define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) #0 {
store volatile i32 1, ptr addrspace(1) %arg0
call void @llvm.trap()
diff --git a/llvm/test/CodeGen/AMDGPU/kernel_code_t_recurse.ll b/llvm/test/CodeGen/AMDGPU/kernel_code_t_recurse.ll
index f3d9e9a727c251..cdd6e88dd103b7 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel_code_t_recurse.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel_code_t_recurse.ll
@@ -1,8 +1,6 @@
; RUN: llc -mtriple=amdgcn-mesa-mesa3d < %s | FileCheck %s
; CHECK-LABEL: non_kernel_recursion:
-; CHECK: .set non_kernel_recursion.has_recursion, 1
-; CHECK: .set non_kernel_recursion.has_indirect_call, 0
define void @non_kernel_recursion(i32 %val) #2 {
%cmp = icmp eq i32 %val, 0
br i1 %cmp, label %ret, label %call
@@ -18,11 +16,8 @@ ret:
; CHECK-LABEL: kernel_caller_recursion:
; CHECK: .amd_kernel_code_t
-; CHECK: is_dynamic_callstack = kernel_caller_recursion.has_dyn_sized_stack|kernel_caller_recursion.has_recursion
+; CHECK: is_dynamic_callstack = 1
; CHECK: .end_amd_kernel_code_t
-
-; CHECK: .set kernel_caller_recursion.has_recursion, or(1, non_kernel_recursion.has_recursion)
-; CHECK: .set kernel_caller_recursion.has_indirect_call, or(0, non_kernel_recursion.has_indirect_call)
define amdgpu_kernel void @kernel_caller_recursion(i32 %n) #0 {
call void @non_kernel_recursion(i32 %n)
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll
index 64bc14d750573b..7698372b687797 100644
--- a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll
+++ b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll
@@ -1,10 +1,10 @@
; RUN: llc -mtriple=amdgcn -mcpu=bonaire -show-mc-encoding < %s | FileCheck --check-prefixes=GCN,CI,ALL %s
; RUN: llc -mtriple=amdgcn -mcpu=carrizo --show-mc-encoding < %s | FileCheck --check-prefixes=GCN,VI,ALL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 --show-mc-encoding < %s | FileCheck --check-prefixes=GCN,GFX9,ALL %s
-; RUN: llc -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck --check-prefixes=BON,GCNHSA,ALL %s
-; RUN: llc -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck --check-prefixes=CAR,GCNHSA,ALL %s
-; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX10,GCNHSA,ALL %s
-; RUN: llc -mcpu=gfx1100 -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global,-architected-flat-scratch,-user-sgpr-init16-bug < %s | FileCheck --check-prefixes=GFX11,GCNHSA,ALL %s
+; RUN: llc -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCNHSA,ALL %s
+; RUN: llc -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCNHSA,ALL %s
+; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCNHSA,ALL %s
+; RUN: llc -mcpu=gfx1100 -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global,-architected-flat-scratch,-user-sgpr-init16-bug < %s | FileCheck --check-prefixes=GCNHSA,ALL %s
; FIXME: align on alloca seems to be ignored for private_segment_alignment
@@ -24,7 +24,7 @@
; GCNHSA: .amdhsa_kernel large_alloca_compute_shader
; GCNHSA: .amdhsa_group_segment_fixed_size 0
-; GCNHSA: .amdhsa_private_segment_fixed_size large_alloca_compute_shader.private_seg_size
+; GCNHSA: .amdhsa_private_segment_fixed_size 32772
; GCNHSA: .amdhsa_user_sgpr_private_segment_buffer 1
; GCNHSA: .amdhsa_user_sgpr_dispatch_ptr 1
; GCNHSA: .amdhsa_user_sgpr_queue_ptr 1
@@ -32,19 +32,14 @@
; GCNHSA: .amdhsa_user_sgpr_dispatch_id 1
; GCNHSA: .amdhsa_user_sgpr_flat_scratch_init 1
; GCNHSA: .amdhsa_user_sgpr_private_segment_size 0
-; GCNHSA: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(large_alloca_compute_shader.private_seg_size*{{32|64}}, {{1024|256}}))/{{1024|256}})>0)||(large_alloca_compute_shader.has_dyn_sized_stack|large_alloca_compute_shader.has_recursion))|5020)&1
+; GCNHSA: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
; GCNHSA: .amdhsa_system_sgpr_workgroup_id_x 1
; GCNHSA: .amdhsa_system_sgpr_workgroup_id_y 1
; GCNHSA: .amdhsa_system_sgpr_workgroup_id_z 1
; GCNHSA: .amdhsa_system_sgpr_workgroup_info 0
; GCNHSA: .amdhsa_system_vgpr_workitem_id 2
-; GCNHSA: .amdhsa_next_free_vgpr max(totalnumvgprs(large_alloca_compute_shader.num_agpr, large_alloca_compute_shader.num_vgpr), 1, 0)
-; BON: .amdhsa_next_free_sgpr (max(large_alloca_compute_shader.num_sgpr+(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 0))
-; CAR: .amdhsa_next_free_sgpr (max(large_alloca_compute_shader.num_sgpr+(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 1))
-; GFX10: .amdhsa_next_free_sgpr (max(large_alloca_compute_shader.num_sgpr+(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 1))
-; GFX11: .amdhsa_next_free_sgpr (max(large_alloca_compute_shader.num_sgpr+(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 0))
-; GCNHSA: .amdhsa_reserve_vcc large_alloca_compute_shader.uses_vcc
-; GCNHSA: .amdhsa_reserve_flat_scratch large_alloca_compute_shader.uses_flat_scratch
+; GCNHSA: .amdhsa_next_free_vgpr 3
+; GCNHSA: .amdhsa_next_free_sgpr 18
; GCNHSA: .amdhsa_float_round_mode_32 0
; GCNHSA: .amdhsa_float_round_mode_16_64 0
; GCNHSA: .amdhsa_float_denorm_mode_32 3
@@ -60,16 +55,6 @@
; GCNHSA: .amdhsa_exception_int_div_zero 0
; GCNHSA: .end_amdhsa_kernel
-; GCNHSA: .set large_alloca_compute_shader.num_vgpr, 3
-; GCNHSA: .set large_alloca_compute_shader.num_agpr, 0
-; GCNHSA: .set large_alloca_compute_shader.num_sgpr, 18
-; GCNHSA: .set large_alloca_compute_shader.private_seg_size, 32772
-; GCNHSA: .set large_alloca_compute_shader.uses_vcc
-; GCNHSA: .set large_alloca_compute_shader.uses_flat_scratch, 0
-; GCNHSA: .set large_alloca_compute_shader.has_dyn_sized_stack, 0
-; GCNHSA: .set large_alloca_compute_shader.has_recursion, 0
-; GCNHSA: .set large_alloca_compute_shader.has_indirect_call, 0
-
; Scratch size = alloca size + emergency stack slot, align {{.*}}, addrspace(5)
; ALL: ; ScratchSize: 32772
define amdgpu_kernel void @large_alloca_compute_shader(i32 %x, i32 %y) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
index 778060d3c5fb3d..eaee8ec73fe411 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
@@ -12,7 +12,7 @@ declare i32 @llvm.amdgcn.workitem.id.z() #0
; MESA: .section .AMDGPU.config
; MESA: .long 47180
-; MESA-NEXT: .long ((((alignto(test_workitem_id_x.private_seg_size*64, 1024))/1024)>0)||(test_workitem_id_x.has_dyn_sized_stack|test_workitem_id_x.has_recursion))|132{{$}}
+; MESA-NEXT: .long 132{{$}}
; ALL-LABEL: {{^}}test_workitem_id_x:
; MESA3D: enable_vgpr_workitem_id = 0
@@ -29,7 +29,7 @@ define amdgpu_kernel void @test_workitem_id_x(ptr addrspace(1) %out) #1 {
; MESA: .section .AMDGPU.config
; MESA: .long 47180
-; MESA-NEXT: .long ((((alignto(test_workitem_id_y.private_seg_size*64, 1024))/1024)>0)||(test_workitem_id_y.has_dyn_sized_stack|test_workitem_id_y.has_recursion))|2180{{$}}
+; MESA-NEXT: .long 2180{{$}}
; ALL-LABEL: {{^}}test_workitem_id_y:
; MESA3D: enable_vgpr_workitem_id = 1
@@ -47,7 +47,7 @@ define amdgpu_kernel void @test_workitem_id_y(ptr addrspace(1) %out) #1 {
; MESA: .section .AMDGPU.config
; MESA: .long 47180
-; MESA-NEXT: .long ((((alignto(test_workitem_id_z.private_seg_size*64, 1024))/1024)>0)||(test_workitem_id_z.has_dyn_sized_stack|test_workitem_id_z.has_recursion))|4228{{$}}
+; MESA-NEXT: .long 4228{{$}}
; ALL-LABEL: {{^}}test_workitem_id_z:
; MESA3D: enable_vgpr_workitem_id = 2
diff --git a/llvm/test/CodeGen/AMDGPU/mesa3d.ll b/llvm/test/CodeGen/AMDGPU/mesa3d.ll
index f3cf2a5ca8ff62..7f0f473c11bd59 100644
--- a/llvm/test/CodeGen/AMDGPU/mesa3d.ll
+++ b/llvm/test/CodeGen/AMDGPU/mesa3d.ll
@@ -1,15 +1,14 @@
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,ALL %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,ALL %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,ALL %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s
; SPI_TMPRING_SIZE.WAVESIZE = 5
; GFX10: .long 165608
-; GFX10-NEXT: .long (((alignto(scratch_ps.private_seg_size*32, 1024))/1024)&8191)<<12
+; GFX10-NEXT: .long 20480
; SPI_TMPRING_SIZE.WAVESIZE = 17
; GFX11: .long 165608
-; 11XFG-TXEN: .long 69632
-; GFX11-NEXT:.long (((alignto(scratch_ps.private_seg_size*32, 256))/256)&32767)<<12
+; GFX11-NEXT: .long 69632
; GCN-LABEL: {{^}}scratch_ps:
; GCN: s_load_dwordx2 s[4:5], s[0:1], 0x0{{$}}
@@ -24,5 +23,3 @@ entry:
store volatile i32 2, ptr addrspace(5) %ptr
ret void
}
-
-; ALL: .set scratch_ps.private_seg_size, 132
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index 3c100cf7a38527..b84686139d0e2c 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -110,14 +110,13 @@ bb.2:
store volatile i32 0, ptr addrspace(1) undef
ret void
}
-; DEFAULTSIZE: .set kernel_non_entry_block_static_alloca_uniformly_reached_align4.private_seg_size, 4112
+; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112
; DEFAULTSIZE: ; ScratchSize: 4112
-; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack ((41|((kernel_non_entry_block_static_alloca_uniformly_reached_align4.has_dyn_sized_stack|kernel_non_entry_block_static_alloca_uniformly_reached_align4.has_recursion)<<11))&2048)>>11
-; DEFAULTSIZE-V5: .set kernel_non_entry_block_static_alloca_uniformly_reached_align4.private_seg_size, 16
-; DEFAULTSIZE-V5: .set kernel_non_entry_block_static_alloca_uniformly_reached_align4.has_dyn_sized_stack, 1
+; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size 16
+; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack 1
; DEFAULTSIZE-V5: ; ScratchSize: 16
-; ASSUME1024: .set kernel_non_entry_block_static_alloca_uniformly_reached_align4.private_seg_size, 1040
+; ASSUME1024: .amdhsa_private_segment_fixed_size 1040
; ASSUME1024: ; ScratchSize: 1040
define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) {
@@ -206,16 +205,13 @@ bb.1:
ret void
}
-; DEFAULTSIZE: .amdhsa_private_segment_fixed_size kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size
-; DEFAULTSIZE: .set kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size, 4160
+; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160
; DEFAULTSIZE: ; ScratchSize: 4160
-; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size
-; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack ((59|((kernel_non_entry_block_static_alloca_uniformly_reached_align64.has_dyn_sized_stack|kernel_non_entry_block_static_alloca_uniformly_reached_align64.has_recursion)<<11))&2048)>>11
-; DEFAULTSIZE-V5: .set kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size, 64
-; DEFAULTSIZE-V5: .set kernel_non_entry_block_static_alloca_uniformly_reached_align64.has_dyn_sized_stack, 1
+; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size 64
+; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack 1
; DEFAULTSIZE-V5: ; ScratchSize: 64
-; ASSUME1024: .set kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size, 1088
+; ASSUME1024: .amdhsa_private_segment_fixed_size 1088
; ASSUME1024: ; ScratchSize: 1088
diff --git a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
index 87ec51fb44ac45..4f10e90fd087fa 100644
--- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
+++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
@@ -2,12 +2,12 @@
; RUN: FileCheck -check-prefix=REMARK %s < %t
; STDERR: remark: foo.cl:27:0: Function Name: test_kernel
-; STDERR-NEXT: remark: foo.cl:27:0: SGPRs: test_kernel.num_sgpr+(extrasgprs(test_kernel.uses_vcc, test_kernel.uses_flat_scratch, 1))
-; STDERR-NEXT: remark: foo.cl:27:0: VGPRs: test_kernel.num_vgpr
-; STDERR-NEXT: remark: foo.cl:27:0: AGPRs: test_kernel.num_agpr
-; STDERR-NEXT: remark: foo.cl:27:0: ScratchSize [bytes/lane]: test_kernel.private_seg_size
+; STDERR-NEXT: remark: foo.cl:27:0: SGPRs: 28
+; STDERR-NEXT: remark: foo.cl:27:0: VGPRs: 9
+; STDERR-NEXT: remark: foo.cl:27:0: AGPRs: 43
+; STDERR-NEXT: remark: foo.cl:27:0: ScratchSize [bytes/lane]: 0
; STDERR-NEXT: remark: foo.cl:27:0: Dynamic Stack: False
-; STDERR-NEXT: remark: foo.cl:27:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_kernel.num_sgpr+(extrasgprs(test_kernel.uses_vcc, test_kernel.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_kernel.num_agpr, test_kernel.num_vgpr), 1, 0))
+; STDERR-NEXT: remark: foo.cl:27:0: Occupancy [waves/SIMD]: 5
; STDERR-NEXT: remark: foo.cl:27:0: SGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:27:0: VGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:27:0: LDS Size [bytes/block]: 512
@@ -19,7 +19,7 @@
; REMARK-NEXT: Function: test_kernel
; REMARK-NEXT: Args:
; REMARK-NEXT: - String: 'Function Name: '
-; REMARK-NEXT: - FunctionName: test_kernel
+; REMARK-NEXT: - FunctionName: test_kernel
; REMARK-NEXT: ...
; REMARK-NEXT: --- !Analysis
; REMARK-NEXT: Pass: kernel-resource-usage
@@ -28,7 +28,7 @@
; REMARK-NEXT: Function: test_kernel
; REMARK-NEXT: Args:
; REMARK-NEXT: - String: ' SGPRs: '
-; REMARK-NEXT: - NumSGPR: 'test_kernel.num_sgpr+(extrasgprs(test_kernel.uses_vcc, test_kernel.uses_flat_scratch, 1))'
+; REMARK-NEXT: - NumSGPR: '28'
; REMARK-NEXT: ...
; REMARK-NEXT: --- !Analysis
; REMARK-NEXT: Pass: kernel-resource-usage
@@ -37,7 +37,7 @@
; REMARK-NEXT: Function: test_kernel
; REMARK-NEXT: Args:
; REMARK-NEXT: - String: ' VGPRs: '
-; REMARK-NEXT: - NumVGPR: test_kernel.num_vgpr
+; REMARK-NEXT: - NumVGPR: '9'
; REMARK-NEXT: ...
; REMARK-NEXT: --- !Analysis
; REMARK-NEXT: Pass: kernel-resource-usage
@@ -46,7 +46,7 @@
; REMARK-NEXT: Function: test_kernel
; REMARK-NEXT: Args:
; REMARK-NEXT: - String: ' AGPRs: '
-; REMARK-NEXT: - NumAGPR: test_kernel.num_agpr
+; REMARK-NEXT: - NumAGPR: '43'
; REMARK-NEXT: ...
; REMARK-NEXT: --- !Analysis
; REMARK-NEXT: Pass: kernel-resource-usage
@@ -55,17 +55,17 @@
; REMARK-NEXT: Function: test_kernel
; REMARK-NEXT: Args:
; REMARK-NEXT: - String: ' ScratchSize [bytes/lane]: '
-; REMARK-NEXT: - ScratchSize: test_kernel.private_seg_size
-; REMARK-NEXT: ...
+; REMARK-NEXT: - ScratchSize: '0'
+; REMARK-NEXT: ..
; REMARK-NEXT: --- !Analysis
; REMARK-NEXT: Pass: kernel-resource-usage
; REMARK-NEXT: Name: DynamicStack
; REMARK-NEXT: DebugLoc: { File: foo.cl, Line: 27, Column: 0 }
; REMARK-NEXT: Function: test_kernel
; REMARK-NEXT: Args:
-; REMARK-NEXT: - String: ' Dynamic Stack: '
-; REMARK-NEXT: - DynamicStack: 'False'
-; REMARK-NEXT: ...
+; REMARK-NEXT: - String: ' Dynamic Stack:
+; REMARK-NEXT: - DynamicStack: 'False'
+; REMARK-NEXT: ..
; REMARK-NEXT: --- !Analysis
; REMARK-NEXT: Pass: kernel-resource-usage
; REMARK-NEXT: Name: Occupancy
@@ -73,7 +73,7 @@
; REMARK-NEXT: Function: test_kernel
; REMARK-NEXT: Args:
; REMARK-NEXT: - String: ' Occupancy [waves/SIMD]: '
-; REMARK-NEXT: - Occupancy: 'occupancy(10, 4, 256, 8, 8, max(test_kernel.num_sgpr+(extrasgprs(test_kernel.uses_vcc, test_kernel.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_kernel.num_agpr, test_kernel.num_vgpr), 1, 0))'
+; REMARK-NEXT: - Occupancy: '5'
; REMARK-NEXT: ...
; REMARK-NEXT: --- !Analysis
; REMARK-NEXT: Pass: kernel-resource-usage
@@ -122,12 +122,12 @@ define void @test_func() !dbg !6 {
}
; STDERR: remark: foo.cl:8:0: Function Name: empty_kernel
-; STDERR-NEXT: remark: foo.cl:8:0: SGPRs: empty_kernel.num_sgpr+(extrasgprs(empty_kernel.uses_vcc, empty_kernel.uses_flat_scratch, 1))
-; STDERR-NEXT: remark: foo.cl:8:0: VGPRs: empty_kernel.num_vgpr
-; STDERR-NEXT: remark: foo.cl:8:0: AGPRs: empty_kernel.num_agpr
-; STDERR-NEXT: remark: foo.cl:8:0: ScratchSize [bytes/lane]: empty_kernel.private_seg_size
+; STDERR-NEXT: remark: foo.cl:8:0: SGPRs: 4
+; STDERR-NEXT: remark: foo.cl:8:0: VGPRs: 0
+; STDERR-NEXT: remark: foo.cl:8:0: AGPRs: 0
+; STDERR-NEXT: remark: foo.cl:8:0: ScratchSize [bytes/lane]: 0
; STDERR-NEXT: remark: foo.cl:8:0: Dynamic Stack: False
-; STDERR-NEXT: remark: foo.cl:8:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(empty_kernel.num_sgpr+(extrasgprs(empty_kernel.uses_vcc, empty_kernel.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(empty_kernel.num_agpr, empty_kernel.num_vgpr), 1, 0))
+; STDERR-NEXT: remark: foo.cl:8:0: Occupancy [waves/SIMD]: 8
; STDERR-NEXT: remark: foo.cl:8:0: SGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:8:0: VGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:8:0: LDS Size [bytes/block]: 0
@@ -141,11 +141,11 @@ define void @empty_func() !dbg !8 {
}
; STDERR: remark: foo.cl:64:0: Function Name: test_indirect_call
-; STDERR-NEXT: remark: foo.cl:64:0: SGPRs: test_indirect_call.num_sgpr+(extrasgprs(test_indirect_call.uses_vcc, test_indirect_call.uses_flat_scratch, 1))
+; STDERR-NEXT: remark: foo.cl:64:0: SGPRs: test_indirect_call.num_sgpr+6
; STDERR-NEXT: remark: foo.cl:64:0: VGPRs: test_indirect_call.num_vgpr
; STDERR-NEXT: remark: foo.cl:64:0: AGPRs: test_indirect_call.num_agpr
-; STDERR-NEXT: remark: foo.cl:64:0: ScratchSize [bytes/lane]: test_indirect_call.private_seg_size
-; STDERR-NEXT: remark: foo.cl:64:0: Dynamic Stack: False
+; STDERR-NEXT: remark: foo.cl:64:0: ScratchSize [bytes/lane]: 0
+; STDERR-NEXT: remark: foo.cl:64:0: Dynamic Stack: True
; STDERR-NEXT: remark: foo.cl:64:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_indirect_call.num_sgpr+(extrasgprs(test_indirect_call.uses_vcc, test_indirect_call.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_call.num_agpr, test_indirect_call.num_vgpr), 1, 0))
; STDERR-NEXT: remark: foo.cl:64:0: SGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:64:0: VGPRs Spill: 0
@@ -159,11 +159,11 @@ define amdgpu_kernel void @test_indirect_call() !dbg !9 {
}
; STDERR: remark: foo.cl:74:0: Function Name: test_indirect_w_static_stack
-; STDERR-NEXT: remark: foo.cl:74:0: SGPRs: test_indirect_w_static_stack.num_sgpr+(extrasgprs(test_indirect_w_static_stack.uses_vcc, test_indirect_w_static_stack.uses_flat_scratch, 1))
+; STDERR-NEXT: remark: foo.cl:74:0: SGPRs: test_indirect_w_static_stack.num_sgpr+6
; STDERR-NEXT: remark: foo.cl:74:0: VGPRs: test_indirect_w_static_stack.num_vgpr
; STDERR-NEXT: remark: foo.cl:74:0: AGPRs: test_indirect_w_static_stack.num_agpr
-; STDERR-NEXT: remark: foo.cl:74:0: ScratchSize [bytes/lane]: test_indirect_w_static_stack.private_seg_size
-; STDERR-NEXT: remark: foo.cl:74:0: Dynamic Stack: False
+; STDERR-NEXT: remark: foo.cl:74:0: ScratchSize [bytes/lane]: 144
+; STDERR-NEXT: remark: foo.cl:74:0: Dynamic Stack: True
; STDERR-NEXT: remark: foo.cl:74:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_indirect_w_static_stack.num_sgpr+(extrasgprs(test_indirect_w_static_stack.uses_vcc, test_indirect_w_static_stack.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_w_static_stack.num_agpr, test_indirect_w_static_stack.num_vgpr), 1, 0))
; STDERR-NEXT: remark: foo.cl:74:0: SGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:74:0: VGPRs Spill: 0
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
index 38d202eb4308f6..6ddf0986755f95 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
@@ -19,7 +19,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
; VI-NEXT: .p2align 6
; VI-NEXT: .amdhsa_kernel max_alignment_128
; VI-NEXT: .amdhsa_group_segment_fixed_size 0
-; VI-NEXT: .amdhsa_private_segment_fixed_size max_alignment_128.private_seg_size
+; VI-NEXT: .amdhsa_private_segment_fixed_size 256
; VI-NEXT: .amdhsa_kernarg_size 56
; VI-NEXT: .amdhsa_user_sgpr_count 14
; VI-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
@@ -29,16 +29,16 @@ define amdgpu_kernel void @max_alignment_128() #0 {
; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 1
; VI-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
; VI-NEXT: .amdhsa_user_sgpr_private_segment_size 0
-; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(max_alignment_128.private_seg_size*64, 1024))/1024)>0)||(max_alignment_128.has_dyn_sized_stack|max_alignment_128.has_recursion))|5020)&1
+; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT: .amdhsa_next_free_vgpr max(totalnumvgprs(max_alignment_128.num_agpr, max_alignment_128.num_vgpr), 1, 0)
-; VI-NEXT: .amdhsa_next_free_sgpr (max(max_alignment_128.num_sgpr+(extrasgprs(max_alignment_128.uses_vcc, max_alignment_128.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(max_alignment_128.uses_vcc, max_alignment_128.uses_flat_scratch, 0))
-; VI-NEXT: .amdhsa_reserve_vcc max_alignment_128.uses_vcc
-; VI-NEXT: .amdhsa_reserve_flat_scratch max_alignment_128.uses_flat_scratch
+; VI-NEXT: .amdhsa_next_free_vgpr 1
+; VI-NEXT: .amdhsa_next_free_sgpr 18
+; VI-NEXT: .amdhsa_reserve_vcc 0
+; VI-NEXT: .amdhsa_reserve_flat_scratch 0
; VI-NEXT: .amdhsa_float_round_mode_32 0
; VI-NEXT: .amdhsa_float_round_mode_16_64 0
; VI-NEXT: .amdhsa_float_denorm_mode_32 3
@@ -54,15 +54,6 @@ define amdgpu_kernel void @max_alignment_128() #0 {
; VI-NEXT: .amdhsa_exception_int_div_zero 0
; VI-NEXT: .end_amdhsa_kernel
; VI-NEXT: .text
-; VI: .set max_alignment_128.num_vgpr, 1
-; VI-NEXT: .set max_alignment_128.num_agpr, 0
-; VI-NEXT: .set max_alignment_128.num_sgpr, 18
-; VI-NEXT: .set max_alignment_128.private_seg_size, 256
-; VI-NEXT: .set max_alignment_128.uses_vcc, 0
-; VI-NEXT: .set max_alignment_128.uses_flat_scratch, 0
-; VI-NEXT: .set max_alignment_128.has_dyn_sized_stack, 0
-; VI-NEXT: .set max_alignment_128.has_recursion, 0
-; VI-NEXT: .set max_alignment_128.has_indirect_call, 0
;
; GFX9-LABEL: max_alignment_128:
; GFX9: ; %bb.0:
@@ -79,7 +70,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
; GFX9-NEXT: .p2align 6
; GFX9-NEXT: .amdhsa_kernel max_alignment_128
; GFX9-NEXT: .amdhsa_group_segment_fixed_size 0
-; GFX9-NEXT: .amdhsa_private_segment_fixed_size max_alignment_128.private_seg_size
+; GFX9-NEXT: .amdhsa_private_segment_fixed_size 256
; GFX9-NEXT: .amdhsa_kernarg_size 56
; GFX9-NEXT: .amdhsa_user_sgpr_count 14
; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
@@ -89,16 +80,16 @@ define amdgpu_kernel void @max_alignment_128() #0 {
; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 1
; GFX9-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_size 0
-; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(max_alignment_128.private_seg_size*64, 1024))/1024)>0)||(max_alignment_128.has_dyn_sized_stack|max_alignment_128.has_recursion))|5020)&1
+; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT: .amdhsa_next_free_vgpr max(totalnumvgprs(max_alignment_128.num_agpr, max_alignment_128.num_vgpr), 1, 0)
-; GFX9-NEXT: .amdhsa_next_free_sgpr (max(max_alignment_128.num_sgpr+(extrasgprs(max_alignment_128.uses_vcc, max_alignment_128.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(max_alignment_128.uses_vcc, max_alignment_128.uses_flat_scratch, 1))
-; GFX9-NEXT: .amdhsa_reserve_vcc max_alignment_128.uses_vcc
-; GFX9-NEXT: .amdhsa_reserve_flat_scratch max_alignment_128.uses_flat_scratch
+; GFX9-NEXT: .amdhsa_next_free_vgpr 1
+; GFX9-NEXT: .amdhsa_next_free_sgpr 18
+; GFX9-NEXT: .amdhsa_reserve_vcc 0
+; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0
; GFX9-NEXT: .amdhsa_reserve_xnack_mask 1
; GFX9-NEXT: .amdhsa_float_round_mode_32 0
; GFX9-NEXT: .amdhsa_float_round_mode_16_64 0
@@ -116,15 +107,6 @@ define amdgpu_kernel void @max_alignment_128() #0 {
; GFX9-NEXT: .amdhsa_exception_int_div_zero 0
; GFX9-NEXT: .end_amdhsa_kernel
; GFX9-NEXT: .text
-; GFX9: .set max_alignment_128.num_vgpr, 1
-; GFX9-NEXT: .set max_alignment_128.num_agpr, 0
-; GFX9-NEXT: .set max_alignment_128.num_sgpr, 18
-; GFX9-NEXT: .set max_alignment_128.private_seg_size, 256
-; GFX9-NEXT: .set max_alignment_128.uses_vcc, 0
-; GFX9-NEXT: .set max_alignment_128.uses_flat_scratch, 0
-; GFX9-NEXT: .set max_alignment_128.has_dyn_sized_stack, 0
-; GFX9-NEXT: .set max_alignment_128.has_recursion, 0
-; GFX9-NEXT: .set max_alignment_128.has_indirect_call, 0
%clutter = alloca i8, addrspace(5) ; Force non-zero offset for next alloca
store volatile i8 3, ptr addrspace(5) %clutter
%alloca.align = alloca i32, align 128, addrspace(5)
@@ -148,7 +130,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
; VI-NEXT: .p2align 6
; VI-NEXT: .amdhsa_kernel stackrealign_attr
; VI-NEXT: .amdhsa_group_segment_fixed_size 0
-; VI-NEXT: .amdhsa_private_segment_fixed_size stackrealign_attr.private_seg_size
+; VI-NEXT: .amdhsa_private_segment_fixed_size 12
; VI-NEXT: .amdhsa_kernarg_size 56
; VI-NEXT: .amdhsa_user_sgpr_count 14
; VI-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
@@ -158,16 +140,16 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 1
; VI-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
; VI-NEXT: .amdhsa_user_sgpr_private_segment_size 0
-; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(stackrealign_attr.private_seg_size*64, 1024))/1024)>0)||(stackrealign_attr.has_dyn_sized_stack|stackrealign_attr.has_recursion))|5020)&1
+; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT: .amdhsa_next_free_vgpr max(totalnumvgprs(stackrealign_attr.num_agpr, stackrealign_attr.num_vgpr), 1, 0)
-; VI-NEXT: .amdhsa_next_free_sgpr (max(stackrealign_attr.num_sgpr+(extrasgprs(stackrealign_attr.uses_vcc, stackrealign_attr.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(stackrealign_attr.uses_vcc, stackrealign_attr.uses_flat_scratch, 0))
-; VI-NEXT: .amdhsa_reserve_vcc stackrealign_attr.uses_vcc
-; VI-NEXT: .amdhsa_reserve_flat_scratch stackrealign_attr.uses_flat_scratch
+; VI-NEXT: .amdhsa_next_free_vgpr 1
+; VI-NEXT: .amdhsa_next_free_sgpr 18
+; VI-NEXT: .amdhsa_reserve_vcc 0
+; VI-NEXT: .amdhsa_reserve_flat_scratch 0
; VI-NEXT: .amdhsa_float_round_mode_32 0
; VI-NEXT: .amdhsa_float_round_mode_16_64 0
; VI-NEXT: .amdhsa_float_denorm_mode_32 3
@@ -183,15 +165,6 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
; VI-NEXT: .amdhsa_exception_int_div_zero 0
; VI-NEXT: .end_amdhsa_kernel
; VI-NEXT: .text
-; VI: .set stackrealign_attr.num_vgpr, 1
-; VI-NEXT: .set stackrealign_attr.num_agpr, 0
-; VI-NEXT: .set stackrealign_attr.num_sgpr, 18
-; VI-NEXT: .set stackrealign_attr.private_seg_size, 12
-; VI-NEXT: .set stackrealign_attr.uses_vcc, 0
-; VI-NEXT: .set stackrealign_attr.uses_flat_scratch, 0
-; VI-NEXT: .set stackrealign_attr.has_dyn_sized_stack, 0
-; VI-NEXT: .set stackrealign_attr.has_recursion, 0
-; VI-NEXT: .set stackrealign_attr.has_indirect_call, 0
;
; GFX9-LABEL: stackrealign_attr:
; GFX9: ; %bb.0:
@@ -208,7 +181,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
; GFX9-NEXT: .p2align 6
; GFX9-NEXT: .amdhsa_kernel stackrealign_attr
; GFX9-NEXT: .amdhsa_group_segment_fixed_size 0
-; GFX9-NEXT: .amdhsa_private_segment_fixed_size stackrealign_attr.private_seg_size
+; GFX9-NEXT: .amdhsa_private_segment_fixed_size 12
; GFX9-NEXT: .amdhsa_kernarg_size 56
; GFX9-NEXT: .amdhsa_user_sgpr_count 14
; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
@@ -218,16 +191,16 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 1
; GFX9-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_size 0
-; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(stackrealign_attr.private_seg_size*64, 1024))/1024)>0)||(stackrealign_attr.has_dyn_sized_stack|stackrealign_attr.has_recursion))|5020)&1
+; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT: .amdhsa_next_free_vgpr max(totalnumvgprs(stackrealign_attr.num_agpr, stackrealign_attr.num_vgpr), 1, 0)
-; GFX9-NEXT: .amdhsa_next_free_sgpr (max(stackrealign_attr.num_sgpr+(extrasgprs(stackrealign_attr.uses_vcc, stackrealign_attr.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(stackrealign_attr.uses_vcc, stackrealign_attr.uses_flat_scratch, 1))
-; GFX9-NEXT: .amdhsa_reserve_vcc stackrealign_attr.uses_vcc
-; GFX9-NEXT: .amdhsa_reserve_flat_scratch stackrealign_attr.uses_flat_scratch
+; GFX9-NEXT: .amdhsa_next_free_vgpr 1
+; GFX9-NEXT: .amdhsa_next_free_sgpr 18
+; GFX9-NEXT: .amdhsa_reserve_vcc 0
+; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0
; GFX9-NEXT: .amdhsa_reserve_xnack_mask 1
; GFX9-NEXT: .amdhsa_float_round_mode_32 0
; GFX9-NEXT: .amdhsa_float_round_mode_16_64 0
@@ -245,15 +218,6 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
; GFX9-NEXT: .amdhsa_exception_int_div_zero 0
; GFX9-NEXT: .end_amdhsa_kernel
; GFX9-NEXT: .text
-; GFX9: .set stackrealign_attr.num_vgpr, 1
-; GFX9-NEXT: .set stackrealign_attr.num_agpr, 0
-; GFX9-NEXT: .set stackrealign_attr.num_sgpr, 18
-; GFX9-NEXT: .set stackrealign_attr.private_seg_size, 12
-; GFX9-NEXT: .set stackrealign_attr.uses_vcc, 0
-; GFX9-NEXT: .set stackrealign_attr.uses_flat_scratch, 0
-; GFX9-NEXT: .set stackrealign_attr.has_dyn_sized_stack, 0
-; GFX9-NEXT: .set stackrealign_attr.has_recursion, 0
-; GFX9-NEXT: .set stackrealign_attr.has_indirect_call, 0
%clutter = alloca i8, addrspace(5) ; Force non-zero offset for next alloca
store volatile i8 3, ptr addrspace(5) %clutter
%alloca.align = alloca i32, align 4, addrspace(5)
@@ -277,7 +241,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
; VI-NEXT: .p2align 6
; VI-NEXT: .amdhsa_kernel alignstack_attr
; VI-NEXT: .amdhsa_group_segment_fixed_size 0
-; VI-NEXT: .amdhsa_private_segment_fixed_size alignstack_attr.private_seg_size
+; VI-NEXT: .amdhsa_private_segment_fixed_size 128
; VI-NEXT: .amdhsa_kernarg_size 56
; VI-NEXT: .amdhsa_user_sgpr_count 14
; VI-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
@@ -287,16 +251,16 @@ define amdgpu_kernel void @alignstack_attr() #2 {
; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 1
; VI-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
; VI-NEXT: .amdhsa_user_sgpr_private_segment_size 0
-; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(alignstack_attr.private_seg_size*64, 1024))/1024)>0)||(alignstack_attr.has_dyn_sized_stack|alignstack_attr.has_recursion))|5020)&1
+; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT: .amdhsa_next_free_vgpr max(totalnumvgprs(alignstack_attr.num_agpr, alignstack_attr.num_vgpr), 1, 0)
-; VI-NEXT: .amdhsa_next_free_sgpr (max(alignstack_attr.num_sgpr+(extrasgprs(alignstack_attr.uses_vcc, alignstack_attr.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(alignstack_attr.uses_vcc, alignstack_attr.uses_flat_scratch, 0))
-; VI-NEXT: .amdhsa_reserve_vcc alignstack_attr.uses_vcc
-; VI-NEXT: .amdhsa_reserve_flat_scratch alignstack_attr.uses_flat_scratch
+; VI-NEXT: .amdhsa_next_free_vgpr 1
+; VI-NEXT: .amdhsa_next_free_sgpr 18
+; VI-NEXT: .amdhsa_reserve_vcc 0
+; VI-NEXT: .amdhsa_reserve_flat_scratch 0
; VI-NEXT: .amdhsa_float_round_mode_32 0
; VI-NEXT: .amdhsa_float_round_mode_16_64 0
; VI-NEXT: .amdhsa_float_denorm_mode_32 3
@@ -312,15 +276,6 @@ define amdgpu_kernel void @alignstack_attr() #2 {
; VI-NEXT: .amdhsa_exception_int_div_zero 0
; VI-NEXT: .end_amdhsa_kernel
; VI-NEXT: .text
-; VI: .set alignstack_attr.num_vgpr, 1
-; VI-NEXT: .set alignstack_attr.num_agpr, 0
-; VI-NEXT: .set alignstack_attr.num_sgpr, 18
-; VI-NEXT: .set alignstack_attr.private_seg_size, 128
-; VI-NEXT: .set alignstack_attr.uses_vcc, 0
-; VI-NEXT: .set alignstack_attr.uses_flat_scratch, 0
-; VI-NEXT: .set alignstack_attr.has_dyn_sized_stack, 0
-; VI-NEXT: .set alignstack_attr.has_recursion, 0
-; VI-NEXT: .set alignstack_attr.has_indirect_call, 0
;
; GFX9-LABEL: alignstack_attr:
; GFX9: ; %bb.0:
@@ -337,7 +292,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
; GFX9-NEXT: .p2align 6
; GFX9-NEXT: .amdhsa_kernel alignstack_attr
; GFX9-NEXT: .amdhsa_group_segment_fixed_size 0
-; GFX9-NEXT: .amdhsa_private_segment_fixed_size alignstack_attr.private_seg_size
+; GFX9-NEXT: .amdhsa_private_segment_fixed_size 128
; GFX9-NEXT: .amdhsa_kernarg_size 56
; GFX9-NEXT: .amdhsa_user_sgpr_count 14
; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
@@ -347,16 +302,16 @@ define amdgpu_kernel void @alignstack_attr() #2 {
; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 1
; GFX9-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_size 0
-; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(alignstack_attr.private_seg_size*64, 1024))/1024)>0)||(alignstack_attr.has_dyn_sized_stack|alignstack_attr.has_recursion))|5020)&1
+; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT: .amdhsa_next_free_vgpr max(totalnumvgprs(alignstack_attr.num_agpr, alignstack_attr.num_vgpr), 1, 0)
-; GFX9-NEXT: .amdhsa_next_free_sgpr (max(alignstack_attr.num_sgpr+(extrasgprs(alignstack_attr.uses_vcc, alignstack_attr.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(alignstack_attr.uses_vcc, alignstack_attr.uses_flat_scratch, 1))
-; GFX9-NEXT: .amdhsa_reserve_vcc alignstack_attr.uses_vcc
-; GFX9-NEXT: .amdhsa_reserve_flat_scratch alignstack_attr.uses_flat_scratch
+; GFX9-NEXT: .amdhsa_next_free_vgpr 1
+; GFX9-NEXT: .amdhsa_next_free_sgpr 18
+; GFX9-NEXT: .amdhsa_reserve_vcc 0
+; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0
; GFX9-NEXT: .amdhsa_reserve_xnack_mask 1
; GFX9-NEXT: .amdhsa_float_round_mode_32 0
; GFX9-NEXT: .amdhsa_float_round_mode_16_64 0
@@ -374,15 +329,6 @@ define amdgpu_kernel void @alignstack_attr() #2 {
; GFX9-NEXT: .amdhsa_exception_int_div_zero 0
; GFX9-NEXT: .end_amdhsa_kernel
; GFX9-NEXT: .text
-; GFX9: .set alignstack_attr.num_vgpr, 1
-; GFX9-NEXT: .set alignstack_attr.num_agpr, 0
-; GFX9-NEXT: .set alignstack_attr.num_sgpr, 18
-; GFX9-NEXT: .set alignstack_attr.private_seg_size, 128
-; GFX9-NEXT: .set alignstack_attr.uses_vcc, 0
-; GFX9-NEXT: .set alignstack_attr.uses_flat_scratch, 0
-; GFX9-NEXT: .set alignstack_attr.has_dyn_sized_stack, 0
-; GFX9-NEXT: .set alignstack_attr.has_recursion, 0
-; GFX9-NEXT: .set alignstack_attr.has_indirect_call, 0
%clutter = alloca i8, addrspace(5) ; Force non-zero offset for next alloca
store volatile i8 3, ptr addrspace(5) %clutter
%alloca.align = alloca i32, align 4, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
index 432d8e0e856dbf..19d633651fdd0d 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
@@ -6,9 +6,8 @@
define amdgpu_kernel void @kern() #0 {
; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr (max(kern.num_sgpr+(extrasgprs(kern.uses_vcc, kern.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kern.uses_vcc, kern.uses_flat_scratch, 1))
+; ASM: .amdhsa_next_free_sgpr 5
; ASM: .amdhsa_reserve_xnack_mask 1
-; ASM: .set kern.num_sgpr, 5
; Verify that an extra SGPR block is reserved with XNACK "any" tid setting.
; OBJ: Contents of section .rodata:
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
index b6b30bc591e2b9..2097579e0c9959 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
@@ -6,9 +6,8 @@
define amdgpu_kernel void @kern() #0 {
; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr (max(kern.num_sgpr+(extrasgprs(kern.uses_vcc, kern.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(kern.uses_vcc, kern.uses_flat_scratch, 0))
+; ASM: .amdhsa_next_free_sgpr 5
; ASM: .amdhsa_reserve_xnack_mask 0
-; ASM: .set kern.num_sgpr, 5
; Verify that an extra SGPR block is not reserved with XNACK "off" tid setting.
; OBJ: Contents of section .rodata:
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
index 0aa5f2a0919761..775c62e73261a9 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
@@ -6,9 +6,8 @@
define amdgpu_kernel void @kern() #0 {
; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr (max(kern.num_sgpr+(extrasgprs(kern.uses_vcc, kern.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kern.uses_vcc, kern.uses_flat_scratch, 1))
+; ASM: .amdhsa_next_free_sgpr 5
; ASM: .amdhsa_reserve_xnack_mask 1
-; ASM: .set kern.num_sgpr, 5
; Verify that an extra SGPR block is reserved with XNACK "on" tid setting.
; OBJ: Contents of section .rodata:
diff --git a/llvm/test/CodeGen/AMDGPU/trap.ll b/llvm/test/CodeGen/AMDGPU/trap.ll
index c2845bf1035640..9bab3e6fcf8c45 100644
--- a/llvm/test/CodeGen/AMDGPU/trap.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap.ll
@@ -5,32 +5,23 @@
; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=+trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=HSA-TRAP %s
; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-HSA-TRAP %s
; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-HSA-TRAP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs %s -o %t1 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs %s -o %t2 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
-; RUN: FileCheck -check-prefix=GCN %s < %t1
-; RUN: FileCheck -check-prefix=GCN %s < %t2
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
; enable trap handler feature
; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=TRAP-BIT -check-prefix=MESA-TRAP %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=TRAP-BIT -check-prefix=MESA-TRAP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs %s -o %t3 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs %s -o %t4 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
-; RUN: FileCheck -check-prefix=GCN -check-prefix=TRAP-BIT %s < %t3
-; RUN: FileCheck -check-prefix=GCN -check-prefix=TRAP-BIT %s < %t4
+; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=TRAP-BIT %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=TRAP-BIT %s
; disable trap handler feature
; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=NO-TRAP-BIT -check-prefix=NOMESA-TRAP %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=NO-TRAP-BIT -check-prefix=NOMESA-TRAP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs %s -o %t5 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs %s -o %t6 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
-; RUN: FileCheck -check-prefix=GCN -check-prefix=NO-TRAP-BIT %s < %t5
-; RUN: FileCheck -check-prefix=GCN -check-prefix=NO-TRAP-BIT %s < %t6
-
-; RUN: llc -global-isel=0 -mtriple=amdgcn -verify-machineinstrs %s -o %t7 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -verify-machineinstrs %s -o %t8 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
-; RUN: FileCheck -check-prefix=GCN %s < %t7
-; RUN: FileCheck -check-prefix=GCN %s < %t8
+; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=NO-TRAP-BIT %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=NO-TRAP-BIT %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
; GCN-WARNING: warning: <unknown>:0:0: in function hsa_debugtrap void (ptr addrspace(1)): debugtrap handler not supported
@@ -40,11 +31,11 @@ declare void @llvm.debugtrap() #1
; MESA-TRAP: .section .AMDGPU.config
; MESA-TRAP: .long 47180
-; MESA-TRAP-NEXT: .long ((((alignto(hsa_trap.private_seg_size*64, 1024))/1024)>0)||(hsa_trap.has_dyn_sized_stack|hsa_trap.has_recursion))|5080
+; MESA-TRAP-NEXT: .long 5080
; NOMESA-TRAP: .section .AMDGPU.config
; NOMESA-TRAP: .long 47180
-; NOMESA-TRAP-NEXT: .long ((((alignto(hsa_trap.private_seg_size*64, 1024))/1024)>0)||(hsa_trap.has_dyn_sized_stack|hsa_trap.has_recursion))|5016
+; NOMESA-TRAP-NEXT: .long 5016
; GCN-LABEL: {{^}}hsa_trap:
; HSA-TRAP: s_mov_b64 s[0:1], s[6:7]
@@ -68,11 +59,11 @@ define amdgpu_kernel void @hsa_trap(ptr addrspace(1) nocapture readonly %arg0) {
; MESA-TRAP: .section .AMDGPU.config
; MESA-TRAP: .long 47180
-; MESA-TRAP-NEXT: .long ((((alignto(hsa_debugtrap.private_seg_size*64, 1024))/1024)>0)||(hsa_debugtrap.has_dyn_sized_stack|hsa_debugtrap.has_recursion))|5080
+; MESA-TRAP-NEXT: .long 5080
; NOMESA-TRAP: .section .AMDGPU.config
; NOMESA-TRAP: .long 47180
-; NOMESA-TRAP-NEXT: .long ((((alignto(hsa_debugtrap.private_seg_size*64, 1024))/1024)>0)||(hsa_debugtrap.has_dyn_sized_stack|hsa_debugtrap.has_recursion))|5016
+; NOMESA-TRAP-NEXT: .long 5016
; GCN-LABEL: {{^}}hsa_debugtrap:
; HSA-TRAP: s_trap 3
>From 8906f9f952876e3a4bbc6e790165f6cd77316c18 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <Janek.vanOirschot at amd.com>
Date: Mon, 19 Aug 2024 07:14:20 -0700
Subject: [PATCH 05/11] Remove AMDGPUMCResourceInfo MCContext class member
---
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 56 ++++++++--------
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h | 3 +-
.../Target/AMDGPU/AMDGPUMCResourceInfo.cpp | 64 ++++++++++---------
llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h | 30 +++++----
4 files changed, 83 insertions(+), 70 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 3e830881e8395b..06db32f019cdde 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -93,7 +93,6 @@ AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
std::unique_ptr<MCStreamer> Streamer)
: AsmPrinter(TM, std::move(Streamer)) {
assert(OutStreamer && "AsmPrinter constructed without streamer");
- RI = std::make_unique<MCResourceInfo>(OutContext);
}
StringRef AMDGPUAsmPrinter::getPassName() const {
@@ -380,7 +379,7 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
const uint64_t MaxScratchPerWorkitem =
STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
MCSymbol *ScratchSizeSymbol =
- RI->getSymbol(F.getName(), RIK::RIK_PrivateSegSize);
+ RI.getSymbol(F.getName(), RIK::RIK_PrivateSegSize, OutContext);
uint64_t ScratchSize;
if (ScratchSizeSymbol->isVariable() &&
TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
@@ -392,7 +391,8 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
// Validate addressable scalar registers (i.e., prior to added implicit
// SGPRs).
- MCSymbol *NumSGPRSymbol = RI->getSymbol(F.getName(), RIK::RIK_NumSGPR);
+ MCSymbol *NumSGPRSymbol =
+ RI.getSymbol(F.getName(), RIK::RIK_NumSGPR, OutContext);
if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
!STM.hasSGPRInitBug()) {
unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
@@ -408,9 +408,10 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
}
}
- MCSymbol *VCCUsedSymbol = RI->getSymbol(F.getName(), RIK::RIK_UsesVCC);
+ MCSymbol *VCCUsedSymbol =
+ RI.getSymbol(F.getName(), RIK::RIK_UsesVCC, OutContext);
MCSymbol *FlatUsedSymbol =
- RI->getSymbol(F.getName(), RIK::RIK_UsesFlatScratch);
+ RI.getSymbol(F.getName(), RIK::RIK_UsesFlatScratch, OutContext);
uint64_t VCCUsed, FlatUsed, NumSgpr;
if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
@@ -471,9 +472,10 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) {
// Assign expressions which can only be resolved when all other functions are
// known.
- RI->finalize();
- getTargetStreamer()->EmitMCResourceMaximums(
- RI->getMaxVGPRSymbol(), RI->getMaxAGPRSymbol(), RI->getMaxSGPRSymbol());
+ RI.finalize(OutContext);
+ getTargetStreamer()->EmitMCResourceMaximums(RI.getMaxVGPRSymbol(OutContext),
+ RI.getMaxAGPRSymbol(OutContext),
+ RI.getMaxSGPRSymbol(OutContext));
for (Function &F : M.functions())
validateMCResourceInfo(F);
@@ -632,7 +634,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
ResourceUsage->getResourceInfo();
- RI->gatherResourceInfo(MF, Info);
+ RI.gatherResourceInfo(MF, Info, OutContext);
if (MFI->isModuleEntryFunction()) {
getSIProgramInfo(CurrentProgramInfo, MF);
@@ -668,15 +670,15 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
{
using RIK = MCResourceInfo::ResourceInfoKind;
getTargetStreamer()->EmitMCResourceInfo(
- RI->getSymbol(MF.getName(), RIK::RIK_NumVGPR),
- RI->getSymbol(MF.getName(), RIK::RIK_NumAGPR),
- RI->getSymbol(MF.getName(), RIK::RIK_NumSGPR),
- RI->getSymbol(MF.getName(), RIK::RIK_PrivateSegSize),
- RI->getSymbol(MF.getName(), RIK::RIK_UsesVCC),
- RI->getSymbol(MF.getName(), RIK::RIK_UsesFlatScratch),
- RI->getSymbol(MF.getName(), RIK::RIK_HasDynSizedStack),
- RI->getSymbol(MF.getName(), RIK::RIK_HasRecursion),
- RI->getSymbol(MF.getName(), RIK::RIK_HasIndirectCall));
+ RI.getSymbol(MF.getName(), RIK::RIK_NumVGPR, OutContext),
+ RI.getSymbol(MF.getName(), RIK::RIK_NumAGPR, OutContext),
+ RI.getSymbol(MF.getName(), RIK::RIK_NumSGPR, OutContext),
+ RI.getSymbol(MF.getName(), RIK::RIK_PrivateSegSize, OutContext),
+ RI.getSymbol(MF.getName(), RIK::RIK_UsesVCC, OutContext),
+ RI.getSymbol(MF.getName(), RIK::RIK_UsesFlatScratch, OutContext),
+ RI.getSymbol(MF.getName(), RIK::RIK_HasDynSizedStack, OutContext),
+ RI.getSymbol(MF.getName(), RIK::RIK_HasRecursion, OutContext),
+ RI.getSymbol(MF.getName(), RIK::RIK_HasIndirectCall, OutContext));
}
if (isVerbose()) {
@@ -689,16 +691,18 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
OutStreamer->emitRawComment(" Function info:", false);
emitCommonFunctionComments(
- RI->getSymbol(MF.getName(), RIK::RIK_NumVGPR)->getVariableValue(),
- STM.hasMAIInsts() ? RI->getSymbol(MF.getName(), RIK::RIK_NumAGPR)
- ->getVariableValue()
- : nullptr,
- RI->createTotalNumVGPRs(MF, Ctx),
- RI->createTotalNumSGPRs(
+ RI.getSymbol(MF.getName(), RIK::RIK_NumVGPR, OutContext)
+ ->getVariableValue(),
+ STM.hasMAIInsts()
+ ? RI.getSymbol(MF.getName(), RIK::RIK_NumAGPR, OutContext)
+ ->getVariableValue()
+ : nullptr,
+ RI.createTotalNumVGPRs(MF, Ctx),
+ RI.createTotalNumSGPRs(
MF,
MF.getSubtarget<GCNSubtarget>().getTargetID().isXnackOnOrAny(),
Ctx),
- RI->getSymbol(MF.getName(), RIK::RIK_PrivateSegSize)
+ RI.getSymbol(MF.getName(), RIK::RIK_PrivateSegSize, OutContext)
->getVariableValue(),
getFunctionCodeSize(MF), MFI);
return false;
@@ -904,7 +908,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
auto GetSymRefExpr =
[&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
- MCSymbol *Sym = RI->getSymbol(MF.getName(), RIK);
+ MCSymbol *Sym = RI.getSymbol(MF.getName(), RIK, OutContext);
return MCSymbolRefExpr::create(Sym, Ctx);
};
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index a49ef406268d76..3532cc8dea0238 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -14,6 +14,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
+#include "AMDGPUMCResourceInfo.h"
#include "SIProgramInfo.h"
#include "llvm/CodeGen/AsmPrinter.h"
@@ -41,7 +42,7 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
AMDGPUResourceUsageAnalysis *ResourceUsage;
- std::unique_ptr<MCResourceInfo> RI;
+ MCResourceInfo RI;
SIProgramInfo CurrentProgramInfo;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
index 0ae0907f99ec5a..b6088b4f2253f4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
@@ -21,8 +21,9 @@
using namespace llvm;
-MCSymbol *MCResourceInfo::getSymbol(StringRef FuncName, ResourceInfoKind RIK) {
- auto GOCS = [this, FuncName](StringRef Suffix) {
+MCSymbol *MCResourceInfo::getSymbol(StringRef FuncName, ResourceInfoKind RIK,
+ MCContext &OutContext) {
+ auto GOCS = [this, FuncName, &OutContext](StringRef Suffix) {
return OutContext.getOrCreateSymbol(FuncName + Twine(Suffix));
};
switch (RIK) {
@@ -51,16 +52,16 @@ MCSymbol *MCResourceInfo::getSymbol(StringRef FuncName, ResourceInfoKind RIK) {
const MCExpr *MCResourceInfo::getSymRefExpr(StringRef FuncName,
ResourceInfoKind RIK,
MCContext &Ctx) {
- return MCSymbolRefExpr::create(getSymbol(FuncName, RIK), Ctx);
+ return MCSymbolRefExpr::create(getSymbol(FuncName, RIK, Ctx), Ctx);
}
-void MCResourceInfo::assignMaxRegs() {
+void MCResourceInfo::assignMaxRegs(MCContext &OutContext) {
// Assign expression to get the max register use to the max_num_Xgpr symbol.
- MCSymbol *MaxVGPRSym = getMaxVGPRSymbol();
- MCSymbol *MaxAGPRSym = getMaxAGPRSymbol();
- MCSymbol *MaxSGPRSym = getMaxSGPRSymbol();
+ MCSymbol *MaxVGPRSym = getMaxVGPRSymbol(OutContext);
+ MCSymbol *MaxAGPRSym = getMaxAGPRSymbol(OutContext);
+ MCSymbol *MaxSGPRSym = getMaxSGPRSymbol(OutContext);
- auto assignMaxRegSym = [this](MCSymbol *Sym, int32_t RegCount) {
+ auto assignMaxRegSym = [this, &OutContext](MCSymbol *Sym, int32_t RegCount) {
const MCExpr *MaxExpr = MCConstantExpr::create(RegCount, OutContext);
Sym->setVariableValue(MaxExpr);
};
@@ -70,28 +71,28 @@ void MCResourceInfo::assignMaxRegs() {
assignMaxRegSym(MaxSGPRSym, MaxSGPR);
}
-void MCResourceInfo::finalize() {
+void MCResourceInfo::finalize(MCContext &OutContext) {
assert(!Finalized && "Cannot finalize ResourceInfo again.");
Finalized = true;
- assignMaxRegs();
+ assignMaxRegs(OutContext);
}
-MCSymbol *MCResourceInfo::getMaxVGPRSymbol() {
+MCSymbol *MCResourceInfo::getMaxVGPRSymbol(MCContext &OutContext) {
return OutContext.getOrCreateSymbol("max_num_vgpr");
}
-MCSymbol *MCResourceInfo::getMaxAGPRSymbol() {
+MCSymbol *MCResourceInfo::getMaxAGPRSymbol(MCContext &OutContext) {
return OutContext.getOrCreateSymbol("max_num_agpr");
}
-MCSymbol *MCResourceInfo::getMaxSGPRSymbol() {
+MCSymbol *MCResourceInfo::getMaxSGPRSymbol(MCContext &OutContext) {
return OutContext.getOrCreateSymbol("max_num_sgpr");
}
void MCResourceInfo::assignResourceInfoExpr(
int64_t LocalValue, ResourceInfoKind RIK, AMDGPUMCExpr::VariantKind Kind,
- const MachineFunction &MF,
- const SmallVectorImpl<const Function *> &Callees) {
+ const MachineFunction &MF, const SmallVectorImpl<const Function *> &Callees,
+ MCContext &OutContext) {
const MCConstantExpr *LocalConstExpr =
MCConstantExpr::create(LocalValue, OutContext);
const MCExpr *SymVal = LocalConstExpr;
@@ -107,22 +108,23 @@ void MCResourceInfo::assignResourceInfoExpr(
if (Seen.contains(Callee))
continue;
Seen.insert(Callee);
- MCSymbol *CalleeValSym = getSymbol(Callee->getName(), RIK);
+ MCSymbol *CalleeValSym = getSymbol(Callee->getName(), RIK, OutContext);
ArgExprs.push_back(MCSymbolRefExpr::create(CalleeValSym, OutContext));
}
SymVal = AMDGPUMCExpr::create(Kind, ArgExprs, OutContext);
}
- MCSymbol *Sym = getSymbol(MF.getName(), RIK);
+ MCSymbol *Sym = getSymbol(MF.getName(), RIK, OutContext);
Sym->setVariableValue(SymVal);
}
void MCResourceInfo::gatherResourceInfo(
const MachineFunction &MF,
- const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &FRI) {
+ const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &FRI,
+ MCContext &OutContext) {
// Worst case VGPR use for non-hardware-entrypoints.
- MCSymbol *MaxVGPRSym = getMaxVGPRSymbol();
- MCSymbol *MaxAGPRSym = getMaxAGPRSymbol();
- MCSymbol *MaxSGPRSym = getMaxSGPRSymbol();
+ MCSymbol *MaxVGPRSym = getMaxVGPRSymbol(OutContext);
+ MCSymbol *MaxAGPRSym = getMaxAGPRSymbol(OutContext);
+ MCSymbol *MaxSGPRSym = getMaxSGPRSymbol(OutContext);
if (!AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())) {
addMaxVGPRCandidate(FRI.NumVGPR);
@@ -134,10 +136,10 @@ void MCResourceInfo::gatherResourceInfo(
ResourceInfoKind RIK) {
if (!FRI.HasIndirectCall) {
assignResourceInfoExpr(numRegs, RIK, AMDGPUMCExpr::AGVK_Max, MF,
- FRI.Callees);
+ FRI.Callees, OutContext);
} else {
const MCExpr *SymRef = MCSymbolRefExpr::create(MaxSym, OutContext);
- MCSymbol *LocalNumSym = getSymbol(MF.getName(), RIK);
+ MCSymbol *LocalNumSym = getSymbol(MF.getName(), RIK, OutContext);
const MCExpr *MaxWithLocal = AMDGPUMCExpr::createMax(
{MCConstantExpr::create(numRegs, OutContext), SymRef}, OutContext);
LocalNumSym->setVariableValue(MaxWithLocal);
@@ -159,7 +161,7 @@ void MCResourceInfo::gatherResourceInfo(
if (!FRI.HasIndirectCall) {
for (const Function *Callee : FRI.Callees) {
MCSymbol *calleeValSym =
- getSymbol(Callee->getName(), RIK_PrivateSegSize);
+ getSymbol(Callee->getName(), RIK_PrivateSegSize, OutContext);
ArgExprs.push_back(MCSymbolRefExpr::create(calleeValSym, OutContext));
}
}
@@ -171,29 +173,29 @@ void MCResourceInfo::gatherResourceInfo(
localConstExpr =
MCBinaryExpr::createAdd(localConstExpr, transitiveExpr, OutContext);
}
- getSymbol(MF.getName(), RIK_PrivateSegSize)
+ getSymbol(MF.getName(), RIK_PrivateSegSize, OutContext)
->setVariableValue(localConstExpr);
}
auto SetToLocal = [&](int64_t LocalValue, ResourceInfoKind RIK) {
- MCSymbol *Sym = getSymbol(MF.getName(), RIK);
+ MCSymbol *Sym = getSymbol(MF.getName(), RIK, OutContext);
Sym->setVariableValue(MCConstantExpr::create(LocalValue, OutContext));
};
if (!FRI.HasIndirectCall) {
assignResourceInfoExpr(FRI.UsesVCC, ResourceInfoKind::RIK_UsesVCC,
- AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees);
+ AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext);
assignResourceInfoExpr(FRI.UsesFlatScratch,
ResourceInfoKind::RIK_UsesFlatScratch,
- AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees);
+ AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext);
assignResourceInfoExpr(FRI.HasDynamicallySizedStack,
ResourceInfoKind::RIK_HasDynSizedStack,
- AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees);
+ AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext);
assignResourceInfoExpr(FRI.HasRecursion, ResourceInfoKind::RIK_HasRecursion,
- AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees);
+ AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext);
assignResourceInfoExpr(FRI.HasIndirectCall,
ResourceInfoKind::RIK_HasIndirectCall,
- AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees);
+ AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees, OutContext);
} else {
SetToLocal(FRI.UsesVCC, ResourceInfoKind::RIK_UsesVCC);
SetToLocal(FRI.UsesFlatScratch, ResourceInfoKind::RIK_UsesFlatScratch);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
index 22ad1eccf4566d..08e862c9953367 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
@@ -12,6 +12,9 @@
///
//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMCRESOURCEINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMCRESOURCEINFO_H
+
#include "AMDGPUResourceUsageAnalysis.h"
#include "MCTargetDesc/AMDGPUMCExpr.h"
@@ -41,20 +44,19 @@ class MCResourceInfo {
int32_t MaxAGPR = 0;
int32_t MaxSGPR = 0;
- MCContext &OutContext;
- bool Finalized;
+ bool Finalized = false;
void assignResourceInfoExpr(int64_t localValue, ResourceInfoKind RIK,
AMDGPUMCExpr::VariantKind Kind,
const MachineFunction &MF,
- const SmallVectorImpl<const Function *> &Callees);
+ const SmallVectorImpl<const Function *> &Callees,
+ MCContext &OutContext);
// Assigns expression for Max S/V/A-GPRs to the referenced symbols.
- void assignMaxRegs();
+ void assignMaxRegs(MCContext &OutContext);
public:
- MCResourceInfo(MCContext &OutContext)
- : OutContext(OutContext), Finalized(false) {}
+ MCResourceInfo() = default;
void addMaxVGPRCandidate(int32_t candidate) {
MaxVGPR = std::max(MaxVGPR, candidate);
}
@@ -65,17 +67,18 @@ class MCResourceInfo {
MaxSGPR = std::max(MaxSGPR, candidate);
}
- MCSymbol *getSymbol(StringRef FuncName, ResourceInfoKind RIK);
+ MCSymbol *getSymbol(StringRef FuncName, ResourceInfoKind RIK,
+ MCContext &OutContext);
const MCExpr *getSymRefExpr(StringRef FuncName, ResourceInfoKind RIK,
MCContext &Ctx);
// Resolves the final symbols that requires the inter-function resource info
// to be resolved.
- void finalize();
+ void finalize(MCContext &OutContext);
- MCSymbol *getMaxVGPRSymbol();
- MCSymbol *getMaxAGPRSymbol();
- MCSymbol *getMaxSGPRSymbol();
+ MCSymbol *getMaxVGPRSymbol(MCContext &OutContext);
+ MCSymbol *getMaxAGPRSymbol(MCContext &OutContext);
+ MCSymbol *getMaxSGPRSymbol(MCContext &OutContext);
/// AMDGPUResourceUsageAnalysis gathers resource usage on a per-function
/// granularity. However, some resource info has to be assigned the call
@@ -84,10 +87,13 @@ class MCResourceInfo {
/// functions with indirect calls should be assigned the module level maximum.
void gatherResourceInfo(
const MachineFunction &MF,
- const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &FRI);
+ const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &FRI,
+ MCContext &OutContext);
const MCExpr *createTotalNumVGPRs(const MachineFunction &MF, MCContext &Ctx);
const MCExpr *createTotalNumSGPRs(const MachineFunction &MF, bool hasXnack,
MCContext &Ctx);
};
} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMCRESOURCEINFO_H
\ No newline at end of file
>From adab4b2a2aee420d0f199b0ad6c4b852719326f8 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <Janek.vanOirschot at amd.com>
Date: Tue, 20 Aug 2024 05:38:09 -0700
Subject: [PATCH 06/11] Restore clang test, add newline
---
clang/test/Frontend/amdgcn-machine-analysis-remarks.cl | 10 +++++-----
llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h | 5 ++++-
2 files changed, 9 insertions(+), 6 deletions(-)
diff --git a/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl b/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl
index a2dd59a871904c..a05e21b37b9127 100644
--- a/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl
+++ b/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl
@@ -2,12 +2,12 @@
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx908 -Rpass-analysis=kernel-resource-usage -S -O0 -verify %s -o /dev/null
// expected-remark at +10 {{Function Name: foo}}
-// expected-remark at +9 {{ SGPRs: foo.num_sgpr+(extrasgprs(foo.uses_vcc, foo.uses_flat_scratch, 1))}}
-// expected-remark at +8 {{ VGPRs: foo.num_vgpr}}
-// expected-remark at +7 {{ AGPRs: foo.num_agpr}}
-// expected-remark at +6 {{ ScratchSize [bytes/lane]: foo.private_seg_size}}
+// expected-remark at +9 {{ SGPRs: 13}}
+// expected-remark at +8 {{ VGPRs: 10}}
+// expected-remark at +7 {{ AGPRs: 12}}
+// expected-remark at +6 {{ ScratchSize [bytes/lane]: 0}}
// expected-remark at +5 {{ Dynamic Stack: False}}
-// expected-remark at +4 {{ Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 10, max(foo.num_sgpr+(extrasgprs(foo.uses_vcc, foo.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(foo.num_agpr, foo.num_vgpr), 1, 0))}}
+// expected-remark at +4 {{ Occupancy [waves/SIMD]: 10}}
// expected-remark at +3 {{ SGPRs Spill: 0}}
// expected-remark at +2 {{ VGPRs Spill: 0}}
// expected-remark at +1 {{ LDS Size [bytes/block]: 0}}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
index 08e862c9953367..08c0c106d5aa9b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
@@ -44,6 +44,9 @@ class MCResourceInfo {
int32_t MaxAGPR = 0;
int32_t MaxSGPR = 0;
+ // Whether the MCResourceInfo has been finalized through finalize(MCContext
+ // &). Should only be called once, at the end of AsmPrinting to assign MaxXGPR
+ // symbols to their final value.
bool Finalized = false;
void assignResourceInfoExpr(int64_t localValue, ResourceInfoKind RIK,
@@ -96,4 +99,4 @@ class MCResourceInfo {
};
} // namespace llvm
-#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMCRESOURCEINFO_H
\ No newline at end of file
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMCRESOURCEINFO_H
>From 7a4f4df84682e3adf24ab1eee27018f79dca3db2 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <Janek.vanOirschot at amd.com>
Date: Wed, 21 Aug 2024 07:58:53 -0700
Subject: [PATCH 07/11] Restore gfx11-user-sgpr-init16-bug test
---
.../CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
index 4f300e2282426e..0f951e89d37c8a 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
@@ -36,8 +36,8 @@
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
; GCN-NEXT: .amdhsa_wavefront_size32
-; GCN-NEXT: .amdhsa_uses_dynamic_stack
-; GCN-NEXT: .amdhsa_enable_private_segment
+; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
+; GCN-NEXT: .amdhsa_enable_private_segment 0
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
@@ -65,8 +65,8 @@ define amdgpu_kernel void @minimal_kernel_inputs() #0 {
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
; GCN-NEXT: .amdhsa_wavefront_size32
-; GCN-NEXT: .amdhsa_uses_dynamic_stack
-; GCN-NEXT: .amdhsa_enable_private_segment
+; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
+; GCN-NEXT: .amdhsa_enable_private_segment 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
@@ -98,8 +98,8 @@ define amdgpu_kernel void @minimal_kernel_inputs_with_stack() #0 {
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
; GCN-NEXT: .amdhsa_wavefront_size32
-; GCN-NEXT: .amdhsa_uses_dynamic_stack
-; GCN-NEXT: .amdhsa_enable_private_segment
+; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
+; GCN-NEXT: .amdhsa_enable_private_segment 0
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
@@ -145,8 +145,8 @@ define amdgpu_kernel void @queue_ptr() #1 {
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 1
; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
; GCN-NEXT: .amdhsa_wavefront_size32
-; GCN-NEXT: .amdhsa_uses_dynamic_stack
-; GCN-NEXT: .amdhsa_enable_private_segment
+; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
+; GCN-NEXT: .amdhsa_enable_private_segment 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
>From aef86a786aab7727f9be5508128860ffefdc6241 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <Janek.vanOirschot at amd.com>
Date: Thu, 22 Aug 2024 08:42:18 -0700
Subject: [PATCH 08/11] Rename comments, remarks, and emitted symbol for sgprs
to be a bit more verbose in what's emitted
---
.../amdgcn-machine-analysis-remarks.cl | 2 +-
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 5 +-
.../Target/AMDGPU/AMDGPUMCResourceInfo.cpp | 2 +-
.../CodeGen/AMDGPU/agpr-register-count.ll | 4 +-
.../AMDGPU/call-alias-register-usage-agpr.ll | 6 +-
.../AMDGPU/call-alias-register-usage0.ll | 4 +-
.../AMDGPU/call-alias-register-usage1.ll | 6 +-
.../AMDGPU/call-alias-register-usage2.ll | 6 +-
.../AMDGPU/call-alias-register-usage3.ll | 6 +-
.../AMDGPU/call-graph-register-usage.ll | 56 ++++-----
.../AMDGPU/codegen-internal-only-func.ll | 4 +-
llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll | 70 +++++------
.../CodeGen/AMDGPU/function-resource-usage.ll | 112 +++++++++---------
llvm/test/CodeGen/AMDGPU/ipra.ll | 8 +-
llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll | 2 +-
.../CodeGen/AMDGPU/register-count-comments.ll | 2 +-
.../AMDGPU/resource-optimization-remarks.ll | 14 +--
17 files changed, 154 insertions(+), 155 deletions(-)
diff --git a/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl b/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl
index a05e21b37b9127..f15130d5f8b611 100644
--- a/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl
+++ b/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl
@@ -2,7 +2,7 @@
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx908 -Rpass-analysis=kernel-resource-usage -S -O0 -verify %s -o /dev/null
// expected-remark at +10 {{Function Name: foo}}
-// expected-remark at +9 {{ SGPRs: 13}}
+// expected-remark at +9 {{ TotalSGPRs: 13}}
// expected-remark at +8 {{ VGPRs: 10}}
// expected-remark at +7 {{ AGPRs: 12}}
// expected-remark at +6 {{ ScratchSize [bytes/lane]: 0}}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 06db32f019cdde..ea20d9313a2c7c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -499,7 +499,8 @@ void AMDGPUAsmPrinter::emitCommonFunctionComments(
const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
const AMDGPUMachineFunction *MFI) {
OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
- OutStreamer->emitRawComment(" NumSgprs: " + getMCExprStr(NumSGPR), false);
+ OutStreamer->emitRawComment(" TotalNumSgprs: " + getMCExprStr(NumSGPR),
+ false);
OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false);
if (NumAGPR && TotalNumVGPR) {
OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false);
@@ -1649,7 +1650,7 @@ void AMDGPUAsmPrinter::emitResourceUsageRemarks(
// printing multiple diagnostic location and diag opts.
EmitResourceUsageRemark("FunctionName", "Function Name",
MF.getFunction().getName());
- EmitResourceUsageRemark("NumSGPR", "SGPRs",
+ EmitResourceUsageRemark("NumSGPR", "TotalSGPRs",
getMCExprStr(CurrentProgramInfo.NumSGPR));
EmitResourceUsageRemark("NumVGPR", "VGPRs",
getMCExprStr(CurrentProgramInfo.NumArchVGPR));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
index b6088b4f2253f4..785401739de21b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
@@ -32,7 +32,7 @@ MCSymbol *MCResourceInfo::getSymbol(StringRef FuncName, ResourceInfoKind RIK,
case RIK_NumAGPR:
return GOCS(".num_agpr");
case RIK_NumSGPR:
- return GOCS(".num_sgpr");
+ return GOCS(".numbered_sgpr");
case RIK_PrivateSegSize:
return GOCS(".private_seg_size");
case RIK_UsesVCC:
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
index 647b5aff242984..2615ca33554ac6 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
@@ -165,8 +165,8 @@ declare void @undef_func()
; GFX90A: VGPRBlocks: ((alignto(max(max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0), 1), 8))/8)-1
; GCN: NumVGPRsForWavesPerEU: max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0)
; GFX90A: AccumOffset: ((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4))/4)-1)+1)*4
-; GFX908: Occupancy: occupancy(10, 4, 256, 8, 10, max(kernel_call_undef_func.num_sgpr+(extrasgprs(kernel_call_undef_func.uses_vcc, kernel_call_undef_func.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0))
-; GFX90A: Occupancy: occupancy(8, 8, 512, 8, 8, max(kernel_call_undef_func.num_sgpr+(extrasgprs(kernel_call_undef_func.uses_vcc, kernel_call_undef_func.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0))
+; GFX908: Occupancy: occupancy(10, 4, 256, 8, 10, max(kernel_call_undef_func.numbered_sgpr+(extrasgprs(kernel_call_undef_func.uses_vcc, kernel_call_undef_func.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0))
+; GFX90A: Occupancy: occupancy(8, 8, 512, 8, 8, max(kernel_call_undef_func.numbered_sgpr+(extrasgprs(kernel_call_undef_func.uses_vcc, kernel_call_undef_func.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0))
; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: ((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4))/4)-1)&(~65536))&63
define amdgpu_kernel void @kernel_call_undef_func() #0 {
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
index f5d45993742814..e8898d6a7001cc 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
@@ -9,12 +9,12 @@
; ALL-LABEL: {{^}}kernel:
; ALL: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel.num_agpr, kernel.num_vgpr), 1, 0)
-; ALL-NEXT: .amdhsa_next_free_sgpr (max(kernel.num_sgpr+(extrasgprs(kernel.uses_vcc, kernel.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel.uses_vcc, kernel.uses_flat_scratch, 1))
+; ALL-NEXT: .amdhsa_next_free_sgpr (max(kernel.numbered_sgpr+(extrasgprs(kernel.uses_vcc, kernel.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel.uses_vcc, kernel.uses_flat_scratch, 1))
; GFX90A-NEXT: .amdhsa_accum_offset ((((((alignto(max(1, kernel.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
; ALL: .set kernel.num_vgpr, max(32, aliasee_default.num_vgpr)
; ALL-NEXT: .set kernel.num_agpr, max(0, aliasee_default.num_agpr)
-; ALL-NEXT: .set kernel.num_sgpr, max(33, aliasee_default.num_sgpr)
+; ALL-NEXT: .set kernel.numbered_sgpr, max(33, aliasee_default.numbered_sgpr)
define amdgpu_kernel void @kernel() #0 {
bb:
call void @alias() #2
@@ -28,7 +28,7 @@ bb:
}
; ALL: .set aliasee_default.num_vgpr, 0
; ALL-NEXT: .set aliasee_default.num_agpr, 27
-; ALL-NEXT: .set aliasee_default.num_sgpr, 32
+; ALL-NEXT: .set aliasee_default.numbered_sgpr, 32
attributes #0 = { noinline norecurse nounwind optnone }
attributes #1 = { noinline norecurse nounwind readnone willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
index 092e734ef106be..a01268625cedbd 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
@@ -9,7 +9,7 @@
; CHECK-LABEL: {{^}}kernel0:
; CHECK: .set kernel0.num_vgpr, max(32, aliasee_default_vgpr64_sgpr102.num_vgpr)
; CHECK-NEXT: .set kernel0.num_agpr, max(0, aliasee_default_vgpr64_sgpr102.num_agpr)
-; CHECK-NEXT: .set kernel0.num_sgpr, max(33, aliasee_default_vgpr64_sgpr102.num_sgpr)
+; CHECK-NEXT: .set kernel0.numbered_sgpr, max(33, aliasee_default_vgpr64_sgpr102.numbered_sgpr)
define amdgpu_kernel void @kernel0() #0 {
bb:
call void @alias0() #2
@@ -18,7 +18,7 @@ bb:
; CHECK: .set aliasee_default_vgpr64_sgpr102.num_vgpr, 53
; CHECK-NEXT: .set aliasee_default_vgpr64_sgpr102.num_agpr, 0
-; CHECK-NEXT: .set aliasee_default_vgpr64_sgpr102.num_sgpr, 32
+; CHECK-NEXT: .set aliasee_default_vgpr64_sgpr102.numbered_sgpr, 32
define internal void @aliasee_default_vgpr64_sgpr102() #1 {
bb:
call void asm sideeffect "; clobber v52 ", "~{v52}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
index f8287dc518421e..86defe3ba7ec08 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
@@ -10,11 +10,11 @@
; CHECK-LABEL: {{^}}kernel1:
; CHECK: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel1.num_agpr, kernel1.num_vgpr), 1, 0)
-; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel1.num_sgpr+(extrasgprs(kernel1.uses_vcc, kernel1.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel1.uses_vcc, kernel1.uses_flat_scratch, 1))
+; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel1.numbered_sgpr+(extrasgprs(kernel1.uses_vcc, kernel1.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel1.uses_vcc, kernel1.uses_flat_scratch, 1))
; CHECK: .set kernel1.num_vgpr, max(41, aliasee_vgpr32_sgpr76.num_vgpr)
; CHECK-NEXT: .set kernel1.num_agpr, max(0, aliasee_vgpr32_sgpr76.num_agpr)
-; CHECK-NEXT: .set kernel1.num_sgpr, max(33, aliasee_vgpr32_sgpr76.num_sgpr)
+; CHECK-NEXT: .set kernel1.numbered_sgpr, max(33, aliasee_vgpr32_sgpr76.numbered_sgpr)
define amdgpu_kernel void @kernel1() #0 {
bb:
call void asm sideeffect "; clobber v40 ", "~{v40}"()
@@ -24,7 +24,7 @@ bb:
; CHECK: .set aliasee_vgpr32_sgpr76.num_vgpr, 27
; CHECK-NEXT: .set aliasee_vgpr32_sgpr76.num_agpr, 0
-; CHECK-NEXT: .set aliasee_vgpr32_sgpr76.num_sgpr, 32
+; CHECK-NEXT: .set aliasee_vgpr32_sgpr76.numbered_sgpr, 32
define internal void @aliasee_vgpr32_sgpr76() #1 {
bb:
call void asm sideeffect "; clobber v26 ", "~{v26}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
index a99b2295dfe85c..6b1fbd9b6e16a2 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
@@ -8,11 +8,11 @@
; CHECK-LABEL: {{^}}kernel2:
; CHECK: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel2.num_agpr, kernel2.num_vgpr), 1, 0)
-; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel2.num_sgpr+(extrasgprs(kernel2.uses_vcc, kernel2.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel2.uses_vcc, kernel2.uses_flat_scratch, 1))
+; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel2.numbered_sgpr+(extrasgprs(kernel2.uses_vcc, kernel2.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel2.uses_vcc, kernel2.uses_flat_scratch, 1))
; CHECK: .set kernel2.num_vgpr, max(32, aliasee_vgpr64_sgpr102.num_vgpr)
; CHECK-NEXT: .set kernel2.num_agpr, max(0, aliasee_vgpr64_sgpr102.num_agpr)
-; CHECK-NEXT: .set kernel2.num_sgpr, max(33, aliasee_vgpr64_sgpr102.num_sgpr)
+; CHECK-NEXT: .set kernel2.numbered_sgpr, max(33, aliasee_vgpr64_sgpr102.numbered_sgpr)
define amdgpu_kernel void @kernel2() #0 {
bb:
call void @alias2() #2
@@ -21,7 +21,7 @@ bb:
; CHECK: .set aliasee_vgpr64_sgpr102.num_vgpr, 53
; CHECK-NEXT: .set aliasee_vgpr64_sgpr102.num_agpr, 0
-; CHECK-NEXT: .set aliasee_vgpr64_sgpr102.num_sgpr, 32
+; CHECK-NEXT: .set aliasee_vgpr64_sgpr102.numbered_sgpr, 32
define internal void @aliasee_vgpr64_sgpr102() #1 {
bb:
call void asm sideeffect "; clobber v52 ", "~{v52}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
index 793dc1bc3a6f33..c81181cd826677 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
@@ -8,11 +8,11 @@
; CHECK-LABEL: {{^}}kernel3:
; CHECK: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel3.num_agpr, kernel3.num_vgpr), 1, 0)
-; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel3.num_sgpr+(extrasgprs(kernel3.uses_vcc, kernel3.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel3.uses_vcc, kernel3.uses_flat_scratch, 1))
+; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel3.numbered_sgpr+(extrasgprs(kernel3.uses_vcc, kernel3.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel3.uses_vcc, kernel3.uses_flat_scratch, 1))
; CHECK: .set kernel3.num_vgpr, max(32, aliasee_vgpr256_sgpr102.num_vgpr)
; CHECK-NEXT: .set kernel3.num_agpr, max(0, aliasee_vgpr256_sgpr102.num_agpr)
-; CHECK-NEXT: .set kernel3.num_sgpr, max(33, aliasee_vgpr256_sgpr102.num_sgpr)
+; CHECK-NEXT: .set kernel3.numbered_sgpr, max(33, aliasee_vgpr256_sgpr102.numbered_sgpr)
define amdgpu_kernel void @kernel3() #0 {
bb:
call void @alias3() #2
@@ -21,7 +21,7 @@ bb:
; CHECK: .set aliasee_vgpr256_sgpr102.num_vgpr, 253
; CHECK-NEXT: .set aliasee_vgpr256_sgpr102.num_agpr, 0
-; CHECK-NEXT: .set aliasee_vgpr256_sgpr102.num_sgpr, 33
+; CHECK-NEXT: .set aliasee_vgpr256_sgpr102.numbered_sgpr, 33
define internal void @aliasee_vgpr256_sgpr102() #1 {
bb:
call void asm sideeffect "; clobber v252 ", "~{v252}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index 05a974695643d3..0328066bd33d8f 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -7,7 +7,7 @@
; Make sure to run a GPU with the SGPR allocation bug.
; GCN-LABEL: {{^}}use_vcc:
-; GCN: ; NumSgprs: 34
+; GCN: ; TotalNumSgprs: 34
; GCN: ; NumVgprs: 0
define void @use_vcc() #1 {
call void asm sideeffect "", "~{vcc}" () #0
@@ -25,7 +25,7 @@ define void @use_vcc() #1 {
; GCN: v_readlane_b32 s4, v40, 2
; GCN: s_mov_b32 s33, s4
; GCN: s_setpc_b64 s[30:31]
-; GCN: ; NumSgprs: 36
+; GCN: ; TotalNumSgprs: 36
; GCN: ; NumVgprs: 41
define void @indirect_use_vcc() #1 {
call void @use_vcc()
@@ -33,9 +33,9 @@ define void @indirect_use_vcc() #1 {
}
; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel:
-; CI: ; NumSgprs: 38
-; VI-NOBUG: ; NumSgprs: 40
-; VI-BUG: ; NumSgprs: 96
+; CI: ; TotalNumSgprs: 38
+; VI-NOBUG: ; TotalNumSgprs: 40
+; VI-BUG: ; TotalNumSgprs: 96
; GCN: ; NumVgprs: 41
define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) #0 {
call void @indirect_use_vcc()
@@ -43,8 +43,8 @@ define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out)
}
; GCN-LABEL: {{^}}use_flat_scratch:
-; CI: ; NumSgprs: 36
-; VI: ; NumSgprs: 38
+; CI: ; TotalNumSgprs: 36
+; VI: ; TotalNumSgprs: 38
; GCN: ; NumVgprs: 0
define void @use_flat_scratch() #1 {
call void asm sideeffect "", "~{flat_scratch}" () #0
@@ -52,8 +52,8 @@ define void @use_flat_scratch() #1 {
}
; GCN-LABEL: {{^}}indirect_use_flat_scratch:
-; CI: ; NumSgprs: 38
-; VI: ; NumSgprs: 40
+; CI: ; TotalNumSgprs: 38
+; VI: ; TotalNumSgprs: 40
; GCN: ; NumVgprs: 41
define void @indirect_use_flat_scratch() #1 {
call void @use_flat_scratch()
@@ -61,9 +61,9 @@ define void @indirect_use_flat_scratch() #1 {
}
; GCN-LABEL: {{^}}indirect_2level_use_flat_scratch_kernel:
-; CI: ; NumSgprs: 38
-; VI-NOBUG: ; NumSgprs: 40
-; VI-BUG: ; NumSgprs: 96
+; CI: ; TotalNumSgprs: 38
+; VI-NOBUG: ; TotalNumSgprs: 40
+; VI-BUG: ; TotalNumSgprs: 96
; GCN: ; NumVgprs: 41
define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace(1) %out) #0 {
call void @indirect_use_flat_scratch()
@@ -107,23 +107,23 @@ define void @indirect_use_50_vgpr() #0 {
}
; GCN-LABEL: {{^}}use_80_sgpr:
-; GCN: ; NumSgprs: 80
+; GCN: ; TotalNumSgprs: 80
define void @use_80_sgpr() #1 {
call void asm sideeffect "", "~{s79}"() #0
ret void
}
; GCN-LABEL: {{^}}indirect_use_80_sgpr:
-; GCN: ; NumSgprs: 82
+; GCN: ; TotalNumSgprs: 82
define void @indirect_use_80_sgpr() #1 {
call void @use_80_sgpr()
ret void
}
; GCN-LABEL: {{^}}indirect_2_level_use_80_sgpr:
-; CI: ; NumSgprs: 84
-; VI-NOBUG: ; NumSgprs: 86
-; VI-BUG: ; NumSgprs: 96
+; CI: ; TotalNumSgprs: 84
+; VI-NOBUG: ; TotalNumSgprs: 86
+; VI-BUG: ; TotalNumSgprs: 96
define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 {
call void @indirect_use_80_sgpr()
ret void
@@ -176,7 +176,7 @@ define amdgpu_kernel void @multi_call_use_use_stack() #0 {
declare void @external() #0
; GCN-LABEL: {{^}}usage_external:
-; NumSgprs: 48
+; TotalNumSgprs: 48
; NumVgprs: 24
; GCN: ScratchSize: 16384
;
@@ -190,7 +190,7 @@ define amdgpu_kernel void @usage_external() #0 {
declare void @external_recurse() #2
; GCN-LABEL: {{^}}usage_external_recurse:
-; NumSgprs: 48
+; TotalNumSgprs: 48
; NumVgprs: 24
; GCN: ScratchSize: 16384
;
@@ -235,9 +235,9 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
; GCN-LABEL: {{^}}count_use_sgpr96_external_call
; GCN: ; sgpr96 s[{{[0-9]+}}:{{[0-9]+}}]
; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(0, max_num_vgpr)
-; GCN: .set count_use_sgpr96_external_call.num_sgpr, max(33, max_num_sgpr)
-; CI: NumSgprs: count_use_sgpr96_external_call.num_sgpr+4
-; VI-BUG: NumSgprs: 96
+; GCN: .set count_use_sgpr96_external_call.numbered_sgpr, max(33, max_num_sgpr)
+; CI: TotalNumSgprs: count_use_sgpr96_external_call.numbered_sgpr+4
+; VI-BUG: TotalNumSgprs: 96
; GCN: NumVgprs: count_use_sgpr96_external_call.num_vgpr
define amdgpu_kernel void @count_use_sgpr96_external_call() {
entry:
@@ -250,9 +250,9 @@ entry:
; GCN-LABEL: {{^}}count_use_sgpr160_external_call
; GCN: ; sgpr160 s[{{[0-9]+}}:{{[0-9]+}}]
; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(0, max_num_vgpr)
-; GCN: .set count_use_sgpr160_external_call.num_sgpr, max(33, max_num_sgpr)
-; CI: NumSgprs: count_use_sgpr160_external_call.num_sgpr+4
-; VI-BUG: NumSgprs: 96
+; GCN: .set count_use_sgpr160_external_call.numbered_sgpr, max(33, max_num_sgpr)
+; CI: TotalNumSgprs: count_use_sgpr160_external_call.numbered_sgpr+4
+; VI-BUG: TotalNumSgprs: 96
; GCN: NumVgprs: count_use_sgpr160_external_call.num_vgpr
define amdgpu_kernel void @count_use_sgpr160_external_call() {
entry:
@@ -265,9 +265,9 @@ entry:
; GCN-LABEL: {{^}}count_use_vgpr160_external_call
; GCN: ; vgpr160 v[{{[0-9]+}}:{{[0-9]+}}]
; GCN: .set count_use_vgpr160_external_call.num_vgpr, max(5, max_num_vgpr)
-; GCN: .set count_use_vgpr160_external_call.num_sgpr, max(33, max_num_sgpr)
-; CI: NumSgprs: count_use_vgpr160_external_call.num_sgpr+4
-; VI-BUG: NumSgprs: 96
+; GCN: .set count_use_vgpr160_external_call.numbered_sgpr, max(33, max_num_sgpr)
+; CI: TotalNumSgprs: count_use_vgpr160_external_call.numbered_sgpr+4
+; VI-BUG: TotalNumSgprs: 96
; GCN: NumVgprs: count_use_vgpr160_external_call.num_vgpr
define amdgpu_kernel void @count_use_vgpr160_external_call() {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll
index aa1a93cfec3d86..f198833059572b 100644
--- a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll
@@ -1,6 +1,6 @@
; REQUIRES: asserts
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=asm < %s | FileCheck %s
-; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=asm < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
; CHECK-NOT: func
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index e4ffedd686ac93..02eb1ad9453291 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -25,11 +25,11 @@
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
-; CI: ; NumSgprs: 8
-; VI-NOXNACK: ; NumSgprs: 8
-; VI-XNACK: ; NumSgprs: 12
-; GFX9-ARCH-FLAT: ; NumSgprs: 14
-; GFX10-ARCH-FLAT: ; NumSgprs: 8
+; CI: ; TotalNumSgprs: 8
+; VI-NOXNACK: ; TotalNumSgprs: 8
+; VI-XNACK: ; TotalNumSgprs: 12
+; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT: ; TotalNumSgprs: 8
define amdgpu_kernel void @no_vcc_no_flat() {
entry:
call void asm sideeffect "", "~{s7}"()
@@ -42,11 +42,11 @@ entry:
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
-; CI: ; NumSgprs: 10
-; VI-NOXNACK: ; NumSgprs: 10
-; VI-XNACK: ; NumSgprs: 12
-; GFX9-ARCH-FLAT: ; NumSgprs: 14
-; GFX10-ARCH-FLAT: ; NumSgprs: 10
+; CI: ; TotalNumSgprs: 10
+; VI-NOXNACK: ; TotalNumSgprs: 10
+; VI-XNACK: ; TotalNumSgprs: 12
+; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT: ; TotalNumSgprs: 10
define amdgpu_kernel void @vcc_no_flat() {
entry:
call void asm sideeffect "", "~{s7},~{vcc}"()
@@ -59,11 +59,11 @@ entry:
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
-; CI: ; NumSgprs: 12
-; VI-NOXNACK: ; NumSgprs: 14
-; VI-XNACK: ; NumSgprs: 14
-; GFX9-ARCH-FLAT: ; NumSgprs: 14
-; GFX10-ARCH-FLAT: ; NumSgprs: 8
+; CI: ; TotalNumSgprs: 12
+; VI-NOXNACK: ; TotalNumSgprs: 14
+; VI-XNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT: ; TotalNumSgprs: 8
define amdgpu_kernel void @no_vcc_flat() {
entry:
call void asm sideeffect "", "~{s7},~{flat_scratch}"()
@@ -76,11 +76,11 @@ entry:
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
-; CI: ; NumSgprs: 12
-; VI-NOXNACK: ; NumSgprs: 14
-; VI-XNACK: ; NumSgprs: 14
-; GFX9-ARCH-FLAT: ; NumSgprs: 14
-; GFX10-ARCH-FLAT: ; NumSgprs: 10
+; CI: ; TotalNumSgprs: 12
+; VI-NOXNACK: ; TotalNumSgprs: 14
+; VI-XNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT: ; TotalNumSgprs: 10
define amdgpu_kernel void @vcc_flat() {
entry:
call void asm sideeffect "", "~{s7},~{vcc},~{flat_scratch}"()
@@ -96,11 +96,11 @@ entry:
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
-; CI: NumSgprs: 4
-; VI-NOXNACK: NumSgprs: 6
-; VI-XNACK: NumSgprs: 6
-; GFX9-ARCH-FLAT: ; NumSgprs: 6
-; GFX10-ARCH-FLAT: ; NumSgprs: 0
+; CI: TotalNumSgprs: 4
+; VI-NOXNACK: TotalNumSgprs: 6
+; VI-XNACK: TotalNumSgprs: 6
+; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6
+; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0
define amdgpu_kernel void @use_flat_scr() #0 {
entry:
call void asm sideeffect "; clobber ", "~{flat_scratch}"()
@@ -113,11 +113,11 @@ entry:
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
-; CI: NumSgprs: 4
-; VI-NOXNACK: NumSgprs: 6
-; VI-XNACK: NumSgprs: 6
-; GFX9-ARCH-FLAT: ; NumSgprs: 6
-; GFX10-ARCH-FLAT: ; NumSgprs: 0
+; CI: TotalNumSgprs: 4
+; VI-NOXNACK: TotalNumSgprs: 6
+; VI-XNACK: TotalNumSgprs: 6
+; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6
+; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0
define amdgpu_kernel void @use_flat_scr_lo() #0 {
entry:
call void asm sideeffect "; clobber ", "~{flat_scratch_lo}"()
@@ -130,11 +130,11 @@ entry:
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
-; CI: NumSgprs: 4
-; VI-NOXNACK: NumSgprs: 6
-; VI-XNACK: NumSgprs: 6
-; GFX9-ARCH-FLAT: ; NumSgprs: 6
-; GFX10-ARCH-FLAT: ; NumSgprs: 0
+; CI: TotalNumSgprs: 4
+; VI-NOXNACK: TotalNumSgprs: 6
+; VI-XNACK: TotalNumSgprs: 6
+; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6
+; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0
define amdgpu_kernel void @use_flat_scr_hi() #0 {
entry:
call void asm sideeffect "; clobber ", "~{flat_scratch_hi}"()
diff --git a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
index c411323a70ed31..ae6a6575d62ec8 100644
--- a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
@@ -1,20 +1,18 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-
-; SGPR use may not seem equal to the sgpr use provided in comments as the latter includes extra sgprs (e.g., for vcc use).
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 < %s | FileCheck -check-prefix=GCN %s
; Functions that don't make calls should have constants as its resource usage as no resource information has to be propagated.
; GCN-LABEL: {{^}}use_vcc:
; GCN: .set use_vcc.num_vgpr, 0
; GCN: .set use_vcc.num_agpr, 0
-; GCN: .set use_vcc.num_sgpr, 32
+; GCN: .set use_vcc.numbered_sgpr, 32
; GCN: .set use_vcc.private_seg_size, 0
; GCN: .set use_vcc.uses_vcc, 1
; GCN: .set use_vcc.uses_flat_scratch, 0
; GCN: .set use_vcc.has_dyn_sized_stack, 0
; GCN: .set use_vcc.has_recursion, 0
; GCN: .set use_vcc.has_indirect_call, 0
-; GCN: NumSgprs: 36
+; GCN: TotalNumSgprs: 36
; GCN: NumVgprs: 0
; GCN: ScratchSize: 0
define void @use_vcc() #1 {
@@ -25,14 +23,14 @@ define void @use_vcc() #1 {
; GCN-LABEL: {{^}}indirect_use_vcc:
; GCN: .set indirect_use_vcc.num_vgpr, max(41, use_vcc.num_vgpr)
; GCN: .set indirect_use_vcc.num_agpr, max(0, use_vcc.num_agpr)
-; GCN: .set indirect_use_vcc.num_sgpr, max(34, use_vcc.num_sgpr)
+; GCN: .set indirect_use_vcc.numbered_sgpr, max(34, use_vcc.numbered_sgpr)
; GCN: .set indirect_use_vcc.private_seg_size, 16+(max(use_vcc.private_seg_size))
; GCN: .set indirect_use_vcc.uses_vcc, or(1, use_vcc.uses_vcc)
; GCN: .set indirect_use_vcc.uses_flat_scratch, or(0, use_vcc.uses_flat_scratch)
; GCN: .set indirect_use_vcc.has_dyn_sized_stack, or(0, use_vcc.has_dyn_sized_stack)
; GCN: .set indirect_use_vcc.has_recursion, or(0, use_vcc.has_recursion)
; GCN: .set indirect_use_vcc.has_indirect_call, or(0, use_vcc.has_indirect_call)
-; GCN: NumSgprs: 38
+; GCN: TotalNumSgprs: 38
; GCN: NumVgprs: 41
; GCN: ScratchSize: 16
define void @indirect_use_vcc() #1 {
@@ -43,14 +41,14 @@ define void @indirect_use_vcc() #1 {
; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel:
; GCN: .set indirect_2level_use_vcc_kernel.num_vgpr, max(32, indirect_use_vcc.num_vgpr)
; GCN: .set indirect_2level_use_vcc_kernel.num_agpr, max(0, indirect_use_vcc.num_agpr)
-; GCN: .set indirect_2level_use_vcc_kernel.num_sgpr, max(33, indirect_use_vcc.num_sgpr)
+; GCN: .set indirect_2level_use_vcc_kernel.numbered_sgpr, max(33, indirect_use_vcc.numbered_sgpr)
; GCN: .set indirect_2level_use_vcc_kernel.private_seg_size, 0+(max(indirect_use_vcc.private_seg_size))
; GCN: .set indirect_2level_use_vcc_kernel.uses_vcc, or(1, indirect_use_vcc.uses_vcc)
; GCN: .set indirect_2level_use_vcc_kernel.uses_flat_scratch, or(1, indirect_use_vcc.uses_flat_scratch)
; GCN: .set indirect_2level_use_vcc_kernel.has_dyn_sized_stack, or(0, indirect_use_vcc.has_dyn_sized_stack)
; GCN: .set indirect_2level_use_vcc_kernel.has_recursion, or(0, indirect_use_vcc.has_recursion)
; GCN: .set indirect_2level_use_vcc_kernel.has_indirect_call, or(0, indirect_use_vcc.has_indirect_call)
-; GCN: NumSgprs: 40
+; GCN: TotalNumSgprs: 40
; GCN: NumVgprs: 41
; GCN: ScratchSize: 16
define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) #0 {
@@ -61,14 +59,14 @@ define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out)
; GCN-LABEL: {{^}}use_flat_scratch:
; GCN: .set use_flat_scratch.num_vgpr, 0
; GCN: .set use_flat_scratch.num_agpr, 0
-; GCN: .set use_flat_scratch.num_sgpr, 32
+; GCN: .set use_flat_scratch.numbered_sgpr, 32
; GCN: .set use_flat_scratch.private_seg_size, 0
; GCN: .set use_flat_scratch.uses_vcc, 0
; GCN: .set use_flat_scratch.uses_flat_scratch, 1
; GCN: .set use_flat_scratch.has_dyn_sized_stack, 0
; GCN: .set use_flat_scratch.has_recursion, 0
; GCN: .set use_flat_scratch.has_indirect_call, 0
-; GCN: NumSgprs: 38
+; GCN: TotalNumSgprs: 38
; GCN: NumVgprs: 0
; GCN: ScratchSize: 0
define void @use_flat_scratch() #1 {
@@ -79,14 +77,14 @@ define void @use_flat_scratch() #1 {
; GCN-LABEL: {{^}}indirect_use_flat_scratch:
; GCN: .set indirect_use_flat_scratch.num_vgpr, max(41, use_flat_scratch.num_vgpr)
; GCN: .set indirect_use_flat_scratch.num_agpr, max(0, use_flat_scratch.num_agpr)
-; GCN: .set indirect_use_flat_scratch.num_sgpr, max(34, use_flat_scratch.num_sgpr)
+; GCN: .set indirect_use_flat_scratch.numbered_sgpr, max(34, use_flat_scratch.numbered_sgpr)
; GCN: .set indirect_use_flat_scratch.private_seg_size, 16+(max(use_flat_scratch.private_seg_size))
; GCN: .set indirect_use_flat_scratch.uses_vcc, or(1, use_flat_scratch.uses_vcc)
; GCN: .set indirect_use_flat_scratch.uses_flat_scratch, or(0, use_flat_scratch.uses_flat_scratch)
; GCN: .set indirect_use_flat_scratch.has_dyn_sized_stack, or(0, use_flat_scratch.has_dyn_sized_stack)
; GCN: .set indirect_use_flat_scratch.has_recursion, or(0, use_flat_scratch.has_recursion)
; GCN: .set indirect_use_flat_scratch.has_indirect_call, or(0, use_flat_scratch.has_indirect_call)
-; GCN: NumSgprs: 40
+; GCN: TotalNumSgprs: 40
; GCN: NumVgprs: 41
; GCN: ScratchSize: 16
define void @indirect_use_flat_scratch() #1 {
@@ -97,14 +95,14 @@ define void @indirect_use_flat_scratch() #1 {
; GCN-LABEL: {{^}}indirect_2level_use_flat_scratch_kernel:
; GCN: .set indirect_2level_use_flat_scratch_kernel.num_vgpr, max(32, indirect_use_flat_scratch.num_vgpr)
; GCN: .set indirect_2level_use_flat_scratch_kernel.num_agpr, max(0, indirect_use_flat_scratch.num_agpr)
-; GCN: .set indirect_2level_use_flat_scratch_kernel.num_sgpr, max(33, indirect_use_flat_scratch.num_sgpr)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.numbered_sgpr, max(33, indirect_use_flat_scratch.numbered_sgpr)
; GCN: .set indirect_2level_use_flat_scratch_kernel.private_seg_size, 0+(max(indirect_use_flat_scratch.private_seg_size))
; GCN: .set indirect_2level_use_flat_scratch_kernel.uses_vcc, or(1, indirect_use_flat_scratch.uses_vcc)
; GCN: .set indirect_2level_use_flat_scratch_kernel.uses_flat_scratch, or(1, indirect_use_flat_scratch.uses_flat_scratch)
; GCN: .set indirect_2level_use_flat_scratch_kernel.has_dyn_sized_stack, or(0, indirect_use_flat_scratch.has_dyn_sized_stack)
; GCN: .set indirect_2level_use_flat_scratch_kernel.has_recursion, or(0, indirect_use_flat_scratch.has_recursion)
; GCN: .set indirect_2level_use_flat_scratch_kernel.has_indirect_call, or(0, indirect_use_flat_scratch.has_indirect_call)
-; GCN: NumSgprs: 40
+; GCN: TotalNumSgprs: 40
; GCN: NumVgprs: 41
; GCN: ScratchSize: 16
define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace(1) %out) #0 {
@@ -115,14 +113,14 @@ define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace
; GCN-LABEL: {{^}}use_10_vgpr:
; GCN: .set use_10_vgpr.num_vgpr, 10
; GCN: .set use_10_vgpr.num_agpr, 0
-; GCN: .set use_10_vgpr.num_sgpr, 32
+; GCN: .set use_10_vgpr.numbered_sgpr, 32
; GCN: .set use_10_vgpr.private_seg_size, 0
; GCN: .set use_10_vgpr.uses_vcc, 0
; GCN: .set use_10_vgpr.uses_flat_scratch, 0
; GCN: .set use_10_vgpr.has_dyn_sized_stack, 0
; GCN: .set use_10_vgpr.has_recursion, 0
; GCN: .set use_10_vgpr.has_indirect_call, 0
-; GCN: NumSgprs: 36
+; GCN: TotalNumSgprs: 36
; GCN: NumVgprs: 10
; GCN: ScratchSize: 0
define void @use_10_vgpr() #1 {
@@ -134,14 +132,14 @@ define void @use_10_vgpr() #1 {
; GCN-LABEL: {{^}}indirect_use_10_vgpr:
; GCN: .set indirect_use_10_vgpr.num_vgpr, max(41, use_10_vgpr.num_vgpr)
; GCN: .set indirect_use_10_vgpr.num_agpr, max(0, use_10_vgpr.num_agpr)
-; GCN: .set indirect_use_10_vgpr.num_sgpr, max(34, use_10_vgpr.num_sgpr)
+; GCN: .set indirect_use_10_vgpr.numbered_sgpr, max(34, use_10_vgpr.numbered_sgpr)
; GCN: .set indirect_use_10_vgpr.private_seg_size, 16+(max(use_10_vgpr.private_seg_size))
; GCN: .set indirect_use_10_vgpr.uses_vcc, or(1, use_10_vgpr.uses_vcc)
; GCN: .set indirect_use_10_vgpr.uses_flat_scratch, or(0, use_10_vgpr.uses_flat_scratch)
; GCN: .set indirect_use_10_vgpr.has_dyn_sized_stack, or(0, use_10_vgpr.has_dyn_sized_stack)
; GCN: .set indirect_use_10_vgpr.has_recursion, or(0, use_10_vgpr.has_recursion)
; GCN: .set indirect_use_10_vgpr.has_indirect_call, or(0, use_10_vgpr.has_indirect_call)
-; GCN: NumSgprs: 38
+; GCN: TotalNumSgprs: 38
; GCN: NumVgprs: 41
; GCN: ScratchSize: 16
define void @indirect_use_10_vgpr() #0 {
@@ -152,14 +150,14 @@ define void @indirect_use_10_vgpr() #0 {
; GCN-LABEL: {{^}}indirect_2_level_use_10_vgpr:
; GCN: .set indirect_2_level_use_10_vgpr.num_vgpr, max(32, indirect_use_10_vgpr.num_vgpr)
; GCN: .set indirect_2_level_use_10_vgpr.num_agpr, max(0, indirect_use_10_vgpr.num_agpr)
-; GCN: .set indirect_2_level_use_10_vgpr.num_sgpr, max(33, indirect_use_10_vgpr.num_sgpr)
+; GCN: .set indirect_2_level_use_10_vgpr.numbered_sgpr, max(33, indirect_use_10_vgpr.numbered_sgpr)
; GCN: .set indirect_2_level_use_10_vgpr.private_seg_size, 0+(max(indirect_use_10_vgpr.private_seg_size))
; GCN: .set indirect_2_level_use_10_vgpr.uses_vcc, or(1, indirect_use_10_vgpr.uses_vcc)
; GCN: .set indirect_2_level_use_10_vgpr.uses_flat_scratch, or(1, indirect_use_10_vgpr.uses_flat_scratch)
; GCN: .set indirect_2_level_use_10_vgpr.has_dyn_sized_stack, or(0, indirect_use_10_vgpr.has_dyn_sized_stack)
; GCN: .set indirect_2_level_use_10_vgpr.has_recursion, or(0, indirect_use_10_vgpr.has_recursion)
; GCN: .set indirect_2_level_use_10_vgpr.has_indirect_call, or(0, indirect_use_10_vgpr.has_indirect_call)
-; GCN: NumSgprs: 40
+; GCN: TotalNumSgprs: 40
; GCN: NumVgprs: 41
; GCN: ScratchSize: 16
define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 {
@@ -170,14 +168,14 @@ define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 {
; GCN-LABEL: {{^}}use_50_vgpr:
; GCN: .set use_50_vgpr.num_vgpr, 50
; GCN: .set use_50_vgpr.num_agpr, 0
-; GCN: .set use_50_vgpr.num_sgpr, 32
+; GCN: .set use_50_vgpr.numbered_sgpr, 32
; GCN: .set use_50_vgpr.private_seg_size, 0
; GCN: .set use_50_vgpr.uses_vcc, 0
; GCN: .set use_50_vgpr.uses_flat_scratch, 0
; GCN: .set use_50_vgpr.has_dyn_sized_stack, 0
; GCN: .set use_50_vgpr.has_recursion, 0
; GCN: .set use_50_vgpr.has_indirect_call, 0
-; GCN: NumSgprs: 36
+; GCN: TotalNumSgprs: 36
; GCN: NumVgprs: 50
; GCN: ScratchSize: 0
define void @use_50_vgpr() #1 {
@@ -188,14 +186,14 @@ define void @use_50_vgpr() #1 {
; GCN-LABEL: {{^}}indirect_use_50_vgpr:
; GCN: .set indirect_use_50_vgpr.num_vgpr, max(41, use_50_vgpr.num_vgpr)
; GCN: .set indirect_use_50_vgpr.num_agpr, max(0, use_50_vgpr.num_agpr)
-; GCN: .set indirect_use_50_vgpr.num_sgpr, max(34, use_50_vgpr.num_sgpr)
+; GCN: .set indirect_use_50_vgpr.numbered_sgpr, max(34, use_50_vgpr.numbered_sgpr)
; GCN: .set indirect_use_50_vgpr.private_seg_size, 16+(max(use_50_vgpr.private_seg_size))
; GCN: .set indirect_use_50_vgpr.uses_vcc, or(1, use_50_vgpr.uses_vcc)
; GCN: .set indirect_use_50_vgpr.uses_flat_scratch, or(0, use_50_vgpr.uses_flat_scratch)
; GCN: .set indirect_use_50_vgpr.has_dyn_sized_stack, or(0, use_50_vgpr.has_dyn_sized_stack)
; GCN: .set indirect_use_50_vgpr.has_recursion, or(0, use_50_vgpr.has_recursion)
; GCN: .set indirect_use_50_vgpr.has_indirect_call, or(0, use_50_vgpr.has_indirect_call)
-; GCN: NumSgprs: 38
+; GCN: TotalNumSgprs: 38
; GCN: NumVgprs: 50
; GCN: ScratchSize: 16
define void @indirect_use_50_vgpr() #0 {
@@ -206,14 +204,14 @@ define void @indirect_use_50_vgpr() #0 {
; GCN-LABEL: {{^}}use_80_sgpr:
; GCN: .set use_80_sgpr.num_vgpr, 1
; GCN: .set use_80_sgpr.num_agpr, 0
-; GCN: .set use_80_sgpr.num_sgpr, 80
+; GCN: .set use_80_sgpr.numbered_sgpr, 80
; GCN: .set use_80_sgpr.private_seg_size, 8
; GCN: .set use_80_sgpr.uses_vcc, 0
; GCN: .set use_80_sgpr.uses_flat_scratch, 0
; GCN: .set use_80_sgpr.has_dyn_sized_stack, 0
; GCN: .set use_80_sgpr.has_recursion, 0
; GCN: .set use_80_sgpr.has_indirect_call, 0
-; GCN: NumSgprs: 84
+; GCN: TotalNumSgprs: 84
; GCN: NumVgprs: 1
; GCN: ScratchSize: 8
define void @use_80_sgpr() #1 {
@@ -224,14 +222,14 @@ define void @use_80_sgpr() #1 {
; GCN-LABEL: {{^}}indirect_use_80_sgpr:
; GCN: .set indirect_use_80_sgpr.num_vgpr, max(41, use_80_sgpr.num_vgpr)
; GCN: .set indirect_use_80_sgpr.num_agpr, max(0, use_80_sgpr.num_agpr)
-; GCN: .set indirect_use_80_sgpr.num_sgpr, max(34, use_80_sgpr.num_sgpr)
+; GCN: .set indirect_use_80_sgpr.numbered_sgpr, max(34, use_80_sgpr.numbered_sgpr)
; GCN: .set indirect_use_80_sgpr.private_seg_size, 16+(max(use_80_sgpr.private_seg_size))
; GCN: .set indirect_use_80_sgpr.uses_vcc, or(1, use_80_sgpr.uses_vcc)
; GCN: .set indirect_use_80_sgpr.uses_flat_scratch, or(0, use_80_sgpr.uses_flat_scratch)
; GCN: .set indirect_use_80_sgpr.has_dyn_sized_stack, or(0, use_80_sgpr.has_dyn_sized_stack)
; GCN: .set indirect_use_80_sgpr.has_recursion, or(0, use_80_sgpr.has_recursion)
; GCN: .set indirect_use_80_sgpr.has_indirect_call, or(0, use_80_sgpr.has_indirect_call)
-; GCN: NumSgprs: 84
+; GCN: TotalNumSgprs: 84
; GCN: NumVgprs: 41
; GCN: ScratchSize: 24
define void @indirect_use_80_sgpr() #1 {
@@ -242,14 +240,14 @@ define void @indirect_use_80_sgpr() #1 {
; GCN-LABEL: {{^}}indirect_2_level_use_80_sgpr:
; GCN: .set indirect_2_level_use_80_sgpr.num_vgpr, max(32, indirect_use_80_sgpr.num_vgpr)
; GCN: .set indirect_2_level_use_80_sgpr.num_agpr, max(0, indirect_use_80_sgpr.num_agpr)
-; GCN: .set indirect_2_level_use_80_sgpr.num_sgpr, max(33, indirect_use_80_sgpr.num_sgpr)
+; GCN: .set indirect_2_level_use_80_sgpr.numbered_sgpr, max(33, indirect_use_80_sgpr.numbered_sgpr)
; GCN: .set indirect_2_level_use_80_sgpr.private_seg_size, 0+(max(indirect_use_80_sgpr.private_seg_size))
; GCN: .set indirect_2_level_use_80_sgpr.uses_vcc, or(1, indirect_use_80_sgpr.uses_vcc)
; GCN: .set indirect_2_level_use_80_sgpr.uses_flat_scratch, or(1, indirect_use_80_sgpr.uses_flat_scratch)
; GCN: .set indirect_2_level_use_80_sgpr.has_dyn_sized_stack, or(0, indirect_use_80_sgpr.has_dyn_sized_stack)
; GCN: .set indirect_2_level_use_80_sgpr.has_recursion, or(0, indirect_use_80_sgpr.has_recursion)
; GCN: .set indirect_2_level_use_80_sgpr.has_indirect_call, or(0, indirect_use_80_sgpr.has_indirect_call)
-; GCN: NumSgprs: 86
+; GCN: TotalNumSgprs: 86
; GCN: NumVgprs: 41
; GCN: ScratchSize: 24
define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 {
@@ -260,14 +258,14 @@ define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 {
; GCN-LABEL: {{^}}use_stack0:
; GCN: .set use_stack0.num_vgpr, 1
; GCN: .set use_stack0.num_agpr, 0
-; GCN: .set use_stack0.num_sgpr, 33
+; GCN: .set use_stack0.numbered_sgpr, 33
; GCN: .set use_stack0.private_seg_size, 2052
; GCN: .set use_stack0.uses_vcc, 0
; GCN: .set use_stack0.uses_flat_scratch, 0
; GCN: .set use_stack0.has_dyn_sized_stack, 0
; GCN: .set use_stack0.has_recursion, 0
; GCN: .set use_stack0.has_indirect_call, 0
-; GCN: NumSgprs: 37
+; GCN: TotalNumSgprs: 37
; GCN: NumVgprs: 1
; GCN: ScratchSize: 2052
define void @use_stack0() #1 {
@@ -279,14 +277,14 @@ define void @use_stack0() #1 {
; GCN-LABEL: {{^}}use_stack1:
; GCN: .set use_stack1.num_vgpr, 1
; GCN: .set use_stack1.num_agpr, 0
-; GCN: .set use_stack1.num_sgpr, 33
+; GCN: .set use_stack1.numbered_sgpr, 33
; GCN: .set use_stack1.private_seg_size, 404
; GCN: .set use_stack1.uses_vcc, 0
; GCN: .set use_stack1.uses_flat_scratch, 0
; GCN: .set use_stack1.has_dyn_sized_stack, 0
; GCN: .set use_stack1.has_recursion, 0
; GCN: .set use_stack1.has_indirect_call, 0
-; GCN: NumSgprs: 37
+; GCN: TotalNumSgprs: 37
; GCN: NumVgprs: 1
; GCN: ScratchSize: 404
define void @use_stack1() #1 {
@@ -298,14 +296,14 @@ define void @use_stack1() #1 {
; GCN-LABEL: {{^}}indirect_use_stack:
; GCN: .set indirect_use_stack.num_vgpr, max(41, use_stack0.num_vgpr)
; GCN: .set indirect_use_stack.num_agpr, max(0, use_stack0.num_agpr)
-; GCN: .set indirect_use_stack.num_sgpr, max(34, use_stack0.num_sgpr)
+; GCN: .set indirect_use_stack.numbered_sgpr, max(34, use_stack0.numbered_sgpr)
; GCN: .set indirect_use_stack.private_seg_size, 80+(max(use_stack0.private_seg_size))
; GCN: .set indirect_use_stack.uses_vcc, or(1, use_stack0.uses_vcc)
; GCN: .set indirect_use_stack.uses_flat_scratch, or(0, use_stack0.uses_flat_scratch)
; GCN: .set indirect_use_stack.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack)
; GCN: .set indirect_use_stack.has_recursion, or(0, use_stack0.has_recursion)
; GCN: .set indirect_use_stack.has_indirect_call, or(0, use_stack0.has_indirect_call)
-; GCN: NumSgprs: 38
+; GCN: TotalNumSgprs: 38
; GCN: NumVgprs: 41
; GCN: ScratchSize: 2132
define void @indirect_use_stack() #1 {
@@ -318,14 +316,14 @@ define void @indirect_use_stack() #1 {
; GCN-LABEL: {{^}}indirect_2_level_use_stack:
; GCN: .set indirect_2_level_use_stack.num_vgpr, max(32, indirect_use_stack.num_vgpr)
; GCN: .set indirect_2_level_use_stack.num_agpr, max(0, indirect_use_stack.num_agpr)
-; GCN: .set indirect_2_level_use_stack.num_sgpr, max(33, indirect_use_stack.num_sgpr)
+; GCN: .set indirect_2_level_use_stack.numbered_sgpr, max(33, indirect_use_stack.numbered_sgpr)
; GCN: .set indirect_2_level_use_stack.private_seg_size, 0+(max(indirect_use_stack.private_seg_size))
; GCN: .set indirect_2_level_use_stack.uses_vcc, or(1, indirect_use_stack.uses_vcc)
; GCN: .set indirect_2_level_use_stack.uses_flat_scratch, or(1, indirect_use_stack.uses_flat_scratch)
; GCN: .set indirect_2_level_use_stack.has_dyn_sized_stack, or(0, indirect_use_stack.has_dyn_sized_stack)
; GCN: .set indirect_2_level_use_stack.has_recursion, or(0, indirect_use_stack.has_recursion)
; GCN: .set indirect_2_level_use_stack.has_indirect_call, or(0, indirect_use_stack.has_indirect_call)
-; GCN: NumSgprs: 40
+; GCN: TotalNumSgprs: 40
; GCN: NumVgprs: 41
; GCN: ScratchSize: 2132
define amdgpu_kernel void @indirect_2_level_use_stack() #0 {
@@ -338,14 +336,14 @@ define amdgpu_kernel void @indirect_2_level_use_stack() #0 {
; GCN-LABEL: {{^}}multi_call_use_use_stack:
; GCN: .set multi_call_use_use_stack.num_vgpr, max(41, use_stack0.num_vgpr, use_stack1.num_vgpr)
; GCN: .set multi_call_use_use_stack.num_agpr, max(0, use_stack0.num_agpr, use_stack1.num_agpr)
-; GCN: .set multi_call_use_use_stack.num_sgpr, max(42, use_stack0.num_sgpr, use_stack1.num_sgpr)
+; GCN: .set multi_call_use_use_stack.numbered_sgpr, max(42, use_stack0.numbered_sgpr, use_stack1.numbered_sgpr)
; GCN: .set multi_call_use_use_stack.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size))
; GCN: .set multi_call_use_use_stack.uses_vcc, or(1, use_stack0.uses_vcc, use_stack1.uses_vcc)
; GCN: .set multi_call_use_use_stack.uses_flat_scratch, or(1, use_stack0.uses_flat_scratch, use_stack1.uses_flat_scratch)
; GCN: .set multi_call_use_use_stack.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack, use_stack1.has_dyn_sized_stack)
; GCN: .set multi_call_use_use_stack.has_recursion, or(0, use_stack0.has_recursion, use_stack1.has_recursion)
; GCN: .set multi_call_use_use_stack.has_indirect_call, or(0, use_stack0.has_indirect_call, use_stack1.has_indirect_call)
-; GCN: NumSgprs: 48
+; GCN: TotalNumSgprs: 48
; GCN: NumVgprs: 41
; GCN: ScratchSize: 2052
define amdgpu_kernel void @multi_call_use_use_stack() #0 {
@@ -359,14 +357,14 @@ declare void @external() #0
; GCN-LABEL: {{^}}multi_call_with_external:
; GCN: .set multi_call_with_external.num_vgpr, max(41, max_num_vgpr)
; GCN: .set multi_call_with_external.num_agpr, max(0, max_num_agpr)
-; GCN: .set multi_call_with_external.num_sgpr, max(42, max_num_sgpr)
+; GCN: .set multi_call_with_external.numbered_sgpr, max(42, max_num_sgpr)
; GCN: .set multi_call_with_external.private_seg_size, 0
; GCN: .set multi_call_with_external.uses_vcc, 1
; GCN: .set multi_call_with_external.uses_flat_scratch, 1
; GCN: .set multi_call_with_external.has_dyn_sized_stack, 1
; GCN: .set multi_call_with_external.has_recursion, 0
; GCN: .set multi_call_with_external.has_indirect_call, 1
-; GCN: NumSgprs: multi_call_with_external.num_sgpr+6
+; GCN: TotalNumSgprs: multi_call_with_external.numbered_sgpr+6
; GCN: NumVgprs: multi_call_with_external.num_vgpr
; GCN: ScratchSize: 0
define amdgpu_kernel void @multi_call_with_external() #0 {
@@ -379,14 +377,14 @@ define amdgpu_kernel void @multi_call_with_external() #0 {
; GCN-LABEL: {{^}}usage_external:
; GCN: .set usage_external.num_vgpr, max(32, max_num_vgpr)
; GCN: .set usage_external.num_agpr, max(0, max_num_agpr)
-; GCN: .set usage_external.num_sgpr, max(33, max_num_sgpr)
+; GCN: .set usage_external.numbered_sgpr, max(33, max_num_sgpr)
; GCN: .set usage_external.private_seg_size, 0
; GCN: .set usage_external.uses_vcc, 1
; GCN: .set usage_external.uses_flat_scratch, 1
; GCN: .set usage_external.has_dyn_sized_stack, 1
; GCN: .set usage_external.has_recursion, 0
; GCN: .set usage_external.has_indirect_call, 1
-; GCN: NumSgprs: usage_external.num_sgpr+6
+; GCN: TotalNumSgprs: usage_external.numbered_sgpr+6
; GCN: NumVgprs: usage_external.num_vgpr
; GCN: ScratchSize: 0
define amdgpu_kernel void @usage_external() #0 {
@@ -399,14 +397,14 @@ declare void @external_recurse() #2
; GCN-LABEL: {{^}}usage_external_recurse:
; GCN: .set usage_external_recurse.num_vgpr, max(32, max_num_vgpr)
; GCN: .set usage_external_recurse.num_agpr, max(0, max_num_agpr)
-; GCN: .set usage_external_recurse.num_sgpr, max(33, max_num_sgpr)
+; GCN: .set usage_external_recurse.numbered_sgpr, max(33, max_num_sgpr)
; GCN: .set usage_external_recurse.private_seg_size, 0
; GCN: .set usage_external_recurse.uses_vcc, 1
; GCN: .set usage_external_recurse.uses_flat_scratch, 1
; GCN: .set usage_external_recurse.has_dyn_sized_stack, 1
; GCN: .set usage_external_recurse.has_recursion, 1
; GCN: .set usage_external_recurse.has_indirect_call, 1
-; GCN: NumSgprs: usage_external_recurse.num_sgpr+6
+; GCN: TotalNumSgprs: usage_external_recurse.numbered_sgpr+6
; GCN: NumVgprs: usage_external_recurse.num_vgpr
; GCN: ScratchSize: 0
define amdgpu_kernel void @usage_external_recurse() #0 {
@@ -417,14 +415,14 @@ define amdgpu_kernel void @usage_external_recurse() #0 {
; GCN-LABEL: {{^}}direct_recursion_use_stack:
; GCN: .set direct_recursion_use_stack.num_vgpr, 41
; GCN: .set direct_recursion_use_stack.num_agpr, 0
-; GCN: .set direct_recursion_use_stack.num_sgpr, 36
+; GCN: .set direct_recursion_use_stack.numbered_sgpr, 36
; GCN: .set direct_recursion_use_stack.private_seg_size, 2064
; GCN: .set direct_recursion_use_stack.uses_vcc, 1
; GCN: .set direct_recursion_use_stack.uses_flat_scratch, 0
; GCN: .set direct_recursion_use_stack.has_dyn_sized_stack, 0
; GCN: .set direct_recursion_use_stack.has_recursion, 1
; GCN: .set direct_recursion_use_stack.has_indirect_call, 0
-; GCN: NumSgprs: 40
+; GCN: TotalNumSgprs: 40
; GCN: NumVgprs: 41
; GCN: ScratchSize: 2064
define void @direct_recursion_use_stack(i32 %val) #2 {
@@ -445,14 +443,14 @@ ret:
; GCN-LABEL: {{^}}usage_direct_recursion:
; GCN: .set usage_direct_recursion.num_vgpr, max(32, direct_recursion_use_stack.num_vgpr)
; GCN: .set usage_direct_recursion.num_agpr, max(0, direct_recursion_use_stack.num_agpr)
-; GCN: .set usage_direct_recursion.num_sgpr, max(33, direct_recursion_use_stack.num_sgpr)
+; GCN: .set usage_direct_recursion.numbered_sgpr, max(33, direct_recursion_use_stack.numbered_sgpr)
; GCN: .set usage_direct_recursion.private_seg_size, 0+(max(direct_recursion_use_stack.private_seg_size))
; GCN: .set usage_direct_recursion.uses_vcc, or(1, direct_recursion_use_stack.uses_vcc)
; GCN: .set usage_direct_recursion.uses_flat_scratch, or(1, direct_recursion_use_stack.uses_flat_scratch)
; GCN: .set usage_direct_recursion.has_dyn_sized_stack, or(0, direct_recursion_use_stack.has_dyn_sized_stack)
; GCN: .set usage_direct_recursion.has_recursion, or(1, direct_recursion_use_stack.has_recursion)
; GCN: .set usage_direct_recursion.has_indirect_call, or(0, direct_recursion_use_stack.has_indirect_call)
-; GCN: NumSgprs: 42
+; GCN: TotalNumSgprs: 42
; GCN: NumVgprs: 41
; GCN: ScratchSize: 2064
define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
@@ -464,14 +462,14 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
; GCN-LABEL: {{^}}count_use_sgpr96_external_call
; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(32, max_num_vgpr)
; GCN: .set count_use_sgpr96_external_call.num_agpr, max(0, max_num_agpr)
-; GCN: .set count_use_sgpr96_external_call.num_sgpr, max(33, max_num_sgpr)
+; GCN: .set count_use_sgpr96_external_call.numbered_sgpr, max(33, max_num_sgpr)
; GCN: .set count_use_sgpr96_external_call.private_seg_size, 0
; GCN: .set count_use_sgpr96_external_call.uses_vcc, 1
; GCN: .set count_use_sgpr96_external_call.uses_flat_scratch, 1
; GCN: .set count_use_sgpr96_external_call.has_dyn_sized_stack, 1
; GCN: .set count_use_sgpr96_external_call.has_recursion, 0
; GCN: .set count_use_sgpr96_external_call.has_indirect_call, 1
-; GCN: NumSgprs: count_use_sgpr96_external_call.num_sgpr+6
+; GCN: TotalNumSgprs: count_use_sgpr96_external_call.numbered_sgpr+6
; GCN: NumVgprs: count_use_sgpr96_external_call.num_vgpr
; GCN: ScratchSize: 0
define amdgpu_kernel void @count_use_sgpr96_external_call() {
@@ -485,14 +483,14 @@ entry:
; GCN-LABEL: {{^}}count_use_sgpr160_external_call
; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(32, max_num_vgpr)
; GCN: .set count_use_sgpr160_external_call.num_agpr, max(0, max_num_agpr)
-; GCN: .set count_use_sgpr160_external_call.num_sgpr, max(33, max_num_sgpr)
+; GCN: .set count_use_sgpr160_external_call.numbered_sgpr, max(33, max_num_sgpr)
; GCN: .set count_use_sgpr160_external_call.private_seg_size, 0
; GCN: .set count_use_sgpr160_external_call.uses_vcc, 1
; GCN: .set count_use_sgpr160_external_call.uses_flat_scratch, 1
; GCN: .set count_use_sgpr160_external_call.has_dyn_sized_stack, 1
; GCN: .set count_use_sgpr160_external_call.has_recursion, 0
; GCN: .set count_use_sgpr160_external_call.has_indirect_call, 1
-; GCN: NumSgprs: count_use_sgpr160_external_call.num_sgpr+6
+; GCN: TotalNumSgprs: count_use_sgpr160_external_call.numbered_sgpr+6
; GCN: NumVgprs: count_use_sgpr160_external_call.num_vgpr
; GCN: ScratchSize: 0
define amdgpu_kernel void @count_use_sgpr160_external_call() {
@@ -506,14 +504,14 @@ entry:
; GCN-LABEL: {{^}}count_use_vgpr160_external_call
; GCN: .set count_use_vgpr160_external_call.num_vgpr, max(32, max_num_vgpr)
; GCN: .set count_use_vgpr160_external_call.num_agpr, max(0, max_num_agpr)
-; GCN: .set count_use_vgpr160_external_call.num_sgpr, max(33, max_num_sgpr)
+; GCN: .set count_use_vgpr160_external_call.numbered_sgpr, max(33, max_num_sgpr)
; GCN: .set count_use_vgpr160_external_call.private_seg_size, 0
; GCN: .set count_use_vgpr160_external_call.uses_vcc, 1
; GCN: .set count_use_vgpr160_external_call.uses_flat_scratch, 1
; GCN: .set count_use_vgpr160_external_call.has_dyn_sized_stack, 1
; GCN: .set count_use_vgpr160_external_call.has_recursion, 0
; GCN: .set count_use_vgpr160_external_call.has_indirect_call, 1
-; GCN: NumSgprs: count_use_vgpr160_external_call.num_sgpr+6
+; GCN: TotalNumSgprs: count_use_vgpr160_external_call.numbered_sgpr+6
; GCN: NumVgprs: count_use_vgpr160_external_call.num_vgpr
; GCN: ScratchSize: 0
define amdgpu_kernel void @count_use_vgpr160_external_call() {
diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll
index 4575df1e0c6b95..957f404c8cdbed 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra.ll
@@ -30,7 +30,7 @@ define hidden void @func() #1 {
; GCN-NOT: writelane
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
-; GCN: ; NumSgprs: 37
+; GCN: ; TotalNumSgprs: 37
; GCN: ; NumVgprs: 9
define amdgpu_kernel void @kernel_call() #0 {
%vgpr = load volatile i32, ptr addrspace(1) undef
@@ -48,7 +48,7 @@ define amdgpu_kernel void @kernel_call() #0 {
; GCN-NOT: readlane
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
-; GCN: ; NumSgprs: 34
+; GCN: ; TotalNumSgprs: 34
; GCN: ; NumVgprs: 10
define void @func_regular_call() #1 {
%vgpr = load volatile i32, ptr addrspace(1) undef
@@ -64,7 +64,7 @@ define void @func_regular_call() #1 {
; GCN-NEXT: s_addc_u32 s17,
; GCN-NEXT: s_setpc_b64 s[16:17]
-; GCN: ; NumSgprs: 32
+; GCN: ; TotalNumSgprs: 32
; GCN: ; NumVgprs: 8
define void @func_tail_call() #1 {
tail call void @func()
@@ -77,7 +77,7 @@ define void @func_tail_call() #1 {
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
; GCN: s_setpc_b64
-; GCN: ; NumSgprs: 34
+; GCN: ; TotalNumSgprs: 34
; GCN: ; NumVgprs: 10
define void @func_call_tail_call() #1 {
%vgpr = load volatile i32, ptr addrspace(1) undef
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
index 6d18f354e65422..a2baa56ea0c989 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
@@ -1,7 +1,7 @@
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s
; CHECK-LABEL: {{^}}_amdgpu_cs_main:
-; CHECK: ; NumSgprs: 4
+; CHECK: ; TotalNumSgprs: 4
; CHECK: ; NumVgprs: 2
; CHECK: .amdgpu_pal_metadata
; CHECK-NEXT: ---
diff --git a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
index 8d12b3fe626da8..35e11ad6a648ba 100644
--- a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
+++ b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
@@ -7,7 +7,7 @@ declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
; SI-LABEL: {{^}}foo:
; SI: .section .AMDGPU.csdata
; SI: ; Kernel info:
-; SI: ; NumSgprs: {{[0-9]+}}
+; SI: ; TotalNumSgprs: {{[0-9]+}}
; SI: ; NumVgprs: {{[0-9]+}}
define amdgpu_kernel void @foo(ptr addrspace(1) noalias %out, ptr addrspace(1) %abase, ptr addrspace(1) %bbase) nounwind {
%mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0);
diff --git a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
index 4f10e90fd087fa..8bbae59f468f1d 100644
--- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
+++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
@@ -2,7 +2,7 @@
; RUN: FileCheck -check-prefix=REMARK %s < %t
; STDERR: remark: foo.cl:27:0: Function Name: test_kernel
-; STDERR-NEXT: remark: foo.cl:27:0: SGPRs: 28
+; STDERR-NEXT: remark: foo.cl:27:0: TotalSGPRs: 28
; STDERR-NEXT: remark: foo.cl:27:0: VGPRs: 9
; STDERR-NEXT: remark: foo.cl:27:0: AGPRs: 43
; STDERR-NEXT: remark: foo.cl:27:0: ScratchSize [bytes/lane]: 0
@@ -27,7 +27,7 @@
; REMARK-NEXT: DebugLoc: { File: foo.cl, Line: 27, Column: 0 }
; REMARK-NEXT: Function: test_kernel
; REMARK-NEXT: Args:
-; REMARK-NEXT: - String: ' SGPRs: '
+; REMARK-NEXT: - String: ' TotalSGPRs: '
; REMARK-NEXT: - NumSGPR: '28'
; REMARK-NEXT: ...
; REMARK-NEXT: --- !Analysis
@@ -122,7 +122,7 @@ define void @test_func() !dbg !6 {
}
; STDERR: remark: foo.cl:8:0: Function Name: empty_kernel
-; STDERR-NEXT: remark: foo.cl:8:0: SGPRs: 4
+; STDERR-NEXT: remark: foo.cl:8:0: TotalSGPRs: 4
; STDERR-NEXT: remark: foo.cl:8:0: VGPRs: 0
; STDERR-NEXT: remark: foo.cl:8:0: AGPRs: 0
; STDERR-NEXT: remark: foo.cl:8:0: ScratchSize [bytes/lane]: 0
@@ -141,12 +141,12 @@ define void @empty_func() !dbg !8 {
}
; STDERR: remark: foo.cl:64:0: Function Name: test_indirect_call
-; STDERR-NEXT: remark: foo.cl:64:0: SGPRs: test_indirect_call.num_sgpr+6
+; STDERR-NEXT: remark: foo.cl:64:0: TotalSGPRs: test_indirect_call.numbered_sgpr+6
; STDERR-NEXT: remark: foo.cl:64:0: VGPRs: test_indirect_call.num_vgpr
; STDERR-NEXT: remark: foo.cl:64:0: AGPRs: test_indirect_call.num_agpr
; STDERR-NEXT: remark: foo.cl:64:0: ScratchSize [bytes/lane]: 0
; STDERR-NEXT: remark: foo.cl:64:0: Dynamic Stack: True
-; STDERR-NEXT: remark: foo.cl:64:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_indirect_call.num_sgpr+(extrasgprs(test_indirect_call.uses_vcc, test_indirect_call.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_call.num_agpr, test_indirect_call.num_vgpr), 1, 0))
+; STDERR-NEXT: remark: foo.cl:64:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_indirect_call.numbered_sgpr+(extrasgprs(test_indirect_call.uses_vcc, test_indirect_call.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_call.num_agpr, test_indirect_call.num_vgpr), 1, 0))
; STDERR-NEXT: remark: foo.cl:64:0: SGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:64:0: VGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:64:0: LDS Size [bytes/block]: 0
@@ -159,12 +159,12 @@ define amdgpu_kernel void @test_indirect_call() !dbg !9 {
}
; STDERR: remark: foo.cl:74:0: Function Name: test_indirect_w_static_stack
-; STDERR-NEXT: remark: foo.cl:74:0: SGPRs: test_indirect_w_static_stack.num_sgpr+6
+; STDERR-NEXT: remark: foo.cl:74:0: TotalSGPRs: test_indirect_w_static_stack.numbered_sgpr+6
; STDERR-NEXT: remark: foo.cl:74:0: VGPRs: test_indirect_w_static_stack.num_vgpr
; STDERR-NEXT: remark: foo.cl:74:0: AGPRs: test_indirect_w_static_stack.num_agpr
; STDERR-NEXT: remark: foo.cl:74:0: ScratchSize [bytes/lane]: 144
; STDERR-NEXT: remark: foo.cl:74:0: Dynamic Stack: True
-; STDERR-NEXT: remark: foo.cl:74:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_indirect_w_static_stack.num_sgpr+(extrasgprs(test_indirect_w_static_stack.uses_vcc, test_indirect_w_static_stack.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_w_static_stack.num_agpr, test_indirect_w_static_stack.num_vgpr), 1, 0))
+; STDERR-NEXT: remark: foo.cl:74:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_indirect_w_static_stack.numbered_sgpr+(extrasgprs(test_indirect_w_static_stack.uses_vcc, test_indirect_w_static_stack.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_w_static_stack.num_agpr, test_indirect_w_static_stack.num_vgpr), 1, 0))
; STDERR-NEXT: remark: foo.cl:74:0: SGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:74:0: VGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:74:0: LDS Size [bytes/block]: 0
>From 91672949f6ce2e3ae1032eafd9884824e207213d Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <janek.vanoirschot at amd.com>
Date: Tue, 17 Sep 2024 13:41:29 +0100
Subject: [PATCH 09/11] Feedback
---
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 37 ++++++++++++++++---
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h | 5 ---
.../Target/AMDGPU/AMDGPUMCResourceInfo.cpp | 3 +-
3 files changed, 32 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index ea20d9313a2c7c..cef5200aadb649 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -437,12 +437,32 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
}
}
- auto I = OccupancyValidateMap.find(&F);
- if (I != OccupancyValidateMap.end()) {
+ MCSymbol *NumVgprSymbol =
+ RI.getSymbol(F.getName(), RIK::RIK_NumVGPR, OutContext);
+ MCSymbol *NumAgprSymbol =
+ RI.getSymbol(F.getName(), RIK::RIK_NumAGPR, OutContext);
+ uint64_t NumVgpr, NumAgpr;
+
+ if (NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() &&
+ TryGetMCExprValue(NumVgprSymbol->getVariableValue(), NumVgpr) &&
+ TryGetMCExprValue(NumAgprSymbol->getVariableValue(), NumAgpr)) {
+ SIMachineFunctionInfo MFI(F, &STM);
+ unsigned MaxWaves = MFI.getMaxWavesPerEU();
+ uint64_t TotalNumVgpr =
+ getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr);
+ uint64_t NumVGPRsForWavesPerEU = std::max(
+ {TotalNumVgpr, (uint64_t)1, (uint64_t)STM.getMinNumVGPRs(MaxWaves)});
+ uint64_t NumSGPRsForWavesPerEU = std::max(
+ {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
+ const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy(
+ STM.computeOccupancy(F, MFI.getLDSSize()),
+ MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
+ MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext), STM,
+ OutContext);
+ uint64_t Occupancy;
+
const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
F, "amdgpu-waves-per-eu", {0, 0}, true);
- uint64_t Occupancy;
- const MCExpr *OccupancyExpr = I->getSecond();
if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) {
DiagnosticInfoOptimizationFailure Diag(
@@ -473,9 +493,16 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) {
// Assign expressions which can only be resolved when all other functions are
// known.
RI.finalize(OutContext);
+
+ // Switch section and emit all GPR maximums within the processed module.
+ OutStreamer->pushSection();
+ MCSectionELF *MaxGPRSection =
+ OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0);
+ OutStreamer->switchSection(MaxGPRSection);
getTargetStreamer()->EmitMCResourceMaximums(RI.getMaxVGPRSymbol(OutContext),
RI.getMaxAGPRSymbol(OutContext),
RI.getMaxSGPRSymbol(OutContext));
+ OutStreamer->popSection();
for (Function &F : M.functions())
validateMCResourceInfo(F);
@@ -1217,8 +1244,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU,
ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
- OccupancyValidateMap.insert({&MF.getFunction(), ProgInfo.Occupancy});
-
const auto [MinWEU, MaxWEU] =
AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
uint64_t Occupancy;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 3532cc8dea0238..cc8c4411805e23 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -50,11 +50,6 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
MCCodeEmitter *DumpCodeInstEmitter = nullptr;
- // validateMCResourceInfo cannot recompute parts of the occupancy as it does
- // for other metadata to validate (e.g., NumSGPRs) so a map is necessary if we
- // really want to track and validate the occupancy.
- DenseMap<const Function *, const MCExpr *> OccupancyValidateMap;
-
uint64_t getFunctionCodeSize(const MachineFunction &MF) const;
void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
index 785401739de21b..b46b2d582976e9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
@@ -105,9 +105,8 @@ void MCResourceInfo::assignResourceInfoExpr(
Seen.insert(&F);
for (const Function *Callee : Callees) {
- if (Seen.contains(Callee))
+ if (!Seen.insert(Callee).second)
continue;
- Seen.insert(Callee);
MCSymbol *CalleeValSym = getSymbol(Callee->getName(), RIK, OutContext);
ArgExprs.push_back(MCSymbolRefExpr::create(CalleeValSym, OutContext));
}
>From 79a00a4bf410612af137c5bd29a66fbf767f38e8 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <janek.vanoirschot at amd.com>
Date: Mon, 23 Sep 2024 17:15:16 +0100
Subject: [PATCH 10/11] Feedback: Documentation and rematerialization of MFI
through MachineFunction
---
llvm/docs/AMDGPUUsage.rst | 50 +++++++++++++++++++++
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 10 ++++-
2 files changed, 58 insertions(+), 2 deletions(-)
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 117fc2cf6bbbc8..2649b3a7d360b2 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1755,6 +1755,56 @@ As part of the AMDGPU MC layer, AMDGPU provides the following target specific
=================== ================= ========================================================
+Function Resource Usage
+-----------------------
+
+The function resource information (e.g., number of VGPRs) required depends on
+all of its callees' function resources. The expression to denote these
+resources should, therefore, be described as the propagative of its callees'
+equivalent expressions. Said expressions are generated and emitted (as symbols)
+by the compiler when compiling to either assembly or object format and should
+not be overwritten or redefined.
+
+The following describes all emitted function resource usage information:
+
+ .. table:: Function Resource Usage:
+ :name: function-usage-table
+
+ ===================================== ========= ========================================= ===============================================================================
+ Symbol Type Description Example
+ ===================================== ========= ========================================= ===============================================================================
+ <function_name>.num_vgpr Integer Number of VGPRs used by <function_name>, .set foo.num_vgpr, max(32, bar.num_vgpr, baz.num_vgpr)
+ worst case of itself and its callees'
+ VGPR use
+ <function_name>.num_agpr Integer Number of AGPRs used by <function_name>, .set foo.num_agpr, max(35, bar.num_agpr)
+ worst case of itself and its callees'
+ AGPR use
+ <function_name>.numbered_sgpr Integer Number of SGPRs used by <function_name>, .set foo.num_sgpr, 21
+ worst case of itself and its callees'
+ SGPR use (without any of the implicitly
+ used SGPRs)
+ <function_name>.private_seg_size Integer Total stack size required for .set foo.private_seg_size, 16+max(bar.private_seg_size, baz.private_seg_size)
+ <function_name>, expression is the
+ locally used stack size + the worst case
+ callee
+ <function_name>.uses_vcc Bool Whether <function_name>, or any of its .set foo.uses_vcc, or(0, bar.uses_vcc)
+ callees, uses vcc or not
+ <function_name>.uses_flat_scratch Bool Whether <function_name>, or any of its .set foo.uses_flat_scratch, 1
+ callees, uses flat scratch or not
+ <function_name>.has_dyn_sized_stack Bool Whether <function_name> stack is .set foo.has_dyn_sized_stack, 1
+ dynamically sized
+ <function_name>.has_recursion Bool Whether <function_name>, or any of its .set foo.has_recursion, 0
+ callees, contains recursion
+ <function_name>.has_indirect_call Bool Whether <function_name>, or any of its .set foo.has_indirect_call, max(0, bar.has_indirect_call)
+ callees, contains an indirect call
+ ===================================== ========= ========================================= ===============================================================================
+
+Futhermore, three symbols are additionally emitted describing the compilation
+unit's worst case (i.e, maxima) ``num_vgpr``, ``num_agpr``, and
+``numbered_sgpr`` which may be referenced and used by the aforementioned
+symbolic expressions. These three symbols are ``max_num_vgpr``,
+``max_num_agpr``, and ``max_num_sgpr``.
+
.. _amdgpu-elf-code-object:
ELF Code Object
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index cef5200aadb649..1e8f19cc4b101e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -34,6 +34,7 @@
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/MC/MCAssembler.h"
@@ -443,10 +444,13 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
RI.getSymbol(F.getName(), RIK::RIK_NumAGPR, OutContext);
uint64_t NumVgpr, NumAgpr;
- if (NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() &&
+ MachineModuleInfo &MMI =
+ getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+ MachineFunction *MF = MMI.getMachineFunction(F);
+ if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() &&
TryGetMCExprValue(NumVgprSymbol->getVariableValue(), NumVgpr) &&
TryGetMCExprValue(NumAgprSymbol->getVariableValue(), NumAgpr)) {
- SIMachineFunctionInfo MFI(F, &STM);
+ const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
unsigned MaxWaves = MFI.getMaxWavesPerEU();
uint64_t TotalNumVgpr =
getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr);
@@ -1630,6 +1634,8 @@ bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<AMDGPUResourceUsageAnalysis>();
AU.addPreserved<AMDGPUResourceUsageAnalysis>();
+ AU.addRequired<MachineModuleInfoWrapperPass>();
+ AU.addPreserved<MachineModuleInfoWrapperPass>();
AsmPrinter::getAnalysisUsage(AU);
}
>From a5b96683198b4e7857f1b384d5334628d8dafc22 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <janek.vanoirschot at amd.com>
Date: Tue, 24 Sep 2024 21:41:07 +0100
Subject: [PATCH 11/11] Feedback, rename max_num_Xgpr symbol, rephrasing doc
---
llvm/docs/AMDGPUUsage.rst | 23 +++++-----
.../Target/AMDGPU/AMDGPUMCResourceInfo.cpp | 6 +--
.../CodeGen/AMDGPU/agpr-register-count.ll | 10 ++---
.../amdpal-metadata-agpr-register-count.ll | 4 +-
...-amdgpu-flat-work-group-size-vgpr-limit.ll | 44 +++++++++----------
.../AMDGPU/call-graph-register-usage.ll | 18 ++++----
.../CodeGen/AMDGPU/function-resource-usage.ll | 42 +++++++++---------
7 files changed, 73 insertions(+), 74 deletions(-)
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 2649b3a7d360b2..4bfa4ab71293c2 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1758,14 +1758,13 @@ As part of the AMDGPU MC layer, AMDGPU provides the following target specific
Function Resource Usage
-----------------------
-The function resource information (e.g., number of VGPRs) required depends on
-all of its callees' function resources. The expression to denote these
-resources should, therefore, be described as the propagative of its callees'
-equivalent expressions. Said expressions are generated and emitted (as symbols)
-by the compiler when compiling to either assembly or object format and should
-not be overwritten or redefined.
+A function's resource usage depends on each of its callees' resource usage. The
+expressions used to denote resource usage reflect this by propagating each
+callees' equivalent expressions. Said expressions are emitted as symbols by the
+compiler when compiling to either assembly or object format and should not be
+overwritten or redefined.
-The following describes all emitted function resource usage information:
+The following describes all emitted function resource usage symbols:
.. table:: Function Resource Usage:
:name: function-usage-table
@@ -1788,11 +1787,11 @@ The following describes all emitted function resource usage information:
locally used stack size + the worst case
callee
<function_name>.uses_vcc Bool Whether <function_name>, or any of its .set foo.uses_vcc, or(0, bar.uses_vcc)
- callees, uses vcc or not
+ callees, uses vcc
<function_name>.uses_flat_scratch Bool Whether <function_name>, or any of its .set foo.uses_flat_scratch, 1
callees, uses flat scratch or not
- <function_name>.has_dyn_sized_stack Bool Whether <function_name> stack is .set foo.has_dyn_sized_stack, 1
- dynamically sized
+ <function_name>.has_dyn_sized_stack Bool Whether <function_name>, or any of its .set foo.has_dyn_sized_stack, 1
+ callees, is dynamically sized
<function_name>.has_recursion Bool Whether <function_name>, or any of its .set foo.has_recursion, 0
callees, contains recursion
<function_name>.has_indirect_call Bool Whether <function_name>, or any of its .set foo.has_indirect_call, max(0, bar.has_indirect_call)
@@ -1802,8 +1801,8 @@ The following describes all emitted function resource usage information:
Futhermore, three symbols are additionally emitted describing the compilation
unit's worst case (i.e, maxima) ``num_vgpr``, ``num_agpr``, and
``numbered_sgpr`` which may be referenced and used by the aforementioned
-symbolic expressions. These three symbols are ``max_num_vgpr``,
-``max_num_agpr``, and ``max_num_sgpr``.
+symbolic expressions. These three symbols are ``amdgcn.max_num_vgpr``,
+``amdgcn.max_num_agpr``, and ``amdgcn.max_num_sgpr``.
.. _amdgpu-elf-code-object:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
index b46b2d582976e9..25278327865958 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
@@ -78,15 +78,15 @@ void MCResourceInfo::finalize(MCContext &OutContext) {
}
MCSymbol *MCResourceInfo::getMaxVGPRSymbol(MCContext &OutContext) {
- return OutContext.getOrCreateSymbol("max_num_vgpr");
+ return OutContext.getOrCreateSymbol("amdgcn.max_num_vgpr");
}
MCSymbol *MCResourceInfo::getMaxAGPRSymbol(MCContext &OutContext) {
- return OutContext.getOrCreateSymbol("max_num_agpr");
+ return OutContext.getOrCreateSymbol("amdgcn.max_num_agpr");
}
MCSymbol *MCResourceInfo::getMaxSGPRSymbol(MCContext &OutContext) {
- return OutContext.getOrCreateSymbol("max_num_sgpr");
+ return OutContext.getOrCreateSymbol("amdgcn.max_num_sgpr");
}
void MCResourceInfo::assignResourceInfoExpr(
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
index 2615ca33554ac6..6c55a1a31dc983 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
@@ -156,8 +156,8 @@ declare void @undef_func()
; GCN-LABEL: {{^}}kernel_call_undef_func:
; GCN: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0)
; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
-; GCN: .set kernel_call_undef_func.num_vgpr, max(32, max_num_vgpr)
-; GCN: .set kernel_call_undef_func.num_agpr, max(0, max_num_agpr)
+; GCN: .set kernel_call_undef_func.num_vgpr, max(32, amdgcn.max_num_vgpr)
+; GCN: .set kernel_call_undef_func.num_agpr, max(0, amdgcn.max_num_agpr)
; GCN: NumVgprs: kernel_call_undef_func.num_vgpr
; GCN: NumAgprs: kernel_call_undef_func.num_agpr
; GCN: TotalNumVgprs: totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr)
@@ -174,8 +174,8 @@ bb:
ret void
}
-; GCN: .set max_num_vgpr, 32
-; GCN-NEXT: .set max_num_agpr, 32
-; GCN-NEXT: .set max_num_sgpr, 34
+; GCN: .set amdgcn.max_num_vgpr, 32
+; GCN-NEXT: .set amdgcn.max_num_agpr, 32
+; GCN-NEXT: .set amdgcn.max_num_sgpr, 34
attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll
index f64a5e01cd2560..38c9a1c4e2742c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll
@@ -60,9 +60,9 @@ bb:
declare void @undef_func()
; CHECK: .type kernel_call_undef_func
-; CHECK: .set kernel_call_undef_func.num_agpr, max(0, max_num_agpr)
+; CHECK: .set kernel_call_undef_func.num_agpr, max(0, amdgcn.max_num_agpr)
; CHECK: NumAgprs: kernel_call_undef_func.num_agpr
-; CHECK: .set max_num_agpr, 32
+; CHECK: .set amdgcn.max_num_agpr, 32
define amdgpu_kernel void @kernel_call_undef_func() #0 {
bb:
call void @undef_func()
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
index c893f6b04b7b66..ea47da86a7fc56 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
@@ -547,17 +547,17 @@ define amdgpu_kernel void @f256() #256 {
attributes #256 = { nounwind "amdgpu-flat-work-group-size"="256,256" }
; GCN-LABEL: {{^}}f512:
-; GFX9: .set f512.num_vgpr, max(128, max_num_vgpr)
-; GFX90A: .set f512.num_vgpr, max(128, max_num_vgpr)
-; GFX90A: .set f512.num_agpr, max(128, max_num_agpr)
-; GFX10WGP-WAVE32: .set f512.num_vgpr, max(256, max_num_vgpr)
-; GFX10WGP-WAVE64: .set f512.num_vgpr, max(256, max_num_vgpr)
-; GFX10CU-WAVE32: .set f512.num_vgpr, max(128, max_num_vgpr)
-; GFX10CU-WAVE64: .set f512.num_vgpr, max(128, max_num_vgpr)
-; GFX11WGP-WAVE32: .set f512.num_vgpr, max(256, max_num_vgpr)
-; GFX11WGP-WAVE64: .set f512.num_vgpr, max(256, max_num_vgpr)
-; GFX11CU-WAVE32: .set f512.num_vgpr, max(192, max_num_vgpr)
-; GFX11CU-WAVE64: .set f512.num_vgpr, max(192, max_num_vgpr)
+; GFX9: .set f512.num_vgpr, max(128, amdgcn.max_num_vgpr)
+; GFX90A: .set f512.num_vgpr, max(128, amdgcn.max_num_vgpr)
+; GFX90A: .set f512.num_agpr, max(128, amdgcn.max_num_agpr)
+; GFX10WGP-WAVE32: .set f512.num_vgpr, max(256, amdgcn.max_num_vgpr)
+; GFX10WGP-WAVE64: .set f512.num_vgpr, max(256, amdgcn.max_num_vgpr)
+; GFX10CU-WAVE32: .set f512.num_vgpr, max(128, amdgcn.max_num_vgpr)
+; GFX10CU-WAVE64: .set f512.num_vgpr, max(128, amdgcn.max_num_vgpr)
+; GFX11WGP-WAVE32: .set f512.num_vgpr, max(256, amdgcn.max_num_vgpr)
+; GFX11WGP-WAVE64: .set f512.num_vgpr, max(256, amdgcn.max_num_vgpr)
+; GFX11CU-WAVE32: .set f512.num_vgpr, max(192, amdgcn.max_num_vgpr)
+; GFX11CU-WAVE64: .set f512.num_vgpr, max(192, amdgcn.max_num_vgpr)
; GCN: NumVgprs: f512.num_vgpr
; GFX90A: NumAgprs: f512.num_agpr
; GFX90A: TotalNumVgprs: totalnumvgprs(f512.num_agpr, f512.num_vgpr)
@@ -569,17 +569,17 @@ define amdgpu_kernel void @f512() #512 {
attributes #512 = { nounwind "amdgpu-flat-work-group-size"="512,512" }
; GCN-LABEL: {{^}}f1024:
-; GFX9: .set f1024.num_vgpr, max(64, max_num_vgpr)
-; GFX90A: .set f1024.num_vgpr, max(64, max_num_vgpr)
-; GFX90A: .set f1024.num_agpr, max(64, max_num_agpr)
-; GFX10WGP-WAVE32: .set f1024.num_vgpr, max(128, max_num_vgpr)
-; GFX10WGP-WAVE64: .set f1024.num_vgpr, max(128, max_num_vgpr)
-; GFX10CU-WAVE32: .set f1024.num_vgpr, max(64, max_num_vgpr)
-; GFX10CU-WAVE64: .set f1024.num_vgpr, max(64, max_num_vgpr)
-; GFX11WGP-WAVE32: .set f1024.num_vgpr, max(192, max_num_vgpr)
-; GFX11WGP-WAVE64: .set f1024.num_vgpr, max(192, max_num_vgpr)
-; GFX11CU-WAVE32: .set f1024.num_vgpr, max(96, max_num_vgpr)
-; GFX11CU-WAVE64: .set f1024.num_vgpr, max(96, max_num_vgpr)
+; GFX9: .set f1024.num_vgpr, max(64, amdgcn.max_num_vgpr)
+; GFX90A: .set f1024.num_vgpr, max(64, amdgcn.max_num_vgpr)
+; GFX90A: .set f1024.num_agpr, max(64, amdgcn.max_num_agpr)
+; GFX10WGP-WAVE32: .set f1024.num_vgpr, max(128, amdgcn.max_num_vgpr)
+; GFX10WGP-WAVE64: .set f1024.num_vgpr, max(128, amdgcn.max_num_vgpr)
+; GFX10CU-WAVE32: .set f1024.num_vgpr, max(64, amdgcn.max_num_vgpr)
+; GFX10CU-WAVE64: .set f1024.num_vgpr, max(64, amdgcn.max_num_vgpr)
+; GFX11WGP-WAVE32: .set f1024.num_vgpr, max(192, amdgcn.max_num_vgpr)
+; GFX11WGP-WAVE64: .set f1024.num_vgpr, max(192, amdgcn.max_num_vgpr)
+; GFX11CU-WAVE32: .set f1024.num_vgpr, max(96, amdgcn.max_num_vgpr)
+; GFX11CU-WAVE64: .set f1024.num_vgpr, max(96, amdgcn.max_num_vgpr)
; GCN: NumVgprs: f1024.num_vgpr
; GFX90A: NumAgprs: f1024.num_agpr
; GFX90A: TotalNumVgprs: totalnumvgprs(f1024.num_agpr, f1024.num_vgpr)
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index 0328066bd33d8f..2a961b880b7253 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -234,8 +234,8 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
; Make sure there's no assert when a sgpr96 is used.
; GCN-LABEL: {{^}}count_use_sgpr96_external_call
; GCN: ; sgpr96 s[{{[0-9]+}}:{{[0-9]+}}]
-; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(0, max_num_vgpr)
-; GCN: .set count_use_sgpr96_external_call.numbered_sgpr, max(33, max_num_sgpr)
+; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(0, amdgcn.max_num_vgpr)
+; GCN: .set count_use_sgpr96_external_call.numbered_sgpr, max(33, amdgcn.max_num_sgpr)
; CI: TotalNumSgprs: count_use_sgpr96_external_call.numbered_sgpr+4
; VI-BUG: TotalNumSgprs: 96
; GCN: NumVgprs: count_use_sgpr96_external_call.num_vgpr
@@ -249,8 +249,8 @@ entry:
; Make sure there's no assert when a sgpr160 is used.
; GCN-LABEL: {{^}}count_use_sgpr160_external_call
; GCN: ; sgpr160 s[{{[0-9]+}}:{{[0-9]+}}]
-; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(0, max_num_vgpr)
-; GCN: .set count_use_sgpr160_external_call.numbered_sgpr, max(33, max_num_sgpr)
+; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(0, amdgcn.max_num_vgpr)
+; GCN: .set count_use_sgpr160_external_call.numbered_sgpr, max(33, amdgcn.max_num_sgpr)
; CI: TotalNumSgprs: count_use_sgpr160_external_call.numbered_sgpr+4
; VI-BUG: TotalNumSgprs: 96
; GCN: NumVgprs: count_use_sgpr160_external_call.num_vgpr
@@ -264,8 +264,8 @@ entry:
; Make sure there's no assert when a vgpr160 is used.
; GCN-LABEL: {{^}}count_use_vgpr160_external_call
; GCN: ; vgpr160 v[{{[0-9]+}}:{{[0-9]+}}]
-; GCN: .set count_use_vgpr160_external_call.num_vgpr, max(5, max_num_vgpr)
-; GCN: .set count_use_vgpr160_external_call.numbered_sgpr, max(33, max_num_sgpr)
+; GCN: .set count_use_vgpr160_external_call.num_vgpr, max(5, amdgcn.max_num_vgpr)
+; GCN: .set count_use_vgpr160_external_call.numbered_sgpr, max(33, amdgcn.max_num_sgpr)
; CI: TotalNumSgprs: count_use_vgpr160_external_call.numbered_sgpr+4
; VI-BUG: TotalNumSgprs: 96
; GCN: NumVgprs: count_use_vgpr160_external_call.num_vgpr
@@ -276,9 +276,9 @@ entry:
ret void
}
-; GCN: .set max_num_vgpr, 50
-; GCN: .set max_num_agpr, 0
-; GCN: .set max_num_sgpr, 80
+; GCN: .set amdgcn.max_num_vgpr, 50
+; GCN: .set amdgcn.max_num_agpr, 0
+; GCN: .set amdgcn.max_num_sgpr, 80
; GCN-LABEL: amdhsa.kernels:
; GCN: .name: count_use_sgpr96_external_call
diff --git a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
index ae6a6575d62ec8..e1165a4bdfc0c2 100644
--- a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
@@ -355,9 +355,9 @@ define amdgpu_kernel void @multi_call_use_use_stack() #0 {
declare void @external() #0
; GCN-LABEL: {{^}}multi_call_with_external:
-; GCN: .set multi_call_with_external.num_vgpr, max(41, max_num_vgpr)
-; GCN: .set multi_call_with_external.num_agpr, max(0, max_num_agpr)
-; GCN: .set multi_call_with_external.numbered_sgpr, max(42, max_num_sgpr)
+; GCN: .set multi_call_with_external.num_vgpr, max(41, amdgcn.max_num_vgpr)
+; GCN: .set multi_call_with_external.num_agpr, max(0, amdgcn.max_num_agpr)
+; GCN: .set multi_call_with_external.numbered_sgpr, max(42, amdgcn.max_num_sgpr)
; GCN: .set multi_call_with_external.private_seg_size, 0
; GCN: .set multi_call_with_external.uses_vcc, 1
; GCN: .set multi_call_with_external.uses_flat_scratch, 1
@@ -375,9 +375,9 @@ define amdgpu_kernel void @multi_call_with_external() #0 {
}
; GCN-LABEL: {{^}}usage_external:
-; GCN: .set usage_external.num_vgpr, max(32, max_num_vgpr)
-; GCN: .set usage_external.num_agpr, max(0, max_num_agpr)
-; GCN: .set usage_external.numbered_sgpr, max(33, max_num_sgpr)
+; GCN: .set usage_external.num_vgpr, max(32, amdgcn.max_num_vgpr)
+; GCN: .set usage_external.num_agpr, max(0, amdgcn.max_num_agpr)
+; GCN: .set usage_external.numbered_sgpr, max(33, amdgcn.max_num_sgpr)
; GCN: .set usage_external.private_seg_size, 0
; GCN: .set usage_external.uses_vcc, 1
; GCN: .set usage_external.uses_flat_scratch, 1
@@ -395,9 +395,9 @@ define amdgpu_kernel void @usage_external() #0 {
declare void @external_recurse() #2
; GCN-LABEL: {{^}}usage_external_recurse:
-; GCN: .set usage_external_recurse.num_vgpr, max(32, max_num_vgpr)
-; GCN: .set usage_external_recurse.num_agpr, max(0, max_num_agpr)
-; GCN: .set usage_external_recurse.numbered_sgpr, max(33, max_num_sgpr)
+; GCN: .set usage_external_recurse.num_vgpr, max(32, amdgcn.max_num_vgpr)
+; GCN: .set usage_external_recurse.num_agpr, max(0, amdgcn.max_num_agpr)
+; GCN: .set usage_external_recurse.numbered_sgpr, max(33, amdgcn.max_num_sgpr)
; GCN: .set usage_external_recurse.private_seg_size, 0
; GCN: .set usage_external_recurse.uses_vcc, 1
; GCN: .set usage_external_recurse.uses_flat_scratch, 1
@@ -460,9 +460,9 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
; Make sure there's no assert when a sgpr96 is used.
; GCN-LABEL: {{^}}count_use_sgpr96_external_call
-; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(32, max_num_vgpr)
-; GCN: .set count_use_sgpr96_external_call.num_agpr, max(0, max_num_agpr)
-; GCN: .set count_use_sgpr96_external_call.numbered_sgpr, max(33, max_num_sgpr)
+; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(32, amdgcn.max_num_vgpr)
+; GCN: .set count_use_sgpr96_external_call.num_agpr, max(0, amdgcn.max_num_agpr)
+; GCN: .set count_use_sgpr96_external_call.numbered_sgpr, max(33, amdgcn.max_num_sgpr)
; GCN: .set count_use_sgpr96_external_call.private_seg_size, 0
; GCN: .set count_use_sgpr96_external_call.uses_vcc, 1
; GCN: .set count_use_sgpr96_external_call.uses_flat_scratch, 1
@@ -481,9 +481,9 @@ entry:
; Make sure there's no assert when a sgpr160 is used.
; GCN-LABEL: {{^}}count_use_sgpr160_external_call
-; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(32, max_num_vgpr)
-; GCN: .set count_use_sgpr160_external_call.num_agpr, max(0, max_num_agpr)
-; GCN: .set count_use_sgpr160_external_call.numbered_sgpr, max(33, max_num_sgpr)
+; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(32, amdgcn.max_num_vgpr)
+; GCN: .set count_use_sgpr160_external_call.num_agpr, max(0, amdgcn.max_num_agpr)
+; GCN: .set count_use_sgpr160_external_call.numbered_sgpr, max(33, amdgcn.max_num_sgpr)
; GCN: .set count_use_sgpr160_external_call.private_seg_size, 0
; GCN: .set count_use_sgpr160_external_call.uses_vcc, 1
; GCN: .set count_use_sgpr160_external_call.uses_flat_scratch, 1
@@ -502,9 +502,9 @@ entry:
; Make sure there's no assert when a vgpr160 is used.
; GCN-LABEL: {{^}}count_use_vgpr160_external_call
-; GCN: .set count_use_vgpr160_external_call.num_vgpr, max(32, max_num_vgpr)
-; GCN: .set count_use_vgpr160_external_call.num_agpr, max(0, max_num_agpr)
-; GCN: .set count_use_vgpr160_external_call.numbered_sgpr, max(33, max_num_sgpr)
+; GCN: .set count_use_vgpr160_external_call.num_vgpr, max(32, amdgcn.max_num_vgpr)
+; GCN: .set count_use_vgpr160_external_call.num_agpr, max(0, amdgcn.max_num_agpr)
+; GCN: .set count_use_vgpr160_external_call.numbered_sgpr, max(33, amdgcn.max_num_sgpr)
; GCN: .set count_use_vgpr160_external_call.private_seg_size, 0
; GCN: .set count_use_vgpr160_external_call.uses_vcc, 1
; GCN: .set count_use_vgpr160_external_call.uses_flat_scratch, 1
@@ -522,9 +522,9 @@ entry:
}
; Added at the of the .s are the module level maximums
-; GCN: .set max_num_vgpr, 50
-; GCN: .set max_num_agpr, 0
-; GCN: .set max_num_sgpr, 80
+; GCN: .set amdgcn.max_num_vgpr, 50
+; GCN: .set amdgcn.max_num_agpr, 0
+; GCN: .set amdgcn.max_num_sgpr, 80
attributes #0 = { nounwind noinline norecurse }
attributes #1 = { nounwind noinline norecurse }
More information about the cfe-commits
mailing list