[clang] [llvm] [AMDGPU] Convert AMDGPUResourceUsageAnalysis pass from Module to MF pass (PR #102913)
Janek van Oirschot via cfe-commits
cfe-commits at lists.llvm.org
Fri Aug 16 06:24:09 PDT 2024
https://github.com/JanekvO updated https://github.com/llvm/llvm-project/pull/102913
>From 136ad3a97df7b02d89d845920c53ef80da1e7c31 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <janek.vanoirschot at amd.com>
Date: Mon, 12 Aug 2024 14:58:41 +0100
Subject: [PATCH 1/3] Convert AMDGPUResourceUsageAnalysis pass from Module to
MachineFunction pass and move metadata propagation logic to MC layer
---
.../amdgcn-machine-analysis-remarks.cl | 10 +-
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 208 +++++--
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h | 19 +-
.../Target/AMDGPU/AMDGPUMCResourceInfo.cpp | 220 ++++++++
llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h | 94 +++
.../AMDGPU/AMDGPUResourceUsageAnalysis.cpp | 147 +----
.../AMDGPU/AMDGPUResourceUsageAnalysis.h | 40 +-
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
.../MCTargetDesc/AMDGPUTargetStreamer.cpp | 41 ++
.../MCTargetDesc/AMDGPUTargetStreamer.h | 23 +
.../Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp | 10 +-
.../AMDGPU/GlobalISel/extractelement.ll | 168 +++---
.../AMDGPU/GlobalISel/flat-scratch-init.ll | 16 +-
.../GlobalISel/llvm.amdgcn.workitem.id.ll | 6 +-
.../AMDGPU/GlobalISel/non-entry-alloca.ll | 8 +-
.../CodeGen/AMDGPU/agpr-register-count.ll | 65 ++-
.../CodeGen/AMDGPU/amdgpu.private-memory.ll | 3 +-
.../amdpal-metadata-agpr-register-count.ll | 4 +-
...-amdgpu-flat-work-group-size-vgpr-limit.ll | 51 +-
llvm/test/CodeGen/AMDGPU/attributor-noopt.ll | 30 +-
.../AMDGPU/call-alias-register-usage-agpr.ll | 16 +-
.../AMDGPU/call-alias-register-usage0.ll | 8 +-
.../AMDGPU/call-alias-register-usage1.ll | 11 +-
.../AMDGPU/call-alias-register-usage2.ll | 11 +-
.../AMDGPU/call-alias-register-usage3.ll | 11 +-
.../AMDGPU/call-graph-register-usage.ll | 50 +-
.../callee-special-input-sgprs-fixed-abi.ll | 4 +-
.../test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll | 1 +
llvm/test/CodeGen/AMDGPU/code-object-v3.ll | 34 +-
.../AMDGPU/codegen-internal-only-func.ll | 6 +-
.../AMDGPU/control-flow-fastregalloc.ll | 6 +-
llvm/test/CodeGen/AMDGPU/elf.ll | 7 +-
.../enable-scratch-only-dynamic-stack.ll | 14 +-
.../CodeGen/AMDGPU/function-resource-usage.ll | 533 ++++++++++++++++++
.../AMDGPU/gfx11-user-sgpr-init16-bug.ll | 16 +-
.../AMDGPU/inline-asm-reserved-regs.ll | 2 +-
.../CodeGen/AMDGPU/insert-delay-alu-bug.ll | 24 +-
llvm/test/CodeGen/AMDGPU/ipra.ll | 14 +-
llvm/test/CodeGen/AMDGPU/kernarg-size.ll | 4 +-
.../CodeGen/AMDGPU/kernel_code_t_recurse.ll | 7 +-
.../CodeGen/AMDGPU/large-alloca-compute.ll | 31 +-
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 45 +-
.../CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll | 6 +-
.../AMDGPU/lower-module-lds-offsets.ll | 26 +-
llvm/test/CodeGen/AMDGPU/mesa3d.ll | 13 +-
.../AMDGPU/module-lds-false-sharing.ll | 99 ++--
llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll | 20 +-
llvm/test/CodeGen/AMDGPU/recursion.ll | 28 +-
.../AMDGPU/resource-optimization-remarks.ll | 64 +--
.../AMDGPU/resource-usage-dead-function.ll | 13 +-
.../CodeGen/AMDGPU/stack-realign-kernel.ll | 126 +++--
llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll | 3 +-
llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll | 3 +-
llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll | 3 +-
llvm/test/CodeGen/AMDGPU/trap.ll | 33 +-
55 files changed, 1801 insertions(+), 655 deletions(-)
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
create mode 100644 llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
diff --git a/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl b/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl
index a05e21b37b9127..a2dd59a871904c 100644
--- a/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl
+++ b/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl
@@ -2,12 +2,12 @@
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx908 -Rpass-analysis=kernel-resource-usage -S -O0 -verify %s -o /dev/null
// expected-remark at +10 {{Function Name: foo}}
-// expected-remark at +9 {{ SGPRs: 13}}
-// expected-remark at +8 {{ VGPRs: 10}}
-// expected-remark at +7 {{ AGPRs: 12}}
-// expected-remark at +6 {{ ScratchSize [bytes/lane]: 0}}
+// expected-remark at +9 {{ SGPRs: foo.num_sgpr+(extrasgprs(foo.uses_vcc, foo.uses_flat_scratch, 1))}}
+// expected-remark at +8 {{ VGPRs: foo.num_vgpr}}
+// expected-remark at +7 {{ AGPRs: foo.num_agpr}}
+// expected-remark at +6 {{ ScratchSize [bytes/lane]: foo.private_seg_size}}
// expected-remark at +5 {{ Dynamic Stack: False}}
-// expected-remark at +4 {{ Occupancy [waves/SIMD]: 10}}
+// expected-remark at +4 {{ Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 10, max(foo.num_sgpr+(extrasgprs(foo.uses_vcc, foo.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(foo.num_agpr, foo.num_vgpr), 1, 0))}}
// expected-remark at +3 {{ SGPRs Spill: 0}}
// expected-remark at +2 {{ VGPRs Spill: 0}}
// expected-remark at +1 {{ LDS Size [bytes/block]: 0}}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index b90d245b7bd394..23bc804515e690 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -18,6 +18,7 @@
#include "AMDGPUAsmPrinter.h"
#include "AMDGPU.h"
#include "AMDGPUHSAMetadataStreamer.h"
+#include "AMDGPUMCResourceInfo.h"
#include "AMDGPUResourceUsageAnalysis.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUInstPrinter.h"
@@ -92,6 +93,9 @@ AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
std::unique_ptr<MCStreamer> Streamer)
: AsmPrinter(TM, std::move(Streamer)) {
assert(OutStreamer && "AsmPrinter constructed without streamer");
+ RI = std::make_unique<MCResourceInfo>(OutContext);
+ OccupancyValidateMap =
+ std::make_unique<DenseMap<const Function *, const MCExpr *>>();
}
StringRef AMDGPUAsmPrinter::getPassName() const {
@@ -359,6 +363,102 @@ bool AMDGPUAsmPrinter::doInitialization(Module &M) {
return AsmPrinter::doInitialization(M);
}
+void AMDGPUAsmPrinter::ValidateMCResourceInfo(Function &F) {
+ if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
+ return;
+
+ using RIK = MCResourceInfo::ResourceInfoKind;
+ const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
+
+ auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
+ int64_t Val;
+ if (Value->evaluateAsAbsolute(Val)) {
+ Res = Val;
+ return true;
+ }
+ return false;
+ };
+
+ const uint64_t MaxScratchPerWorkitem =
+ STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
+ MCSymbol *ScratchSizeSymbol =
+ RI->getSymbol(F.getName(), RIK::RIK_PrivateSegSize);
+ uint64_t ScratchSize;
+ if (ScratchSizeSymbol->isVariable() &&
+ TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
+ ScratchSize > MaxScratchPerWorkitem) {
+ DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem,
+ DS_Error);
+ F.getContext().diagnose(DiagStackSize);
+ }
+
+ // Validate addressable scalar registers (i.e., prior to added implicit
+ // SGPRs).
+ MCSymbol *NumSGPRSymbol = RI->getSymbol(F.getName(), RIK::RIK_NumSGPR);
+ if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
+ !STM.hasSGPRInitBug()) {
+ unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
+ uint64_t NumSgpr;
+ if (NumSGPRSymbol->isVariable() &&
+ TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
+ NumSgpr > MaxAddressableNumSGPRs) {
+ DiagnosticInfoResourceLimit Diag(F, "addressable scalar registers",
+ NumSgpr, MaxAddressableNumSGPRs,
+ DS_Error, DK_ResourceLimit);
+ F.getContext().diagnose(Diag);
+ return;
+ }
+ }
+
+ MCSymbol *VCCUsedSymbol = RI->getSymbol(F.getName(), RIK::RIK_UsesVCC);
+ MCSymbol *FlatUsedSymbol =
+ RI->getSymbol(F.getName(), RIK::RIK_UsesFlatScratch);
+ uint64_t VCCUsed, FlatUsed, NumSgpr;
+
+ if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
+ FlatUsedSymbol->isVariable() &&
+ TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
+ TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) &&
+ TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) {
+
+ // Recomputes NumSgprs + implicit SGPRs but all symbols should now be
+ // resolvable.
+ NumSgpr += IsaInfo::getNumExtraSGPRs(
+ &STM, VCCUsed, FlatUsed,
+ getTargetStreamer()->getTargetID()->isXnackOnOrAny());
+ if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
+ STM.hasSGPRInitBug()) {
+ unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
+ if (NumSgpr > MaxAddressableNumSGPRs) {
+ DiagnosticInfoResourceLimit Diag(F, "scalar registers", NumSgpr,
+ MaxAddressableNumSGPRs, DS_Error,
+ DK_ResourceLimit);
+ F.getContext().diagnose(Diag);
+ return;
+ }
+ }
+
+ auto I = OccupancyValidateMap->find(&F);
+ if (I != OccupancyValidateMap->end()) {
+ const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
+ F, "amdgpu-waves-per-eu", {0, 0}, true);
+ uint64_t Occupancy;
+ const MCExpr *OccupancyExpr = I->getSecond();
+
+ if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) {
+ DiagnosticInfoOptimizationFailure Diag(
+ F, F.getSubprogram(),
+ "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
+ "'" +
+ F.getName() + "': desired occupancy was " + Twine(MinWEU) +
+ ", final occupancy is " + Twine(Occupancy));
+ F.getContext().diagnose(Diag);
+ return;
+ }
+ }
+ }
+}
+
bool AMDGPUAsmPrinter::doFinalization(Module &M) {
// Pad with s_code_end to help tools and guard against instruction prefetch
// causing stale data in caches. Arguably this should be done by the linker,
@@ -371,25 +471,16 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) {
getTargetStreamer()->EmitCodeEnd(STI);
}
- return AsmPrinter::doFinalization(M);
-}
+ // Assign expressions which can only be resolved when all other functions are
+ // known.
+ RI->Finalize();
+ getTargetStreamer()->EmitMCResourceMaximums(
+ RI->getMaxVGPRSymbol(), RI->getMaxAGPRSymbol(), RI->getMaxSGPRSymbol());
-// Print comments that apply to both callable functions and entry points.
-void AMDGPUAsmPrinter::emitCommonFunctionComments(
- uint32_t NumVGPR, std::optional<uint32_t> NumAGPR, uint32_t TotalNumVGPR,
- uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize,
- const AMDGPUMachineFunction *MFI) {
- OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
- OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
- OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
- if (NumAGPR) {
- OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false);
- OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR),
- false);
+ for (Function &F : M.functions()) {
+ ValidateMCResourceInfo(F);
}
- OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
- OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
- false);
+ return AsmPrinter::doFinalization(M);
}
SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
@@ -402,6 +493,7 @@ SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
return Str;
}
+// Print comments that apply to both callable functions and entry points.
void AMDGPUAsmPrinter::emitCommonFunctionComments(
const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
@@ -571,21 +663,45 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
STM.hasMAIInsts());
+ {
+ const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
+ ResourceUsage->getResourceInfo();
+ RI->gatherResourceInfo(MF, Info);
+ using RIK = MCResourceInfo::ResourceInfoKind;
+ getTargetStreamer()->EmitMCResourceInfo(
+ RI->getSymbol(MF.getName(), RIK::RIK_NumVGPR),
+ RI->getSymbol(MF.getName(), RIK::RIK_NumAGPR),
+ RI->getSymbol(MF.getName(), RIK::RIK_NumSGPR),
+ RI->getSymbol(MF.getName(), RIK::RIK_PrivateSegSize),
+ RI->getSymbol(MF.getName(), RIK::RIK_UsesVCC),
+ RI->getSymbol(MF.getName(), RIK::RIK_UsesFlatScratch),
+ RI->getSymbol(MF.getName(), RIK::RIK_HasDynSizedStack),
+ RI->getSymbol(MF.getName(), RIK::RIK_HasRecursion),
+ RI->getSymbol(MF.getName(), RIK::RIK_HasIndirectCall));
+ }
+
if (isVerbose()) {
MCSectionELF *CommentSection =
Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
OutStreamer->switchSection(CommentSection);
if (!MFI->isEntryFunction()) {
+ using RIK = MCResourceInfo::ResourceInfoKind;
OutStreamer->emitRawComment(" Function info:", false);
- const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
- ResourceUsage->getResourceInfo(&MF.getFunction());
+
emitCommonFunctionComments(
- Info.NumVGPR,
- STM.hasMAIInsts() ? Info.NumAGPR : std::optional<uint32_t>(),
- Info.getTotalNumVGPRs(STM),
- Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
- Info.PrivateSegmentSize, getFunctionCodeSize(MF), MFI);
+ RI->getSymbol(MF.getName(), RIK::RIK_NumVGPR)->getVariableValue(),
+ STM.hasMAIInsts() ? RI->getSymbol(MF.getName(), RIK::RIK_NumAGPR)
+ ->getVariableValue()
+ : nullptr,
+ RI->createTotalNumVGPRs(MF, Ctx),
+ RI->createTotalNumSGPRs(
+ MF,
+ MF.getSubtarget<GCNSubtarget>().getTargetID().isXnackOnOrAny(),
+ Ctx),
+ RI->getSymbol(MF.getName(), RIK::RIK_PrivateSegSize)
+ ->getVariableValue(),
+ getFunctionCodeSize(MF), MFI);
return false;
}
@@ -753,8 +869,6 @@ uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const
void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
const MachineFunction &MF) {
- const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
- ResourceUsage->getResourceInfo(&MF.getFunction());
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
MCContext &Ctx = MF.getContext();
@@ -771,18 +885,38 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
return false;
};
- ProgInfo.NumArchVGPR = CreateExpr(Info.NumVGPR);
- ProgInfo.NumAccVGPR = CreateExpr(Info.NumAGPR);
- ProgInfo.NumVGPR = CreateExpr(Info.getTotalNumVGPRs(STM));
- ProgInfo.AccumOffset =
- CreateExpr(alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1);
+ auto GetSymRefExpr =
+ [&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
+ MCSymbol *Sym = RI->getSymbol(MF.getName(), RIK);
+ return MCSymbolRefExpr::create(Sym, Ctx);
+ };
+
+ const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx);
+ const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx);
+
+ using RIK = MCResourceInfo::ResourceInfoKind;
+ ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR);
+ ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR);
+ ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
+ ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
+
+ // AccumOffset computed for the MCExpr equivalent of:
+ // alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1;
+ ProgInfo.AccumOffset = MCBinaryExpr::createSub(
+ MCBinaryExpr::createDiv(
+ AMDGPUMCExpr::createAlignTo(
+ AMDGPUMCExpr::createMax({ConstOne, ProgInfo.NumArchVGPR}, Ctx),
+ ConstFour, Ctx),
+ ConstFour, Ctx),
+ ConstOne, Ctx);
ProgInfo.TgSplit = STM.isTgSplitEnabled();
- ProgInfo.NumSGPR = CreateExpr(Info.NumExplicitSGPR);
- ProgInfo.ScratchSize = CreateExpr(Info.PrivateSegmentSize);
- ProgInfo.VCCUsed = CreateExpr(Info.UsesVCC);
- ProgInfo.FlatUsed = CreateExpr(Info.UsesFlatScratch);
+ ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR);
+ ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize);
+ ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC);
+ ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch);
ProgInfo.DynamicCallStack =
- CreateExpr(Info.HasDynamicallySizedStack || Info.HasRecursion);
+ MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
+ GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
const uint64_t MaxScratchPerWorkitem =
STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
@@ -1082,6 +1216,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU,
ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
+ OccupancyValidateMap->insert({&MF.getFunction(), ProgInfo.Occupancy});
+
const auto [MinWEU, MaxWEU] =
AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
uint64_t Occupancy;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index f66bbde42ce278..676a4687ee2af7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -24,6 +24,7 @@ struct AMDGPUResourceUsageAnalysis;
class AMDGPUTargetStreamer;
class MCCodeEmitter;
class MCOperand;
+class MCResourceInfo;
namespace AMDGPU {
struct MCKernelDescriptor;
@@ -40,12 +41,20 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
AMDGPUResourceUsageAnalysis *ResourceUsage;
+ std::unique_ptr<MCResourceInfo> RI;
+
SIProgramInfo CurrentProgramInfo;
std::unique_ptr<AMDGPU::HSAMD::MetadataStreamer> HSAMetadataStream;
MCCodeEmitter *DumpCodeInstEmitter = nullptr;
+ // ValidateMCResourceInfo cannot recompute parts of the occupancy as it does
+ // for other metadata to validate (e.g., NumSGPRs) so a map is necessary if we
+ // really want to track and validate the occupancy.
+ std::unique_ptr<DenseMap<const Function *, const MCExpr *>>
+ OccupancyValidateMap;
+
uint64_t getFunctionCodeSize(const MachineFunction &MF) const;
void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF);
@@ -60,11 +69,6 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
void EmitPALMetadata(const MachineFunction &MF,
const SIProgramInfo &KernelInfo);
void emitPALFunctionMetadata(const MachineFunction &MF);
- void emitCommonFunctionComments(uint32_t NumVGPR,
- std::optional<uint32_t> NumAGPR,
- uint32_t TotalNumVGPR, uint32_t NumSGPR,
- uint64_t ScratchSize, uint64_t CodeSize,
- const AMDGPUMachineFunction *MFI);
void emitCommonFunctionComments(const MCExpr *NumVGPR, const MCExpr *NumAGPR,
const MCExpr *TotalNumVGPR,
const MCExpr *NumSGPR,
@@ -84,6 +88,11 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
SmallString<128> getMCExprStr(const MCExpr *Value);
+ /// Attempts to replace the validation that is missed in getSIProgramInfo due
+ /// to MCExpr being unknown. Invoked during doFinalization such that the
+ /// MCResourceInfo symbols are known.
+ void ValidateMCResourceInfo(Function &F);
+
public:
explicit AMDGPUAsmPrinter(TargetMachine &TM,
std::unique_ptr<MCStreamer> Streamer);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
new file mode 100644
index 00000000000000..58383475b312c9
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
@@ -0,0 +1,220 @@
+//===- AMDGPUMCResourceInfo.cpp --- MC Resource Info ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief MC infrastructure to propagate the function level resource usage
+/// info.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCResourceInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSymbol.h"
+
+using namespace llvm;
+
+MCSymbol *MCResourceInfo::getSymbol(StringRef FuncName, ResourceInfoKind RIK) {
+ switch (RIK) {
+ case RIK_NumVGPR:
+ return OutContext.getOrCreateSymbol(FuncName + Twine(".num_vgpr"));
+ case RIK_NumAGPR:
+ return OutContext.getOrCreateSymbol(FuncName + Twine(".num_agpr"));
+ case RIK_NumSGPR:
+ return OutContext.getOrCreateSymbol(FuncName + Twine(".num_sgpr"));
+ case RIK_PrivateSegSize:
+ return OutContext.getOrCreateSymbol(FuncName + Twine(".private_seg_size"));
+ case RIK_UsesVCC:
+ return OutContext.getOrCreateSymbol(FuncName + Twine(".uses_vcc"));
+ case RIK_UsesFlatScratch:
+ return OutContext.getOrCreateSymbol(FuncName + Twine(".uses_flat_scratch"));
+ case RIK_HasDynSizedStack:
+ return OutContext.getOrCreateSymbol(FuncName +
+ Twine(".has_dyn_sized_stack"));
+ case RIK_HasRecursion:
+ return OutContext.getOrCreateSymbol(FuncName + Twine(".has_recursion"));
+ case RIK_HasIndirectCall:
+ return OutContext.getOrCreateSymbol(FuncName + Twine(".has_indirect_call"));
+ }
+ llvm_unreachable("Unexpected ResourceInfoKind.");
+}
+
+const MCExpr *MCResourceInfo::getSymRefExpr(StringRef FuncName,
+ ResourceInfoKind RIK,
+ MCContext &Ctx) {
+ return MCSymbolRefExpr::create(getSymbol(FuncName, RIK), Ctx);
+}
+
+void MCResourceInfo::assignMaxRegs() {
+ // Assign expression to get the max register use to the max_num_Xgpr symbol.
+ MCSymbol *MaxVGPRSym = getMaxVGPRSymbol();
+ MCSymbol *MaxAGPRSym = getMaxAGPRSymbol();
+ MCSymbol *MaxSGPRSym = getMaxSGPRSymbol();
+
+ auto assignMaxRegSym = [this](MCSymbol *Sym, int32_t RegCount) {
+ const MCExpr *MaxExpr = MCConstantExpr::create(RegCount, OutContext);
+ Sym->setVariableValue(MaxExpr);
+ };
+
+ assignMaxRegSym(MaxVGPRSym, MaxVGPR);
+ assignMaxRegSym(MaxAGPRSym, MaxAGPR);
+ assignMaxRegSym(MaxSGPRSym, MaxSGPR);
+}
+
+void MCResourceInfo::Finalize() {
+ assert(!finalized && "Cannot finalize ResourceInfo again.");
+ finalized = true;
+ assignMaxRegs();
+}
+
+MCSymbol *MCResourceInfo::getMaxVGPRSymbol() {
+ return OutContext.getOrCreateSymbol("max_num_vgpr");
+}
+
+MCSymbol *MCResourceInfo::getMaxAGPRSymbol() {
+ return OutContext.getOrCreateSymbol("max_num_agpr");
+}
+
+MCSymbol *MCResourceInfo::getMaxSGPRSymbol() {
+ return OutContext.getOrCreateSymbol("max_num_sgpr");
+}
+
+void MCResourceInfo::assignResourceInfoExpr(
+ int64_t localValue, ResourceInfoKind RIK, AMDGPUMCExpr::VariantKind Kind,
+ const MachineFunction &MF,
+ const SmallVectorImpl<const Function *> &Callees) {
+ const MCConstantExpr *localConstExpr =
+ MCConstantExpr::create(localValue, OutContext);
+ const MCExpr *SymVal = localConstExpr;
+ if (Callees.size() > 0) {
+ std::vector<const MCExpr *> ArgExprs;
+ // Avoid recursive symbol assignment.
+ SmallSet<StringRef, 8> Seen;
+ ArgExprs.push_back(localConstExpr);
+ Seen.insert(MF.getName());
+
+ for (const Function *Callee : Callees) {
+ if (Seen.contains(Callee->getName()))
+ continue;
+ Seen.insert(Callee->getName());
+ MCSymbol *calleeValSym = getSymbol(Callee->getName(), RIK);
+ ArgExprs.push_back(MCSymbolRefExpr::create(calleeValSym, OutContext));
+ }
+ SymVal = AMDGPUMCExpr::create(Kind, ArgExprs, OutContext);
+ }
+ MCSymbol *Sym = getSymbol(MF.getName(), RIK);
+ Sym->setVariableValue(SymVal);
+}
+
+void MCResourceInfo::gatherResourceInfo(
+ const MachineFunction &MF,
+ const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &FRI) {
+ // Worst case VGPR use for non-hardware-entrypoints.
+ MCSymbol *maxVGPRSym = getMaxVGPRSymbol();
+ MCSymbol *maxAGPRSym = getMaxAGPRSymbol();
+ MCSymbol *maxSGPRSym = getMaxSGPRSymbol();
+
+ if (!AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())) {
+ addMaxVGPRCandidate(FRI.NumVGPR);
+ addMaxAGPRCandidate(FRI.NumAGPR);
+ addMaxSGPRCandidate(FRI.NumExplicitSGPR);
+ }
+
+ auto setMaxReg = [&](MCSymbol *MaxSym, int32_t numRegs,
+ ResourceInfoKind RIK) {
+ if (!FRI.HasIndirectCall) {
+ assignResourceInfoExpr(numRegs, RIK, AMDGPUMCExpr::AGVK_Max, MF,
+ FRI.Callees);
+ } else {
+ const MCExpr *SymRef = MCSymbolRefExpr::create(MaxSym, OutContext);
+ MCSymbol *LocalNumSym = getSymbol(MF.getName(), RIK);
+ const MCExpr *MaxWithLocal = AMDGPUMCExpr::createMax(
+ {MCConstantExpr::create(numRegs, OutContext), SymRef}, OutContext);
+ LocalNumSym->setVariableValue(MaxWithLocal);
+ }
+ };
+
+ setMaxReg(maxVGPRSym, FRI.NumVGPR, RIK_NumVGPR);
+ setMaxReg(maxAGPRSym, FRI.NumAGPR, RIK_NumAGPR);
+ setMaxReg(maxSGPRSym, FRI.NumExplicitSGPR, RIK_NumSGPR);
+
+ {
+ // The expression for private segment size should be: FRI.PrivateSegmentSize
+ // + max(FRI.Callees, FRI.CalleeSegmentSize)
+ std::vector<const MCExpr *> ArgExprs;
+ if (FRI.CalleeSegmentSize)
+ ArgExprs.push_back(
+ MCConstantExpr::create(FRI.CalleeSegmentSize, OutContext));
+
+ if (!FRI.HasIndirectCall) {
+ for (const Function *Callee : FRI.Callees) {
+ MCSymbol *calleeValSym =
+ getSymbol(Callee->getName(), RIK_PrivateSegSize);
+ ArgExprs.push_back(MCSymbolRefExpr::create(calleeValSym, OutContext));
+ }
+ }
+ const MCExpr *localConstExpr =
+ MCConstantExpr::create(FRI.PrivateSegmentSize, OutContext);
+ if (ArgExprs.size() > 0) {
+ const AMDGPUMCExpr *transitiveExpr =
+ AMDGPUMCExpr::createMax(ArgExprs, OutContext);
+ localConstExpr =
+ MCBinaryExpr::createAdd(localConstExpr, transitiveExpr, OutContext);
+ }
+ getSymbol(MF.getName(), RIK_PrivateSegSize)
+ ->setVariableValue(localConstExpr);
+ }
+
+ auto setToLocal = [&](int64_t localValue, ResourceInfoKind RIK) {
+ MCSymbol *Sym = getSymbol(MF.getName(), RIK);
+ Sym->setVariableValue(MCConstantExpr::create(localValue, OutContext));
+ };
+
+ if (!FRI.HasIndirectCall) {
+ assignResourceInfoExpr(FRI.UsesVCC, ResourceInfoKind::RIK_UsesVCC,
+ AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees);
+ assignResourceInfoExpr(FRI.UsesFlatScratch,
+ ResourceInfoKind::RIK_UsesFlatScratch,
+ AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees);
+ assignResourceInfoExpr(FRI.HasDynamicallySizedStack,
+ ResourceInfoKind::RIK_HasDynSizedStack,
+ AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees);
+ assignResourceInfoExpr(FRI.HasRecursion, ResourceInfoKind::RIK_HasRecursion,
+ AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees);
+ assignResourceInfoExpr(FRI.HasIndirectCall,
+ ResourceInfoKind::RIK_HasIndirectCall,
+ AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees);
+ } else {
+ setToLocal(FRI.UsesVCC, ResourceInfoKind::RIK_UsesVCC);
+ setToLocal(FRI.UsesFlatScratch, ResourceInfoKind::RIK_UsesFlatScratch);
+ setToLocal(FRI.HasDynamicallySizedStack,
+ ResourceInfoKind::RIK_HasDynSizedStack);
+ setToLocal(FRI.HasRecursion, ResourceInfoKind::RIK_HasRecursion);
+ setToLocal(FRI.HasIndirectCall, ResourceInfoKind::RIK_HasIndirectCall);
+ }
+}
+
+const MCExpr *MCResourceInfo::createTotalNumVGPRs(const MachineFunction &MF,
+ MCContext &Ctx) {
+ return AMDGPUMCExpr::createTotalNumVGPR(
+ getSymRefExpr(MF.getName(), RIK_NumAGPR, Ctx),
+ getSymRefExpr(MF.getName(), RIK_NumVGPR, Ctx), Ctx);
+}
+
+const MCExpr *MCResourceInfo::createTotalNumSGPRs(const MachineFunction &MF,
+ bool hasXnack,
+ MCContext &Ctx) {
+ return MCBinaryExpr::createAdd(
+ getSymRefExpr(MF.getName(), RIK_NumSGPR, Ctx),
+ AMDGPUMCExpr::createExtraSGPRs(
+ getSymRefExpr(MF.getName(), RIK_UsesVCC, Ctx),
+ getSymRefExpr(MF.getName(), RIK_UsesFlatScratch, Ctx), hasXnack, Ctx),
+ Ctx);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
new file mode 100644
index 00000000000000..6646003693a67f
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
@@ -0,0 +1,94 @@
+//===- AMDGPUMCResourceInfo.h ----- MC Resource Info --------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief MC infrastructure to propagate the function level resource usage
+/// info.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUResourceUsageAnalysis.h"
+#include "MCTargetDesc/AMDGPUMCExpr.h"
+
+namespace llvm {
+
+class MCContext;
+class MCSymbol;
+class StringRef;
+class MachineFunction;
+
+class MCResourceInfo {
+public:
+ enum ResourceInfoKind {
+ RIK_NumVGPR,
+ RIK_NumAGPR,
+ RIK_NumSGPR,
+ RIK_PrivateSegSize,
+ RIK_UsesVCC,
+ RIK_UsesFlatScratch,
+ RIK_HasDynSizedStack,
+ RIK_HasRecursion,
+ RIK_HasIndirectCall
+ };
+
+private:
+ int32_t MaxVGPR;
+ int32_t MaxAGPR;
+ int32_t MaxSGPR;
+
+ MCContext &OutContext;
+ bool finalized;
+
+ void assignResourceInfoExpr(int64_t localValue, ResourceInfoKind RIK,
+ AMDGPUMCExpr::VariantKind Kind,
+ const MachineFunction &MF,
+ const SmallVectorImpl<const Function *> &Callees);
+
+ // Assigns expression for Max S/V/A-GPRs to the referenced symbols.
+ void assignMaxRegs();
+
+public:
+ MCResourceInfo(MCContext &OutContext)
+ : MaxVGPR(0), MaxAGPR(0), MaxSGPR(0), OutContext(OutContext),
+ finalized(false) {}
+ void addMaxVGPRCandidate(int32_t candidate) {
+ MaxVGPR = std::max(MaxVGPR, candidate);
+ }
+ void addMaxAGPRCandidate(int32_t candidate) {
+ MaxAGPR = std::max(MaxAGPR, candidate);
+ }
+ void addMaxSGPRCandidate(int32_t candidate) {
+ MaxSGPR = std::max(MaxSGPR, candidate);
+ }
+
+ MCSymbol *getSymbol(StringRef FuncName, ResourceInfoKind RIK);
+ const MCExpr *getSymRefExpr(StringRef FuncName, ResourceInfoKind RIK,
+ MCContext &Ctx);
+
+ // Resolves the final symbols that requires the inter-function resource info
+ // to be resolved.
+ void Finalize();
+
+ MCSymbol *getMaxVGPRSymbol();
+ MCSymbol *getMaxAGPRSymbol();
+ MCSymbol *getMaxSGPRSymbol();
+
+ /// AMDGPUResourceUsageAnalysis gathers resource usage on a per-function
+ /// granularity. However, some resource info has to be assigned the call
+ /// transitive maximum or accumulative. For example, if A calls B and B's VGPR
+ /// usage exceeds A's, A should be assigned B's VGPR usage. Furthermore,
+ /// functions with indirect calls should be assigned the module level maximum.
+ void gatherResourceInfo(
+ const MachineFunction &MF,
+ const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &FRI);
+
+ const MCExpr *createTotalNumVGPRs(const MachineFunction &MF, MCContext &Ctx);
+ const MCExpr *createTotalNumSGPRs(const MachineFunction &MF, bool hasXnack,
+ MCContext &Ctx);
+};
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 0aca99a82d1978..1ee3c40d69a3b3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -13,14 +13,6 @@
/// The results of this analysis are used to fill the register usage, flat
/// usage, etc. into hardware registers.
///
-/// The analysis takes callees into account. E.g. if a function A that needs 10
-/// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
-/// will return 20.
-/// It is assumed that an indirect call can go into any function except
-/// hardware-entrypoints. Therefore the register usage of functions with
-/// indirect calls is estimated as the maximum of all non-entrypoint functions
-/// in the module.
-///
//===----------------------------------------------------------------------===//
#include "AMDGPUResourceUsageAnalysis.h"
@@ -28,8 +20,8 @@
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/Analysis/CallGraph.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/GlobalAlias.h"
#include "llvm/IR/GlobalValue.h"
@@ -78,92 +70,37 @@ static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
return false;
}
-int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
- const GCNSubtarget &ST) const {
- return NumExplicitSGPR +
- IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
- ST.getTargetID().isXnackOnOrAny());
-}
-
-int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
- const GCNSubtarget &ST) const {
- return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), NumAGPR, NumVGPR);
-}
-
-bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
+bool AMDGPUResourceUsageAnalysis::runOnMachineFunction(MachineFunction &MF) {
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
if (!TPC)
return false;
- MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
const TargetMachine &TM = TPC->getTM<TargetMachine>();
const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
- bool HasIndirectCall = false;
-
- CallGraph CG = CallGraph(M);
- auto End = po_end(&CG);
// By default, for code object v5 and later, track only the minimum scratch
// size
uint32_t AssumedStackSizeForDynamicSizeObjects =
clAssumedStackSizeForDynamicSizeObjects;
uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
- if (AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5 ||
+ if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
+ AMDGPU::AMDHSA_COV5 ||
STI.getTargetTriple().getOS() == Triple::AMDPAL) {
- if (clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences() == 0)
+ if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
AssumedStackSizeForDynamicSizeObjects = 0;
- if (clAssumedStackSizeForExternalCall.getNumOccurrences() == 0)
+ if (!clAssumedStackSizeForExternalCall.getNumOccurrences())
AssumedStackSizeForExternalCall = 0;
}
- for (auto IT = po_begin(&CG); IT != End; ++IT) {
- Function *F = IT->getFunction();
- if (!F || F->isDeclaration())
- continue;
-
- MachineFunction *MF = MMI.getMachineFunction(*F);
- assert(MF && "function must have been generated already");
-
- auto CI =
- CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo()));
- SIFunctionResourceInfo &Info = CI.first->second;
- assert(CI.second && "should only be called once per function");
- Info = analyzeResourceUsage(*MF, TM, AssumedStackSizeForDynamicSizeObjects,
- AssumedStackSizeForExternalCall);
- HasIndirectCall |= Info.HasIndirectCall;
- }
-
- // It's possible we have unreachable functions in the module which weren't
- // visited by the PO traversal. Make sure we have some resource counts to
- // report.
- for (const auto &IT : CG) {
- const Function *F = IT.first;
- if (!F || F->isDeclaration())
- continue;
-
- auto CI =
- CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo()));
- if (!CI.second) // Skip already visited functions
- continue;
-
- SIFunctionResourceInfo &Info = CI.first->second;
- MachineFunction *MF = MMI.getMachineFunction(*F);
- assert(MF && "function must have been generated already");
- Info = analyzeResourceUsage(*MF, TM, AssumedStackSizeForDynamicSizeObjects,
- AssumedStackSizeForExternalCall);
- HasIndirectCall |= Info.HasIndirectCall;
- }
-
- if (HasIndirectCall)
- propagateIndirectCallRegisterUsage();
+ ResourceInfo = analyzeResourceUsage(MF, AssumedStackSizeForDynamicSizeObjects,
+ AssumedStackSizeForExternalCall);
return false;
}
AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
- const MachineFunction &MF, const TargetMachine &TM,
- uint32_t AssumedStackSizeForDynamicSizeObjects,
+ const MachineFunction &MF, uint32_t AssumedStackSizeForDynamicSizeObjects,
uint32_t AssumedStackSizeForExternalCall) const {
SIFunctionResourceInfo Info;
@@ -253,7 +190,7 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
int32_t MaxVGPR = -1;
int32_t MaxAGPR = -1;
int32_t MaxSGPR = -1;
- uint64_t CalleeFrameSize = 0;
+ Info.CalleeSegmentSize = 0;
for (const MachineBasicBlock &MBB : MF) {
for (const MachineInstr &MI : MBB) {
@@ -512,8 +449,6 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
TII->getNamedOperand(MI, AMDGPU::OpName::callee);
const Function *Callee = getCalleeFunction(*CalleeOp);
- DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
- CallGraphResourceInfo.end();
// Avoid crashing on undefined behavior with an illegal call to a
// kernel. If a callsite's calling convention doesn't match the
@@ -522,9 +457,14 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
report_fatal_error("invalid call to entry function");
+ auto isSameFunction = [](const MachineFunction &MF, const Function *F) {
+ return F == &MF.getFunction();
+ };
+
+ if (Callee && !isSameFunction(MF, Callee))
+ Info.Callees.push_back(Callee);
+
bool IsIndirect = !Callee || Callee->isDeclaration();
- if (!IsIndirect)
- I = CallGraphResourceInfo.find(Callee);
// FIXME: Call site could have norecurse on it
if (!Callee || !Callee->doesNotRecurse()) {
@@ -539,15 +479,15 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
// directly call the tail called function. If a kernel directly
// calls a tail recursive function, we'll assume maximum stack size
// based on the regular call instruction.
- CalleeFrameSize = std::max(
- CalleeFrameSize,
+ Info.CalleeSegmentSize = std::max(
+ Info.CalleeSegmentSize,
static_cast<uint64_t>(AssumedStackSizeForExternalCall));
}
}
- if (IsIndirect || I == CallGraphResourceInfo.end()) {
- CalleeFrameSize =
- std::max(CalleeFrameSize,
+ if (IsIndirect) {
+ Info.CalleeSegmentSize =
+ std::max(Info.CalleeSegmentSize,
static_cast<uint64_t>(AssumedStackSizeForExternalCall));
// Register usage of indirect calls gets handled later
@@ -555,19 +495,6 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
Info.UsesFlatScratch = ST.hasFlatAddressSpace();
Info.HasDynamicallySizedStack = true;
Info.HasIndirectCall = true;
- } else {
- // We force CodeGen to run in SCC order, so the callee's register
- // usage etc. should be the cumulative usage of all callees.
- MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
- MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
- MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
- CalleeFrameSize =
- std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
- Info.UsesVCC |= I->second.UsesVCC;
- Info.UsesFlatScratch |= I->second.UsesFlatScratch;
- Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
- Info.HasRecursion |= I->second.HasRecursion;
- Info.HasIndirectCall |= I->second.HasIndirectCall;
}
}
}
@@ -576,36 +503,6 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
Info.NumExplicitSGPR = MaxSGPR + 1;
Info.NumVGPR = MaxVGPR + 1;
Info.NumAGPR = MaxAGPR + 1;
- Info.PrivateSegmentSize += CalleeFrameSize;
return Info;
}
-
-void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
- // Collect the maximum number of registers from non-hardware-entrypoints.
- // All these functions are potential targets for indirect calls.
- int32_t NonKernelMaxSGPRs = 0;
- int32_t NonKernelMaxVGPRs = 0;
- int32_t NonKernelMaxAGPRs = 0;
-
- for (const auto &I : CallGraphResourceInfo) {
- if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
- auto &Info = I.getSecond();
- NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
- NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
- NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
- }
- }
-
- // Add register usage for functions with indirect calls.
- // For calls to unknown functions, we assume the maximum register usage of
- // all non-hardware-entrypoints in the current module.
- for (auto &I : CallGraphResourceInfo) {
- auto &Info = I.getSecond();
- if (Info.HasIndirectCall) {
- Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
- Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
- Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
- }
- }
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
index 7f71de6749dcef..92ef41f49b3ba8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
@@ -15,8 +15,8 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H
-#include "llvm/Analysis/CallGraphSCCPass.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
namespace llvm {
@@ -24,10 +24,9 @@ class GCNSubtarget;
class MachineFunction;
class TargetMachine;
-struct AMDGPUResourceUsageAnalysis : public ModulePass {
- static char ID;
-
+struct AMDGPUResourceUsageAnalysis : public MachineFunctionPass {
public:
+ static char ID;
// Track resource usage for callee functions.
struct SIFunctionResourceInfo {
// Track the number of explicitly used VGPRs. Special registers reserved at
@@ -35,48 +34,33 @@ struct AMDGPUResourceUsageAnalysis : public ModulePass {
int32_t NumVGPR = 0;
int32_t NumAGPR = 0;
int32_t NumExplicitSGPR = 0;
+ uint64_t CalleeSegmentSize = 0;
uint64_t PrivateSegmentSize = 0;
bool UsesVCC = false;
bool UsesFlatScratch = false;
bool HasDynamicallySizedStack = false;
bool HasRecursion = false;
bool HasIndirectCall = false;
-
- int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const;
- // Total number of VGPRs is actually a combination of AGPR and VGPR
- // depending on architecture - and some alignment constraints
- int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const;
+ SmallVector<const Function *, 16> Callees;
};
- AMDGPUResourceUsageAnalysis() : ModulePass(ID) {}
+ AMDGPUResourceUsageAnalysis() : MachineFunctionPass(ID) {}
- bool doInitialization(Module &M) override {
- CallGraphResourceInfo.clear();
- return ModulePass::doInitialization(M);
- }
+ bool runOnMachineFunction(MachineFunction &MF) override;
- bool runOnModule(Module &M) override;
+ const SIFunctionResourceInfo &getResourceInfo() const { return ResourceInfo; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<MachineModuleInfoWrapperPass>();
AU.setPreservesAll();
- }
-
- const SIFunctionResourceInfo &getResourceInfo(const Function *F) const {
- auto Info = CallGraphResourceInfo.find(F);
- assert(Info != CallGraphResourceInfo.end() &&
- "Failed to find resource info for function");
- return Info->getSecond();
+ MachineFunctionPass::getAnalysisUsage(AU);
}
private:
SIFunctionResourceInfo
- analyzeResourceUsage(const MachineFunction &MF, const TargetMachine &TM,
+ analyzeResourceUsage(const MachineFunction &MF,
uint32_t AssumedStackSizeForDynamicSizeObjects,
uint32_t AssumedStackSizeForExternalCall) const;
- void propagateIndirectCallRegisterUsage();
-
- DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo;
+ SIFunctionResourceInfo ResourceInfo;
};
} // namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 671caf8484cd97..d685b635a82a6f 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -82,6 +82,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUMCInstLower.cpp
AMDGPUIGroupLP.cpp
AMDGPUInsertSingleUseVDST.cpp
+ AMDGPUMCResourceInfo.cpp
AMDGPUMarkLastScratchLoad.cpp
AMDGPUMIRFormatter.cpp
AMDGPUOpenCLEnqueuedBlockLowering.cpp
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 73d466abc66f7b..a1a41d6cc8c6a0 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -271,6 +271,47 @@ void AMDGPUTargetAsmStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
<< Alignment.value() << '\n';
}
+void AMDGPUTargetAsmStreamer::EmitMCResourceInfo(
+ const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
+ const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize,
+ const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch,
+ const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion,
+ const MCSymbol *HasIndirectCall) {
+#define PRINT_RES_INFO(ARG) \
+ OS << "\t.set "; \
+ ARG->print(OS, getContext().getAsmInfo()); \
+ OS << ", "; \
+ ARG->getVariableValue()->print(OS, getContext().getAsmInfo()); \
+ Streamer.addBlankLine();
+
+ PRINT_RES_INFO(NumVGPR);
+ PRINT_RES_INFO(NumAGPR);
+ PRINT_RES_INFO(NumExplicitSGPR);
+ PRINT_RES_INFO(PrivateSegmentSize);
+ PRINT_RES_INFO(UsesVCC);
+ PRINT_RES_INFO(UsesFlatScratch);
+ PRINT_RES_INFO(HasDynamicallySizedStack);
+ PRINT_RES_INFO(HasRecursion);
+ PRINT_RES_INFO(HasIndirectCall);
+#undef PRINT_RES_INFO
+}
+
+void AMDGPUTargetAsmStreamer::EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
+ const MCSymbol *MaxAGPR,
+ const MCSymbol *MaxSGPR) {
+#define PRINT_RES_INFO(ARG) \
+ OS << "\t.set "; \
+ ARG->print(OS, getContext().getAsmInfo()); \
+ OS << ", "; \
+ ARG->getVariableValue()->print(OS, getContext().getAsmInfo()); \
+ Streamer.addBlankLine();
+
+ PRINT_RES_INFO(MaxVGPR);
+ PRINT_RES_INFO(MaxAGPR);
+ PRINT_RES_INFO(MaxSGPR);
+#undef PRINT_RES_INFO
+}
+
bool AMDGPUTargetAsmStreamer::EmitISAVersion() {
OS << "\t.amd_amdgpu_isa \"" << getTargetID()->toString() << "\"\n";
return true;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index bf1538c71d1543..e41f302c3d56ce 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -60,6 +60,17 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
virtual void emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, Align Alignment) {
}
+ virtual void EmitMCResourceInfo(
+ const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
+ const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize,
+ const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch,
+ const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion,
+ const MCSymbol *HasIndirectCall){};
+
+ virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
+ const MCSymbol *MaxAGPR,
+ const MCSymbol *MaxSGPR){};
+
/// \returns True on success, false on failure.
virtual bool EmitISAVersion() { return true; }
@@ -136,6 +147,18 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override;
+ void EmitMCResourceInfo(const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
+ const MCSymbol *NumExplicitSGPR,
+ const MCSymbol *PrivateSegmentSize,
+ const MCSymbol *UsesVCC,
+ const MCSymbol *UsesFlatScratch,
+ const MCSymbol *HasDynamicallySizedStack,
+ const MCSymbol *HasRecursion,
+ const MCSymbol *HasIndirectCall) override;
+
+ void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR,
+ const MCSymbol *MaxSGPR) override;
+
/// \returns True on success, false on failure.
bool EmitISAVersion() override;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index a53bf70d77717b..92d09b3afa77d7 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -215,15 +215,15 @@ void AMDGPUPALMetadata::setRegister(unsigned Reg, const MCExpr *Val,
const MCExpr *NExpr = MCConstantExpr::create(N.getUInt(), Ctx);
Val = MCBinaryExpr::createOr(Val, NExpr, Ctx);
}
- ExprIt->getSecond() = Val;
} else if (N.getKind() == msgpack::Type::UInt) {
const MCExpr *NExpr = MCConstantExpr::create(N.getUInt(), Ctx);
Val = MCBinaryExpr::createOr(Val, NExpr, Ctx);
- int64_t Unused;
- if (!Val->evaluateAsAbsolute(Unused))
- REM[Reg] = Val;
- (void)Unused;
+ } else {
+ // Default to uint64_t 0 so additional calls to setRegister will allow
+ // propagate ORs.
+ N = (uint64_t)0;
}
+ REM[Reg] = Val;
DelayedExprs.assignDocNode(N, msgpack::Type::UInt, Val);
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 34efb089b72bf1..a7ab4393f3b0ac 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -3025,8 +3025,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: amd_machine_version_stepping = 0
; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256
; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0
-; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1
+; GPRIDX-NEXT: granulated_workitem_vgpr_count = (11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; GPRIDX-NEXT: granulated_wavefront_sgpr_count = ((11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
; GPRIDX-NEXT: priority = 0
; GPRIDX-NEXT: float_mode = 240
; GPRIDX-NEXT: priv = 0
@@ -3036,7 +3036,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: enable_wgp_mode = 0
; GPRIDX-NEXT: enable_mem_ordered = 0
; GPRIDX-NEXT: enable_fwd_progress = 0
-; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v5f64_s_s.private_seg_size*64, 1024))/1024)>0)||(dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion))|5012)&1
; GPRIDX-NEXT: user_sgpr_count = 10
; GPRIDX-NEXT: enable_trap_handler = 0
; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -3061,16 +3061,16 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: enable_ordered_append_gds = 0
; GPRIDX-NEXT: private_element_size = 1
; GPRIDX-NEXT: is_ptr64 = 1
-; GPRIDX-NEXT: is_dynamic_callstack = 0
+; GPRIDX-NEXT: is_dynamic_callstack = dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion
; GPRIDX-NEXT: is_debug_enabled = 0
; GPRIDX-NEXT: is_xnack_enabled = 1
-; GPRIDX-NEXT: workitem_private_segment_byte_size = 0
+; GPRIDX-NEXT: workitem_private_segment_byte_size = dyn_extract_v5f64_s_s.private_seg_size
; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
-; GPRIDX-NEXT: wavefront_sgpr_count = 13
-; GPRIDX-NEXT: workitem_vgpr_count = 3
+; GPRIDX-NEXT: wavefront_sgpr_count = dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 1))
+; GPRIDX-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr)
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
; GPRIDX-NEXT: reserved_sgpr_first = 0
@@ -3116,8 +3116,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: amd_machine_version_stepping = 3
; MOVREL-NEXT: kernel_code_entry_byte_offset = 256
; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0
-; MOVREL-NEXT: granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1
+; MOVREL-NEXT: granulated_workitem_vgpr_count = (11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; MOVREL-NEXT: granulated_wavefront_sgpr_count = ((11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
; MOVREL-NEXT: priority = 0
; MOVREL-NEXT: float_mode = 240
; MOVREL-NEXT: priv = 0
@@ -3127,7 +3127,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: enable_wgp_mode = 0
; MOVREL-NEXT: enable_mem_ordered = 0
; MOVREL-NEXT: enable_fwd_progress = 0
-; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v5f64_s_s.private_seg_size*64, 1024))/1024)>0)||(dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion))|5012)&1
; MOVREL-NEXT: user_sgpr_count = 10
; MOVREL-NEXT: enable_trap_handler = 0
; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -3152,16 +3152,16 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: enable_ordered_append_gds = 0
; MOVREL-NEXT: private_element_size = 1
; MOVREL-NEXT: is_ptr64 = 1
-; MOVREL-NEXT: is_dynamic_callstack = 0
+; MOVREL-NEXT: is_dynamic_callstack = dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion
; MOVREL-NEXT: is_debug_enabled = 0
; MOVREL-NEXT: is_xnack_enabled = 0
-; MOVREL-NEXT: workitem_private_segment_byte_size = 0
+; MOVREL-NEXT: workitem_private_segment_byte_size = dyn_extract_v5f64_s_s.private_seg_size
; MOVREL-NEXT: workgroup_group_segment_byte_size = 0
; MOVREL-NEXT: gds_segment_byte_size = 0
; MOVREL-NEXT: kernarg_segment_byte_size = 28
; MOVREL-NEXT: workgroup_fbarrier_count = 0
-; MOVREL-NEXT: wavefront_sgpr_count = 9
-; MOVREL-NEXT: workitem_vgpr_count = 4
+; MOVREL-NEXT: wavefront_sgpr_count = dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 0))
+; MOVREL-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr)
; MOVREL-NEXT: reserved_vgpr_first = 0
; MOVREL-NEXT: reserved_vgpr_count = 0
; MOVREL-NEXT: reserved_sgpr_first = 0
@@ -3208,8 +3208,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: amd_machine_version_stepping = 0
; GFX10-NEXT: kernel_code_entry_byte_offset = 256
; GFX10-NEXT: kernel_code_prefetch_byte_size = 0
-; GFX10-NEXT: granulated_workitem_vgpr_count = 0
-; GFX10-NEXT: granulated_wavefront_sgpr_count = 1
+; GFX10-NEXT: granulated_workitem_vgpr_count = (1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; GFX10-NEXT: granulated_wavefront_sgpr_count = ((1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
; GFX10-NEXT: priority = 0
; GFX10-NEXT: float_mode = 240
; GFX10-NEXT: priv = 0
@@ -3219,7 +3219,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: enable_wgp_mode = 1
; GFX10-NEXT: enable_mem_ordered = 1
; GFX10-NEXT: enable_fwd_progress = 0
-; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v5f64_s_s.private_seg_size*32, 1024))/1024)>0)||(dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion))|5012)&1
; GFX10-NEXT: user_sgpr_count = 10
; GFX10-NEXT: enable_trap_handler = 0
; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -3244,16 +3244,16 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: enable_ordered_append_gds = 0
; GFX10-NEXT: private_element_size = 1
; GFX10-NEXT: is_ptr64 = 1
-; GFX10-NEXT: is_dynamic_callstack = 0
+; GFX10-NEXT: is_dynamic_callstack = dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion
; GFX10-NEXT: is_debug_enabled = 0
; GFX10-NEXT: is_xnack_enabled = 1
-; GFX10-NEXT: workitem_private_segment_byte_size = 0
+; GFX10-NEXT: workitem_private_segment_byte_size = dyn_extract_v5f64_s_s.private_seg_size
; GFX10-NEXT: workgroup_group_segment_byte_size = 0
; GFX10-NEXT: gds_segment_byte_size = 0
; GFX10-NEXT: kernarg_segment_byte_size = 28
; GFX10-NEXT: workgroup_fbarrier_count = 0
-; GFX10-NEXT: wavefront_sgpr_count = 9
-; GFX10-NEXT: workitem_vgpr_count = 3
+; GFX10-NEXT: wavefront_sgpr_count = dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 1))
+; GFX10-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr)
; GFX10-NEXT: reserved_vgpr_first = 0
; GFX10-NEXT: reserved_vgpr_count = 0
; GFX10-NEXT: reserved_sgpr_first = 0
@@ -3300,8 +3300,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX11-NEXT: amd_machine_version_stepping = 0
; GFX11-NEXT: kernel_code_entry_byte_offset = 256
; GFX11-NEXT: kernel_code_prefetch_byte_size = 0
-; GFX11-NEXT: granulated_workitem_vgpr_count = 0
-; GFX11-NEXT: granulated_wavefront_sgpr_count = 0
+; GFX11-NEXT: granulated_workitem_vgpr_count = (1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; GFX11-NEXT: granulated_wavefront_sgpr_count = ((1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
; GFX11-NEXT: priority = 0
; GFX11-NEXT: float_mode = 240
; GFX11-NEXT: priv = 0
@@ -3311,7 +3311,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX11-NEXT: enable_wgp_mode = 1
; GFX11-NEXT: enable_mem_ordered = 1
; GFX11-NEXT: enable_fwd_progress = 0
-; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v5f64_s_s.private_seg_size*32, 256))/256)>0)||(dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion))|5018)&1
; GFX11-NEXT: user_sgpr_count = 13
; GFX11-NEXT: enable_trap_handler = 0
; GFX11-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -3336,16 +3336,16 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX11-NEXT: enable_ordered_append_gds = 0
; GFX11-NEXT: private_element_size = 1
; GFX11-NEXT: is_ptr64 = 1
-; GFX11-NEXT: is_dynamic_callstack = 0
+; GFX11-NEXT: is_dynamic_callstack = dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion
; GFX11-NEXT: is_debug_enabled = 0
; GFX11-NEXT: is_xnack_enabled = 0
-; GFX11-NEXT: workitem_private_segment_byte_size = 0
+; GFX11-NEXT: workitem_private_segment_byte_size = dyn_extract_v5f64_s_s.private_seg_size
; GFX11-NEXT: workgroup_group_segment_byte_size = 0
; GFX11-NEXT: gds_segment_byte_size = 0
; GFX11-NEXT: kernarg_segment_byte_size = 28
; GFX11-NEXT: workgroup_fbarrier_count = 0
-; GFX11-NEXT: wavefront_sgpr_count = 7
-; GFX11-NEXT: workitem_vgpr_count = 3
+; GFX11-NEXT: wavefront_sgpr_count = dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 0))
+; GFX11-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr)
; GFX11-NEXT: reserved_vgpr_first = 0
; GFX11-NEXT: reserved_vgpr_count = 0
; GFX11-NEXT: reserved_sgpr_first = 0
@@ -4042,8 +4042,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: amd_machine_version_stepping = 0
; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256
; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0
-; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1
+; GPRIDX-NEXT: granulated_workitem_vgpr_count = (11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; GPRIDX-NEXT: granulated_wavefront_sgpr_count = ((11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
; GPRIDX-NEXT: priority = 0
; GPRIDX-NEXT: float_mode = 240
; GPRIDX-NEXT: priv = 0
@@ -4053,7 +4053,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_wgp_mode = 0
; GPRIDX-NEXT: enable_mem_ordered = 0
; GPRIDX-NEXT: enable_fwd_progress = 0
-; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f32_s_s_s.private_seg_size*64, 1024))/1024)>0)||(dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion))|5012)&1
; GPRIDX-NEXT: user_sgpr_count = 10
; GPRIDX-NEXT: enable_trap_handler = 0
; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -4078,16 +4078,16 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_ordered_append_gds = 0
; GPRIDX-NEXT: private_element_size = 1
; GPRIDX-NEXT: is_ptr64 = 1
-; GPRIDX-NEXT: is_dynamic_callstack = 0
+; GPRIDX-NEXT: is_dynamic_callstack = dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion
; GPRIDX-NEXT: is_debug_enabled = 0
; GPRIDX-NEXT: is_xnack_enabled = 1
-; GPRIDX-NEXT: workitem_private_segment_byte_size = 0
+; GPRIDX-NEXT: workitem_private_segment_byte_size = dyn_extract_v4f32_s_s_s.private_seg_size
; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
-; GPRIDX-NEXT: wavefront_sgpr_count = 12
-; GPRIDX-NEXT: workitem_vgpr_count = 2
+; GPRIDX-NEXT: wavefront_sgpr_count = dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 1))
+; GPRIDX-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr)
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
; GPRIDX-NEXT: reserved_sgpr_first = 0
@@ -4126,8 +4126,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: amd_machine_version_stepping = 3
; MOVREL-NEXT: kernel_code_entry_byte_offset = 256
; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0
-; MOVREL-NEXT: granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT: granulated_wavefront_sgpr_count = 0
+; MOVREL-NEXT: granulated_workitem_vgpr_count = (11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; MOVREL-NEXT: granulated_wavefront_sgpr_count = ((11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
; MOVREL-NEXT: priority = 0
; MOVREL-NEXT: float_mode = 240
; MOVREL-NEXT: priv = 0
@@ -4137,7 +4137,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_wgp_mode = 0
; MOVREL-NEXT: enable_mem_ordered = 0
; MOVREL-NEXT: enable_fwd_progress = 0
-; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f32_s_s_s.private_seg_size*64, 1024))/1024)>0)||(dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion))|5012)&1
; MOVREL-NEXT: user_sgpr_count = 10
; MOVREL-NEXT: enable_trap_handler = 0
; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -4162,16 +4162,16 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_ordered_append_gds = 0
; MOVREL-NEXT: private_element_size = 1
; MOVREL-NEXT: is_ptr64 = 1
-; MOVREL-NEXT: is_dynamic_callstack = 0
+; MOVREL-NEXT: is_dynamic_callstack = dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion
; MOVREL-NEXT: is_debug_enabled = 0
; MOVREL-NEXT: is_xnack_enabled = 0
-; MOVREL-NEXT: workitem_private_segment_byte_size = 0
+; MOVREL-NEXT: workitem_private_segment_byte_size = dyn_extract_v4f32_s_s_s.private_seg_size
; MOVREL-NEXT: workgroup_group_segment_byte_size = 0
; MOVREL-NEXT: gds_segment_byte_size = 0
; MOVREL-NEXT: kernarg_segment_byte_size = 28
; MOVREL-NEXT: workgroup_fbarrier_count = 0
-; MOVREL-NEXT: wavefront_sgpr_count = 8
-; MOVREL-NEXT: workitem_vgpr_count = 3
+; MOVREL-NEXT: wavefront_sgpr_count = dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 0))
+; MOVREL-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr)
; MOVREL-NEXT: reserved_vgpr_first = 0
; MOVREL-NEXT: reserved_vgpr_count = 0
; MOVREL-NEXT: reserved_sgpr_first = 0
@@ -4211,8 +4211,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: amd_machine_version_stepping = 0
; GFX10-NEXT: kernel_code_entry_byte_offset = 256
; GFX10-NEXT: kernel_code_prefetch_byte_size = 0
-; GFX10-NEXT: granulated_workitem_vgpr_count = 0
-; GFX10-NEXT: granulated_wavefront_sgpr_count = 0
+; GFX10-NEXT: granulated_workitem_vgpr_count = (1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; GFX10-NEXT: granulated_wavefront_sgpr_count = ((1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
; GFX10-NEXT: priority = 0
; GFX10-NEXT: float_mode = 240
; GFX10-NEXT: priv = 0
@@ -4222,7 +4222,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_wgp_mode = 1
; GFX10-NEXT: enable_mem_ordered = 1
; GFX10-NEXT: enable_fwd_progress = 0
-; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f32_s_s_s.private_seg_size*32, 1024))/1024)>0)||(dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion))|5012)&1
; GFX10-NEXT: user_sgpr_count = 10
; GFX10-NEXT: enable_trap_handler = 0
; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -4247,16 +4247,16 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_ordered_append_gds = 0
; GFX10-NEXT: private_element_size = 1
; GFX10-NEXT: is_ptr64 = 1
-; GFX10-NEXT: is_dynamic_callstack = 0
+; GFX10-NEXT: is_dynamic_callstack = dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion
; GFX10-NEXT: is_debug_enabled = 0
; GFX10-NEXT: is_xnack_enabled = 1
-; GFX10-NEXT: workitem_private_segment_byte_size = 0
+; GFX10-NEXT: workitem_private_segment_byte_size = dyn_extract_v4f32_s_s_s.private_seg_size
; GFX10-NEXT: workgroup_group_segment_byte_size = 0
; GFX10-NEXT: gds_segment_byte_size = 0
; GFX10-NEXT: kernarg_segment_byte_size = 28
; GFX10-NEXT: workgroup_fbarrier_count = 0
-; GFX10-NEXT: wavefront_sgpr_count = 8
-; GFX10-NEXT: workitem_vgpr_count = 2
+; GFX10-NEXT: wavefront_sgpr_count = dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 1))
+; GFX10-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr)
; GFX10-NEXT: reserved_vgpr_first = 0
; GFX10-NEXT: reserved_vgpr_count = 0
; GFX10-NEXT: reserved_sgpr_first = 0
@@ -4296,8 +4296,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: amd_machine_version_stepping = 0
; GFX11-NEXT: kernel_code_entry_byte_offset = 256
; GFX11-NEXT: kernel_code_prefetch_byte_size = 0
-; GFX11-NEXT: granulated_workitem_vgpr_count = 0
-; GFX11-NEXT: granulated_wavefront_sgpr_count = 0
+; GFX11-NEXT: granulated_workitem_vgpr_count = (1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; GFX11-NEXT: granulated_wavefront_sgpr_count = ((1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
; GFX11-NEXT: priority = 0
; GFX11-NEXT: float_mode = 240
; GFX11-NEXT: priv = 0
@@ -4307,7 +4307,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: enable_wgp_mode = 1
; GFX11-NEXT: enable_mem_ordered = 1
; GFX11-NEXT: enable_fwd_progress = 0
-; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f32_s_s_s.private_seg_size*32, 256))/256)>0)||(dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion))|5018)&1
; GFX11-NEXT: user_sgpr_count = 13
; GFX11-NEXT: enable_trap_handler = 0
; GFX11-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -4332,16 +4332,16 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: enable_ordered_append_gds = 0
; GFX11-NEXT: private_element_size = 1
; GFX11-NEXT: is_ptr64 = 1
-; GFX11-NEXT: is_dynamic_callstack = 0
+; GFX11-NEXT: is_dynamic_callstack = dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion
; GFX11-NEXT: is_debug_enabled = 0
; GFX11-NEXT: is_xnack_enabled = 0
-; GFX11-NEXT: workitem_private_segment_byte_size = 0
+; GFX11-NEXT: workitem_private_segment_byte_size = dyn_extract_v4f32_s_s_s.private_seg_size
; GFX11-NEXT: workgroup_group_segment_byte_size = 0
; GFX11-NEXT: gds_segment_byte_size = 0
; GFX11-NEXT: kernarg_segment_byte_size = 28
; GFX11-NEXT: workgroup_fbarrier_count = 0
-; GFX11-NEXT: wavefront_sgpr_count = 5
-; GFX11-NEXT: workitem_vgpr_count = 2
+; GFX11-NEXT: wavefront_sgpr_count = dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 0))
+; GFX11-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr)
; GFX11-NEXT: reserved_vgpr_first = 0
; GFX11-NEXT: reserved_vgpr_count = 0
; GFX11-NEXT: reserved_sgpr_first = 0
@@ -4389,8 +4389,8 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: amd_machine_version_stepping = 0
; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256
; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0
-; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1
+; GPRIDX-NEXT: granulated_workitem_vgpr_count = (11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; GPRIDX-NEXT: granulated_wavefront_sgpr_count = ((11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
; GPRIDX-NEXT: priority = 0
; GPRIDX-NEXT: float_mode = 240
; GPRIDX-NEXT: priv = 0
@@ -4400,7 +4400,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_wgp_mode = 0
; GPRIDX-NEXT: enable_mem_ordered = 0
; GPRIDX-NEXT: enable_fwd_progress = 0
-; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f64_s_s_s.private_seg_size*64, 1024))/1024)>0)||(dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion))|5012)&1
; GPRIDX-NEXT: user_sgpr_count = 10
; GPRIDX-NEXT: enable_trap_handler = 0
; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -4425,16 +4425,16 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_ordered_append_gds = 0
; GPRIDX-NEXT: private_element_size = 1
; GPRIDX-NEXT: is_ptr64 = 1
-; GPRIDX-NEXT: is_dynamic_callstack = 0
+; GPRIDX-NEXT: is_dynamic_callstack = dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion
; GPRIDX-NEXT: is_debug_enabled = 0
; GPRIDX-NEXT: is_xnack_enabled = 1
-; GPRIDX-NEXT: workitem_private_segment_byte_size = 0
+; GPRIDX-NEXT: workitem_private_segment_byte_size = dyn_extract_v4f64_s_s_s.private_seg_size
; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
-; GPRIDX-NEXT: wavefront_sgpr_count = 13
-; GPRIDX-NEXT: workitem_vgpr_count = 3
+; GPRIDX-NEXT: wavefront_sgpr_count = dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 1))
+; GPRIDX-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr)
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
; GPRIDX-NEXT: reserved_sgpr_first = 0
@@ -4476,8 +4476,8 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: amd_machine_version_stepping = 3
; MOVREL-NEXT: kernel_code_entry_byte_offset = 256
; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0
-; MOVREL-NEXT: granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1
+; MOVREL-NEXT: granulated_workitem_vgpr_count = (11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; MOVREL-NEXT: granulated_wavefront_sgpr_count = ((11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
; MOVREL-NEXT: priority = 0
; MOVREL-NEXT: float_mode = 240
; MOVREL-NEXT: priv = 0
@@ -4487,7 +4487,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_wgp_mode = 0
; MOVREL-NEXT: enable_mem_ordered = 0
; MOVREL-NEXT: enable_fwd_progress = 0
-; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f64_s_s_s.private_seg_size*64, 1024))/1024)>0)||(dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion))|5012)&1
; MOVREL-NEXT: user_sgpr_count = 10
; MOVREL-NEXT: enable_trap_handler = 0
; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -4512,16 +4512,16 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_ordered_append_gds = 0
; MOVREL-NEXT: private_element_size = 1
; MOVREL-NEXT: is_ptr64 = 1
-; MOVREL-NEXT: is_dynamic_callstack = 0
+; MOVREL-NEXT: is_dynamic_callstack = dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion
; MOVREL-NEXT: is_debug_enabled = 0
; MOVREL-NEXT: is_xnack_enabled = 0
-; MOVREL-NEXT: workitem_private_segment_byte_size = 0
+; MOVREL-NEXT: workitem_private_segment_byte_size = dyn_extract_v4f64_s_s_s.private_seg_size
; MOVREL-NEXT: workgroup_group_segment_byte_size = 0
; MOVREL-NEXT: gds_segment_byte_size = 0
; MOVREL-NEXT: kernarg_segment_byte_size = 28
; MOVREL-NEXT: workgroup_fbarrier_count = 0
-; MOVREL-NEXT: wavefront_sgpr_count = 9
-; MOVREL-NEXT: workitem_vgpr_count = 4
+; MOVREL-NEXT: wavefront_sgpr_count = dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 0))
+; MOVREL-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr)
; MOVREL-NEXT: reserved_vgpr_first = 0
; MOVREL-NEXT: reserved_vgpr_count = 0
; MOVREL-NEXT: reserved_sgpr_first = 0
@@ -4564,8 +4564,8 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: amd_machine_version_stepping = 0
; GFX10-NEXT: kernel_code_entry_byte_offset = 256
; GFX10-NEXT: kernel_code_prefetch_byte_size = 0
-; GFX10-NEXT: granulated_workitem_vgpr_count = 0
-; GFX10-NEXT: granulated_wavefront_sgpr_count = 1
+; GFX10-NEXT: granulated_workitem_vgpr_count = (1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; GFX10-NEXT: granulated_wavefront_sgpr_count = ((1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
; GFX10-NEXT: priority = 0
; GFX10-NEXT: float_mode = 240
; GFX10-NEXT: priv = 0
@@ -4575,7 +4575,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_wgp_mode = 1
; GFX10-NEXT: enable_mem_ordered = 1
; GFX10-NEXT: enable_fwd_progress = 0
-; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f64_s_s_s.private_seg_size*32, 1024))/1024)>0)||(dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion))|5012)&1
; GFX10-NEXT: user_sgpr_count = 10
; GFX10-NEXT: enable_trap_handler = 0
; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -4600,16 +4600,16 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_ordered_append_gds = 0
; GFX10-NEXT: private_element_size = 1
; GFX10-NEXT: is_ptr64 = 1
-; GFX10-NEXT: is_dynamic_callstack = 0
+; GFX10-NEXT: is_dynamic_callstack = dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion
; GFX10-NEXT: is_debug_enabled = 0
; GFX10-NEXT: is_xnack_enabled = 1
-; GFX10-NEXT: workitem_private_segment_byte_size = 0
+; GFX10-NEXT: workitem_private_segment_byte_size = dyn_extract_v4f64_s_s_s.private_seg_size
; GFX10-NEXT: workgroup_group_segment_byte_size = 0
; GFX10-NEXT: gds_segment_byte_size = 0
; GFX10-NEXT: kernarg_segment_byte_size = 28
; GFX10-NEXT: workgroup_fbarrier_count = 0
-; GFX10-NEXT: wavefront_sgpr_count = 9
-; GFX10-NEXT: workitem_vgpr_count = 3
+; GFX10-NEXT: wavefront_sgpr_count = dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 1))
+; GFX10-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr)
; GFX10-NEXT: reserved_vgpr_first = 0
; GFX10-NEXT: reserved_vgpr_count = 0
; GFX10-NEXT: reserved_sgpr_first = 0
@@ -4652,8 +4652,8 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: amd_machine_version_stepping = 0
; GFX11-NEXT: kernel_code_entry_byte_offset = 256
; GFX11-NEXT: kernel_code_prefetch_byte_size = 0
-; GFX11-NEXT: granulated_workitem_vgpr_count = 0
-; GFX11-NEXT: granulated_wavefront_sgpr_count = 0
+; GFX11-NEXT: granulated_workitem_vgpr_count = (1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; GFX11-NEXT: granulated_wavefront_sgpr_count = ((1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
; GFX11-NEXT: priority = 0
; GFX11-NEXT: float_mode = 240
; GFX11-NEXT: priv = 0
@@ -4663,7 +4663,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: enable_wgp_mode = 1
; GFX11-NEXT: enable_mem_ordered = 1
; GFX11-NEXT: enable_fwd_progress = 0
-; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f64_s_s_s.private_seg_size*32, 256))/256)>0)||(dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion))|5018)&1
; GFX11-NEXT: user_sgpr_count = 13
; GFX11-NEXT: enable_trap_handler = 0
; GFX11-NEXT: enable_sgpr_workgroup_id_x = 1
@@ -4688,16 +4688,16 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: enable_ordered_append_gds = 0
; GFX11-NEXT: private_element_size = 1
; GFX11-NEXT: is_ptr64 = 1
-; GFX11-NEXT: is_dynamic_callstack = 0
+; GFX11-NEXT: is_dynamic_callstack = dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion
; GFX11-NEXT: is_debug_enabled = 0
; GFX11-NEXT: is_xnack_enabled = 0
-; GFX11-NEXT: workitem_private_segment_byte_size = 0
+; GFX11-NEXT: workitem_private_segment_byte_size = dyn_extract_v4f64_s_s_s.private_seg_size
; GFX11-NEXT: workgroup_group_segment_byte_size = 0
; GFX11-NEXT: gds_segment_byte_size = 0
; GFX11-NEXT: kernarg_segment_byte_size = 28
; GFX11-NEXT: workgroup_fbarrier_count = 0
-; GFX11-NEXT: wavefront_sgpr_count = 7
-; GFX11-NEXT: workitem_vgpr_count = 3
+; GFX11-NEXT: wavefront_sgpr_count = dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 0))
+; GFX11-NEXT: workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr)
; GFX11-NEXT: reserved_vgpr_first = 0
; GFX11-NEXT: reserved_vgpr_count = 0
; GFX11-NEXT: reserved_sgpr_first = 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
index 66b88236bbb4c1..0221d0f790be0c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
@@ -18,7 +18,7 @@ target triple = "amdgcn-amd-amdhsa"
; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset
; RW-FLAT-NOT: .amdhsa_enable_private_segment
; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset
-; RO-FLAT: .amdhsa_enable_private_segment 1
+; RO-FLAT: .amdhsa_enable_private_segment (((((alignto(stack_object_addrspacecast_in_kernel_no_calls.private_seg_size*64, 1024))/1024)>0)||(stack_object_addrspacecast_in_kernel_no_calls.has_dyn_sized_stack|stack_object_addrspacecast_in_kernel_no_calls.has_recursion))|128)&1
; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1
; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 6
; RO-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 0
@@ -38,11 +38,12 @@ define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() {
; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer
; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 1
; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init
-; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(stack_object_in_kernel_no_calls.private_seg_size*64, 1024))/1024)>0)||(stack_object_in_kernel_no_calls.has_dyn_sized_stack|stack_object_in_kernel_no_calls.has_recursion))|140)&1
; RW-FLAT-NOT: .amdhsa_enable_private_segment
; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset
-; RO-FLAT: .amdhsa_enable_private_segment 1
-; RW-FLAT: .amdhsa_reserve_flat_scratch 0
+; RO-FLAT: .amdhsa_enable_private_segment (((((alignto(stack_object_in_kernel_no_calls.private_seg_size*64, 1024))/1024)>0)||(stack_object_in_kernel_no_calls.has_dyn_sized_stack|stack_object_in_kernel_no_calls.has_recursion))|128)&1
+; RW-FLAT: .amdhsa_reserve_flat_scratch stack_object_in_kernel_no_calls.uses_flat_scratch
+; RW-FLAT: .set stack_object_in_kernel_no_calls.uses_flat_scratch, 0
; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1
; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 6
; RO-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 0
@@ -58,11 +59,12 @@ define amdgpu_kernel void @stack_object_in_kernel_no_calls() {
; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer
; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 0
; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init
-; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
+; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(kernel_no_calls_no_stack.private_seg_size*64, 1024))/1024)>0)||(kernel_no_calls_no_stack.has_dyn_sized_stack|kernel_no_calls_no_stack.has_recursion))|136)&1
; RW-FLAT-NOT: .amdhsa_enable_private_segment
; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset
-; RO-FLAT: .amdhsa_enable_private_segment 0
-; RW-FLAT: .amdhsa_reserve_flat_scratch 0
+; RO-FLAT: .amdhsa_enable_private_segment (((((alignto(kernel_no_calls_no_stack.private_seg_size*64, 1024))/1024)>0)||(kernel_no_calls_no_stack.has_dyn_sized_stack|kernel_no_calls_no_stack.has_recursion))|128)&1
+; RW-FLAT: .amdhsa_reserve_flat_scratch kernel_no_calls_no_stack.uses_flat_scratch
+; RW-FLAT: .set kernel_no_calls_no_stack.uses_flat_scratch, 0
; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 4
; RO-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
index d5646820a19832..374ce0676d2205 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
@@ -16,7 +16,7 @@ declare i32 @llvm.amdgcn.workitem.id.z() #0
; MESA: .section .AMDGPU.config
; MESA: .long 47180
-; MESA-NEXT: .long 132{{$}}
+; MESA-NEXT: .long ((((alignto(test_workitem_id_x.private_seg_size*64, 1024))/1024)>0)||(test_workitem_id_x.has_dyn_sized_stack|test_workitem_id_x.has_recursion))|132{{$}}
; ALL-LABEL: {{^}}test_workitem_id_x:
; MESA3D: enable_vgpr_workitem_id = 0
@@ -33,7 +33,7 @@ define amdgpu_kernel void @test_workitem_id_x(ptr addrspace(1) %out) #1 {
; MESA: .section .AMDGPU.config
; MESA: .long 47180
-; MESA-NEXT: .long 2180{{$}}
+; MESA-NEXT: .long ((((alignto(test_workitem_id_y.private_seg_size*64, 1024))/1024)>0)||(test_workitem_id_y.has_dyn_sized_stack|test_workitem_id_y.has_recursion))|2180{{$}}
; ALL-LABEL: {{^}}test_workitem_id_y:
; MESA3D: enable_vgpr_workitem_id = 1
@@ -51,7 +51,7 @@ define amdgpu_kernel void @test_workitem_id_y(ptr addrspace(1) %out) #1 {
; MESA: .section .AMDGPU.config
; MESA: .long 47180
-; MESA-NEXT: .long 4228{{$}}
+; MESA-NEXT: .long ((((alignto(test_workitem_id_z.private_seg_size*64, 1024))/1024)>0)||(test_workitem_id_z.has_dyn_sized_stack|test_workitem_id_z.has_recursion))|4228{{$}}
; ALL-LABEL: {{^}}test_workitem_id_z:
; MESA3D: enable_vgpr_workitem_id = 2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
index c7afbeabbbb6b1..1c3db1d64b299d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
@@ -75,10 +75,10 @@ bb.2:
store volatile i32 0, ptr addrspace(1) undef
ret void
}
-; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 16
+; DEFAULTSIZE: .amdhsa_private_segment_fixed_size kernel_non_entry_block_static_alloca_uniformly_reached_align4.private_seg_size
; DEFAULTSIZE: ; ScratchSize: 16
-; ASSUME1024: .amdhsa_private_segment_fixed_size 1040
+; ASSUME1024: .amdhsa_private_segment_fixed_size kernel_non_entry_block_static_alloca_uniformly_reached_align4.private_seg_size
; ASSUME1024: ; ScratchSize: 1040
define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) {
@@ -137,10 +137,10 @@ bb.1:
ret void
}
-; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 64
+; DEFAULTSIZE: .amdhsa_private_segment_fixed_size kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size
; DEFAULTSIZE: ; ScratchSize: 64
-; ASSUME1024: .amdhsa_private_segment_fixed_size 1088
+; ASSUME1024: .amdhsa_private_segment_fixed_size kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size
; ASSUME1024: ; ScratchSize: 1088
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
index 8d87b53efb4e73..e311be4b12218a 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
@@ -2,9 +2,10 @@
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX90A %s
; GCN-LABEL: {{^}}kernel_32_agprs:
-; GFX908: .amdhsa_next_free_vgpr 32
-; GFX90A: .amdhsa_next_free_vgpr 44
-; GFX90A: .amdhsa_accum_offset 12
+; GCN: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_32_agprs.num_agpr, kernel_32_agprs.num_vgpr), 1, 0)
+; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_32_agprs.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
+; GFX908: .set kernel_32_agprs.num_vgpr, 9
+; GFX908: .set kernel_32_agprs.num_agpr, 32
; GCN: NumVgprs: 9
; GCN: NumAgprs: 32
; GFX908: TotalNumVgprs: 32
@@ -24,8 +25,9 @@ bb:
}
; GCN-LABEL: {{^}}kernel_0_agprs:
-; GCN: .amdhsa_next_free_vgpr 1
-; GFX90A: .amdhsa_accum_offset 4
+; GCN: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_0_agprs.num_agpr, kernel_0_agprs.num_vgpr), 1, 0)
+; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_0_agprs.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
+; GCN: .set kernel_0_agprs.num_vgpr, 1
; GCN: NumVgprs: 1
; GCN: NumAgprs: 0
; GCN: TotalNumVgprs: 1
@@ -42,9 +44,10 @@ bb:
}
; GCN-LABEL: {{^}}kernel_40_vgprs:
-; GFX908: .amdhsa_next_free_vgpr 40
-; GFX90A: .amdhsa_next_free_vgpr 56
-; GFX90A: .amdhsa_accum_offset 40
+; GCN: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_40_vgprs.num_agpr, kernel_40_vgprs.num_vgpr), 1, 0)
+; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_40_vgprs.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
+; GCN: .set kernel_40_vgprs.num_vgpr, 40
+; GFX90A: .set kernel_40_vgprs.num_agpr, 16
; GCN: NumVgprs: 40
; GCN: NumAgprs: 16
; GFX908: TotalNumVgprs: 40
@@ -99,9 +102,10 @@ bb:
}
; GCN-LABEL: {{^}}kernel_max_gprs:
-; GFX908: .amdhsa_next_free_vgpr 256
-; GFX90A: .amdhsa_next_free_vgpr 512
-; GFX90A: .amdhsa_accum_offset 256
+; GCN: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_max_gprs.num_agpr, kernel_max_gprs.num_vgpr), 1, 0)
+; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_max_gprs.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
+; GCN: .set kernel_max_gprs.num_vgpr, 256
+; GFX90A: .set kernel_max_gprs.num_agpr, 256
; GCN: NumVgprs: 256
; GCN: NumAgprs: 256
; GFX908: TotalNumVgprs: 256
@@ -121,8 +125,10 @@ bb:
}
; GCN-LABEL: {{^}}kernel_call_func_32_agprs:
-; GFX908: .amdhsa_next_free_vgpr 32
-; GFX90A: .amdhsa_accum_offset 12
+; GCN: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_call_func_32_agprs.num_agpr, kernel_call_func_32_agprs.num_vgpr), 1, 0)
+; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_call_func_32_agprs.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
+; GCN: .set kernel_call_func_32_agprs.num_vgpr, max(0, func_32_agprs.num_vgpr)
+; GCN: .set kernel_call_func_32_agprs.num_agpr, max(0, func_32_agprs.num_agpr)
; GCN: NumVgprs: 9
; GCN: NumAgprs: 32
; GFX908: TotalNumVgprs: 32
@@ -154,25 +160,28 @@ bb:
declare void @undef_func()
; GCN-LABEL: {{^}}kernel_call_undef_func:
-; GFX908: .amdhsa_next_free_vgpr 32
-; GFX90A: .amdhsa_next_free_vgpr 64
-; GFX90A: .amdhsa_accum_offset 32
-; GCN: NumVgprs: 32
-; GCN: NumAgprs: 32
-; GFX908: TotalNumVgprs: 32
-; GFX90A: TotalNumVgprs: 64
-; GFX908: VGPRBlocks: 7
-; GFX90A: VGPRBlocks: 7
-; GFX908: NumVGPRsForWavesPerEU: 32
-; GFX90A: NumVGPRsForWavesPerEU: 64
-; GFX90A: AccumOffset: 32
-; GFX908: Occupancy: 8
-; GFX90A: Occupancy: 8
-; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 7
+; GCN: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0)
+; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
+; GCN: .set kernel_call_undef_func.num_vgpr, max(32, max_num_vgpr)
+; GCN: .set kernel_call_undef_func.num_agpr, max(0, max_num_agpr)
+; GCN: NumVgprs: kernel_call_undef_func.num_vgpr
+; GCN: NumAgprs: kernel_call_undef_func.num_agpr
+; GCN: TotalNumVgprs: totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr)
+; GFX908: VGPRBlocks: ((alignto(max(max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0), 1), 4))/4)-1
+; GFX90A: VGPRBlocks: ((alignto(max(max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0), 1), 8))/8)-1
+; GCN: NumVGPRsForWavesPerEU: max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0)
+; GFX90A: AccumOffset: ((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4))/4)-1)+1)*4
+; GFX908: Occupancy: occupancy(10, 4, 256, 8, 10, max(kernel_call_undef_func.num_sgpr+(extrasgprs(kernel_call_undef_func.uses_vcc, kernel_call_undef_func.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0))
+; GFX90A: Occupancy: occupancy(8, 8, 512, 8, 8, max(kernel_call_undef_func.num_sgpr+(extrasgprs(kernel_call_undef_func.uses_vcc, kernel_call_undef_func.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0))
+; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: ((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4))/4)-1)&(~65536))&63
define amdgpu_kernel void @kernel_call_undef_func() #0 {
bb:
call void @undef_func()
ret void
}
+; GCN: .set max_num_vgpr, 32
+; GCN-NEXT: .set max_num_agpr, 32
+; GCN-NEXT: .set max_num_sgpr, 34
+
attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index 9ec8e425a3f55c..993ff4e4477d35 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -35,7 +35,8 @@
; FIXME: Creating the emergency stack slots causes us to over-estimate scratch
; by 4 bytes.
-; HSA-ALLOCA: .amdhsa_private_segment_fixed_size 24
+; HSA-ALLOCA: .amdhsa_private_segment_fixed_size mova_same_clause.private_seg_size
+; HSA-ALLOCA: .set mova_same_clause.private_seg_size, 24
; HSA-ALLOCA: s_add_i32 s12, s12, s17
; HSA-ALLOCA-DAG: s_mov_b32 flat_scratch_lo, s13
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll
index 99a7ae37e0e78d..f64a5e01cd2560 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll
@@ -60,7 +60,9 @@ bb:
declare void @undef_func()
; CHECK: .type kernel_call_undef_func
-; CHECK: NumAgprs: 32
+; CHECK: .set kernel_call_undef_func.num_agpr, max(0, max_num_agpr)
+; CHECK: NumAgprs: kernel_call_undef_func.num_agpr
+; CHECK: .set max_num_agpr, 32
define amdgpu_kernel void @kernel_call_undef_func() #0 {
bb:
call void @undef_func()
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
index e4d427a0b826f8..c893f6b04b7b66 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
@@ -547,18 +547,20 @@ define amdgpu_kernel void @f256() #256 {
attributes #256 = { nounwind "amdgpu-flat-work-group-size"="256,256" }
; GCN-LABEL: {{^}}f512:
-; GFX9: NumVgprs: 128
-; GFX90A: NumVgprs: 128
-; GFX90A: NumAgprs: 128
-; GFX90A: TotalNumVgprs: 256
-; GFX10WGP-WAVE32: NumVgprs: 256
-; GFX10WGP-WAVE64: NumVgprs: 256
-; GFX10CU-WAVE32: NumVgprs: 128
-; GFX10CU-WAVE64: NumVgprs: 128
-; GFX11WGP-WAVE32: NumVgprs: 256
-; GFX11WGP-WAVE64: NumVgprs: 256
-; GFX11CU-WAVE32: NumVgprs: 192
-; GFX11CU-WAVE64: NumVgprs: 192
+; GFX9: .set f512.num_vgpr, max(128, max_num_vgpr)
+; GFX90A: .set f512.num_vgpr, max(128, max_num_vgpr)
+; GFX90A: .set f512.num_agpr, max(128, max_num_agpr)
+; GFX10WGP-WAVE32: .set f512.num_vgpr, max(256, max_num_vgpr)
+; GFX10WGP-WAVE64: .set f512.num_vgpr, max(256, max_num_vgpr)
+; GFX10CU-WAVE32: .set f512.num_vgpr, max(128, max_num_vgpr)
+; GFX10CU-WAVE64: .set f512.num_vgpr, max(128, max_num_vgpr)
+; GFX11WGP-WAVE32: .set f512.num_vgpr, max(256, max_num_vgpr)
+; GFX11WGP-WAVE64: .set f512.num_vgpr, max(256, max_num_vgpr)
+; GFX11CU-WAVE32: .set f512.num_vgpr, max(192, max_num_vgpr)
+; GFX11CU-WAVE64: .set f512.num_vgpr, max(192, max_num_vgpr)
+; GCN: NumVgprs: f512.num_vgpr
+; GFX90A: NumAgprs: f512.num_agpr
+; GFX90A: TotalNumVgprs: totalnumvgprs(f512.num_agpr, f512.num_vgpr)
define amdgpu_kernel void @f512() #512 {
call void @foo()
call void @use256vgprs()
@@ -567,17 +569,20 @@ define amdgpu_kernel void @f512() #512 {
attributes #512 = { nounwind "amdgpu-flat-work-group-size"="512,512" }
; GCN-LABEL: {{^}}f1024:
-; GFX9: NumVgprs: 64
-; GFX90A: NumAgprs: 64
-; GFX90A: TotalNumVgprs: 128
-; GFX10WGP-WAVE32: NumVgprs: 128
-; GFX10WGP-WAVE64: NumVgprs: 128
-; GFX10CU-WAVE32: NumVgprs: 64
-; GFX10CU-WAVE64: NumVgprs: 64
-; GFX11WGP-WAVE32: NumVgprs: 192
-; GFX11WGP-WAVE64: NumVgprs: 192
-; GFX11CU-WAVE32: NumVgprs: 96
-; GFX11CU-WAVE64: NumVgprs: 96
+; GFX9: .set f1024.num_vgpr, max(64, max_num_vgpr)
+; GFX90A: .set f1024.num_vgpr, max(64, max_num_vgpr)
+; GFX90A: .set f1024.num_agpr, max(64, max_num_agpr)
+; GFX10WGP-WAVE32: .set f1024.num_vgpr, max(128, max_num_vgpr)
+; GFX10WGP-WAVE64: .set f1024.num_vgpr, max(128, max_num_vgpr)
+; GFX10CU-WAVE32: .set f1024.num_vgpr, max(64, max_num_vgpr)
+; GFX10CU-WAVE64: .set f1024.num_vgpr, max(64, max_num_vgpr)
+; GFX11WGP-WAVE32: .set f1024.num_vgpr, max(192, max_num_vgpr)
+; GFX11WGP-WAVE64: .set f1024.num_vgpr, max(192, max_num_vgpr)
+; GFX11CU-WAVE32: .set f1024.num_vgpr, max(96, max_num_vgpr)
+; GFX11CU-WAVE64: .set f1024.num_vgpr, max(96, max_num_vgpr)
+; GCN: NumVgprs: f1024.num_vgpr
+; GFX90A: NumAgprs: f1024.num_agpr
+; GFX90A: TotalNumVgprs: totalnumvgprs(f1024.num_agpr, f1024.num_vgpr)
define amdgpu_kernel void @f1024() #1024 {
call void @foo()
call void @use256vgprs()
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
index 90562e25a3e9c1..77be8605f20015 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
@@ -10,12 +10,21 @@
; OPT: .amdhsa_user_sgpr_dispatch_id 0
; OPT: .amdhsa_user_sgpr_flat_scratch_init 0
; OPT: .amdhsa_user_sgpr_private_segment_size 0
-; OPT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
+; OPT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(foo.private_seg_size*64, 1024))/1024)>0)||(foo.has_dyn_sized_stack|foo.has_recursion))|136)&1
; OPT: .amdhsa_system_sgpr_workgroup_id_x 1
; OPT: .amdhsa_system_sgpr_workgroup_id_y 0
; OPT: .amdhsa_system_sgpr_workgroup_id_z 0
; OPT: .amdhsa_system_sgpr_workgroup_info 0
; OPT: .amdhsa_system_vgpr_workitem_id 0
+; OPT: .set foo.num_vgpr, 0
+; OPT: .set foo.num_agpr, 0
+; OPT: .set foo.num_sgpr, 0
+; OPT: .set foo.private_seg_size, 0
+; OPT: .set foo.uses_vcc, 0
+; OPT: .set foo.uses_flat_scratch, 0
+; OPT: .set foo.has_dyn_sized_stack, 0
+; OPT: .set foo.has_recursion, 0
+; OPT: .set foo.has_indirect_call, 0
; NOOPT: .amdhsa_user_sgpr_private_segment_buffer 1
; NOOPT: .amdhsa_user_sgpr_dispatch_ptr 1
@@ -25,12 +34,25 @@
; NOOPT: .amdhsa_user_sgpr_dispatch_id 1
; NOOPT: .amdhsa_user_sgpr_flat_scratch_init 0
; NOOPT: .amdhsa_user_sgpr_private_segment_size 0
-; NOOPT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
+; COV4: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(foo.private_seg_size*64, 1024))/1024)>0)||(foo.has_dyn_sized_stack|foo.has_recursion))|5016)&1
+; COV5: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(foo.private_seg_size*64, 1024))/1024)>0)||(foo.has_dyn_sized_stack|foo.has_recursion))|5012)&1
; NOOPT: .amdhsa_system_sgpr_workgroup_id_x 1
; NOOPT: .amdhsa_system_sgpr_workgroup_id_y 1
; NOOPT: .amdhsa_system_sgpr_workgroup_id_z 1
-; NOOPT: .amdhsa_system_sgpr_workgroup_info 0
-; NOOPT: .amdhsa_system_vgpr_workitem_id 2
+; COV4: .amdhsa_system_sgpr_workgroup_info 0
+; COV5: .amdhsa_system_sgpr_workgroup_info 0
+; COV4: .amdhsa_system_vgpr_workitem_id 2
+; COV5: .amdhsa_system_vgpr_workitem_id 2
+; NOOPT: .set foo.num_vgpr, 0
+; NOOPT: .set foo.num_agpr, 0
+; NOOPT: .set foo.num_sgpr, 0
+; NOOPT: .set foo.private_seg_size, 0
+; NOOPT: .set foo.uses_vcc, 0
+; NOOPT: .set foo.uses_flat_scratch, 0
+; NOOPT: .set foo.has_dyn_sized_stack, 0
+; NOOPT: .set foo.has_recursion, 0
+; NOOPT: .set foo.has_indirect_call, 0
+
define amdgpu_kernel void @foo() {
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
index a795e995603410..f5d45993742814 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefixes=ALL,GFX908 %s
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=ALL %s
; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=ALL,GFX90A %s
; CallGraphAnalysis, which CodeGenSCC order depends on, does not look
@@ -8,12 +8,13 @@
@alias = hidden alias void (), ptr @aliasee_default
; ALL-LABEL: {{^}}kernel:
-; GFX908: .amdhsa_next_free_vgpr 32
-; GFX908-NEXT: .amdhsa_next_free_sgpr 33
+; ALL: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel.num_agpr, kernel.num_vgpr), 1, 0)
+; ALL-NEXT: .amdhsa_next_free_sgpr (max(kernel.num_sgpr+(extrasgprs(kernel.uses_vcc, kernel.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel.uses_vcc, kernel.uses_flat_scratch, 1))
+; GFX90A-NEXT: .amdhsa_accum_offset ((((((alignto(max(1, kernel.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
-; GFX90A: .amdhsa_next_free_vgpr 59
-; GFX90A-NEXT: .amdhsa_next_free_sgpr 33
-; GFX90A-NEXT: .amdhsa_accum_offset 32
+; ALL: .set kernel.num_vgpr, max(32, aliasee_default.num_vgpr)
+; ALL-NEXT: .set kernel.num_agpr, max(0, aliasee_default.num_agpr)
+; ALL-NEXT: .set kernel.num_sgpr, max(33, aliasee_default.num_sgpr)
define amdgpu_kernel void @kernel() #0 {
bb:
call void @alias() #2
@@ -25,6 +26,9 @@ bb:
call void asm sideeffect "; clobber a26 ", "~{a26}"()
ret void
}
+; ALL: .set aliasee_default.num_vgpr, 0
+; ALL-NEXT: .set aliasee_default.num_agpr, 27
+; ALL-NEXT: .set aliasee_default.num_sgpr, 32
attributes #0 = { noinline norecurse nounwind optnone }
attributes #1 = { noinline norecurse nounwind readnone willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
index c976cc3d53b5eb..092e734ef106be 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
@@ -7,14 +7,18 @@
@alias0 = hidden alias void (), ptr @aliasee_default_vgpr64_sgpr102
; CHECK-LABEL: {{^}}kernel0:
-; CHECK: .amdhsa_next_free_vgpr 53
-; CHECK-NEXT: .amdhsa_next_free_sgpr 33
+; CHECK: .set kernel0.num_vgpr, max(32, aliasee_default_vgpr64_sgpr102.num_vgpr)
+; CHECK-NEXT: .set kernel0.num_agpr, max(0, aliasee_default_vgpr64_sgpr102.num_agpr)
+; CHECK-NEXT: .set kernel0.num_sgpr, max(33, aliasee_default_vgpr64_sgpr102.num_sgpr)
define amdgpu_kernel void @kernel0() #0 {
bb:
call void @alias0() #2
ret void
}
+; CHECK: .set aliasee_default_vgpr64_sgpr102.num_vgpr, 53
+; CHECK-NEXT: .set aliasee_default_vgpr64_sgpr102.num_agpr, 0
+; CHECK-NEXT: .set aliasee_default_vgpr64_sgpr102.num_sgpr, 32
define internal void @aliasee_default_vgpr64_sgpr102() #1 {
bb:
call void asm sideeffect "; clobber v52 ", "~{v52}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
index edef71ef143dfd..f8287dc518421e 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
@@ -9,8 +9,12 @@
; The parent kernel has a higher VGPR usage than the possible callees.
; CHECK-LABEL: {{^}}kernel1:
-; CHECK: .amdhsa_next_free_vgpr 41
-; CHECK-NEXT: .amdhsa_next_free_sgpr 33
+; CHECK: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel1.num_agpr, kernel1.num_vgpr), 1, 0)
+; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel1.num_sgpr+(extrasgprs(kernel1.uses_vcc, kernel1.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel1.uses_vcc, kernel1.uses_flat_scratch, 1))
+
+; CHECK: .set kernel1.num_vgpr, max(41, aliasee_vgpr32_sgpr76.num_vgpr)
+; CHECK-NEXT: .set kernel1.num_agpr, max(0, aliasee_vgpr32_sgpr76.num_agpr)
+; CHECK-NEXT: .set kernel1.num_sgpr, max(33, aliasee_vgpr32_sgpr76.num_sgpr)
define amdgpu_kernel void @kernel1() #0 {
bb:
call void asm sideeffect "; clobber v40 ", "~{v40}"()
@@ -18,6 +22,9 @@ bb:
ret void
}
+; CHECK: .set aliasee_vgpr32_sgpr76.num_vgpr, 27
+; CHECK-NEXT: .set aliasee_vgpr32_sgpr76.num_agpr, 0
+; CHECK-NEXT: .set aliasee_vgpr32_sgpr76.num_sgpr, 32
define internal void @aliasee_vgpr32_sgpr76() #1 {
bb:
call void asm sideeffect "; clobber v26 ", "~{v26}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
index bb34ef1a15d2b9..a99b2295dfe85c 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
@@ -7,14 +7,21 @@
@alias2 = hidden alias void (), ptr @aliasee_vgpr64_sgpr102
; CHECK-LABEL: {{^}}kernel2:
-; CHECK: .amdhsa_next_free_vgpr 53
-; CHECK-NEXT: .amdhsa_next_free_sgpr 33
+; CHECK: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel2.num_agpr, kernel2.num_vgpr), 1, 0)
+; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel2.num_sgpr+(extrasgprs(kernel2.uses_vcc, kernel2.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel2.uses_vcc, kernel2.uses_flat_scratch, 1))
+
+; CHECK: .set kernel2.num_vgpr, max(32, aliasee_vgpr64_sgpr102.num_vgpr)
+; CHECK-NEXT: .set kernel2.num_agpr, max(0, aliasee_vgpr64_sgpr102.num_agpr)
+; CHECK-NEXT: .set kernel2.num_sgpr, max(33, aliasee_vgpr64_sgpr102.num_sgpr)
define amdgpu_kernel void @kernel2() #0 {
bb:
call void @alias2() #2
ret void
}
+; CHECK: .set aliasee_vgpr64_sgpr102.num_vgpr, 53
+; CHECK-NEXT: .set aliasee_vgpr64_sgpr102.num_agpr, 0
+; CHECK-NEXT: .set aliasee_vgpr64_sgpr102.num_sgpr, 32
define internal void @aliasee_vgpr64_sgpr102() #1 {
bb:
call void asm sideeffect "; clobber v52 ", "~{v52}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
index 8a88eb7e51ad72..793dc1bc3a6f33 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
@@ -7,14 +7,21 @@
@alias3 = hidden alias void (), ptr @aliasee_vgpr256_sgpr102
; CHECK-LABEL: {{^}}kernel3:
-; CHECK: .amdhsa_next_free_vgpr 253
-; CHECK-NEXT: .amdhsa_next_free_sgpr 33
+; CHECK: .amdhsa_next_free_vgpr max(totalnumvgprs(kernel3.num_agpr, kernel3.num_vgpr), 1, 0)
+; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel3.num_sgpr+(extrasgprs(kernel3.uses_vcc, kernel3.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel3.uses_vcc, kernel3.uses_flat_scratch, 1))
+
+; CHECK: .set kernel3.num_vgpr, max(32, aliasee_vgpr256_sgpr102.num_vgpr)
+; CHECK-NEXT: .set kernel3.num_agpr, max(0, aliasee_vgpr256_sgpr102.num_agpr)
+; CHECK-NEXT: .set kernel3.num_sgpr, max(33, aliasee_vgpr256_sgpr102.num_sgpr)
define amdgpu_kernel void @kernel3() #0 {
bb:
call void @alias3() #2
ret void
}
+; CHECK: .set aliasee_vgpr256_sgpr102.num_vgpr, 253
+; CHECK-NEXT: .set aliasee_vgpr256_sgpr102.num_agpr, 0
+; CHECK-NEXT: .set aliasee_vgpr256_sgpr102.num_sgpr, 33
define internal void @aliasee_vgpr256_sgpr102() #1 {
bb:
call void asm sideeffect "; clobber v252 ", "~{v252}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index 6af45035d394f8..6311c2a01d366b 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -222,10 +222,14 @@ ret:
}
; GCN-LABEL: {{^}}usage_direct_recursion:
-; GCN: .amdhsa_private_segment_fixed_size 18448
+; GCN: .amdhsa_private_segment_fixed_size usage_direct_recursion.private_seg_size
+; GCN: .set usage_direct_recursion.private_seg_size, 0+(max(16384, direct_recursion_use_stack.private_seg_size))
+; GCN: ScratchSize: 18448
;
; GCN-V5-LABEL: {{^}}usage_direct_recursion:
-; GCN-V5: .amdhsa_private_segment_fixed_size 2064{{$}}
+; GCN-V5: .amdhsa_private_segment_fixed_size usage_direct_recursion.private_seg_size
+; GCN-V5: .set usage_direct_recursion.private_seg_size, 0+(max(direct_recursion_use_stack.private_seg_size))
+; GCN-V5: ScratchSize: 2064
define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
call void @direct_recursion_use_stack(i32 %n)
ret void
@@ -234,10 +238,11 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
; Make sure there's no assert when a sgpr96 is used.
; GCN-LABEL: {{^}}count_use_sgpr96_external_call
; GCN: ; sgpr96 s[{{[0-9]+}}:{{[0-9]+}}]
-; CI: NumSgprs: 84
-; VI-NOBUG: NumSgprs: 86
+; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(0, max_num_vgpr)
+; GCN: .set count_use_sgpr96_external_call.num_sgpr, max(33, max_num_sgpr)
+; CI: NumSgprs: count_use_sgpr96_external_call.num_sgpr+4
; VI-BUG: NumSgprs: 96
-; GCN: NumVgprs: 50
+; GCN: NumVgprs: count_use_sgpr96_external_call.num_vgpr
define amdgpu_kernel void @count_use_sgpr96_external_call() {
entry:
tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> <i32 10, i32 11, i32 12>) #1
@@ -248,10 +253,11 @@ entry:
; Make sure there's no assert when a sgpr160 is used.
; GCN-LABEL: {{^}}count_use_sgpr160_external_call
; GCN: ; sgpr160 s[{{[0-9]+}}:{{[0-9]+}}]
-; CI: NumSgprs: 84
-; VI-NOBUG: NumSgprs: 86
+; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(0, max_num_vgpr)
+; GCN: .set count_use_sgpr160_external_call.num_sgpr, max(33, max_num_sgpr)
+; CI: NumSgprs: count_use_sgpr160_external_call.num_sgpr+4
; VI-BUG: NumSgprs: 96
-; GCN: NumVgprs: 50
+; GCN: NumVgprs: count_use_sgpr160_external_call.num_vgpr
define amdgpu_kernel void @count_use_sgpr160_external_call() {
entry:
tail call void asm sideeffect "; sgpr160 $0", "s"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
@@ -262,10 +268,11 @@ entry:
; Make sure there's no assert when a vgpr160 is used.
; GCN-LABEL: {{^}}count_use_vgpr160_external_call
; GCN: ; vgpr160 v[{{[0-9]+}}:{{[0-9]+}}]
-; CI: NumSgprs: 84
-; VI-NOBUG: NumSgprs: 86
+; GCN: .set count_use_vgpr160_external_call.num_vgpr, max(5, max_num_vgpr)
+; GCN: .set count_use_vgpr160_external_call.num_sgpr, max(33, max_num_sgpr)
+; CI: NumSgprs: count_use_vgpr160_external_call.num_sgpr+4
; VI-BUG: NumSgprs: 96
-; GCN: NumVgprs: 50
+; GCN: NumVgprs: count_use_vgpr160_external_call.num_vgpr
define amdgpu_kernel void @count_use_vgpr160_external_call() {
entry:
tail call void asm sideeffect "; vgpr160 $0", "v"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
@@ -273,6 +280,27 @@ entry:
ret void
}
+; GCN: .set max_num_vgpr, 50
+; GCN: .set max_num_agpr, 0
+; GCN: .set max_num_sgpr, 80
+
+; GCN-LABEL: amdhsa.kernels:
+; GCN: .name: count_use_sgpr96_external_call
+; CI: .sgpr_count: 84
+; VI-NOBUG: .sgpr_count: 86
+; VI-BUG: .sgpr_count: 96
+; GCN: .vgpr_count: 50
+; GCN: .name: count_use_sgpr160_external_call
+; CI: .sgpr_count: 84
+; VI-NOBUG: .sgpr_count: 86
+; VI-BUG: .sgpr_count: 96
+; GCN: .vgpr_count: 50
+; GCN: .name: count_use_vgpr160_external_call
+; CI: .sgpr_count: 84
+; VI-NOBUG: .sgpr_count: 86
+; VI-BUG: .sgpr_count: 96
+; GCN: .vgpr_count: 50
+
attributes #0 = { nounwind noinline norecurse "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
attributes #1 = { nounwind noinline norecurse "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
attributes #2 = { nounwind noinline }
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
index 032ec65fa85133..7e731a70ca4d76 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
@@ -469,7 +469,7 @@ define hidden void @use_every_sgpr_input() #1 {
; GCN: .amdhsa_user_sgpr_dispatch_id 1
; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
; GCN: .amdhsa_user_sgpr_private_segment_size 0
-; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(kern_indirect_use_every_sgpr_input.private_seg_size*64, 1024))/1024)>0)||(kern_indirect_use_every_sgpr_input.has_dyn_sized_stack|kern_indirect_use_every_sgpr_input.has_recursion))|920)&1
; GCN: .amdhsa_system_sgpr_workgroup_id_x 1
; GCN: .amdhsa_system_sgpr_workgroup_id_y 1
; GCN: .amdhsa_system_sgpr_workgroup_id_z 1
@@ -494,7 +494,7 @@ define amdgpu_kernel void @kern_indirect_use_every_sgpr_input(i8) #1 {
; GCN: .amdhsa_user_sgpr_dispatch_id 1
; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
; GCN: .amdhsa_user_sgpr_private_segment_size 0
-; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(kern_indirect_use_every_sgpr_input_no_kernargs.private_seg_size*64, 1024))/1024)>0)||(kern_indirect_use_every_sgpr_input_no_kernargs.has_dyn_sized_stack|kern_indirect_use_every_sgpr_input_no_kernargs.has_recursion))|916)&1
; GCN: .amdhsa_system_sgpr_workgroup_id_x 1
; GCN: .amdhsa_system_sgpr_workgroup_id_y 1
; GCN: .amdhsa_system_sgpr_workgroup_id_z 1
diff --git a/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll b/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
index 643f2619840a22..ede57f1a0a04ce 100644
--- a/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
@@ -33,6 +33,7 @@ bb2:
; GCN-LABEL: {{^}}preserve_condition_undef_flag:
; GCN-NOT: vcc
+; GCN: s_endpgm
define amdgpu_kernel void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) {
bb0:
%tmp = icmp sgt i32 %arg1, 4
diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
index 3035a8579c8a6d..13fd714933dbb6 100644
--- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
@@ -15,13 +15,19 @@
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 10
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
-; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8
-; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
-; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
+; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr max(totalnumvgprs(fadd.num_agpr, fadd.num_vgpr), 1, 0)
+; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr (max(fadd.num_sgpr+(extrasgprs(fadd.uses_vcc, fadd.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(fadd.uses_vcc, fadd.uses_flat_scratch, 0))
+; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc fadd.uses_vcc
+; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch fadd.uses_flat_scratch
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
; OSABI-AMDHSA-ASM: .text
+; OSABI-AMDHSA-ASM: .set fadd.num_vgpr, 3
+; OSABI-AMDHSA-ASM: .set fadd.num_agpr, 0
+; OSABI-AMDHSA-ASM: .set fadd.num_sgpr, 8
+; OSABI-AMDHSA-ASM: .set fadd.uses_vcc, 0
+; OSABI-AMDHSA-ASM: .set fadd.uses_flat_scratch, 0
+
; ALL-ASM-LABEL: {{^}}fsub:
; OSABI-AMDHSA-ASM-NOT: .amdgpu_hsa_kernel
@@ -34,13 +40,19 @@
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 10
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
-; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8
-; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
-; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
+; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr max(totalnumvgprs(fsub.num_agpr, fsub.num_vgpr), 1, 0)
+; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr (max(fsub.num_sgpr+(extrasgprs(fsub.uses_vcc, fsub.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(fsub.uses_vcc, fsub.uses_flat_scratch, 0))
+; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc fsub.uses_vcc
+; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch fsub.uses_flat_scratch
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
; OSABI-AMDHSA-ASM: .text
+; OSABI-AMDHSA-ASM: .set fsub.num_vgpr, 3
+; OSABI-AMDHSA-ASM: .set fsub.num_agpr, 0
+; OSABI-AMDHSA-ASM: .set fsub.num_sgpr, 8
+; OSABI-AMDHSA-ASM: .set fsub.uses_vcc, 0
+; OSABI-AMDHSA-ASM: .set fsub.uses_flat_scratch, 0
+
; OSABI-AMDHSA-ASM-NOT: .hsa_code_object_version
; OSABI-AMDHSA-ASM-NOT: .hsa_code_object_isa
; OSABI-AMDHSA-ASM-NOT: .amd_amdgpu_isa
@@ -93,8 +105,10 @@ entry:
; registers used.
;
; ALL-ASM-LABEL: {{^}}empty:
-; ALL-ASM: .amdhsa_next_free_vgpr 1
-; ALL-ASM: .amdhsa_next_free_sgpr 1
+; ALL-ASM: .amdhsa_next_free_vgpr max(totalnumvgprs(empty.num_agpr, empty.num_vgpr), 1, 0)
+; ALL-ASM: .amdhsa_next_free_sgpr (max(empty.num_sgpr+(extrasgprs(empty.uses_vcc, empty.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(empty.uses_vcc, empty.uses_flat_scratch, 0))
+; ALL-ASM: NumSGPRsForWavesPerEU: 1
+; ALL-ASM: NumVGPRsForWavesPerEU: 1
define amdgpu_kernel void @empty(
i32 %i,
ptr addrspace(1) %r,
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll
index 9d93609b1e8813..aa1a93cfec3d86 100644
--- a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll
@@ -1,8 +1,8 @@
; REQUIRES: asserts
-; RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
-; RUN: not --crash llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=asm < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=asm < %s | FileCheck %s
-; CHECK: function must have been generated already
+; CHECK-NOT: func
define internal i32 @func() {
ret i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
index 789150f690d52e..69a729f6847f0a 100644
--- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -62,7 +62,8 @@
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]]
-; VGPR: .amdhsa_private_segment_fixed_size 16
+; VGPR: .amdhsa_private_segment_fixed_size divergent_if_endif.private_seg_size
+; VGPR: .set divergent_if_endif.private_seg_size, 16
define amdgpu_kernel void @divergent_if_endif(ptr addrspace(1) %out) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -133,7 +134,8 @@ endif:
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]]
-; VGPR: .amdhsa_private_segment_fixed_size 20
+; VGPR: .amdhsa_private_segment_fixed_size divergent_loop.private_seg_size
+; VGPR: .set divergent_loop.private_seg_size, 20
define amdgpu_kernel void @divergent_loop(ptr addrspace(1) %out) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/elf.ll b/llvm/test/CodeGen/AMDGPU/elf.ll
index f51d9fc5125ba6..423bb95af25df9 100644
--- a/llvm/test/CodeGen/AMDGPU/elf.ll
+++ b/llvm/test/CodeGen/AMDGPU/elf.ll
@@ -3,7 +3,7 @@
; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -filetype=obj | llvm-readobj -S --symbols --file-headers - | FileCheck --check-prefix=ELF %s
; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TONGA %s
; RUN: llc < %s -mtriple=amdgcn -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs -filetype=obj | llvm-readobj -S --symbols --file-headers - | FileCheck --check-prefix=ELF %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=CARRIZO %s
; Test that we don't try to produce a COFF file on windows
; RUN: llc < %s -mtriple=amdgcn-pc-mingw -verify-machineinstrs -filetype=obj | llvm-readobj -S --symbols --file-headers - | FileCheck --check-prefix=ELF %s
@@ -20,8 +20,9 @@
; CONFIG: .section .AMDGPU.config
; CONFIG-NEXT: .long 45096
-; TYPICAL-NEXT: .long 0
-; TONGA-NEXT: .long 704
+; TYPICAL-NEXT: .long (((((alignto(max(max(totalnumvgprs(test.num_agpr, max(totalnumvgprs(test.num_agpr, test.num_vgpr), 1)), 1, 0), 1), 4))/4)-1)&63)<<0)|(((((alignto(max(max(max(test.num_sgpr+(extrasgprs(test.uses_vcc, test.uses_flat_scratch, 0)), 0), 1, 0), 1), 8))/8)-1)&15)<<6)
+; TONGA-NEXT: .long (((((alignto(max(max(totalnumvgprs(test.num_agpr, max(totalnumvgprs(test.num_agpr, test.num_vgpr), 1)), 1, 0), 1), 4))/4)-1)&63)<<0)|(((((alignto(max(96, 1), 8))/8)-1)&15)<<6)
+; CARRIZO-NEXT: .long (((((alignto(max(max(totalnumvgprs(test.num_agpr, max(totalnumvgprs(test.num_agpr, test.num_vgpr), 1)), 1, 0), 1), 4))/4)-1)&63)<<0)|(((((alignto(max(max(max(test.num_sgpr+(extrasgprs(test.uses_vcc, test.uses_flat_scratch, 1)), 0), 1, 0), 1), 8))/8)-1)&15)<<6)
; CONFIG: .p2align 8
; CONFIG: test:
define amdgpu_ps void @test(i32 %p) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/enable-scratch-only-dynamic-stack.ll b/llvm/test/CodeGen/AMDGPU/enable-scratch-only-dynamic-stack.ll
index 78ac2f9eaff020..c4111282682527 100644
--- a/llvm/test/CodeGen/AMDGPU/enable-scratch-only-dynamic-stack.ll
+++ b/llvm/test/CodeGen/AMDGPU/enable-scratch-only-dynamic-stack.ll
@@ -7,13 +7,17 @@
; No stack objects, only indirect call has to enable scrathch
; GCN-LABEL: test_indirect_call:
-; COV5: .amdhsa_private_segment_fixed_size 0{{$}}
-; COV4: .amdhsa_private_segment_fixed_size 16384{{$}}
-
+; GCN: .amdhsa_private_segment_fixed_size test_indirect_call.private_seg_size
; GCN: .amdhsa_user_sgpr_private_segment_buffer 1
+; COV5: .amdhsa_uses_dynamic_stack ((59|((test_indirect_call.has_dyn_sized_stack|test_indirect_call.has_recursion)<<11))&2048)>>11
+; COV5: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(test_indirect_call.private_seg_size*64, 1024))/1024)>0)||(test_indirect_call.has_dyn_sized_stack|test_indirect_call.has_recursion))|5016)&1
+; COV4: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(test_indirect_call.private_seg_size*64, 1024))/1024)>0)||(test_indirect_call.has_dyn_sized_stack|test_indirect_call.has_recursion))|5020)&1
+
+; COV5: .set test_indirect_call.private_seg_size, 0{{$}}
+; COV4: .set test_indirect_call.private_seg_size, 0+(max(16384))
+; COV5: .set test_indirect_call.has_recursion, 1
+; COV5: .set test_indirect_call.has_indirect_call, 1
-; COV5: .amdhsa_uses_dynamic_stack 1
-; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
define amdgpu_kernel void @test_indirect_call() {
%fptr = load ptr, ptr addrspace(4) @gv.fptr0
call void %fptr()
diff --git a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
new file mode 100644
index 00000000000000..c411323a70ed31
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
@@ -0,0 +1,533 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; SGPR use may not seem equal to the sgpr use provided in comments as the latter includes extra sgprs (e.g., for vcc use).
+
+; Functions that don't make calls should have constants as its resource usage as no resource information has to be propagated.
+
+; GCN-LABEL: {{^}}use_vcc:
+; GCN: .set use_vcc.num_vgpr, 0
+; GCN: .set use_vcc.num_agpr, 0
+; GCN: .set use_vcc.num_sgpr, 32
+; GCN: .set use_vcc.private_seg_size, 0
+; GCN: .set use_vcc.uses_vcc, 1
+; GCN: .set use_vcc.uses_flat_scratch, 0
+; GCN: .set use_vcc.has_dyn_sized_stack, 0
+; GCN: .set use_vcc.has_recursion, 0
+; GCN: .set use_vcc.has_indirect_call, 0
+; GCN: NumSgprs: 36
+; GCN: NumVgprs: 0
+; GCN: ScratchSize: 0
+define void @use_vcc() #1 {
+ call void asm sideeffect "", "~{vcc}" () #0
+ ret void
+}
+
+; GCN-LABEL: {{^}}indirect_use_vcc:
+; GCN: .set indirect_use_vcc.num_vgpr, max(41, use_vcc.num_vgpr)
+; GCN: .set indirect_use_vcc.num_agpr, max(0, use_vcc.num_agpr)
+; GCN: .set indirect_use_vcc.num_sgpr, max(34, use_vcc.num_sgpr)
+; GCN: .set indirect_use_vcc.private_seg_size, 16+(max(use_vcc.private_seg_size))
+; GCN: .set indirect_use_vcc.uses_vcc, or(1, use_vcc.uses_vcc)
+; GCN: .set indirect_use_vcc.uses_flat_scratch, or(0, use_vcc.uses_flat_scratch)
+; GCN: .set indirect_use_vcc.has_dyn_sized_stack, or(0, use_vcc.has_dyn_sized_stack)
+; GCN: .set indirect_use_vcc.has_recursion, or(0, use_vcc.has_recursion)
+; GCN: .set indirect_use_vcc.has_indirect_call, or(0, use_vcc.has_indirect_call)
+; GCN: NumSgprs: 38
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 16
+define void @indirect_use_vcc() #1 {
+ call void @use_vcc()
+ ret void
+}
+
+; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel:
+; GCN: .set indirect_2level_use_vcc_kernel.num_vgpr, max(32, indirect_use_vcc.num_vgpr)
+; GCN: .set indirect_2level_use_vcc_kernel.num_agpr, max(0, indirect_use_vcc.num_agpr)
+; GCN: .set indirect_2level_use_vcc_kernel.num_sgpr, max(33, indirect_use_vcc.num_sgpr)
+; GCN: .set indirect_2level_use_vcc_kernel.private_seg_size, 0+(max(indirect_use_vcc.private_seg_size))
+; GCN: .set indirect_2level_use_vcc_kernel.uses_vcc, or(1, indirect_use_vcc.uses_vcc)
+; GCN: .set indirect_2level_use_vcc_kernel.uses_flat_scratch, or(1, indirect_use_vcc.uses_flat_scratch)
+; GCN: .set indirect_2level_use_vcc_kernel.has_dyn_sized_stack, or(0, indirect_use_vcc.has_dyn_sized_stack)
+; GCN: .set indirect_2level_use_vcc_kernel.has_recursion, or(0, indirect_use_vcc.has_recursion)
+; GCN: .set indirect_2level_use_vcc_kernel.has_indirect_call, or(0, indirect_use_vcc.has_indirect_call)
+; GCN: NumSgprs: 40
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 16
+define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) #0 {
+ call void @indirect_use_vcc()
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_flat_scratch:
+; GCN: .set use_flat_scratch.num_vgpr, 0
+; GCN: .set use_flat_scratch.num_agpr, 0
+; GCN: .set use_flat_scratch.num_sgpr, 32
+; GCN: .set use_flat_scratch.private_seg_size, 0
+; GCN: .set use_flat_scratch.uses_vcc, 0
+; GCN: .set use_flat_scratch.uses_flat_scratch, 1
+; GCN: .set use_flat_scratch.has_dyn_sized_stack, 0
+; GCN: .set use_flat_scratch.has_recursion, 0
+; GCN: .set use_flat_scratch.has_indirect_call, 0
+; GCN: NumSgprs: 38
+; GCN: NumVgprs: 0
+; GCN: ScratchSize: 0
+define void @use_flat_scratch() #1 {
+ call void asm sideeffect "", "~{flat_scratch}" () #0
+ ret void
+}
+
+; GCN-LABEL: {{^}}indirect_use_flat_scratch:
+; GCN: .set indirect_use_flat_scratch.num_vgpr, max(41, use_flat_scratch.num_vgpr)
+; GCN: .set indirect_use_flat_scratch.num_agpr, max(0, use_flat_scratch.num_agpr)
+; GCN: .set indirect_use_flat_scratch.num_sgpr, max(34, use_flat_scratch.num_sgpr)
+; GCN: .set indirect_use_flat_scratch.private_seg_size, 16+(max(use_flat_scratch.private_seg_size))
+; GCN: .set indirect_use_flat_scratch.uses_vcc, or(1, use_flat_scratch.uses_vcc)
+; GCN: .set indirect_use_flat_scratch.uses_flat_scratch, or(0, use_flat_scratch.uses_flat_scratch)
+; GCN: .set indirect_use_flat_scratch.has_dyn_sized_stack, or(0, use_flat_scratch.has_dyn_sized_stack)
+; GCN: .set indirect_use_flat_scratch.has_recursion, or(0, use_flat_scratch.has_recursion)
+; GCN: .set indirect_use_flat_scratch.has_indirect_call, or(0, use_flat_scratch.has_indirect_call)
+; GCN: NumSgprs: 40
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 16
+define void @indirect_use_flat_scratch() #1 {
+ call void @use_flat_scratch()
+ ret void
+}
+
+; GCN-LABEL: {{^}}indirect_2level_use_flat_scratch_kernel:
+; GCN: .set indirect_2level_use_flat_scratch_kernel.num_vgpr, max(32, indirect_use_flat_scratch.num_vgpr)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.num_agpr, max(0, indirect_use_flat_scratch.num_agpr)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.num_sgpr, max(33, indirect_use_flat_scratch.num_sgpr)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.private_seg_size, 0+(max(indirect_use_flat_scratch.private_seg_size))
+; GCN: .set indirect_2level_use_flat_scratch_kernel.uses_vcc, or(1, indirect_use_flat_scratch.uses_vcc)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.uses_flat_scratch, or(1, indirect_use_flat_scratch.uses_flat_scratch)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.has_dyn_sized_stack, or(0, indirect_use_flat_scratch.has_dyn_sized_stack)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.has_recursion, or(0, indirect_use_flat_scratch.has_recursion)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.has_indirect_call, or(0, indirect_use_flat_scratch.has_indirect_call)
+; GCN: NumSgprs: 40
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 16
+define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace(1) %out) #0 {
+ call void @indirect_use_flat_scratch()
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_10_vgpr:
+; GCN: .set use_10_vgpr.num_vgpr, 10
+; GCN: .set use_10_vgpr.num_agpr, 0
+; GCN: .set use_10_vgpr.num_sgpr, 32
+; GCN: .set use_10_vgpr.private_seg_size, 0
+; GCN: .set use_10_vgpr.uses_vcc, 0
+; GCN: .set use_10_vgpr.uses_flat_scratch, 0
+; GCN: .set use_10_vgpr.has_dyn_sized_stack, 0
+; GCN: .set use_10_vgpr.has_recursion, 0
+; GCN: .set use_10_vgpr.has_indirect_call, 0
+; GCN: NumSgprs: 36
+; GCN: NumVgprs: 10
+; GCN: ScratchSize: 0
+define void @use_10_vgpr() #1 {
+ call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4}"() #0
+ call void asm sideeffect "", "~{v5},~{v6},~{v7},~{v8},~{v9}"() #0
+ ret void
+}
+
+; GCN-LABEL: {{^}}indirect_use_10_vgpr:
+; GCN: .set indirect_use_10_vgpr.num_vgpr, max(41, use_10_vgpr.num_vgpr)
+; GCN: .set indirect_use_10_vgpr.num_agpr, max(0, use_10_vgpr.num_agpr)
+; GCN: .set indirect_use_10_vgpr.num_sgpr, max(34, use_10_vgpr.num_sgpr)
+; GCN: .set indirect_use_10_vgpr.private_seg_size, 16+(max(use_10_vgpr.private_seg_size))
+; GCN: .set indirect_use_10_vgpr.uses_vcc, or(1, use_10_vgpr.uses_vcc)
+; GCN: .set indirect_use_10_vgpr.uses_flat_scratch, or(0, use_10_vgpr.uses_flat_scratch)
+; GCN: .set indirect_use_10_vgpr.has_dyn_sized_stack, or(0, use_10_vgpr.has_dyn_sized_stack)
+; GCN: .set indirect_use_10_vgpr.has_recursion, or(0, use_10_vgpr.has_recursion)
+; GCN: .set indirect_use_10_vgpr.has_indirect_call, or(0, use_10_vgpr.has_indirect_call)
+; GCN: NumSgprs: 38
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 16
+define void @indirect_use_10_vgpr() #0 {
+ call void @use_10_vgpr()
+ ret void
+}
+
+; GCN-LABEL: {{^}}indirect_2_level_use_10_vgpr:
+; GCN: .set indirect_2_level_use_10_vgpr.num_vgpr, max(32, indirect_use_10_vgpr.num_vgpr)
+; GCN: .set indirect_2_level_use_10_vgpr.num_agpr, max(0, indirect_use_10_vgpr.num_agpr)
+; GCN: .set indirect_2_level_use_10_vgpr.num_sgpr, max(33, indirect_use_10_vgpr.num_sgpr)
+; GCN: .set indirect_2_level_use_10_vgpr.private_seg_size, 0+(max(indirect_use_10_vgpr.private_seg_size))
+; GCN: .set indirect_2_level_use_10_vgpr.uses_vcc, or(1, indirect_use_10_vgpr.uses_vcc)
+; GCN: .set indirect_2_level_use_10_vgpr.uses_flat_scratch, or(1, indirect_use_10_vgpr.uses_flat_scratch)
+; GCN: .set indirect_2_level_use_10_vgpr.has_dyn_sized_stack, or(0, indirect_use_10_vgpr.has_dyn_sized_stack)
+; GCN: .set indirect_2_level_use_10_vgpr.has_recursion, or(0, indirect_use_10_vgpr.has_recursion)
+; GCN: .set indirect_2_level_use_10_vgpr.has_indirect_call, or(0, indirect_use_10_vgpr.has_indirect_call)
+; GCN: NumSgprs: 40
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 16
+define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 {
+ call void @indirect_use_10_vgpr()
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_50_vgpr:
+; GCN: .set use_50_vgpr.num_vgpr, 50
+; GCN: .set use_50_vgpr.num_agpr, 0
+; GCN: .set use_50_vgpr.num_sgpr, 32
+; GCN: .set use_50_vgpr.private_seg_size, 0
+; GCN: .set use_50_vgpr.uses_vcc, 0
+; GCN: .set use_50_vgpr.uses_flat_scratch, 0
+; GCN: .set use_50_vgpr.has_dyn_sized_stack, 0
+; GCN: .set use_50_vgpr.has_recursion, 0
+; GCN: .set use_50_vgpr.has_indirect_call, 0
+; GCN: NumSgprs: 36
+; GCN: NumVgprs: 50
+; GCN: ScratchSize: 0
+define void @use_50_vgpr() #1 {
+ call void asm sideeffect "", "~{v49}"() #0
+ ret void
+}
+
+; GCN-LABEL: {{^}}indirect_use_50_vgpr:
+; GCN: .set indirect_use_50_vgpr.num_vgpr, max(41, use_50_vgpr.num_vgpr)
+; GCN: .set indirect_use_50_vgpr.num_agpr, max(0, use_50_vgpr.num_agpr)
+; GCN: .set indirect_use_50_vgpr.num_sgpr, max(34, use_50_vgpr.num_sgpr)
+; GCN: .set indirect_use_50_vgpr.private_seg_size, 16+(max(use_50_vgpr.private_seg_size))
+; GCN: .set indirect_use_50_vgpr.uses_vcc, or(1, use_50_vgpr.uses_vcc)
+; GCN: .set indirect_use_50_vgpr.uses_flat_scratch, or(0, use_50_vgpr.uses_flat_scratch)
+; GCN: .set indirect_use_50_vgpr.has_dyn_sized_stack, or(0, use_50_vgpr.has_dyn_sized_stack)
+; GCN: .set indirect_use_50_vgpr.has_recursion, or(0, use_50_vgpr.has_recursion)
+; GCN: .set indirect_use_50_vgpr.has_indirect_call, or(0, use_50_vgpr.has_indirect_call)
+; GCN: NumSgprs: 38
+; GCN: NumVgprs: 50
+; GCN: ScratchSize: 16
+define void @indirect_use_50_vgpr() #0 {
+ call void @use_50_vgpr()
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_80_sgpr:
+; GCN: .set use_80_sgpr.num_vgpr, 1
+; GCN: .set use_80_sgpr.num_agpr, 0
+; GCN: .set use_80_sgpr.num_sgpr, 80
+; GCN: .set use_80_sgpr.private_seg_size, 8
+; GCN: .set use_80_sgpr.uses_vcc, 0
+; GCN: .set use_80_sgpr.uses_flat_scratch, 0
+; GCN: .set use_80_sgpr.has_dyn_sized_stack, 0
+; GCN: .set use_80_sgpr.has_recursion, 0
+; GCN: .set use_80_sgpr.has_indirect_call, 0
+; GCN: NumSgprs: 84
+; GCN: NumVgprs: 1
+; GCN: ScratchSize: 8
+define void @use_80_sgpr() #1 {
+ call void asm sideeffect "", "~{s79}"() #0
+ ret void
+}
+
+; GCN-LABEL: {{^}}indirect_use_80_sgpr:
+; GCN: .set indirect_use_80_sgpr.num_vgpr, max(41, use_80_sgpr.num_vgpr)
+; GCN: .set indirect_use_80_sgpr.num_agpr, max(0, use_80_sgpr.num_agpr)
+; GCN: .set indirect_use_80_sgpr.num_sgpr, max(34, use_80_sgpr.num_sgpr)
+; GCN: .set indirect_use_80_sgpr.private_seg_size, 16+(max(use_80_sgpr.private_seg_size))
+; GCN: .set indirect_use_80_sgpr.uses_vcc, or(1, use_80_sgpr.uses_vcc)
+; GCN: .set indirect_use_80_sgpr.uses_flat_scratch, or(0, use_80_sgpr.uses_flat_scratch)
+; GCN: .set indirect_use_80_sgpr.has_dyn_sized_stack, or(0, use_80_sgpr.has_dyn_sized_stack)
+; GCN: .set indirect_use_80_sgpr.has_recursion, or(0, use_80_sgpr.has_recursion)
+; GCN: .set indirect_use_80_sgpr.has_indirect_call, or(0, use_80_sgpr.has_indirect_call)
+; GCN: NumSgprs: 84
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 24
+define void @indirect_use_80_sgpr() #1 {
+ call void @use_80_sgpr()
+ ret void
+}
+
+; GCN-LABEL: {{^}}indirect_2_level_use_80_sgpr:
+; GCN: .set indirect_2_level_use_80_sgpr.num_vgpr, max(32, indirect_use_80_sgpr.num_vgpr)
+; GCN: .set indirect_2_level_use_80_sgpr.num_agpr, max(0, indirect_use_80_sgpr.num_agpr)
+; GCN: .set indirect_2_level_use_80_sgpr.num_sgpr, max(33, indirect_use_80_sgpr.num_sgpr)
+; GCN: .set indirect_2_level_use_80_sgpr.private_seg_size, 0+(max(indirect_use_80_sgpr.private_seg_size))
+; GCN: .set indirect_2_level_use_80_sgpr.uses_vcc, or(1, indirect_use_80_sgpr.uses_vcc)
+; GCN: .set indirect_2_level_use_80_sgpr.uses_flat_scratch, or(1, indirect_use_80_sgpr.uses_flat_scratch)
+; GCN: .set indirect_2_level_use_80_sgpr.has_dyn_sized_stack, or(0, indirect_use_80_sgpr.has_dyn_sized_stack)
+; GCN: .set indirect_2_level_use_80_sgpr.has_recursion, or(0, indirect_use_80_sgpr.has_recursion)
+; GCN: .set indirect_2_level_use_80_sgpr.has_indirect_call, or(0, indirect_use_80_sgpr.has_indirect_call)
+; GCN: NumSgprs: 86
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 24
+define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 {
+ call void @indirect_use_80_sgpr()
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_stack0:
+; GCN: .set use_stack0.num_vgpr, 1
+; GCN: .set use_stack0.num_agpr, 0
+; GCN: .set use_stack0.num_sgpr, 33
+; GCN: .set use_stack0.private_seg_size, 2052
+; GCN: .set use_stack0.uses_vcc, 0
+; GCN: .set use_stack0.uses_flat_scratch, 0
+; GCN: .set use_stack0.has_dyn_sized_stack, 0
+; GCN: .set use_stack0.has_recursion, 0
+; GCN: .set use_stack0.has_indirect_call, 0
+; GCN: NumSgprs: 37
+; GCN: NumVgprs: 1
+; GCN: ScratchSize: 2052
+define void @use_stack0() #1 {
+ %alloca = alloca [512 x i32], align 4, addrspace(5)
+ call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_stack1:
+; GCN: .set use_stack1.num_vgpr, 1
+; GCN: .set use_stack1.num_agpr, 0
+; GCN: .set use_stack1.num_sgpr, 33
+; GCN: .set use_stack1.private_seg_size, 404
+; GCN: .set use_stack1.uses_vcc, 0
+; GCN: .set use_stack1.uses_flat_scratch, 0
+; GCN: .set use_stack1.has_dyn_sized_stack, 0
+; GCN: .set use_stack1.has_recursion, 0
+; GCN: .set use_stack1.has_indirect_call, 0
+; GCN: NumSgprs: 37
+; GCN: NumVgprs: 1
+; GCN: ScratchSize: 404
+define void @use_stack1() #1 {
+ %alloca = alloca [100 x i32], align 4, addrspace(5)
+ call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0
+ ret void
+}
+
+; GCN-LABEL: {{^}}indirect_use_stack:
+; GCN: .set indirect_use_stack.num_vgpr, max(41, use_stack0.num_vgpr)
+; GCN: .set indirect_use_stack.num_agpr, max(0, use_stack0.num_agpr)
+; GCN: .set indirect_use_stack.num_sgpr, max(34, use_stack0.num_sgpr)
+; GCN: .set indirect_use_stack.private_seg_size, 80+(max(use_stack0.private_seg_size))
+; GCN: .set indirect_use_stack.uses_vcc, or(1, use_stack0.uses_vcc)
+; GCN: .set indirect_use_stack.uses_flat_scratch, or(0, use_stack0.uses_flat_scratch)
+; GCN: .set indirect_use_stack.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack)
+; GCN: .set indirect_use_stack.has_recursion, or(0, use_stack0.has_recursion)
+; GCN: .set indirect_use_stack.has_indirect_call, or(0, use_stack0.has_indirect_call)
+; GCN: NumSgprs: 38
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 2132
+define void @indirect_use_stack() #1 {
+ %alloca = alloca [16 x i32], align 4, addrspace(5)
+ call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0
+ call void @use_stack0()
+ ret void
+}
+
+; GCN-LABEL: {{^}}indirect_2_level_use_stack:
+; GCN: .set indirect_2_level_use_stack.num_vgpr, max(32, indirect_use_stack.num_vgpr)
+; GCN: .set indirect_2_level_use_stack.num_agpr, max(0, indirect_use_stack.num_agpr)
+; GCN: .set indirect_2_level_use_stack.num_sgpr, max(33, indirect_use_stack.num_sgpr)
+; GCN: .set indirect_2_level_use_stack.private_seg_size, 0+(max(indirect_use_stack.private_seg_size))
+; GCN: .set indirect_2_level_use_stack.uses_vcc, or(1, indirect_use_stack.uses_vcc)
+; GCN: .set indirect_2_level_use_stack.uses_flat_scratch, or(1, indirect_use_stack.uses_flat_scratch)
+; GCN: .set indirect_2_level_use_stack.has_dyn_sized_stack, or(0, indirect_use_stack.has_dyn_sized_stack)
+; GCN: .set indirect_2_level_use_stack.has_recursion, or(0, indirect_use_stack.has_recursion)
+; GCN: .set indirect_2_level_use_stack.has_indirect_call, or(0, indirect_use_stack.has_indirect_call)
+; GCN: NumSgprs: 40
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 2132
+define amdgpu_kernel void @indirect_2_level_use_stack() #0 {
+ call void @indirect_use_stack()
+ ret void
+}
+
+
+; Should be maximum of callee usage
+; GCN-LABEL: {{^}}multi_call_use_use_stack:
+; GCN: .set multi_call_use_use_stack.num_vgpr, max(41, use_stack0.num_vgpr, use_stack1.num_vgpr)
+; GCN: .set multi_call_use_use_stack.num_agpr, max(0, use_stack0.num_agpr, use_stack1.num_agpr)
+; GCN: .set multi_call_use_use_stack.num_sgpr, max(42, use_stack0.num_sgpr, use_stack1.num_sgpr)
+; GCN: .set multi_call_use_use_stack.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size))
+; GCN: .set multi_call_use_use_stack.uses_vcc, or(1, use_stack0.uses_vcc, use_stack1.uses_vcc)
+; GCN: .set multi_call_use_use_stack.uses_flat_scratch, or(1, use_stack0.uses_flat_scratch, use_stack1.uses_flat_scratch)
+; GCN: .set multi_call_use_use_stack.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack, use_stack1.has_dyn_sized_stack)
+; GCN: .set multi_call_use_use_stack.has_recursion, or(0, use_stack0.has_recursion, use_stack1.has_recursion)
+; GCN: .set multi_call_use_use_stack.has_indirect_call, or(0, use_stack0.has_indirect_call, use_stack1.has_indirect_call)
+; GCN: NumSgprs: 48
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 2052
+define amdgpu_kernel void @multi_call_use_use_stack() #0 {
+ call void @use_stack0()
+ call void @use_stack1()
+ ret void
+}
+
+declare void @external() #0
+
+; GCN-LABEL: {{^}}multi_call_with_external:
+; GCN: .set multi_call_with_external.num_vgpr, max(41, max_num_vgpr)
+; GCN: .set multi_call_with_external.num_agpr, max(0, max_num_agpr)
+; GCN: .set multi_call_with_external.num_sgpr, max(42, max_num_sgpr)
+; GCN: .set multi_call_with_external.private_seg_size, 0
+; GCN: .set multi_call_with_external.uses_vcc, 1
+; GCN: .set multi_call_with_external.uses_flat_scratch, 1
+; GCN: .set multi_call_with_external.has_dyn_sized_stack, 1
+; GCN: .set multi_call_with_external.has_recursion, 0
+; GCN: .set multi_call_with_external.has_indirect_call, 1
+; GCN: NumSgprs: multi_call_with_external.num_sgpr+6
+; GCN: NumVgprs: multi_call_with_external.num_vgpr
+; GCN: ScratchSize: 0
+define amdgpu_kernel void @multi_call_with_external() #0 {
+ call void @use_stack0()
+ call void @use_stack1()
+ call void @external()
+ ret void
+}
+
+; GCN-LABEL: {{^}}usage_external:
+; GCN: .set usage_external.num_vgpr, max(32, max_num_vgpr)
+; GCN: .set usage_external.num_agpr, max(0, max_num_agpr)
+; GCN: .set usage_external.num_sgpr, max(33, max_num_sgpr)
+; GCN: .set usage_external.private_seg_size, 0
+; GCN: .set usage_external.uses_vcc, 1
+; GCN: .set usage_external.uses_flat_scratch, 1
+; GCN: .set usage_external.has_dyn_sized_stack, 1
+; GCN: .set usage_external.has_recursion, 0
+; GCN: .set usage_external.has_indirect_call, 1
+; GCN: NumSgprs: usage_external.num_sgpr+6
+; GCN: NumVgprs: usage_external.num_vgpr
+; GCN: ScratchSize: 0
+define amdgpu_kernel void @usage_external() #0 {
+ call void @external()
+ ret void
+}
+
+declare void @external_recurse() #2
+
+; GCN-LABEL: {{^}}usage_external_recurse:
+; GCN: .set usage_external_recurse.num_vgpr, max(32, max_num_vgpr)
+; GCN: .set usage_external_recurse.num_agpr, max(0, max_num_agpr)
+; GCN: .set usage_external_recurse.num_sgpr, max(33, max_num_sgpr)
+; GCN: .set usage_external_recurse.private_seg_size, 0
+; GCN: .set usage_external_recurse.uses_vcc, 1
+; GCN: .set usage_external_recurse.uses_flat_scratch, 1
+; GCN: .set usage_external_recurse.has_dyn_sized_stack, 1
+; GCN: .set usage_external_recurse.has_recursion, 1
+; GCN: .set usage_external_recurse.has_indirect_call, 1
+; GCN: NumSgprs: usage_external_recurse.num_sgpr+6
+; GCN: NumVgprs: usage_external_recurse.num_vgpr
+; GCN: ScratchSize: 0
+define amdgpu_kernel void @usage_external_recurse() #0 {
+ call void @external_recurse()
+ ret void
+}
+
+; GCN-LABEL: {{^}}direct_recursion_use_stack:
+; GCN: .set direct_recursion_use_stack.num_vgpr, 41
+; GCN: .set direct_recursion_use_stack.num_agpr, 0
+; GCN: .set direct_recursion_use_stack.num_sgpr, 36
+; GCN: .set direct_recursion_use_stack.private_seg_size, 2064
+; GCN: .set direct_recursion_use_stack.uses_vcc, 1
+; GCN: .set direct_recursion_use_stack.uses_flat_scratch, 0
+; GCN: .set direct_recursion_use_stack.has_dyn_sized_stack, 0
+; GCN: .set direct_recursion_use_stack.has_recursion, 1
+; GCN: .set direct_recursion_use_stack.has_indirect_call, 0
+; GCN: NumSgprs: 40
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 2064
+define void @direct_recursion_use_stack(i32 %val) #2 {
+ %alloca = alloca [512 x i32], align 4, addrspace(5)
+ call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0
+ %cmp = icmp eq i32 %val, 0
+ br i1 %cmp, label %ret, label %call
+
+call:
+ %val.sub1 = sub i32 %val, 1
+ call void @direct_recursion_use_stack(i32 %val.sub1)
+ br label %ret
+
+ret:
+ ret void
+}
+
+; GCN-LABEL: {{^}}usage_direct_recursion:
+; GCN: .set usage_direct_recursion.num_vgpr, max(32, direct_recursion_use_stack.num_vgpr)
+; GCN: .set usage_direct_recursion.num_agpr, max(0, direct_recursion_use_stack.num_agpr)
+; GCN: .set usage_direct_recursion.num_sgpr, max(33, direct_recursion_use_stack.num_sgpr)
+; GCN: .set usage_direct_recursion.private_seg_size, 0+(max(direct_recursion_use_stack.private_seg_size))
+; GCN: .set usage_direct_recursion.uses_vcc, or(1, direct_recursion_use_stack.uses_vcc)
+; GCN: .set usage_direct_recursion.uses_flat_scratch, or(1, direct_recursion_use_stack.uses_flat_scratch)
+; GCN: .set usage_direct_recursion.has_dyn_sized_stack, or(0, direct_recursion_use_stack.has_dyn_sized_stack)
+; GCN: .set usage_direct_recursion.has_recursion, or(1, direct_recursion_use_stack.has_recursion)
+; GCN: .set usage_direct_recursion.has_indirect_call, or(0, direct_recursion_use_stack.has_indirect_call)
+; GCN: NumSgprs: 42
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 2064
+define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
+ call void @direct_recursion_use_stack(i32 %n)
+ ret void
+}
+
+; Make sure there's no assert when a sgpr96 is used.
+; GCN-LABEL: {{^}}count_use_sgpr96_external_call
+; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(32, max_num_vgpr)
+; GCN: .set count_use_sgpr96_external_call.num_agpr, max(0, max_num_agpr)
+; GCN: .set count_use_sgpr96_external_call.num_sgpr, max(33, max_num_sgpr)
+; GCN: .set count_use_sgpr96_external_call.private_seg_size, 0
+; GCN: .set count_use_sgpr96_external_call.uses_vcc, 1
+; GCN: .set count_use_sgpr96_external_call.uses_flat_scratch, 1
+; GCN: .set count_use_sgpr96_external_call.has_dyn_sized_stack, 1
+; GCN: .set count_use_sgpr96_external_call.has_recursion, 0
+; GCN: .set count_use_sgpr96_external_call.has_indirect_call, 1
+; GCN: NumSgprs: count_use_sgpr96_external_call.num_sgpr+6
+; GCN: NumVgprs: count_use_sgpr96_external_call.num_vgpr
+; GCN: ScratchSize: 0
+define amdgpu_kernel void @count_use_sgpr96_external_call() {
+entry:
+ tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> <i32 10, i32 11, i32 12>) #1
+ call void @external()
+ ret void
+}
+
+; Make sure there's no assert when a sgpr160 is used.
+; GCN-LABEL: {{^}}count_use_sgpr160_external_call
+; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(32, max_num_vgpr)
+; GCN: .set count_use_sgpr160_external_call.num_agpr, max(0, max_num_agpr)
+; GCN: .set count_use_sgpr160_external_call.num_sgpr, max(33, max_num_sgpr)
+; GCN: .set count_use_sgpr160_external_call.private_seg_size, 0
+; GCN: .set count_use_sgpr160_external_call.uses_vcc, 1
+; GCN: .set count_use_sgpr160_external_call.uses_flat_scratch, 1
+; GCN: .set count_use_sgpr160_external_call.has_dyn_sized_stack, 1
+; GCN: .set count_use_sgpr160_external_call.has_recursion, 0
+; GCN: .set count_use_sgpr160_external_call.has_indirect_call, 1
+; GCN: NumSgprs: count_use_sgpr160_external_call.num_sgpr+6
+; GCN: NumVgprs: count_use_sgpr160_external_call.num_vgpr
+; GCN: ScratchSize: 0
+define amdgpu_kernel void @count_use_sgpr160_external_call() {
+entry:
+ tail call void asm sideeffect "; sgpr160 $0", "s"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
+ call void @external()
+ ret void
+}
+
+; Make sure there's no assert when a vgpr160 is used.
+; GCN-LABEL: {{^}}count_use_vgpr160_external_call
+; GCN: .set count_use_vgpr160_external_call.num_vgpr, max(32, max_num_vgpr)
+; GCN: .set count_use_vgpr160_external_call.num_agpr, max(0, max_num_agpr)
+; GCN: .set count_use_vgpr160_external_call.num_sgpr, max(33, max_num_sgpr)
+; GCN: .set count_use_vgpr160_external_call.private_seg_size, 0
+; GCN: .set count_use_vgpr160_external_call.uses_vcc, 1
+; GCN: .set count_use_vgpr160_external_call.uses_flat_scratch, 1
+; GCN: .set count_use_vgpr160_external_call.has_dyn_sized_stack, 1
+; GCN: .set count_use_vgpr160_external_call.has_recursion, 0
+; GCN: .set count_use_vgpr160_external_call.has_indirect_call, 1
+; GCN: NumSgprs: count_use_vgpr160_external_call.num_sgpr+6
+; GCN: NumVgprs: count_use_vgpr160_external_call.num_vgpr
+; GCN: ScratchSize: 0
+define amdgpu_kernel void @count_use_vgpr160_external_call() {
+entry:
+ tail call void asm sideeffect "; vgpr160 $0", "v"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
+ call void @external()
+ ret void
+}
+
+; Added at the of the .s are the module level maximums
+; GCN: .set max_num_vgpr, 50
+; GCN: .set max_num_agpr, 0
+; GCN: .set max_num_sgpr, 80
+
+attributes #0 = { nounwind noinline norecurse }
+attributes #1 = { nounwind noinline norecurse }
+attributes #2 = { nounwind noinline }
diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
index 0f951e89d37c8a..4f300e2282426e 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
@@ -36,8 +36,8 @@
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
; GCN-NEXT: .amdhsa_wavefront_size32
-; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
-; GCN-NEXT: .amdhsa_enable_private_segment 0
+; GCN-NEXT: .amdhsa_uses_dynamic_stack
+; GCN-NEXT: .amdhsa_enable_private_segment
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
@@ -65,8 +65,8 @@ define amdgpu_kernel void @minimal_kernel_inputs() #0 {
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
; GCN-NEXT: .amdhsa_wavefront_size32
-; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
-; GCN-NEXT: .amdhsa_enable_private_segment 1
+; GCN-NEXT: .amdhsa_uses_dynamic_stack
+; GCN-NEXT: .amdhsa_enable_private_segment
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
@@ -98,8 +98,8 @@ define amdgpu_kernel void @minimal_kernel_inputs_with_stack() #0 {
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
; GCN-NEXT: .amdhsa_wavefront_size32
-; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
-; GCN-NEXT: .amdhsa_enable_private_segment 0
+; GCN-NEXT: .amdhsa_uses_dynamic_stack
+; GCN-NEXT: .amdhsa_enable_private_segment
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
@@ -145,8 +145,8 @@ define amdgpu_kernel void @queue_ptr() #1 {
; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 1
; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
; GCN-NEXT: .amdhsa_wavefront_size32
-; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
-; GCN-NEXT: .amdhsa_enable_private_segment 1
+; GCN-NEXT: .amdhsa_uses_dynamic_stack
+; GCN-NEXT: .amdhsa_enable_private_segment
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll b/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll
index f20d720c3876ba..dce4162c246247 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll
@@ -41,7 +41,7 @@ entry:
}
; FIXME: This should warn too
-; ERR-NOT: warning
+; ERR-NOT: warning: inline asm clobber list contains reserved registers
define amdgpu_kernel void @def_exec(ptr addrspace(1) %ptr) {
entry:
%exec = call i64 asm sideeffect "; def $0", "={exec}"()
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index cddfb21a6fbdf4..6e07e2fc9da79a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -3,6 +3,18 @@
declare i32 @llvm.amdgcn.workitem.id.x()
+define <2 x i64> @f1() #0 {
+; GFX11-LABEL: f1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: v_mov_b32_e32 v3, 0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ ret <2 x i64> zeroinitializer
+}
+
define void @f0() {
; GFX11-LABEL: f0:
; GFX11: ; %bb.0: ; %bb
@@ -36,18 +48,6 @@ bb:
ret void
}
-define <2 x i64> @f1() #0 {
-; GFX11-LABEL: f1:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: v_mov_b32_e32 v3, 0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- ret <2 x i64> zeroinitializer
-}
-
; FIXME: This generates "instid1(/* invalid instid value */)".
define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, ptr %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i1 %arg11) {
; GFX11-LABEL: f2:
diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll
index b49931379b84a5..4575df1e0c6b95 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra.ll
@@ -105,13 +105,6 @@ define void @test_funcx2() #0 {
ret void
}
-; GCN-LABEL: {{^}}wombat:
-define weak amdgpu_kernel void @wombat(ptr %arg, ptr %arg2) {
-bb:
- call void @hoge() #0
- ret void
-}
-
; Make sure we save/restore the return address around the call.
; Function Attrs: norecurse
define internal void @hoge() #2 {
@@ -128,6 +121,13 @@ bb:
ret void
}
+; GCN-LABEL: {{^}}wombat:
+define weak amdgpu_kernel void @wombat(ptr %arg, ptr %arg2) {
+bb:
+ call void @hoge() #0
+ ret void
+}
+
declare dso_local void @eggs()
diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
index 496a1c652da251..d9a5a49e75f0a5 100644
--- a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
@@ -5,12 +5,14 @@ declare void @llvm.trap() #0
; DOORBELL: .amdhsa_kernel trap
; DOORBELL-NEXT: .amdhsa_group_segment_fixed_size 0
-; DOORBELL-NEXT: .amdhsa_private_segment_fixed_size 0
+; DOORBELL-NEXT: .amdhsa_private_segment_fixed_size trap.private_seg_size
; DOORBELL-NEXT: .amdhsa_kernarg_size 8
; DOORBELL-NEXT: .amdhsa_user_sgpr_count 12
; DOORBELL-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
; DOORBELL: .end_amdhsa_kernel
+; DOORBELL: .set trap.private_seg_size, 0
+
define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) #0 {
store volatile i32 1, ptr addrspace(1) %arg0
call void @llvm.trap()
diff --git a/llvm/test/CodeGen/AMDGPU/kernel_code_t_recurse.ll b/llvm/test/CodeGen/AMDGPU/kernel_code_t_recurse.ll
index cdd6e88dd103b7..f3d9e9a727c251 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel_code_t_recurse.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel_code_t_recurse.ll
@@ -1,6 +1,8 @@
; RUN: llc -mtriple=amdgcn-mesa-mesa3d < %s | FileCheck %s
; CHECK-LABEL: non_kernel_recursion:
+; CHECK: .set non_kernel_recursion.has_recursion, 1
+; CHECK: .set non_kernel_recursion.has_indirect_call, 0
define void @non_kernel_recursion(i32 %val) #2 {
%cmp = icmp eq i32 %val, 0
br i1 %cmp, label %ret, label %call
@@ -16,8 +18,11 @@ ret:
; CHECK-LABEL: kernel_caller_recursion:
; CHECK: .amd_kernel_code_t
-; CHECK: is_dynamic_callstack = 1
+; CHECK: is_dynamic_callstack = kernel_caller_recursion.has_dyn_sized_stack|kernel_caller_recursion.has_recursion
; CHECK: .end_amd_kernel_code_t
+
+; CHECK: .set kernel_caller_recursion.has_recursion, or(1, non_kernel_recursion.has_recursion)
+; CHECK: .set kernel_caller_recursion.has_indirect_call, or(0, non_kernel_recursion.has_indirect_call)
define amdgpu_kernel void @kernel_caller_recursion(i32 %n) #0 {
call void @non_kernel_recursion(i32 %n)
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll
index 7698372b687797..64bc14d750573b 100644
--- a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll
+++ b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll
@@ -1,10 +1,10 @@
; RUN: llc -mtriple=amdgcn -mcpu=bonaire -show-mc-encoding < %s | FileCheck --check-prefixes=GCN,CI,ALL %s
; RUN: llc -mtriple=amdgcn -mcpu=carrizo --show-mc-encoding < %s | FileCheck --check-prefixes=GCN,VI,ALL %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 --show-mc-encoding < %s | FileCheck --check-prefixes=GCN,GFX9,ALL %s
-; RUN: llc -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCNHSA,ALL %s
-; RUN: llc -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCNHSA,ALL %s
-; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCNHSA,ALL %s
-; RUN: llc -mcpu=gfx1100 -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global,-architected-flat-scratch,-user-sgpr-init16-bug < %s | FileCheck --check-prefixes=GCNHSA,ALL %s
+; RUN: llc -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck --check-prefixes=BON,GCNHSA,ALL %s
+; RUN: llc -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck --check-prefixes=CAR,GCNHSA,ALL %s
+; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX10,GCNHSA,ALL %s
+; RUN: llc -mcpu=gfx1100 -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global,-architected-flat-scratch,-user-sgpr-init16-bug < %s | FileCheck --check-prefixes=GFX11,GCNHSA,ALL %s
; FIXME: align on alloca seems to be ignored for private_segment_alignment
@@ -24,7 +24,7 @@
; GCNHSA: .amdhsa_kernel large_alloca_compute_shader
; GCNHSA: .amdhsa_group_segment_fixed_size 0
-; GCNHSA: .amdhsa_private_segment_fixed_size 32772
+; GCNHSA: .amdhsa_private_segment_fixed_size large_alloca_compute_shader.private_seg_size
; GCNHSA: .amdhsa_user_sgpr_private_segment_buffer 1
; GCNHSA: .amdhsa_user_sgpr_dispatch_ptr 1
; GCNHSA: .amdhsa_user_sgpr_queue_ptr 1
@@ -32,14 +32,19 @@
; GCNHSA: .amdhsa_user_sgpr_dispatch_id 1
; GCNHSA: .amdhsa_user_sgpr_flat_scratch_init 1
; GCNHSA: .amdhsa_user_sgpr_private_segment_size 0
-; GCNHSA: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; GCNHSA: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(large_alloca_compute_shader.private_seg_size*{{32|64}}, {{1024|256}}))/{{1024|256}})>0)||(large_alloca_compute_shader.has_dyn_sized_stack|large_alloca_compute_shader.has_recursion))|5020)&1
; GCNHSA: .amdhsa_system_sgpr_workgroup_id_x 1
; GCNHSA: .amdhsa_system_sgpr_workgroup_id_y 1
; GCNHSA: .amdhsa_system_sgpr_workgroup_id_z 1
; GCNHSA: .amdhsa_system_sgpr_workgroup_info 0
; GCNHSA: .amdhsa_system_vgpr_workitem_id 2
-; GCNHSA: .amdhsa_next_free_vgpr 3
-; GCNHSA: .amdhsa_next_free_sgpr 18
+; GCNHSA: .amdhsa_next_free_vgpr max(totalnumvgprs(large_alloca_compute_shader.num_agpr, large_alloca_compute_shader.num_vgpr), 1, 0)
+; BON: .amdhsa_next_free_sgpr (max(large_alloca_compute_shader.num_sgpr+(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 0))
+; CAR: .amdhsa_next_free_sgpr (max(large_alloca_compute_shader.num_sgpr+(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 1))
+; GFX10: .amdhsa_next_free_sgpr (max(large_alloca_compute_shader.num_sgpr+(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 1))
+; GFX11: .amdhsa_next_free_sgpr (max(large_alloca_compute_shader.num_sgpr+(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 0))
+; GCNHSA: .amdhsa_reserve_vcc large_alloca_compute_shader.uses_vcc
+; GCNHSA: .amdhsa_reserve_flat_scratch large_alloca_compute_shader.uses_flat_scratch
; GCNHSA: .amdhsa_float_round_mode_32 0
; GCNHSA: .amdhsa_float_round_mode_16_64 0
; GCNHSA: .amdhsa_float_denorm_mode_32 3
@@ -55,6 +60,16 @@
; GCNHSA: .amdhsa_exception_int_div_zero 0
; GCNHSA: .end_amdhsa_kernel
+; GCNHSA: .set large_alloca_compute_shader.num_vgpr, 3
+; GCNHSA: .set large_alloca_compute_shader.num_agpr, 0
+; GCNHSA: .set large_alloca_compute_shader.num_sgpr, 18
+; GCNHSA: .set large_alloca_compute_shader.private_seg_size, 32772
+; GCNHSA: .set large_alloca_compute_shader.uses_vcc
+; GCNHSA: .set large_alloca_compute_shader.uses_flat_scratch, 0
+; GCNHSA: .set large_alloca_compute_shader.has_dyn_sized_stack, 0
+; GCNHSA: .set large_alloca_compute_shader.has_recursion, 0
+; GCNHSA: .set large_alloca_compute_shader.has_indirect_call, 0
+
; Scratch size = alloca size + emergency stack slot, align {{.*}}, addrspace(5)
; ALL: ; ScratchSize: 32772
define amdgpu_kernel void @large_alloca_compute_shader(i32 %x, i32 %y) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 1b1ea52520c0bf..cf331e119126ed 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -146,12 +146,9 @@
; GCN-O0-NEXT: Lazy Machine Block Frequency Analysis
; GCN-O0-NEXT: Machine Optimization Remark Emitter
; GCN-O0-NEXT: Stack Frame Layout Analysis
-; GCN-O0-NEXT: Function register usage analysis
-; GCN-O0-NEXT: FunctionPass Manager
-; GCN-O0-NEXT: Lazy Machine Block Frequency Analysis
-; GCN-O0-NEXT: Machine Optimization Remark Emitter
-; GCN-O0-NEXT: AMDGPU Assembly Printer
-; GCN-O0-NEXT: Free MachineFunction
+; GCN-O0-NEXT: Function register usage analysis
+; GCN-O0-NEXT: AMDGPU Assembly Printer
+; GCN-O0-NEXT: Free MachineFunction
; GCN-O1:Target Library Information
; GCN-O1-NEXT:Target Pass Configuration
@@ -421,12 +418,9 @@
; GCN-O1-NEXT: Lazy Machine Block Frequency Analysis
; GCN-O1-NEXT: Machine Optimization Remark Emitter
; GCN-O1-NEXT: Stack Frame Layout Analysis
-; GCN-O1-NEXT: Function register usage analysis
-; GCN-O1-NEXT: FunctionPass Manager
-; GCN-O1-NEXT: Lazy Machine Block Frequency Analysis
-; GCN-O1-NEXT: Machine Optimization Remark Emitter
-; GCN-O1-NEXT: AMDGPU Assembly Printer
-; GCN-O1-NEXT: Free MachineFunction
+; GCN-O1-NEXT: Function register usage analysis
+; GCN-O1-NEXT: AMDGPU Assembly Printer
+; GCN-O1-NEXT: Free MachineFunction
; GCN-O1-OPTS:Target Library Information
; GCN-O1-OPTS-NEXT:Target Pass Configuration
@@ -724,12 +718,9 @@
; GCN-O1-OPTS-NEXT: Lazy Machine Block Frequency Analysis
; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter
; GCN-O1-OPTS-NEXT: Stack Frame Layout Analysis
-; GCN-O1-OPTS-NEXT: Function register usage analysis
-; GCN-O1-OPTS-NEXT: FunctionPass Manager
-; GCN-O1-OPTS-NEXT: Lazy Machine Block Frequency Analysis
-; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter
-; GCN-O1-OPTS-NEXT: AMDGPU Assembly Printer
-; GCN-O1-OPTS-NEXT: Free MachineFunction
+; GCN-O1-OPTS-NEXT: Function register usage analysis
+; GCN-O1-OPTS-NEXT: AMDGPU Assembly Printer
+; GCN-O1-OPTS-NEXT: Free MachineFunction
; GCN-O2:Target Library Information
; GCN-O2-NEXT:Target Pass Configuration
@@ -1033,12 +1024,9 @@
; GCN-O2-NEXT: Lazy Machine Block Frequency Analysis
; GCN-O2-NEXT: Machine Optimization Remark Emitter
; GCN-O2-NEXT: Stack Frame Layout Analysis
-; GCN-O2-NEXT: Function register usage analysis
-; GCN-O2-NEXT: FunctionPass Manager
-; GCN-O2-NEXT: Lazy Machine Block Frequency Analysis
-; GCN-O2-NEXT: Machine Optimization Remark Emitter
-; GCN-O2-NEXT: AMDGPU Assembly Printer
-; GCN-O2-NEXT: Free MachineFunction
+; GCN-O2-NEXT: Function register usage analysis
+; GCN-O2-NEXT: AMDGPU Assembly Printer
+; GCN-O2-NEXT: Free MachineFunction
; GCN-O3:Target Library Information
; GCN-O3-NEXT:Target Pass Configuration
@@ -1354,12 +1342,9 @@
; GCN-O3-NEXT: Lazy Machine Block Frequency Analysis
; GCN-O3-NEXT: Machine Optimization Remark Emitter
; GCN-O3-NEXT: Stack Frame Layout Analysis
-; GCN-O3-NEXT: Function register usage analysis
-; GCN-O3-NEXT: FunctionPass Manager
-; GCN-O3-NEXT: Lazy Machine Block Frequency Analysis
-; GCN-O3-NEXT: Machine Optimization Remark Emitter
-; GCN-O3-NEXT: AMDGPU Assembly Printer
-; GCN-O3-NEXT: Free MachineFunction
+; GCN-O3-NEXT: Function register usage analysis
+; GCN-O3-NEXT: AMDGPU Assembly Printer
+; GCN-O3-NEXT: Free MachineFunction
define void @empty() {
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
index eaee8ec73fe411..778060d3c5fb3d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
@@ -12,7 +12,7 @@ declare i32 @llvm.amdgcn.workitem.id.z() #0
; MESA: .section .AMDGPU.config
; MESA: .long 47180
-; MESA-NEXT: .long 132{{$}}
+; MESA-NEXT: .long ((((alignto(test_workitem_id_x.private_seg_size*64, 1024))/1024)>0)||(test_workitem_id_x.has_dyn_sized_stack|test_workitem_id_x.has_recursion))|132{{$}}
; ALL-LABEL: {{^}}test_workitem_id_x:
; MESA3D: enable_vgpr_workitem_id = 0
@@ -29,7 +29,7 @@ define amdgpu_kernel void @test_workitem_id_x(ptr addrspace(1) %out) #1 {
; MESA: .section .AMDGPU.config
; MESA: .long 47180
-; MESA-NEXT: .long 2180{{$}}
+; MESA-NEXT: .long ((((alignto(test_workitem_id_y.private_seg_size*64, 1024))/1024)>0)||(test_workitem_id_y.has_dyn_sized_stack|test_workitem_id_y.has_recursion))|2180{{$}}
; ALL-LABEL: {{^}}test_workitem_id_y:
; MESA3D: enable_vgpr_workitem_id = 1
@@ -47,7 +47,7 @@ define amdgpu_kernel void @test_workitem_id_y(ptr addrspace(1) %out) #1 {
; MESA: .section .AMDGPU.config
; MESA: .long 47180
-; MESA-NEXT: .long 4228{{$}}
+; MESA-NEXT: .long ((((alignto(test_workitem_id_z.private_seg_size*64, 1024))/1024)>0)||(test_workitem_id_z.has_dyn_sized_stack|test_workitem_id_z.has_recursion))|4228{{$}}
; ALL-LABEL: {{^}}test_workitem_id_z:
; MESA3D: enable_vgpr_workitem_id = 2
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
index 34dcdaf29677e4..b508ffff8050a8 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
@@ -9,6 +9,19 @@
@lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1
@lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16
+; GCN-LABEL: {{^}}f0:
+; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0
+; GCN-DAG: v_mov_b32_e32 [[TREE:v[0-9]+]], 3
+; GCN: ds_write_b8 [[NULL]], [[TREE]]
+define void @f0() {
+; OPT-LABEL: @f0() {
+; OPT-NEXT: store i8 3, ptr addrspace(3) @llvm.amdgcn.module.lds, align 1
+; OPT-NEXT: ret void
+;
+ store i8 3, ptr addrspace(3) @lds.size.1.align.1, align 1
+ ret void
+}
+
; GCN-LABEL: {{^}}k0:
; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0
; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
@@ -29,16 +42,3 @@ define amdgpu_kernel void @k0() {
call void @f0()
ret void
}
-
-; GCN-LABEL: {{^}}f0:
-; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0
-; GCN-DAG: v_mov_b32_e32 [[TREE:v[0-9]+]], 3
-; GCN: ds_write_b8 [[NULL]], [[TREE]]
-define void @f0() {
-; OPT-LABEL: @f0() {
-; OPT-NEXT: store i8 3, ptr addrspace(3) @llvm.amdgcn.module.lds, align 1
-; OPT-NEXT: ret void
-;
- store i8 3, ptr addrspace(3) @lds.size.1.align.1, align 1
- ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/mesa3d.ll b/llvm/test/CodeGen/AMDGPU/mesa3d.ll
index 7f0f473c11bd59..f3cf2a5ca8ff62 100644
--- a/llvm/test/CodeGen/AMDGPU/mesa3d.ll
+++ b/llvm/test/CodeGen/AMDGPU/mesa3d.ll
@@ -1,14 +1,15 @@
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,ALL %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,ALL %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,ALL %s
; SPI_TMPRING_SIZE.WAVESIZE = 5
; GFX10: .long 165608
-; GFX10-NEXT: .long 20480
+; GFX10-NEXT: .long (((alignto(scratch_ps.private_seg_size*32, 1024))/1024)&8191)<<12
; SPI_TMPRING_SIZE.WAVESIZE = 17
; GFX11: .long 165608
-; GFX11-NEXT: .long 69632
+; 11XFG-TXEN: .long 69632
+; GFX11-NEXT:.long (((alignto(scratch_ps.private_seg_size*32, 256))/256)&32767)<<12
; GCN-LABEL: {{^}}scratch_ps:
; GCN: s_load_dwordx2 s[4:5], s[0:1], 0x0{{$}}
@@ -23,3 +24,5 @@ entry:
store volatile i32 2, ptr addrspace(5) %ptr
ret void
}
+
+; ALL: .set scratch_ps.private_seg_size, 132
diff --git a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
index 27b71dd471a839..aa16937d7d897d 100644
--- a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
+++ b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
@@ -24,6 +24,55 @@ store i32 0, ptr addrspace(3) @used_by_kernel
}
; CHECK: ; LDSByteSize: 4 bytes
+define void @nonkernel() {
+; GFX9-LABEL: nonkernel:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v1, v0
+; GFX9-NEXT: ds_write_b32 v0, v0 offset:8
+; GFX9-NEXT: ds_write_b64 v0, v[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: nonkernel:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: ds_write_b32 v0, v0 offset:8
+; GFX10-NEXT: ds_write_b64 v0, v[0:1]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; G_GFX9-LABEL: nonkernel:
+; G_GFX9: ; %bb.0:
+; G_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; G_GFX9-NEXT: v_mov_b32_e32 v2, 0
+; G_GFX9-NEXT: v_mov_b32_e32 v3, 8
+; G_GFX9-NEXT: v_mov_b32_e32 v0, 0
+; G_GFX9-NEXT: v_mov_b32_e32 v1, 0
+; G_GFX9-NEXT: ds_write_b32 v3, v2
+; G_GFX9-NEXT: ds_write_b64 v2, v[0:1]
+; G_GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; G_GFX10-LABEL: nonkernel:
+; G_GFX10: ; %bb.0:
+; G_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; G_GFX10-NEXT: v_mov_b32_e32 v2, 0
+; G_GFX10-NEXT: v_mov_b32_e32 v3, 8
+; G_GFX10-NEXT: v_mov_b32_e32 v0, 0
+; G_GFX10-NEXT: v_mov_b32_e32 v1, 0
+; G_GFX10-NEXT: ds_write_b32 v3, v2
+; G_GFX10-NEXT: ds_write_b64 v2, v[0:1]
+; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT: s_setpc_b64 s[30:31]
+ store i32 0, ptr addrspace(3) @used_by_both
+ store double 0.0, ptr addrspace(3) @used_by_function
+ ret void
+}
+
; Needs to allocate both variables, store to used_by_both is at sizeof(double)
define amdgpu_kernel void @withcall() {
; GFX9-LABEL: withcall:
@@ -171,55 +220,5 @@ define amdgpu_kernel void @nocall_false_sharing() {
}
; CHECK: ; LDSByteSize: 4 bytes
-
-define void @nonkernel() {
-; GFX9-LABEL: nonkernel:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: ds_write_b32 v0, v0 offset:8
-; GFX9-NEXT: ds_write_b64 v0, v[0:1]
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: nonkernel:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: ds_write_b32 v0, v0 offset:8
-; GFX10-NEXT: ds_write_b64 v0, v[0:1]
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; G_GFX9-LABEL: nonkernel:
-; G_GFX9: ; %bb.0:
-; G_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; G_GFX9-NEXT: v_mov_b32_e32 v2, 0
-; G_GFX9-NEXT: v_mov_b32_e32 v3, 8
-; G_GFX9-NEXT: v_mov_b32_e32 v0, 0
-; G_GFX9-NEXT: v_mov_b32_e32 v1, 0
-; G_GFX9-NEXT: ds_write_b32 v3, v2
-; G_GFX9-NEXT: ds_write_b64 v2, v[0:1]
-; G_GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; G_GFX10-LABEL: nonkernel:
-; G_GFX10: ; %bb.0:
-; G_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; G_GFX10-NEXT: v_mov_b32_e32 v2, 0
-; G_GFX10-NEXT: v_mov_b32_e32 v3, 8
-; G_GFX10-NEXT: v_mov_b32_e32 v0, 0
-; G_GFX10-NEXT: v_mov_b32_e32 v1, 0
-; G_GFX10-NEXT: ds_write_b32 v3, v2
-; G_GFX10-NEXT: ds_write_b64 v2, v[0:1]
-; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT: s_setpc_b64 s[30:31]
- store i32 0, ptr addrspace(3) @used_by_both
- store double 0.0, ptr addrspace(3) @used_by_function
- ret void
-}
-
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index b84686139d0e2c..3c100cf7a38527 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -110,13 +110,14 @@ bb.2:
store volatile i32 0, ptr addrspace(1) undef
ret void
}
-; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112
+; DEFAULTSIZE: .set kernel_non_entry_block_static_alloca_uniformly_reached_align4.private_seg_size, 4112
; DEFAULTSIZE: ; ScratchSize: 4112
-; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size 16
-; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack 1
+; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack ((41|((kernel_non_entry_block_static_alloca_uniformly_reached_align4.has_dyn_sized_stack|kernel_non_entry_block_static_alloca_uniformly_reached_align4.has_recursion)<<11))&2048)>>11
+; DEFAULTSIZE-V5: .set kernel_non_entry_block_static_alloca_uniformly_reached_align4.private_seg_size, 16
+; DEFAULTSIZE-V5: .set kernel_non_entry_block_static_alloca_uniformly_reached_align4.has_dyn_sized_stack, 1
; DEFAULTSIZE-V5: ; ScratchSize: 16
-; ASSUME1024: .amdhsa_private_segment_fixed_size 1040
+; ASSUME1024: .set kernel_non_entry_block_static_alloca_uniformly_reached_align4.private_seg_size, 1040
; ASSUME1024: ; ScratchSize: 1040
define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) {
@@ -205,13 +206,16 @@ bb.1:
ret void
}
-; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160
+; DEFAULTSIZE: .amdhsa_private_segment_fixed_size kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size
+; DEFAULTSIZE: .set kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size, 4160
; DEFAULTSIZE: ; ScratchSize: 4160
-; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size 64
-; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack 1
+; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size
+; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack ((59|((kernel_non_entry_block_static_alloca_uniformly_reached_align64.has_dyn_sized_stack|kernel_non_entry_block_static_alloca_uniformly_reached_align64.has_recursion)<<11))&2048)>>11
+; DEFAULTSIZE-V5: .set kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size, 64
+; DEFAULTSIZE-V5: .set kernel_non_entry_block_static_alloca_uniformly_reached_align64.has_dyn_sized_stack, 1
; DEFAULTSIZE-V5: ; ScratchSize: 64
-; ASSUME1024: .amdhsa_private_segment_fixed_size 1088
+; ASSUME1024: .set kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size, 1088
; ASSUME1024: ; ScratchSize: 1088
diff --git a/llvm/test/CodeGen/AMDGPU/recursion.ll b/llvm/test/CodeGen/AMDGPU/recursion.ll
index d58477c194ea62..c0d228e1254e64 100644
--- a/llvm/test/CodeGen/AMDGPU/recursion.ll
+++ b/llvm/test/CodeGen/AMDGPU/recursion.ll
@@ -3,7 +3,11 @@
; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=V5 %s
; CHECK-LABEL: {{^}}recursive:
+; CHECK: .set recursive.private_seg_size, 16+(max(16384))
; CHECK: ScratchSize: 16
+
+; V5-LABEL: {{^}}recursive:
+; V5: .set recursive.has_recursion, 1
define void @recursive() {
call void @recursive()
store volatile i32 0, ptr addrspace(1) undef
@@ -11,18 +15,22 @@ define void @recursive() {
}
; CHECK-LABEL: {{^}}tail_recursive:
+; CHECK: .set tail_recursive.private_seg_size, 0
; CHECK: ScratchSize: 0
define void @tail_recursive() {
tail call void @tail_recursive()
ret void
}
+; CHECK: .set calls_tail_recursive.private_seg_size, 0+(max(tail_recursive.private_seg_size))
define void @calls_tail_recursive() norecurse {
tail call void @tail_recursive()
ret void
}
; CHECK-LABEL: {{^}}tail_recursive_with_stack:
+; CHECK: .set tail_recursive_with_stack.private_seg_size, 8
+; CHECK: .set tail_recursive_with_stack.has_recursion, 1
define void @tail_recursive_with_stack() {
%alloca = alloca i32, addrspace(5)
store volatile i32 0, ptr addrspace(5) %alloca
@@ -33,11 +41,11 @@ define void @tail_recursive_with_stack() {
; For an arbitrary recursive call, report a large number for unknown stack
; usage for code object v4 and older
; CHECK-LABEL: {{^}}calls_recursive:
-; CHECK: .amdhsa_private_segment_fixed_size 16400{{$}}
+; CHECK: .set calls_recursive.private_seg_size, 0+(max(16384, recursive.private_seg_size))
;
; V5-LABEL: {{^}}calls_recursive:
-; V5: .amdhsa_private_segment_fixed_size 0{{$}}
-; V5: .amdhsa_uses_dynamic_stack 1
+; V5: .set calls_recursive.private_seg_size, 0+(max(recursive.private_seg_size))
+; V5: .set calls_recursive.has_dyn_sized_stack, or(0, recursive.has_dyn_sized_stack)
define amdgpu_kernel void @calls_recursive() {
call void @recursive()
ret void
@@ -46,7 +54,7 @@ define amdgpu_kernel void @calls_recursive() {
; Make sure we do not report a huge stack size for tail recursive
; functions
; CHECK-LABEL: {{^}}kernel_indirectly_calls_tail_recursive:
-; CHECK: .amdhsa_private_segment_fixed_size 0{{$}}
+; CHECK: .set kernel_indirectly_calls_tail_recursive.private_seg_size, 0+(max(calls_tail_recursive.private_seg_size))
define amdgpu_kernel void @kernel_indirectly_calls_tail_recursive() {
call void @calls_tail_recursive()
ret void
@@ -57,22 +65,22 @@ define amdgpu_kernel void @kernel_indirectly_calls_tail_recursive() {
; in the kernel.
; CHECK-LABEL: {{^}}kernel_calls_tail_recursive:
-; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}}
+; CHECK: .set kernel_calls_tail_recursive.private_seg_size, 0+(max(16384, tail_recursive.private_seg_size))
;
; V5-LABEL: {{^}}kernel_calls_tail_recursive:
-; V5: .amdhsa_private_segment_fixed_size 0{{$}}
-; V5: .amdhsa_uses_dynamic_stack 1
+; V5: .set kernel_calls_tail_recursive.private_seg_size, 0+(max(tail_recursive.private_seg_size))
+; V5: .set kernel_calls_tail_recursive.has_recursion, or(1, tail_recursive.has_recursion)
define amdgpu_kernel void @kernel_calls_tail_recursive() {
call void @tail_recursive()
ret void
}
; CHECK-LABEL: {{^}}kernel_calls_tail_recursive_with_stack:
-; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}}
+; CHECK: .set kernel_calls_tail_recursive_with_stack.private_seg_size, 0+(max(16384, tail_recursive_with_stack.private_seg_size))
;
; V5-LABEL: {{^}}kernel_calls_tail_recursive_with_stack:
-; V5: .amdhsa_private_segment_fixed_size 8{{$}}
-; V5: .amdhsa_uses_dynamic_stack 1
+; V5: .set kernel_calls_tail_recursive_with_stack.private_seg_size, 0+(max(tail_recursive_with_stack.private_seg_size))
+; V5: .set kernel_calls_tail_recursive_with_stack.has_dyn_sized_stack, or(0, tail_recursive_with_stack.has_dyn_sized_stack)
define amdgpu_kernel void @kernel_calls_tail_recursive_with_stack() {
call void @tail_recursive_with_stack()
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
index 002de8bb4eb510..87ec51fb44ac45 100644
--- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
+++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
@@ -2,12 +2,12 @@
; RUN: FileCheck -check-prefix=REMARK %s < %t
; STDERR: remark: foo.cl:27:0: Function Name: test_kernel
-; STDERR-NEXT: remark: foo.cl:27:0: SGPRs: 28
-; STDERR-NEXT: remark: foo.cl:27:0: VGPRs: 9
-; STDERR-NEXT: remark: foo.cl:27:0: AGPRs: 43
-; STDERR-NEXT: remark: foo.cl:27:0: ScratchSize [bytes/lane]: 0
+; STDERR-NEXT: remark: foo.cl:27:0: SGPRs: test_kernel.num_sgpr+(extrasgprs(test_kernel.uses_vcc, test_kernel.uses_flat_scratch, 1))
+; STDERR-NEXT: remark: foo.cl:27:0: VGPRs: test_kernel.num_vgpr
+; STDERR-NEXT: remark: foo.cl:27:0: AGPRs: test_kernel.num_agpr
+; STDERR-NEXT: remark: foo.cl:27:0: ScratchSize [bytes/lane]: test_kernel.private_seg_size
; STDERR-NEXT: remark: foo.cl:27:0: Dynamic Stack: False
-; STDERR-NEXT: remark: foo.cl:27:0: Occupancy [waves/SIMD]: 5
+; STDERR-NEXT: remark: foo.cl:27:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_kernel.num_sgpr+(extrasgprs(test_kernel.uses_vcc, test_kernel.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_kernel.num_agpr, test_kernel.num_vgpr), 1, 0))
; STDERR-NEXT: remark: foo.cl:27:0: SGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:27:0: VGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:27:0: LDS Size [bytes/block]: 512
@@ -19,7 +19,7 @@
; REMARK-NEXT: Function: test_kernel
; REMARK-NEXT: Args:
; REMARK-NEXT: - String: 'Function Name: '
-; REMARK-NEXT: - FunctionName: test_kernel
+; REMARK-NEXT: - FunctionName: test_kernel
; REMARK-NEXT: ...
; REMARK-NEXT: --- !Analysis
; REMARK-NEXT: Pass: kernel-resource-usage
@@ -28,7 +28,7 @@
; REMARK-NEXT: Function: test_kernel
; REMARK-NEXT: Args:
; REMARK-NEXT: - String: ' SGPRs: '
-; REMARK-NEXT: - NumSGPR: '28'
+; REMARK-NEXT: - NumSGPR: 'test_kernel.num_sgpr+(extrasgprs(test_kernel.uses_vcc, test_kernel.uses_flat_scratch, 1))'
; REMARK-NEXT: ...
; REMARK-NEXT: --- !Analysis
; REMARK-NEXT: Pass: kernel-resource-usage
@@ -37,7 +37,7 @@
; REMARK-NEXT: Function: test_kernel
; REMARK-NEXT: Args:
; REMARK-NEXT: - String: ' VGPRs: '
-; REMARK-NEXT: - NumVGPR: '9'
+; REMARK-NEXT: - NumVGPR: test_kernel.num_vgpr
; REMARK-NEXT: ...
; REMARK-NEXT: --- !Analysis
; REMARK-NEXT: Pass: kernel-resource-usage
@@ -46,7 +46,7 @@
; REMARK-NEXT: Function: test_kernel
; REMARK-NEXT: Args:
; REMARK-NEXT: - String: ' AGPRs: '
-; REMARK-NEXT: - NumAGPR: '43'
+; REMARK-NEXT: - NumAGPR: test_kernel.num_agpr
; REMARK-NEXT: ...
; REMARK-NEXT: --- !Analysis
; REMARK-NEXT: Pass: kernel-resource-usage
@@ -55,17 +55,17 @@
; REMARK-NEXT: Function: test_kernel
; REMARK-NEXT: Args:
; REMARK-NEXT: - String: ' ScratchSize [bytes/lane]: '
-; REMARK-NEXT: - ScratchSize: '0'
-; REMARK-NEXT: ..
+; REMARK-NEXT: - ScratchSize: test_kernel.private_seg_size
+; REMARK-NEXT: ...
; REMARK-NEXT: --- !Analysis
; REMARK-NEXT: Pass: kernel-resource-usage
; REMARK-NEXT: Name: DynamicStack
; REMARK-NEXT: DebugLoc: { File: foo.cl, Line: 27, Column: 0 }
; REMARK-NEXT: Function: test_kernel
; REMARK-NEXT: Args:
-; REMARK-NEXT: - String: ' Dynamic Stack:
-; REMARK-NEXT: - DynamicStack: 'False'
-; REMARK-NEXT: ..
+; REMARK-NEXT: - String: ' Dynamic Stack: '
+; REMARK-NEXT: - DynamicStack: 'False'
+; REMARK-NEXT: ...
; REMARK-NEXT: --- !Analysis
; REMARK-NEXT: Pass: kernel-resource-usage
; REMARK-NEXT: Name: Occupancy
@@ -73,7 +73,7 @@
; REMARK-NEXT: Function: test_kernel
; REMARK-NEXT: Args:
; REMARK-NEXT: - String: ' Occupancy [waves/SIMD]: '
-; REMARK-NEXT: - Occupancy: '5'
+; REMARK-NEXT: - Occupancy: 'occupancy(10, 4, 256, 8, 8, max(test_kernel.num_sgpr+(extrasgprs(test_kernel.uses_vcc, test_kernel.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_kernel.num_agpr, test_kernel.num_vgpr), 1, 0))'
; REMARK-NEXT: ...
; REMARK-NEXT: --- !Analysis
; REMARK-NEXT: Pass: kernel-resource-usage
@@ -122,12 +122,12 @@ define void @test_func() !dbg !6 {
}
; STDERR: remark: foo.cl:8:0: Function Name: empty_kernel
-; STDERR-NEXT: remark: foo.cl:8:0: SGPRs: 4
-; STDERR-NEXT: remark: foo.cl:8:0: VGPRs: 0
-; STDERR-NEXT: remark: foo.cl:8:0: AGPRs: 0
-; STDERR-NEXT: remark: foo.cl:8:0: ScratchSize [bytes/lane]: 0
+; STDERR-NEXT: remark: foo.cl:8:0: SGPRs: empty_kernel.num_sgpr+(extrasgprs(empty_kernel.uses_vcc, empty_kernel.uses_flat_scratch, 1))
+; STDERR-NEXT: remark: foo.cl:8:0: VGPRs: empty_kernel.num_vgpr
+; STDERR-NEXT: remark: foo.cl:8:0: AGPRs: empty_kernel.num_agpr
+; STDERR-NEXT: remark: foo.cl:8:0: ScratchSize [bytes/lane]: empty_kernel.private_seg_size
; STDERR-NEXT: remark: foo.cl:8:0: Dynamic Stack: False
-; STDERR-NEXT: remark: foo.cl:8:0: Occupancy [waves/SIMD]: 8
+; STDERR-NEXT: remark: foo.cl:8:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(empty_kernel.num_sgpr+(extrasgprs(empty_kernel.uses_vcc, empty_kernel.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(empty_kernel.num_agpr, empty_kernel.num_vgpr), 1, 0))
; STDERR-NEXT: remark: foo.cl:8:0: SGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:8:0: VGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:8:0: LDS Size [bytes/block]: 0
@@ -141,12 +141,12 @@ define void @empty_func() !dbg !8 {
}
; STDERR: remark: foo.cl:64:0: Function Name: test_indirect_call
-; STDERR-NEXT: remark: foo.cl:64:0: SGPRs: 39
-; STDERR-NEXT: remark: foo.cl:64:0: VGPRs: 32
-; STDERR-NEXT: remark: foo.cl:64:0: AGPRs: 10
-; STDERR-NEXT: remark: foo.cl:64:0: ScratchSize [bytes/lane]: 0
-; STDERR-NEXT: remark: foo.cl:64:0: Dynamic Stack: True
-; STDERR-NEXT: remark: foo.cl:64:0: Occupancy [waves/SIMD]: 8
+; STDERR-NEXT: remark: foo.cl:64:0: SGPRs: test_indirect_call.num_sgpr+(extrasgprs(test_indirect_call.uses_vcc, test_indirect_call.uses_flat_scratch, 1))
+; STDERR-NEXT: remark: foo.cl:64:0: VGPRs: test_indirect_call.num_vgpr
+; STDERR-NEXT: remark: foo.cl:64:0: AGPRs: test_indirect_call.num_agpr
+; STDERR-NEXT: remark: foo.cl:64:0: ScratchSize [bytes/lane]: test_indirect_call.private_seg_size
+; STDERR-NEXT: remark: foo.cl:64:0: Dynamic Stack: False
+; STDERR-NEXT: remark: foo.cl:64:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_indirect_call.num_sgpr+(extrasgprs(test_indirect_call.uses_vcc, test_indirect_call.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_call.num_agpr, test_indirect_call.num_vgpr), 1, 0))
; STDERR-NEXT: remark: foo.cl:64:0: SGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:64:0: VGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:64:0: LDS Size [bytes/block]: 0
@@ -159,12 +159,12 @@ define amdgpu_kernel void @test_indirect_call() !dbg !9 {
}
; STDERR: remark: foo.cl:74:0: Function Name: test_indirect_w_static_stack
-; STDERR-NEXT: remark: foo.cl:74:0: SGPRs: 39
-; STDERR-NEXT: remark: foo.cl:74:0: VGPRs: 32
-; STDERR-NEXT: remark: foo.cl:74:0: AGPRs: 10
-; STDERR-NEXT: remark: foo.cl:74:0: ScratchSize [bytes/lane]: 144
-; STDERR-NEXT: remark: foo.cl:74:0: Dynamic Stack: True
-; STDERR-NEXT: remark: foo.cl:74:0: Occupancy [waves/SIMD]: 8
+; STDERR-NEXT: remark: foo.cl:74:0: SGPRs: test_indirect_w_static_stack.num_sgpr+(extrasgprs(test_indirect_w_static_stack.uses_vcc, test_indirect_w_static_stack.uses_flat_scratch, 1))
+; STDERR-NEXT: remark: foo.cl:74:0: VGPRs: test_indirect_w_static_stack.num_vgpr
+; STDERR-NEXT: remark: foo.cl:74:0: AGPRs: test_indirect_w_static_stack.num_agpr
+; STDERR-NEXT: remark: foo.cl:74:0: ScratchSize [bytes/lane]: test_indirect_w_static_stack.private_seg_size
+; STDERR-NEXT: remark: foo.cl:74:0: Dynamic Stack: False
+; STDERR-NEXT: remark: foo.cl:74:0: Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_indirect_w_static_stack.num_sgpr+(extrasgprs(test_indirect_w_static_stack.uses_vcc, test_indirect_w_static_stack.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_w_static_stack.num_agpr, test_indirect_w_static_stack.num_vgpr), 1, 0))
; STDERR-NEXT: remark: foo.cl:74:0: SGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:74:0: VGPRs Spill: 0
; STDERR-NEXT: remark: foo.cl:74:0: LDS Size [bytes/block]: 0
diff --git a/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll b/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll
index bba59ba4d80302..5d5aad76afd095 100644
--- a/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll
+++ b/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll
@@ -1,6 +1,6 @@
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefix=GCN %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefix=GCN-V5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefix=GCN-V5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefixes=GCN,ALL %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefixes=GCN-V5,ALL %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefixes=GCN-V5,ALL %s
; Make sure there's no assertion when trying to report the resource
; usage for a function which becomes dead during codegen.
@@ -21,9 +21,10 @@ define internal fastcc void @unreachable() {
; GCN-NOT: s_swappc_b64
; GCN: s_endpgm
-; GCN: .amdhsa_private_segment_fixed_size 0
-; GCN-NOT: .amdhsa_uses_dynamic_stack 0
-; GCN-V5: .amdhsa_uses_dynamic_stack 0
+; GCN-NOT: .amdhsa_uses_dynamic_stack
+; GCN-V5: .amdhsa_uses_dynamic_stack
+; ALL: .set entry.private_seg_size, 0
+; ALL: .set entry.has_dyn_sized_stack, 0
define amdgpu_kernel void @entry() {
bb0:
br i1 false, label %bb1, label %bb2
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
index 6ddf0986755f95..38d202eb4308f6 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
@@ -19,7 +19,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
; VI-NEXT: .p2align 6
; VI-NEXT: .amdhsa_kernel max_alignment_128
; VI-NEXT: .amdhsa_group_segment_fixed_size 0
-; VI-NEXT: .amdhsa_private_segment_fixed_size 256
+; VI-NEXT: .amdhsa_private_segment_fixed_size max_alignment_128.private_seg_size
; VI-NEXT: .amdhsa_kernarg_size 56
; VI-NEXT: .amdhsa_user_sgpr_count 14
; VI-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
@@ -29,16 +29,16 @@ define amdgpu_kernel void @max_alignment_128() #0 {
; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 1
; VI-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
; VI-NEXT: .amdhsa_user_sgpr_private_segment_size 0
-; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(max_alignment_128.private_seg_size*64, 1024))/1024)>0)||(max_alignment_128.has_dyn_sized_stack|max_alignment_128.has_recursion))|5020)&1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT: .amdhsa_next_free_vgpr 1
-; VI-NEXT: .amdhsa_next_free_sgpr 18
-; VI-NEXT: .amdhsa_reserve_vcc 0
-; VI-NEXT: .amdhsa_reserve_flat_scratch 0
+; VI-NEXT: .amdhsa_next_free_vgpr max(totalnumvgprs(max_alignment_128.num_agpr, max_alignment_128.num_vgpr), 1, 0)
+; VI-NEXT: .amdhsa_next_free_sgpr (max(max_alignment_128.num_sgpr+(extrasgprs(max_alignment_128.uses_vcc, max_alignment_128.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(max_alignment_128.uses_vcc, max_alignment_128.uses_flat_scratch, 0))
+; VI-NEXT: .amdhsa_reserve_vcc max_alignment_128.uses_vcc
+; VI-NEXT: .amdhsa_reserve_flat_scratch max_alignment_128.uses_flat_scratch
; VI-NEXT: .amdhsa_float_round_mode_32 0
; VI-NEXT: .amdhsa_float_round_mode_16_64 0
; VI-NEXT: .amdhsa_float_denorm_mode_32 3
@@ -54,6 +54,15 @@ define amdgpu_kernel void @max_alignment_128() #0 {
; VI-NEXT: .amdhsa_exception_int_div_zero 0
; VI-NEXT: .end_amdhsa_kernel
; VI-NEXT: .text
+; VI: .set max_alignment_128.num_vgpr, 1
+; VI-NEXT: .set max_alignment_128.num_agpr, 0
+; VI-NEXT: .set max_alignment_128.num_sgpr, 18
+; VI-NEXT: .set max_alignment_128.private_seg_size, 256
+; VI-NEXT: .set max_alignment_128.uses_vcc, 0
+; VI-NEXT: .set max_alignment_128.uses_flat_scratch, 0
+; VI-NEXT: .set max_alignment_128.has_dyn_sized_stack, 0
+; VI-NEXT: .set max_alignment_128.has_recursion, 0
+; VI-NEXT: .set max_alignment_128.has_indirect_call, 0
;
; GFX9-LABEL: max_alignment_128:
; GFX9: ; %bb.0:
@@ -70,7 +79,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
; GFX9-NEXT: .p2align 6
; GFX9-NEXT: .amdhsa_kernel max_alignment_128
; GFX9-NEXT: .amdhsa_group_segment_fixed_size 0
-; GFX9-NEXT: .amdhsa_private_segment_fixed_size 256
+; GFX9-NEXT: .amdhsa_private_segment_fixed_size max_alignment_128.private_seg_size
; GFX9-NEXT: .amdhsa_kernarg_size 56
; GFX9-NEXT: .amdhsa_user_sgpr_count 14
; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
@@ -80,16 +89,16 @@ define amdgpu_kernel void @max_alignment_128() #0 {
; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 1
; GFX9-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_size 0
-; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(max_alignment_128.private_seg_size*64, 1024))/1024)>0)||(max_alignment_128.has_dyn_sized_stack|max_alignment_128.has_recursion))|5020)&1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT: .amdhsa_next_free_vgpr 1
-; GFX9-NEXT: .amdhsa_next_free_sgpr 18
-; GFX9-NEXT: .amdhsa_reserve_vcc 0
-; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0
+; GFX9-NEXT: .amdhsa_next_free_vgpr max(totalnumvgprs(max_alignment_128.num_agpr, max_alignment_128.num_vgpr), 1, 0)
+; GFX9-NEXT: .amdhsa_next_free_sgpr (max(max_alignment_128.num_sgpr+(extrasgprs(max_alignment_128.uses_vcc, max_alignment_128.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(max_alignment_128.uses_vcc, max_alignment_128.uses_flat_scratch, 1))
+; GFX9-NEXT: .amdhsa_reserve_vcc max_alignment_128.uses_vcc
+; GFX9-NEXT: .amdhsa_reserve_flat_scratch max_alignment_128.uses_flat_scratch
; GFX9-NEXT: .amdhsa_reserve_xnack_mask 1
; GFX9-NEXT: .amdhsa_float_round_mode_32 0
; GFX9-NEXT: .amdhsa_float_round_mode_16_64 0
@@ -107,6 +116,15 @@ define amdgpu_kernel void @max_alignment_128() #0 {
; GFX9-NEXT: .amdhsa_exception_int_div_zero 0
; GFX9-NEXT: .end_amdhsa_kernel
; GFX9-NEXT: .text
+; GFX9: .set max_alignment_128.num_vgpr, 1
+; GFX9-NEXT: .set max_alignment_128.num_agpr, 0
+; GFX9-NEXT: .set max_alignment_128.num_sgpr, 18
+; GFX9-NEXT: .set max_alignment_128.private_seg_size, 256
+; GFX9-NEXT: .set max_alignment_128.uses_vcc, 0
+; GFX9-NEXT: .set max_alignment_128.uses_flat_scratch, 0
+; GFX9-NEXT: .set max_alignment_128.has_dyn_sized_stack, 0
+; GFX9-NEXT: .set max_alignment_128.has_recursion, 0
+; GFX9-NEXT: .set max_alignment_128.has_indirect_call, 0
%clutter = alloca i8, addrspace(5) ; Force non-zero offset for next alloca
store volatile i8 3, ptr addrspace(5) %clutter
%alloca.align = alloca i32, align 128, addrspace(5)
@@ -130,7 +148,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
; VI-NEXT: .p2align 6
; VI-NEXT: .amdhsa_kernel stackrealign_attr
; VI-NEXT: .amdhsa_group_segment_fixed_size 0
-; VI-NEXT: .amdhsa_private_segment_fixed_size 12
+; VI-NEXT: .amdhsa_private_segment_fixed_size stackrealign_attr.private_seg_size
; VI-NEXT: .amdhsa_kernarg_size 56
; VI-NEXT: .amdhsa_user_sgpr_count 14
; VI-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
@@ -140,16 +158,16 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 1
; VI-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
; VI-NEXT: .amdhsa_user_sgpr_private_segment_size 0
-; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(stackrealign_attr.private_seg_size*64, 1024))/1024)>0)||(stackrealign_attr.has_dyn_sized_stack|stackrealign_attr.has_recursion))|5020)&1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT: .amdhsa_next_free_vgpr 1
-; VI-NEXT: .amdhsa_next_free_sgpr 18
-; VI-NEXT: .amdhsa_reserve_vcc 0
-; VI-NEXT: .amdhsa_reserve_flat_scratch 0
+; VI-NEXT: .amdhsa_next_free_vgpr max(totalnumvgprs(stackrealign_attr.num_agpr, stackrealign_attr.num_vgpr), 1, 0)
+; VI-NEXT: .amdhsa_next_free_sgpr (max(stackrealign_attr.num_sgpr+(extrasgprs(stackrealign_attr.uses_vcc, stackrealign_attr.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(stackrealign_attr.uses_vcc, stackrealign_attr.uses_flat_scratch, 0))
+; VI-NEXT: .amdhsa_reserve_vcc stackrealign_attr.uses_vcc
+; VI-NEXT: .amdhsa_reserve_flat_scratch stackrealign_attr.uses_flat_scratch
; VI-NEXT: .amdhsa_float_round_mode_32 0
; VI-NEXT: .amdhsa_float_round_mode_16_64 0
; VI-NEXT: .amdhsa_float_denorm_mode_32 3
@@ -165,6 +183,15 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
; VI-NEXT: .amdhsa_exception_int_div_zero 0
; VI-NEXT: .end_amdhsa_kernel
; VI-NEXT: .text
+; VI: .set stackrealign_attr.num_vgpr, 1
+; VI-NEXT: .set stackrealign_attr.num_agpr, 0
+; VI-NEXT: .set stackrealign_attr.num_sgpr, 18
+; VI-NEXT: .set stackrealign_attr.private_seg_size, 12
+; VI-NEXT: .set stackrealign_attr.uses_vcc, 0
+; VI-NEXT: .set stackrealign_attr.uses_flat_scratch, 0
+; VI-NEXT: .set stackrealign_attr.has_dyn_sized_stack, 0
+; VI-NEXT: .set stackrealign_attr.has_recursion, 0
+; VI-NEXT: .set stackrealign_attr.has_indirect_call, 0
;
; GFX9-LABEL: stackrealign_attr:
; GFX9: ; %bb.0:
@@ -181,7 +208,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
; GFX9-NEXT: .p2align 6
; GFX9-NEXT: .amdhsa_kernel stackrealign_attr
; GFX9-NEXT: .amdhsa_group_segment_fixed_size 0
-; GFX9-NEXT: .amdhsa_private_segment_fixed_size 12
+; GFX9-NEXT: .amdhsa_private_segment_fixed_size stackrealign_attr.private_seg_size
; GFX9-NEXT: .amdhsa_kernarg_size 56
; GFX9-NEXT: .amdhsa_user_sgpr_count 14
; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
@@ -191,16 +218,16 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 1
; GFX9-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_size 0
-; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(stackrealign_attr.private_seg_size*64, 1024))/1024)>0)||(stackrealign_attr.has_dyn_sized_stack|stackrealign_attr.has_recursion))|5020)&1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT: .amdhsa_next_free_vgpr 1
-; GFX9-NEXT: .amdhsa_next_free_sgpr 18
-; GFX9-NEXT: .amdhsa_reserve_vcc 0
-; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0
+; GFX9-NEXT: .amdhsa_next_free_vgpr max(totalnumvgprs(stackrealign_attr.num_agpr, stackrealign_attr.num_vgpr), 1, 0)
+; GFX9-NEXT: .amdhsa_next_free_sgpr (max(stackrealign_attr.num_sgpr+(extrasgprs(stackrealign_attr.uses_vcc, stackrealign_attr.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(stackrealign_attr.uses_vcc, stackrealign_attr.uses_flat_scratch, 1))
+; GFX9-NEXT: .amdhsa_reserve_vcc stackrealign_attr.uses_vcc
+; GFX9-NEXT: .amdhsa_reserve_flat_scratch stackrealign_attr.uses_flat_scratch
; GFX9-NEXT: .amdhsa_reserve_xnack_mask 1
; GFX9-NEXT: .amdhsa_float_round_mode_32 0
; GFX9-NEXT: .amdhsa_float_round_mode_16_64 0
@@ -218,6 +245,15 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
; GFX9-NEXT: .amdhsa_exception_int_div_zero 0
; GFX9-NEXT: .end_amdhsa_kernel
; GFX9-NEXT: .text
+; GFX9: .set stackrealign_attr.num_vgpr, 1
+; GFX9-NEXT: .set stackrealign_attr.num_agpr, 0
+; GFX9-NEXT: .set stackrealign_attr.num_sgpr, 18
+; GFX9-NEXT: .set stackrealign_attr.private_seg_size, 12
+; GFX9-NEXT: .set stackrealign_attr.uses_vcc, 0
+; GFX9-NEXT: .set stackrealign_attr.uses_flat_scratch, 0
+; GFX9-NEXT: .set stackrealign_attr.has_dyn_sized_stack, 0
+; GFX9-NEXT: .set stackrealign_attr.has_recursion, 0
+; GFX9-NEXT: .set stackrealign_attr.has_indirect_call, 0
%clutter = alloca i8, addrspace(5) ; Force non-zero offset for next alloca
store volatile i8 3, ptr addrspace(5) %clutter
%alloca.align = alloca i32, align 4, addrspace(5)
@@ -241,7 +277,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
; VI-NEXT: .p2align 6
; VI-NEXT: .amdhsa_kernel alignstack_attr
; VI-NEXT: .amdhsa_group_segment_fixed_size 0
-; VI-NEXT: .amdhsa_private_segment_fixed_size 128
+; VI-NEXT: .amdhsa_private_segment_fixed_size alignstack_attr.private_seg_size
; VI-NEXT: .amdhsa_kernarg_size 56
; VI-NEXT: .amdhsa_user_sgpr_count 14
; VI-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
@@ -251,16 +287,16 @@ define amdgpu_kernel void @alignstack_attr() #2 {
; VI-NEXT: .amdhsa_user_sgpr_dispatch_id 1
; VI-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
; VI-NEXT: .amdhsa_user_sgpr_private_segment_size 0
-; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; VI-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(alignstack_attr.private_seg_size*64, 1024))/1024)>0)||(alignstack_attr.has_dyn_sized_stack|alignstack_attr.has_recursion))|5020)&1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; VI-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; VI-NEXT: .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT: .amdhsa_next_free_vgpr 1
-; VI-NEXT: .amdhsa_next_free_sgpr 18
-; VI-NEXT: .amdhsa_reserve_vcc 0
-; VI-NEXT: .amdhsa_reserve_flat_scratch 0
+; VI-NEXT: .amdhsa_next_free_vgpr max(totalnumvgprs(alignstack_attr.num_agpr, alignstack_attr.num_vgpr), 1, 0)
+; VI-NEXT: .amdhsa_next_free_sgpr (max(alignstack_attr.num_sgpr+(extrasgprs(alignstack_attr.uses_vcc, alignstack_attr.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(alignstack_attr.uses_vcc, alignstack_attr.uses_flat_scratch, 0))
+; VI-NEXT: .amdhsa_reserve_vcc alignstack_attr.uses_vcc
+; VI-NEXT: .amdhsa_reserve_flat_scratch alignstack_attr.uses_flat_scratch
; VI-NEXT: .amdhsa_float_round_mode_32 0
; VI-NEXT: .amdhsa_float_round_mode_16_64 0
; VI-NEXT: .amdhsa_float_denorm_mode_32 3
@@ -276,6 +312,15 @@ define amdgpu_kernel void @alignstack_attr() #2 {
; VI-NEXT: .amdhsa_exception_int_div_zero 0
; VI-NEXT: .end_amdhsa_kernel
; VI-NEXT: .text
+; VI: .set alignstack_attr.num_vgpr, 1
+; VI-NEXT: .set alignstack_attr.num_agpr, 0
+; VI-NEXT: .set alignstack_attr.num_sgpr, 18
+; VI-NEXT: .set alignstack_attr.private_seg_size, 128
+; VI-NEXT: .set alignstack_attr.uses_vcc, 0
+; VI-NEXT: .set alignstack_attr.uses_flat_scratch, 0
+; VI-NEXT: .set alignstack_attr.has_dyn_sized_stack, 0
+; VI-NEXT: .set alignstack_attr.has_recursion, 0
+; VI-NEXT: .set alignstack_attr.has_indirect_call, 0
;
; GFX9-LABEL: alignstack_attr:
; GFX9: ; %bb.0:
@@ -292,7 +337,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
; GFX9-NEXT: .p2align 6
; GFX9-NEXT: .amdhsa_kernel alignstack_attr
; GFX9-NEXT: .amdhsa_group_segment_fixed_size 0
-; GFX9-NEXT: .amdhsa_private_segment_fixed_size 128
+; GFX9-NEXT: .amdhsa_private_segment_fixed_size alignstack_attr.private_seg_size
; GFX9-NEXT: .amdhsa_kernarg_size 56
; GFX9-NEXT: .amdhsa_user_sgpr_count 14
; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
@@ -302,16 +347,16 @@ define amdgpu_kernel void @alignstack_attr() #2 {
; GFX9-NEXT: .amdhsa_user_sgpr_dispatch_id 1
; GFX9-NEXT: .amdhsa_user_sgpr_flat_scratch_init 1
; GFX9-NEXT: .amdhsa_user_sgpr_private_segment_size 0
-; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; GFX9-NEXT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(alignstack_attr.private_seg_size*64, 1024))/1024)>0)||(alignstack_attr.has_dyn_sized_stack|alignstack_attr.has_recursion))|5020)&1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
; GFX9-NEXT: .amdhsa_system_sgpr_workgroup_info 0
; GFX9-NEXT: .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT: .amdhsa_next_free_vgpr 1
-; GFX9-NEXT: .amdhsa_next_free_sgpr 18
-; GFX9-NEXT: .amdhsa_reserve_vcc 0
-; GFX9-NEXT: .amdhsa_reserve_flat_scratch 0
+; GFX9-NEXT: .amdhsa_next_free_vgpr max(totalnumvgprs(alignstack_attr.num_agpr, alignstack_attr.num_vgpr), 1, 0)
+; GFX9-NEXT: .amdhsa_next_free_sgpr (max(alignstack_attr.num_sgpr+(extrasgprs(alignstack_attr.uses_vcc, alignstack_attr.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(alignstack_attr.uses_vcc, alignstack_attr.uses_flat_scratch, 1))
+; GFX9-NEXT: .amdhsa_reserve_vcc alignstack_attr.uses_vcc
+; GFX9-NEXT: .amdhsa_reserve_flat_scratch alignstack_attr.uses_flat_scratch
; GFX9-NEXT: .amdhsa_reserve_xnack_mask 1
; GFX9-NEXT: .amdhsa_float_round_mode_32 0
; GFX9-NEXT: .amdhsa_float_round_mode_16_64 0
@@ -329,6 +374,15 @@ define amdgpu_kernel void @alignstack_attr() #2 {
; GFX9-NEXT: .amdhsa_exception_int_div_zero 0
; GFX9-NEXT: .end_amdhsa_kernel
; GFX9-NEXT: .text
+; GFX9: .set alignstack_attr.num_vgpr, 1
+; GFX9-NEXT: .set alignstack_attr.num_agpr, 0
+; GFX9-NEXT: .set alignstack_attr.num_sgpr, 18
+; GFX9-NEXT: .set alignstack_attr.private_seg_size, 128
+; GFX9-NEXT: .set alignstack_attr.uses_vcc, 0
+; GFX9-NEXT: .set alignstack_attr.uses_flat_scratch, 0
+; GFX9-NEXT: .set alignstack_attr.has_dyn_sized_stack, 0
+; GFX9-NEXT: .set alignstack_attr.has_recursion, 0
+; GFX9-NEXT: .set alignstack_attr.has_indirect_call, 0
%clutter = alloca i8, addrspace(5) ; Force non-zero offset for next alloca
store volatile i8 3, ptr addrspace(5) %clutter
%alloca.align = alloca i32, align 4, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
index 19d633651fdd0d..432d8e0e856dbf 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
@@ -6,8 +6,9 @@
define amdgpu_kernel void @kern() #0 {
; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 5
+; ASM: .amdhsa_next_free_sgpr (max(kern.num_sgpr+(extrasgprs(kern.uses_vcc, kern.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kern.uses_vcc, kern.uses_flat_scratch, 1))
; ASM: .amdhsa_reserve_xnack_mask 1
+; ASM: .set kern.num_sgpr, 5
; Verify that an extra SGPR block is reserved with XNACK "any" tid setting.
; OBJ: Contents of section .rodata:
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
index 2097579e0c9959..b6b30bc591e2b9 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
@@ -6,8 +6,9 @@
define amdgpu_kernel void @kern() #0 {
; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 5
+; ASM: .amdhsa_next_free_sgpr (max(kern.num_sgpr+(extrasgprs(kern.uses_vcc, kern.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(kern.uses_vcc, kern.uses_flat_scratch, 0))
; ASM: .amdhsa_reserve_xnack_mask 0
+; ASM: .set kern.num_sgpr, 5
; Verify that an extra SGPR block is not reserved with XNACK "off" tid setting.
; OBJ: Contents of section .rodata:
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
index 775c62e73261a9..0aa5f2a0919761 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
@@ -6,8 +6,9 @@
define amdgpu_kernel void @kern() #0 {
; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 5
+; ASM: .amdhsa_next_free_sgpr (max(kern.num_sgpr+(extrasgprs(kern.uses_vcc, kern.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kern.uses_vcc, kern.uses_flat_scratch, 1))
; ASM: .amdhsa_reserve_xnack_mask 1
+; ASM: .set kern.num_sgpr, 5
; Verify that an extra SGPR block is reserved with XNACK "on" tid setting.
; OBJ: Contents of section .rodata:
diff --git a/llvm/test/CodeGen/AMDGPU/trap.ll b/llvm/test/CodeGen/AMDGPU/trap.ll
index 9bab3e6fcf8c45..c2845bf1035640 100644
--- a/llvm/test/CodeGen/AMDGPU/trap.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap.ll
@@ -5,23 +5,32 @@
; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=+trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=HSA-TRAP %s
; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-HSA-TRAP %s
; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-HSA-TRAP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs %s -o %t1 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs %s -o %t2 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
+; RUN: FileCheck -check-prefix=GCN %s < %t1
+; RUN: FileCheck -check-prefix=GCN %s < %t2
; enable trap handler feature
; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=TRAP-BIT -check-prefix=MESA-TRAP %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=TRAP-BIT -check-prefix=MESA-TRAP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=TRAP-BIT %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=TRAP-BIT %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs %s -o %t3 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs %s -o %t4 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
+; RUN: FileCheck -check-prefix=GCN -check-prefix=TRAP-BIT %s < %t3
+; RUN: FileCheck -check-prefix=GCN -check-prefix=TRAP-BIT %s < %t4
; disable trap handler feature
; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=NO-TRAP-BIT -check-prefix=NOMESA-TRAP %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=NO-TRAP-BIT -check-prefix=NOMESA-TRAP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=NO-TRAP-BIT %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=NO-TRAP-BIT %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs %s -o %t5 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs %s -o %t6 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
+; RUN: FileCheck -check-prefix=GCN -check-prefix=NO-TRAP-BIT %s < %t5
+; RUN: FileCheck -check-prefix=GCN -check-prefix=NO-TRAP-BIT %s < %t6
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn -verify-machineinstrs %s -o %t7 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -verify-machineinstrs %s -o %t8 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
+; RUN: FileCheck -check-prefix=GCN %s < %t7
+; RUN: FileCheck -check-prefix=GCN %s < %t8
-; RUN: llc -global-isel=0 -mtriple=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
; GCN-WARNING: warning: <unknown>:0:0: in function hsa_debugtrap void (ptr addrspace(1)): debugtrap handler not supported
@@ -31,11 +40,11 @@ declare void @llvm.debugtrap() #1
; MESA-TRAP: .section .AMDGPU.config
; MESA-TRAP: .long 47180
-; MESA-TRAP-NEXT: .long 5080
+; MESA-TRAP-NEXT: .long ((((alignto(hsa_trap.private_seg_size*64, 1024))/1024)>0)||(hsa_trap.has_dyn_sized_stack|hsa_trap.has_recursion))|5080
; NOMESA-TRAP: .section .AMDGPU.config
; NOMESA-TRAP: .long 47180
-; NOMESA-TRAP-NEXT: .long 5016
+; NOMESA-TRAP-NEXT: .long ((((alignto(hsa_trap.private_seg_size*64, 1024))/1024)>0)||(hsa_trap.has_dyn_sized_stack|hsa_trap.has_recursion))|5016
; GCN-LABEL: {{^}}hsa_trap:
; HSA-TRAP: s_mov_b64 s[0:1], s[6:7]
@@ -59,11 +68,11 @@ define amdgpu_kernel void @hsa_trap(ptr addrspace(1) nocapture readonly %arg0) {
; MESA-TRAP: .section .AMDGPU.config
; MESA-TRAP: .long 47180
-; MESA-TRAP-NEXT: .long 5080
+; MESA-TRAP-NEXT: .long ((((alignto(hsa_debugtrap.private_seg_size*64, 1024))/1024)>0)||(hsa_debugtrap.has_dyn_sized_stack|hsa_debugtrap.has_recursion))|5080
; NOMESA-TRAP: .section .AMDGPU.config
; NOMESA-TRAP: .long 47180
-; NOMESA-TRAP-NEXT: .long 5016
+; NOMESA-TRAP-NEXT: .long ((((alignto(hsa_debugtrap.private_seg_size*64, 1024))/1024)>0)||(hsa_debugtrap.has_dyn_sized_stack|hsa_debugtrap.has_recursion))|5016
; GCN-LABEL: {{^}}hsa_debugtrap:
; HSA-TRAP: s_trap 3
>From 5dff9e2352a3c00164c6fe6e9d9fdc27a0c5fb15 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <janek.vanoirschot at amd.com>
Date: Mon, 12 Aug 2024 15:56:46 +0100
Subject: [PATCH 2/3] Formatting
---
llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index e41f302c3d56ce..6a91ad06de5d12 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -65,11 +65,11 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize,
const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch,
const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion,
- const MCSymbol *HasIndirectCall){};
+ const MCSymbol *HasIndirectCall) {};
virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
const MCSymbol *MaxAGPR,
- const MCSymbol *MaxSGPR){};
+ const MCSymbol *MaxSGPR) {};
/// \returns True on success, false on failure.
virtual bool EmitISAVersion() { return true; }
>From 82ca3282e05bffaf1b56c0d17727d246f8bb496c Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <janek.vanoirschot at amd.com>
Date: Fri, 16 Aug 2024 14:23:43 +0100
Subject: [PATCH 3/3] Feedback
---
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 49 ++++++++++---------
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h | 7 ++-
.../Target/AMDGPU/AMDGPUMCResourceInfo.cpp | 10 ++--
llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h | 11 ++---
4 files changed, 40 insertions(+), 37 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 23bc804515e690..524be4cbc4a2ab 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -94,8 +94,6 @@ AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
: AsmPrinter(TM, std::move(Streamer)) {
assert(OutStreamer && "AsmPrinter constructed without streamer");
RI = std::make_unique<MCResourceInfo>(OutContext);
- OccupancyValidateMap =
- std::make_unique<DenseMap<const Function *, const MCExpr *>>();
}
StringRef AMDGPUAsmPrinter::getPassName() const {
@@ -363,7 +361,7 @@ bool AMDGPUAsmPrinter::doInitialization(Module &M) {
return AsmPrinter::doInitialization(M);
}
-void AMDGPUAsmPrinter::ValidateMCResourceInfo(Function &F) {
+void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
return;
@@ -438,8 +436,8 @@ void AMDGPUAsmPrinter::ValidateMCResourceInfo(Function &F) {
}
}
- auto I = OccupancyValidateMap->find(&F);
- if (I != OccupancyValidateMap->end()) {
+ auto I = OccupancyValidateMap.find(&F);
+ if (I != OccupancyValidateMap.end()) {
const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
F, "amdgpu-waves-per-eu", {0, 0}, true);
uint64_t Occupancy;
@@ -473,13 +471,13 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) {
// Assign expressions which can only be resolved when all other functions are
// known.
- RI->Finalize();
+ RI->finalize();
getTargetStreamer()->EmitMCResourceMaximums(
RI->getMaxVGPRSymbol(), RI->getMaxAGPRSymbol(), RI->getMaxSGPRSymbol());
- for (Function &F : M.functions()) {
- ValidateMCResourceInfo(F);
- }
+ for (Function &F : M.functions())
+ validateMCResourceInfo(F);
+
return AsmPrinter::doFinalization(M);
}
@@ -867,6 +865,24 @@ uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const
return CodeSize;
}
+// AccumOffset computed for the MCExpr equivalent of:
+// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
+static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
+ const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx);
+ const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx);
+
+ // Can't be lower than 1 for subsequent alignTo.
+ const MCExpr *MaximumTaken =
+ AMDGPUMCExpr::createMax({ConstOne, NumVGPR}, Ctx);
+
+ // Practically, it's computing divideCeil(MaximumTaken, 4).
+ const MCExpr *DivCeil = MCBinaryExpr::createDiv(
+ AMDGPUMCExpr::createAlignTo(MaximumTaken, ConstFour, Ctx), ConstFour,
+ Ctx);
+
+ return MCBinaryExpr::createSub(DivCeil, ConstOne, Ctx);
+}
+
void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
const MachineFunction &MF) {
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
@@ -891,24 +907,13 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
return MCSymbolRefExpr::create(Sym, Ctx);
};
- const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx);
- const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx);
-
using RIK = MCResourceInfo::ResourceInfoKind;
ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR);
ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR);
ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
- // AccumOffset computed for the MCExpr equivalent of:
- // alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1;
- ProgInfo.AccumOffset = MCBinaryExpr::createSub(
- MCBinaryExpr::createDiv(
- AMDGPUMCExpr::createAlignTo(
- AMDGPUMCExpr::createMax({ConstOne, ProgInfo.NumArchVGPR}, Ctx),
- ConstFour, Ctx),
- ConstFour, Ctx),
- ConstOne, Ctx);
+ ProgInfo.AccumOffset = computeAccumOffset(ProgInfo.NumArchVGPR, Ctx);
ProgInfo.TgSplit = STM.isTgSplitEnabled();
ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR);
ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize);
@@ -1216,7 +1221,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU,
ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
- OccupancyValidateMap->insert({&MF.getFunction(), ProgInfo.Occupancy});
+ OccupancyValidateMap.insert({&MF.getFunction(), ProgInfo.Occupancy});
const auto [MinWEU, MaxWEU] =
AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 676a4687ee2af7..a49ef406268d76 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -49,11 +49,10 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
MCCodeEmitter *DumpCodeInstEmitter = nullptr;
- // ValidateMCResourceInfo cannot recompute parts of the occupancy as it does
+ // validateMCResourceInfo cannot recompute parts of the occupancy as it does
// for other metadata to validate (e.g., NumSGPRs) so a map is necessary if we
// really want to track and validate the occupancy.
- std::unique_ptr<DenseMap<const Function *, const MCExpr *>>
- OccupancyValidateMap;
+ DenseMap<const Function *, const MCExpr *> OccupancyValidateMap;
uint64_t getFunctionCodeSize(const MachineFunction &MF) const;
@@ -91,7 +90,7 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
/// Attempts to replace the validation that is missed in getSIProgramInfo due
/// to MCExpr being unknown. Invoked during doFinalization such that the
/// MCResourceInfo symbols are known.
- void ValidateMCResourceInfo(Function &F);
+ void validateMCResourceInfo(Function &F);
public:
explicit AMDGPUAsmPrinter(TargetMachine &TM,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
index 58383475b312c9..a704a5fdc1cf19 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
@@ -68,7 +68,7 @@ void MCResourceInfo::assignMaxRegs() {
assignMaxRegSym(MaxSGPRSym, MaxSGPR);
}
-void MCResourceInfo::Finalize() {
+void MCResourceInfo::finalize() {
assert(!finalized && "Cannot finalize ResourceInfo again.");
finalized = true;
assignMaxRegs();
@@ -93,8 +93,8 @@ void MCResourceInfo::assignResourceInfoExpr(
const MCConstantExpr *localConstExpr =
MCConstantExpr::create(localValue, OutContext);
const MCExpr *SymVal = localConstExpr;
- if (Callees.size() > 0) {
- std::vector<const MCExpr *> ArgExprs;
+ if (!Callees.empty()) {
+ SmallVector<const MCExpr *, 8> ArgExprs;
// Avoid recursive symbol assignment.
SmallSet<StringRef, 8> Seen;
ArgExprs.push_back(localConstExpr);
@@ -148,7 +148,7 @@ void MCResourceInfo::gatherResourceInfo(
{
// The expression for private segment size should be: FRI.PrivateSegmentSize
// + max(FRI.Callees, FRI.CalleeSegmentSize)
- std::vector<const MCExpr *> ArgExprs;
+ SmallVector<const MCExpr *, 8> ArgExprs;
if (FRI.CalleeSegmentSize)
ArgExprs.push_back(
MCConstantExpr::create(FRI.CalleeSegmentSize, OutContext));
@@ -162,7 +162,7 @@ void MCResourceInfo::gatherResourceInfo(
}
const MCExpr *localConstExpr =
MCConstantExpr::create(FRI.PrivateSegmentSize, OutContext);
- if (ArgExprs.size() > 0) {
+ if (!ArgExprs.empty()) {
const AMDGPUMCExpr *transitiveExpr =
AMDGPUMCExpr::createMax(ArgExprs, OutContext);
localConstExpr =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
index 6646003693a67f..97c1843e3b80f3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
@@ -37,9 +37,9 @@ class MCResourceInfo {
};
private:
- int32_t MaxVGPR;
- int32_t MaxAGPR;
- int32_t MaxSGPR;
+ int32_t MaxVGPR = 0;
+ int32_t MaxAGPR = 0;
+ int32_t MaxSGPR = 0;
MCContext &OutContext;
bool finalized;
@@ -54,8 +54,7 @@ class MCResourceInfo {
public:
MCResourceInfo(MCContext &OutContext)
- : MaxVGPR(0), MaxAGPR(0), MaxSGPR(0), OutContext(OutContext),
- finalized(false) {}
+ : OutContext(OutContext), finalized(false) {}
void addMaxVGPRCandidate(int32_t candidate) {
MaxVGPR = std::max(MaxVGPR, candidate);
}
@@ -72,7 +71,7 @@ class MCResourceInfo {
// Resolves the final symbols that requires the inter-function resource info
// to be resolved.
- void Finalize();
+ void finalize();
MCSymbol *getMaxVGPRSymbol();
MCSymbol *getMaxAGPRSymbol();
More information about the cfe-commits
mailing list