[llvm] [AMDGPU] Replace dynamic VGPR feature with attribute (PR #133444)
Diana Picus via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 28 07:01:19 PDT 2025
https://github.com/rovka created https://github.com/llvm/llvm-project/pull/133444
Use a function attribute (amdgpu-dynamic-vgpr) instead of a subtarget feature, as requested in #130030.
>From ab3edd06b0e28fc61ecbea326fdf3d8764a8e73a Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Fri, 28 Mar 2025 14:39:38 +0100
Subject: [PATCH] [AMDGPU] Replace dynamic VGPR feature with attribute
Use a function attribute (amdgpu-dynamic-vgpr) instead of a subtarget
feature, as requested in #130030.
---
llvm/docs/AMDGPUUsage.rst | 10 +-
llvm/lib/Target/AMDGPU/AMDGPU.td | 6 -
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 18 +-
.../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 6 +-
.../Target/AMDGPU/GCNIterativeScheduler.cpp | 27 +-
llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp | 2 +-
llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 14 +-
llvm/lib/Target/AMDGPU/GCNRegPressure.h | 12 +-
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 38 +-
llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 20 +-
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 21 +-
.../AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp | 3 +-
.../Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h | 1 +
.../lib/Target/AMDGPU/SIFormMemoryClauses.cpp | 2 +-
llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 6 +-
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 8 +-
.../Target/AMDGPU/SIMachineFunctionInfo.cpp | 4 +
.../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 8 +
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 2 +-
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 34 +-
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 12 +-
.../dynamic-vgpr-reserve-stack-for-cwsr.ll | 10 +-
.../AMDGPU/machine-function-info-cwsr.ll | 4 +-
.../AMDGPU/pal-metadata-3.0-callable-dvgpr.ll | 305 ++++++++++++++++
.../AMDGPU/pal-metadata-3.0-callable.ll | 2 -
.../CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll | 205 +++++++++++
llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll | 13 +-
.../AMDGPU/release-vgprs-gfx12-dvgpr.mir | 340 ++++++++++++++++++
.../CodeGen/AMDGPU/release-vgprs-gfx12.mir | 57 +--
.../Target/AMDGPU/AMDGPUUnitTests.cpp | 38 +-
30 files changed, 1038 insertions(+), 190 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12-dvgpr.mir
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index ab507e3714ebb..daedc324c5d0a 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -759,11 +759,6 @@ For example:
performant than code generated for XNACK replay
disabled.
- dynamic-vgpr TODO Represents the "Dynamic VGPR" hardware mode, introduced in GFX12.
- Waves launched in this mode may allocate or deallocate the VGPRs
- using dedicated instructions, but may not send the DEALLOC_VGPRS
- message.
-
=============== ============================ ==================================================
.. _amdgpu-target-id:
@@ -1743,6 +1738,11 @@ The AMDGPU backend supports the following LLVM IR attributes.
"amdgpu-promote-alloca-to-vector-vgpr-ratio" Ratio of VGPRs to budget for promoting alloca to vectors.
+ "amdgpu-dynamic-vgpr" Represents the "Dynamic VGPR" hardware mode, introduced in GFX12.
+ Waves launched in this mode may allocate or deallocate the VGPRs
+ using dedicated instructions, but may not send the DEALLOC_VGPRS
+ message.
+
================================================ ==========================================================
Calling Conventions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 84619dd656f35..8bd9fcd8e04e7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1257,12 +1257,6 @@ def FeatureXF32Insts : SubtargetFeature<"xf32-insts",
"v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32"
>;
-def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr",
- "DynamicVGPR",
- "true",
- "Enable dynamic VGPR mode"
->;
-
def FeatureDynamicVGPRBlockSize32 : SubtargetFeature<"dynamic-vgpr-block-size-32",
"DynamicVGPRBlockSize32",
"true",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 800e2b9c0e657..8e3f5aef8bd05 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -452,13 +452,13 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
uint64_t TotalNumVgpr =
getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr);
uint64_t NumVGPRsForWavesPerEU = std::max(
- {TotalNumVgpr, (uint64_t)1, (uint64_t)STM.getMinNumVGPRs(MaxWaves)});
+ {TotalNumVgpr, (uint64_t)1, (uint64_t)STM.getMinNumVGPRs(MaxWaves, MFI.isDynamicVGPREnabled())});
uint64_t NumSGPRsForWavesPerEU = std::max(
{NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy(
STM.getOccupancyWithWorkGroupSizes(*MF).second,
MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
- MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext), STM,
+ MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext), MFI.isDynamicVGPREnabled(), STM,
OutContext);
uint64_t Occupancy;
@@ -1081,7 +1081,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
Ctx);
ProgInfo.NumVGPRsForWavesPerEU =
AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
- CreateExpr(STM.getMinNumVGPRs(MaxWaves))},
+ CreateExpr(STM.getMinNumVGPRs(MaxWaves, MFI->isDynamicVGPREnabled()))},
Ctx);
if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
@@ -1255,7 +1255,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
STM.computeOccupancy(F, ProgInfo.LDSSize).second,
- ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
+ ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU, MFI->isDynamicVGPREnabled(), STM, Ctx);
const auto [MinWEU, MaxWEU] =
AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
@@ -1404,7 +1404,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
// Helper function to add common PAL Metadata 3.0+
static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
const SIProgramInfo &CurrentProgramInfo,
- CallingConv::ID CC, const GCNSubtarget &ST) {
+ CallingConv::ID CC, const GCNSubtarget &ST, bool IsDynamicVGPR) {
if (ST.hasIEEEMode())
MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
@@ -1416,7 +1416,7 @@ static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
(bool)CurrentProgramInfo.TrapHandlerEnable);
MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
- if (ST.isDynamicVGPREnabled())
+ if (IsDynamicVGPR)
MD->setComputeRegisters(".dynamic_vgpr_en", true);
}
@@ -1443,7 +1443,7 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
// For targets that support dynamic VGPRs, set the number of saved dynamic
// VGPRs (if any) in the PAL metadata.
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
- if (STM.isDynamicVGPREnabled() &&
+ if (MFI->isDynamicVGPREnabled() &&
MFI->getScratchReservedForDynamicVGPRs() > 0)
MD->setHwStage(CC, ".dynamic_vgpr_saved_count",
MFI->getScratchReservedForDynamicVGPRs() / 4);
@@ -1469,7 +1469,7 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
CurrentProgramInfo.ScratchEnable);
- EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM);
+ EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM, MFI->isDynamicVGPREnabled());
}
// ScratchSize is in bytes, 16 aligned.
@@ -1540,7 +1540,7 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
MD->setRsrc2(CallingConv::AMDGPU_CS,
CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
} else {
- EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST);
+ EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST, MF.getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled());
}
// Set optional info
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 98a70c0dbb912..8f9a7aefb104e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -199,8 +199,12 @@ unsigned getMaxVGPRs(const TargetMachine &TM, const Function &F) {
if (!TM.getTargetTriple().isAMDGCN())
return 128;
+ bool IsDynamicVGPR = false;
+ if (F.hasFnAttribute("amdgpu-dynamic-vgpr"))
+ IsDynamicVGPR = F.getFnAttribute("amdgpu-dynamic-vgpr").getValueAsBool();
+
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
- unsigned MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
+ unsigned MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first, IsDynamicVGPR);
// A non-entry function has only 32 caller preserved registers.
// Do not promote alloca which will force spilling unless we know the function
diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index da065e8d8cb6b..ded5c270e3cd9 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -426,13 +426,14 @@ void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
// TODO: assert Regions are sorted descending by pressure
const auto &ST = MF.getSubtarget<GCNSubtarget>();
- const auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
+ const bool IsDynamicVGPR = MF.getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled();
+ const auto Occ = Regions.front()->MaxPressure.getOccupancy(ST, IsDynamicVGPR);
LLVM_DEBUG(dbgs() << "Trying to improve occupancy, target = " << TargetOcc
<< ", current = " << Occ << '\n');
auto NewOcc = TargetOcc;
for (auto *R : Regions) {
- if (R->MaxPressure.getOccupancy(ST) >= NewOcc)
+ if (R->MaxPressure.getOccupancy(ST, IsDynamicVGPR) >= NewOcc)
break;
LLVM_DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
@@ -444,7 +445,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
LLVM_DEBUG(dbgs() << "Occupancy improvement attempt:\n";
printSchedRP(dbgs(), R->MaxPressure, MaxRP));
- NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST));
+ NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST, IsDynamicVGPR));
if (NewOcc <= Occ)
break;
@@ -465,9 +466,10 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
const auto &ST = MF.getSubtarget<GCNSubtarget>();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
auto TgtOcc = MFI->getMinAllowedOccupancy();
+ bool IsDynamicVGPR = MFI->isDynamicVGPREnabled();
sortRegionsByPressure(TgtOcc);
- auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
+ auto Occ = Regions.front()->MaxPressure.getOccupancy(ST, IsDynamicVGPR);
if (TryMaximizeOccupancy && Occ < TgtOcc)
Occ = tryMaximizeOccupancy(TgtOcc);
@@ -494,19 +496,19 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
const auto RP = getRegionPressure(*R);
LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
- if (RP.getOccupancy(ST) < TgtOcc) {
+ if (RP.getOccupancy(ST, IsDynamicVGPR) < TgtOcc) {
LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
if (R->BestSchedule.get() &&
- R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
+ R->BestSchedule->MaxPressure.getOccupancy(ST, IsDynamicVGPR) >= TgtOcc) {
LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
scheduleBest(*R);
} else {
LLVM_DEBUG(dbgs() << ", restoring\n");
Ovr.restoreOrder();
- assert(R->MaxPressure.getOccupancy(ST) >= TgtOcc);
+ assert(R->MaxPressure.getOccupancy(ST, IsDynamicVGPR) >= TgtOcc);
}
}
- FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST));
+ FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST, IsDynamicVGPR));
}
}
MFI->limitOccupancy(FinalOccupancy);
@@ -552,9 +554,10 @@ void GCNIterativeScheduler::scheduleILP(
const auto &ST = MF.getSubtarget<GCNSubtarget>();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
auto TgtOcc = MFI->getMinAllowedOccupancy();
+ bool IsDynamicVGPR = MFI->isDynamicVGPREnabled();
sortRegionsByPressure(TgtOcc);
- auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
+ auto Occ = Regions.front()->MaxPressure.getOccupancy(ST, IsDynamicVGPR);
if (TryMaximizeOccupancy && Occ < TgtOcc)
Occ = tryMaximizeOccupancy(TgtOcc);
@@ -572,17 +575,17 @@ void GCNIterativeScheduler::scheduleILP(
const auto RP = getSchedulePressure(*R, ILPSchedule);
LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
- if (RP.getOccupancy(ST) < TgtOcc) {
+ if (RP.getOccupancy(ST, IsDynamicVGPR) < TgtOcc) {
LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
if (R->BestSchedule.get() &&
- R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
+ R->BestSchedule->MaxPressure.getOccupancy(ST, IsDynamicVGPR) >= TgtOcc) {
LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
scheduleBest(*R);
}
} else {
scheduleRegion(*R, ILPSchedule, RP);
LLVM_DEBUG(printSchedResult(dbgs(), R, RP));
- FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST));
+ FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST, IsDynamicVGPR));
}
}
MFI->limitOccupancy(FinalOccupancy);
diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
index 13eb0ca539a4c..f3fd6c8564693 100644
--- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
@@ -251,7 +251,7 @@ bool GCNNSAReassignImpl::run(MachineFunction &MF) {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
- MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs);
+ MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy(), MFI->isDynamicVGPREnabled()), MaxNumVGPRs);
CSRegs = MRI->getCalleeSavedRegs();
using Candidate = std::pair<const MachineInstr*, bool>;
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index f74d12cfab0c0..3088dd2f65a8c 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -13,6 +13,7 @@
#include "GCNRegPressure.h"
#include "AMDGPU.h"
+#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/RegisterPressure.h"
using namespace llvm;
@@ -92,17 +93,18 @@ void GCNRegPressure::inc(unsigned Reg,
bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
unsigned MaxOccupancy) const {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ bool IsDynamicVGPR = MF.getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled();
const auto SGPROcc = std::min(MaxOccupancy,
ST.getOccupancyWithNumSGPRs(getSGPRNum()));
const auto VGPROcc =
std::min(MaxOccupancy,
- ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts())));
+ ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()), IsDynamicVGPR));
const auto OtherSGPROcc = std::min(MaxOccupancy,
ST.getOccupancyWithNumSGPRs(O.getSGPRNum()));
const auto OtherVGPROcc =
std::min(MaxOccupancy,
- ST.getOccupancyWithNumVGPRs(O.getVGPRNum(ST.hasGFX90AInsts())));
+ ST.getOccupancyWithNumVGPRs(O.getVGPRNum(ST.hasGFX90AInsts()), IsDynamicVGPR));
const auto Occ = std::min(SGPROcc, VGPROcc);
const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
@@ -224,13 +226,13 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
O.getVGPRNum(ST.hasGFX90AInsts()));
}
-Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST) {
- return Printable([&RP, ST](raw_ostream &OS) {
+Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST, bool IsDynamicVGPR) {
+ return Printable([&RP, ST, IsDynamicVGPR](raw_ostream &OS) {
OS << "VGPRs: " << RP.Value[GCNRegPressure::VGPR32] << ' '
<< "AGPRs: " << RP.getAGPRNum();
if (ST)
OS << "(O"
- << ST->getOccupancyWithNumVGPRs(RP.getVGPRNum(ST->hasGFX90AInsts()))
+ << ST->getOccupancyWithNumVGPRs(RP.getVGPRNum(ST->hasGFX90AInsts()), IsDynamicVGPR)
<< ')';
OS << ", SGPRs: " << RP.getSGPRNum();
if (ST)
@@ -238,7 +240,7 @@ Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST) {
OS << ", LVGPR WT: " << RP.getVGPRTuplesWeight()
<< ", LSGPR WT: " << RP.getSGPRTuplesWeight();
if (ST)
- OS << " -> Occ: " << RP.getOccupancy(*ST);
+ OS << " -> Occ: " << RP.getOccupancy(*ST, IsDynamicVGPR);
OS << '\n';
});
}
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 7554b9f578fcb..15e86c733ff60 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -67,9 +67,9 @@ struct GCNRegPressure {
Value[AGPR_TUPLE]); }
unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; }
- unsigned getOccupancy(const GCNSubtarget &ST) const {
+ unsigned getOccupancy(const GCNSubtarget &ST, bool IsDynamicVGPR) const {
return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()),
- ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts())));
+ ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()), IsDynamicVGPR));
}
void inc(unsigned Reg,
@@ -77,8 +77,8 @@ struct GCNRegPressure {
LaneBitmask NewMask,
const MachineRegisterInfo &MRI);
- bool higherOccupancy(const GCNSubtarget &ST, const GCNRegPressure& O) const {
- return getOccupancy(ST) > O.getOccupancy(ST);
+ bool higherOccupancy(const GCNSubtarget &ST, const GCNRegPressure& O, bool IsDynamicVGPR) const {
+ return getOccupancy(ST, IsDynamicVGPR) > O.getOccupancy(ST, IsDynamicVGPR);
}
/// Compares \p this GCNRegpressure to \p O, returning true if \p this is
@@ -126,7 +126,7 @@ struct GCNRegPressure {
friend GCNRegPressure max(const GCNRegPressure &P1,
const GCNRegPressure &P2);
- friend Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST);
+ friend Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST, bool IsDynamicVGPR);
};
inline GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2) {
@@ -395,7 +395,7 @@ GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI,
bool isEqual(const GCNRPTracker::LiveRegSet &S1,
const GCNRPTracker::LiveRegSet &S2);
-Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST = nullptr);
+Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST = nullptr, bool IsDynamicVGPR = false);
Printable print(const GCNRPTracker::LiveRegSet &LiveRegs,
const MachineRegisterInfo &MRI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index ea9bc88bbe86b..f2900beb5234c 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -95,16 +95,16 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
if (!KnownExcessRP) {
VGPRCriticalLimit =
- std::min(ST.getMaxNumVGPRs(TargetOccupancy), VGPRExcessLimit);
+ std::min(ST.getMaxNumVGPRs(TargetOccupancy, MFI.isDynamicVGPREnabled()), VGPRExcessLimit);
} else {
// This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
// returns a reasonably small number for targets with lots of VGPRs, such
// as GFX10 and GFX11.
LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
"VGPRCriticalLimit calculation method.\n");
-
- unsigned Granule = AMDGPU::IsaInfo::getVGPRAllocGranule(&ST);
- unsigned Addressable = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST);
+ bool IsDynamicVGPR = MFI.isDynamicVGPREnabled();
+ unsigned Granule = AMDGPU::IsaInfo::getVGPRAllocGranule(&ST, IsDynamicVGPR);
+ unsigned Addressable = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST, IsDynamicVGPR);
unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
VGPRBudget = std::max(VGPRBudget, Granule);
VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
@@ -1126,7 +1126,7 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() {
if (DAG.MinOccupancy > InitialOccupancy) {
for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX)
DAG.RegionsWithMinOcc[IDX] =
- DAG.Pressure[IDX].getOccupancy(DAG.ST) == DAG.MinOccupancy;
+ DAG.Pressure[IDX].getOccupancy(DAG.ST, DAG.MFI.isDynamicVGPREnabled()) == DAG.MinOccupancy;
LLVM_DEBUG(dbgs() << StageID
<< " stage successfully increased occupancy to "
@@ -1266,11 +1266,13 @@ void GCNSchedStage::checkScheduling() {
LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
+ bool IsDynamicVGPR = DAG.MFI.isDynamicVGPREnabled();
+
if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
DAG.Pressure[RegionIdx] = PressureAfter;
DAG.RegionsWithMinOcc[RegionIdx] =
- PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
+ PressureAfter.getOccupancy(ST, IsDynamicVGPR) == DAG.MinOccupancy;
// Early out if we have achieved the occupancy target.
LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
@@ -1280,9 +1282,9 @@ void GCNSchedStage::checkScheduling() {
unsigned TargetOccupancy = std::min(
S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
unsigned WavesAfter =
- std::min(TargetOccupancy, PressureAfter.getOccupancy(ST));
+ std::min(TargetOccupancy, PressureAfter.getOccupancy(ST, IsDynamicVGPR));
unsigned WavesBefore =
- std::min(TargetOccupancy, PressureBefore.getOccupancy(ST));
+ std::min(TargetOccupancy, PressureBefore.getOccupancy(ST, IsDynamicVGPR));
LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
<< ", after " << WavesAfter << ".\n");
@@ -1332,7 +1334,7 @@ void GCNSchedStage::checkScheduling() {
} else {
DAG.Pressure[RegionIdx] = PressureAfter;
DAG.RegionsWithMinOcc[RegionIdx] =
- PressureAfter.getOccupancy(ST) == DAG.MinOccupancy;
+ PressureAfter.getOccupancy(ST, IsDynamicVGPR) == DAG.MinOccupancy;
}
}
@@ -1455,11 +1457,11 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
return true;
// For dynamic VGPR mode, we don't want to waste any VGPR blocks.
- if (ST.isDynamicVGPREnabled()) {
+ if (DAG.MFI.isDynamicVGPREnabled()) {
unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
- &ST, PressureBefore.getVGPRNum(false));
+ &ST, DAG.MFI.isDynamicVGPREnabled(), PressureBefore.getVGPRNum(false));
unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
- &ST, PressureAfter.getVGPRNum(false));
+ &ST, DAG.MFI.isDynamicVGPREnabled(), PressureAfter.getVGPRNum(false));
if (BlocksAfter > BlocksBefore)
return true;
}
@@ -1483,7 +1485,7 @@ bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
// If RP is not reduced in the unclustered reschedule stage, revert to the
// old schedule.
- if ((WavesAfter <= PressureBefore.getOccupancy(ST) &&
+ if ((WavesAfter <= PressureBefore.getOccupancy(ST, DAG.MFI.isDynamicVGPREnabled()) &&
mayCauseSpilling(WavesAfter)) ||
GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
@@ -1507,7 +1509,7 @@ bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
unsigned OldMetric = MBefore.getMetric();
unsigned NewMetric = MAfter.getMetric();
unsigned WavesBefore =
- std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST));
+ std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(ST, DAG.MFI.isDynamicVGPREnabled()));
unsigned Profit =
((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
@@ -1565,7 +1567,7 @@ bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
void GCNSchedStage::revertScheduling() {
DAG.RegionsWithMinOcc[RegionIdx] =
- PressureBefore.getOccupancy(ST) == DAG.MinOccupancy;
+ PressureBefore.getOccupancy(ST, DAG.MFI.isDynamicVGPREnabled()) == DAG.MinOccupancy;
LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
DAG.RescheduleRegions[RegionIdx] =
S.hasNextStage() &&
@@ -1823,7 +1825,7 @@ bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
// The occupancy of this region could have been improved by a previous
// iteration's sinking of defs.
- if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) {
+ if (NewPressure[I].getOccupancy(ST, DAG.MFI.isDynamicVGPREnabled()) > DAG.MinOccupancy) {
NewRescheduleRegions[I] = true;
Improved = true;
continue;
@@ -1869,7 +1871,7 @@ bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
#endif
}
int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
- unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
+ unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink, DAG.MFI.isDynamicVGPREnabled());
// If in the most optimistic scenario, we cannot improve occupancy, then do
// not attempt to sink any instructions.
if (OptimisticOccupancy <= DAG.MinOccupancy)
@@ -1920,7 +1922,7 @@ bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
}
SinkedDefs.push_back(Def);
- ImproveOccupancy = NewPressure[I].getOccupancy(ST);
+ ImproveOccupancy = NewPressure[I].getOccupancy(ST, DAG.MFI.isDynamicVGPREnabled());
if (ImproveOccupancy > DAG.MinOccupancy)
break;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 53f5c1efd14eb..309fd97520d88 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -366,8 +366,8 @@ unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
getGeneration());
}
-unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {
- return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs);
+unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs, bool IsDynamicVGPR) const {
+ return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs, IsDynamicVGPR);
}
unsigned
@@ -403,9 +403,13 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
std::pair<unsigned, unsigned>
GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
unsigned NumSGPRs, unsigned NumVGPRs) const {
+ bool IsDynamicVGPR = false;
+ if (F.hasFnAttribute("amdgpu-dynamic-vgpr"))
+ IsDynamicVGPR = F.getFnAttribute("amdgpu-dynamic-vgpr").getValueAsBool();
+
auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F);
unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs);
- unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs);
+ unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs, IsDynamicVGPR);
// Maximum occupancy may be further limited by high SGPR/VGPR usage.
MaxOcc = std::min(MaxOcc, std::min(SGPROcc, VGPROcc));
@@ -498,9 +502,13 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
unsigned GCNSubtarget::getBaseMaxNumVGPRs(
const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
+ bool IsDynamicVGPR = false;
+ if (F.hasFnAttribute("amdgpu-dynamic-vgpr"))
+ IsDynamicVGPR = F.getFnAttribute("amdgpu-dynamic-vgpr").getValueAsBool();
+
// Compute maximum number of VGPRs function can use using default/requested
// minimum number of waves per execution unit.
- unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
+ unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first, IsDynamicVGPR);
// Check if maximum number of VGPRs was explicitly requested using
// "amdgpu-num-vgpr" attribute.
@@ -512,10 +520,10 @@ unsigned GCNSubtarget::getBaseMaxNumVGPRs(
// Make sure requested value is compatible with values implied by
// default/requested minimum/maximum number of waves per execution unit.
- if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
+ if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first, IsDynamicVGPR))
Requested = 0;
if (WavesPerEU.second && Requested &&
- Requested < getMinNumVGPRs(WavesPerEU.second))
+ Requested < getMinNumVGPRs(WavesPerEU.second, IsDynamicVGPR))
Requested = 0;
if (Requested)
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 7384278d81cc1..0e15817c15683 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -191,7 +191,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
/// indicates a lack of S_CLAUSE support.
unsigned MaxHardClauseLength = 0;
bool SupportsSRAMECC = false;
- bool DynamicVGPR = false;
bool DynamicVGPRBlockSize32 = false;
// This should not be used directly. 'TargetID' tracks the dynamic settings
@@ -1374,7 +1373,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
/// Return the maximum number of waves per SIMD for kernels using \p VGPRs
/// VGPRs
- unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
+ unsigned getOccupancyWithNumVGPRs(unsigned VGPRs, bool IsDynamicVGPR) const;
/// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
/// be achieved when the only function running on a CU is \p F, each workgroup
@@ -1524,8 +1523,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
unsigned getMaxNumSGPRs(const Function &F) const;
/// \returns VGPR allocation granularity supported by the subtarget.
- unsigned getVGPRAllocGranule() const {
- return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
+ unsigned getVGPRAllocGranule(bool IsDynamicVGPR) const {
+ return AMDGPU::IsaInfo::getVGPRAllocGranule(this, IsDynamicVGPR);
}
/// \returns VGPR encoding granularity supported by the subtarget.
@@ -1545,20 +1544,20 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
}
/// \returns Addressable number of VGPRs supported by the subtarget.
- unsigned getAddressableNumVGPRs() const {
- return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
+ unsigned getAddressableNumVGPRs(bool IsDynamicVGPR) const {
+ return AMDGPU::IsaInfo::getAddressableNumVGPRs(this, IsDynamicVGPR);
}
/// \returns the minimum number of VGPRs that will prevent achieving more than
/// the specified number of waves \p WavesPerEU.
- unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
- return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
+ unsigned getMinNumVGPRs(unsigned WavesPerEU, bool IsDynamicVGPR) const {
+ return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU, IsDynamicVGPR);
}
/// \returns the maximum number of VGPRs that can be used and still achieved
/// at least the specified number of waves \p WavesPerEU.
- unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
- return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
+ unsigned getMaxNumVGPRs(unsigned WavesPerEU, bool IsDynamicVGPR) const {
+ return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU, IsDynamicVGPR);
}
/// \returns max num VGPRs. This is the common utility function
@@ -1658,8 +1657,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return true;
}
- bool isDynamicVGPREnabled() const { return DynamicVGPR; }
-
bool requiresDisjointEarlyClobberAndUndef() const override {
// AMDGPU doesn't care if early-clobber and undef operands are allocated
// to the same register.
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
index 7fff2e515b046..5111e077fb823 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
@@ -285,10 +285,11 @@ const AMDGPUMCExpr *AMDGPUMCExpr::createTotalNumVGPR(const MCExpr *NumAGPR,
const AMDGPUMCExpr *AMDGPUMCExpr::createOccupancy(unsigned InitOcc,
const MCExpr *NumSGPRs,
const MCExpr *NumVGPRs,
+ bool IsDynamicVGPR,
const GCNSubtarget &STM,
MCContext &Ctx) {
unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
- unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM);
+ unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, IsDynamicVGPR);
unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
unsigned Generation = STM.getGeneration();
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
index c0167096f022a..da3f0387b36e6 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@@ -83,6 +83,7 @@ class AMDGPUMCExpr : public MCTargetExpr {
static const AMDGPUMCExpr *createOccupancy(unsigned InitOcc,
const MCExpr *NumSGPRs,
const MCExpr *NumVGPRs,
+ bool IsDynamicVGPR,
const GCNSubtarget &STM,
MCContext &Ctx);
diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index bbc0280aed42e..19a899eec2565 100644
--- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -198,7 +198,7 @@ bool SIFormMemoryClausesImpl::checkPressure(const MachineInstr &MI,
// pointer becomes dead and could otherwise be reused for destination.
RPT.advanceToNext();
GCNRegPressure MaxPressure = RPT.moveMaxPressure();
- unsigned Occupancy = MaxPressure.getOccupancy(*ST);
+ unsigned Occupancy = MaxPressure.getOccupancy(*ST, MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled());
// Don't push over half the register budget. We don't want to introduce
// spilling just to form a soft clause.
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 9c737b4f3e378..9cd1f56bf9b88 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -715,8 +715,8 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
Register FPReg = MFI->getFrameOffsetReg();
assert(FPReg != AMDGPU::FP_REG);
unsigned VGPRSize =
- llvm::alignTo((ST.getAddressableNumVGPRs() -
- AMDGPU::IsaInfo::getVGPRAllocGranule(&ST)) *
+ llvm::alignTo((ST.getAddressableNumVGPRs(MFI->isDynamicVGPREnabled()) -
+ AMDGPU::IsaInfo::getVGPRAllocGranule(&ST, MFI->isDynamicVGPREnabled())) *
4,
FrameInfo.getMaxAlign());
MFI->setScratchReservedForDynamicVGPRs(VGPRSize);
@@ -1882,7 +1882,7 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const {
bool SIFrameLowering::mayReserveScratchForCWSR(
const MachineFunction &MF) const {
- return MF.getSubtarget<GCNSubtarget>().isDynamicVGPREnabled() &&
+ return MF.getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() &&
AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv()) &&
AMDGPU::isCompute(MF.getFunction().getCallingConv());
}
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 15965f2bac8aa..c46fc682b1162 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1676,7 +1676,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
if (!WCG->isOptNone() &&
- (ST->isDynamicVGPREnabled() ||
+ (MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
(ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
!ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
@@ -2555,7 +2555,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
- [[maybe_unused]] unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
+ [[maybe_unused]] unsigned NumVGPRsMax = ST->getAddressableNumVGPRs(MFI->isDynamicVGPREnabled());
[[maybe_unused]] unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
@@ -2711,7 +2711,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
// (i.e. whether we're in dynamic VGPR mode or not).
// Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
// waveslot limited kernel runs slower with the deallocation.
- if (ST->isDynamicVGPREnabled()) {
+ if (MFI->isDynamicVGPREnabled()) {
for (MachineInstr *MI : ReleaseVGPRInsts) {
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(AMDGPU::S_ALLOC_VGPR))
@@ -2722,7 +2722,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
if (!ReleaseVGPRInsts.empty() &&
(MF.getFrameInfo().hasCalls() ||
ST->getOccupancyWithNumVGPRs(
- TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) <
+ TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass), /*IsDynamicVGPR=*/false) <
AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
for (MachineInstr *MI : ReleaseVGPRInsts) {
if (ST->requiresNopBeforeDeallocVGPRs()) {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index efdf642e29db3..2101904fc3d5c 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -48,6 +48,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
assert(MaxNumWorkGroups.size() == 3);
+ if (F.hasFnAttribute("amdgpu-dynamic-vgpr"))
+ IsDynamicVGPREnabled = F.getFnAttribute("amdgpu-dynamic-vgpr").getValueAsBool();
+
Occupancy = ST.computeOccupancy(F, getLDSSize()).second;
CallingConv::ID CC = F.getCallingConv();
@@ -716,6 +719,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
+ IsDynamicVGPREnabled(MFI.isDynamicVGPREnabled()),
ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) {
for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
SpillPhysVGPRS.push_back(regToString(Reg, TRI));
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index a60409b5a7e09..e3b2f32d1c547 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -299,6 +299,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
bool HasInitWholeWave = false;
+ bool IsDynamicVGPREnabled = false;
unsigned ScratchReservedForDynamicVGPRs = 0;
SIMachineFunctionInfo() = default;
@@ -352,6 +353,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg,
StringValue());
YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false);
+ YamlIO.mapOptional("isDynamicVGPREnabled", MFI.IsDynamicVGPREnabled, false);
YamlIO.mapOptional("scratchReservedForDynamicVGPRs",
MFI.ScratchReservedForDynamicVGPRs, 0);
}
@@ -459,6 +461,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
unsigned NumSpilledSGPRs = 0;
unsigned NumSpilledVGPRs = 0;
+ bool IsDynamicVGPREnabled = false;
+
// The size in bytes of the scratch space reserved for the CWSR trap handler
// to spill some of the dynamic VGPRs.
unsigned ScratchReservedForDynamicVGPRs = 0;
@@ -788,6 +792,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
BytesInStackArgArea = Bytes;
}
+ bool isDynamicVGPREnabled() const {
+ return IsDynamicVGPREnabled;
+ }
+
// This is only used if we need to save any dynamic VGPRs in scratch.
unsigned getScratchReservedForDynamicVGPRs() const {
return ScratchReservedForDynamicVGPRs;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index c1ac9491b2363..9005ceec1d2d1 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3689,7 +3689,7 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
default:
return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
case AMDGPU::VGPR_32RegClassID:
- return std::min(ST.getMaxNumVGPRs(MinOcc), ST.getMaxNumVGPRs(MF));
+ return std::min(ST.getMaxNumVGPRs(MinOcc, MF.getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled()), ST.getMaxNumVGPRs(MF));
case AMDGPU::SGPR_32RegClassID:
case AMDGPU::SGPR_LO16RegClassID:
return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF));
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 1c777e235fb60..4c6e59caeba8b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1161,12 +1161,12 @@ unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs) {
1;
}
-unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
+unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, bool IsDynamicVGPR,
std::optional<bool> EnableWavefrontSize32) {
if (STI->getFeatureBits().test(FeatureGFX90AInsts))
return 8;
- if (STI->getFeatureBits().test(FeatureDynamicVGPR))
+ if (IsDynamicVGPR)
return STI->getFeatureBits().test(FeatureDynamicVGPRBlockSize32) ? 32 : 16;
bool IsWave32 = EnableWavefrontSize32 ?
@@ -1207,18 +1207,18 @@ unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI) { return 256; }
-unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
+unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI, bool IsDynamicVGPR) {
if (STI->getFeatureBits().test(FeatureGFX90AInsts))
return 512;
- if (STI->getFeatureBits().test(FeatureDynamicVGPR))
+ if (IsDynamicVGPR)
// On GFX12 we can allocate at most 8 blocks of VGPRs.
- return 8 * getVGPRAllocGranule(STI);
+ return 8 * getVGPRAllocGranule(STI, IsDynamicVGPR);
return getAddressableNumArchVGPRs(STI);
}
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
- unsigned NumVGPRs) {
- return getNumWavesPerEUWithNumVGPRs(NumVGPRs, getVGPRAllocGranule(STI),
+ unsigned NumVGPRs, bool IsDynamicVGPR) {
+ return getNumWavesPerEUWithNumVGPRs(NumVGPRs, getVGPRAllocGranule(STI, IsDynamicVGPR),
getMaxWavesPerEU(STI),
getTotalNumVGPRs(STI));
}
@@ -1259,7 +1259,7 @@ unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves,
return 5;
}
-unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
+unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU, bool IsDynamicVGPR) {
assert(WavesPerEU != 0);
unsigned MaxWavesPerEU = getMaxWavesPerEU(STI);
@@ -1267,28 +1267,28 @@ unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
return 0;
unsigned TotNumVGPRs = getTotalNumVGPRs(STI);
- unsigned AddrsableNumVGPRs = getAddressableNumVGPRs(STI);
- unsigned Granule = getVGPRAllocGranule(STI);
+ unsigned AddrsableNumVGPRs = getAddressableNumVGPRs(STI, IsDynamicVGPR);
+ unsigned Granule = getVGPRAllocGranule(STI, IsDynamicVGPR);
unsigned MaxNumVGPRs = alignDown(TotNumVGPRs / WavesPerEU, Granule);
if (MaxNumVGPRs == alignDown(TotNumVGPRs / MaxWavesPerEU, Granule))
return 0;
- unsigned MinWavesPerEU = getNumWavesPerEUWithNumVGPRs(STI, AddrsableNumVGPRs);
+ unsigned MinWavesPerEU = getNumWavesPerEUWithNumVGPRs(STI, AddrsableNumVGPRs, IsDynamicVGPR);
if (WavesPerEU < MinWavesPerEU)
- return getMinNumVGPRs(STI, MinWavesPerEU);
+ return getMinNumVGPRs(STI, MinWavesPerEU, IsDynamicVGPR);
unsigned MaxNumVGPRsNext = alignDown(TotNumVGPRs / (WavesPerEU + 1), Granule);
unsigned MinNumVGPRs = 1 + std::min(MaxNumVGPRs - Granule, MaxNumVGPRsNext);
return std::min(MinNumVGPRs, AddrsableNumVGPRs);
}
-unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
+unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU, bool IsDynamicVGPR) {
assert(WavesPerEU != 0);
unsigned MaxNumVGPRs = alignDown(getTotalNumVGPRs(STI) / WavesPerEU,
- getVGPRAllocGranule(STI));
- unsigned AddressableNumVGPRs = getAddressableNumVGPRs(STI);
+ getVGPRAllocGranule(STI, IsDynamicVGPR));
+ unsigned AddressableNumVGPRs = getAddressableNumVGPRs(STI, IsDynamicVGPR);
return std::min(MaxNumVGPRs, AddressableNumVGPRs);
}
@@ -1300,10 +1300,10 @@ unsigned getEncodedNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs,
}
unsigned getAllocatedNumVGPRBlocks(const MCSubtargetInfo *STI,
- unsigned NumVGPRs,
+ unsigned NumVGPRs, bool IsDynamicVGPR,
std::optional<bool> EnableWavefrontSize32) {
return getGranulatedNumRegisterBlocks(
- NumVGPRs, getVGPRAllocGranule(STI, EnableWavefrontSize32));
+ NumVGPRs, getVGPRAllocGranule(STI, IsDynamicVGPR, EnableWavefrontSize32));
}
} // end namespace IsaInfo
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index f61a99c37e669..798c8baeccd47 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -307,7 +307,7 @@ unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs);
/// For subtargets which support it, \p EnableWavefrontSize32 should match
/// the ENABLE_WAVEFRONT_SIZE32 kernel descriptor field.
unsigned
-getVGPRAllocGranule(const MCSubtargetInfo *STI,
+getVGPRAllocGranule(const MCSubtargetInfo *STI, bool IsDynamicVGPR,
std::optional<bool> EnableWavefrontSize32 = std::nullopt);
/// \returns VGPR encoding granularity for given subtarget \p STI.
@@ -326,20 +326,20 @@ unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI);
unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI);
/// \returns Addressable number of VGPRs for given subtarget \p STI.
-unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI);
+unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI, bool IsDynamicVGPR);
/// \returns Minimum number of VGPRs that meets given number of waves per
/// execution unit requirement for given subtarget \p STI.
-unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
+unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU, bool IsDynamicVGPR);
/// \returns Maximum number of VGPRs that meets given number of waves per
/// execution unit requirement for given subtarget \p STI.
-unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
+unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU, bool IsDynamicVGPR);
/// \returns Number of waves reachable for a given \p NumVGPRs usage for given
/// subtarget \p STI.
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
- unsigned NumVGPRs);
+ unsigned NumVGPRs, bool IsDynamicVGPR);
/// \returns Number of waves reachable for a given \p NumVGPRs usage, \p Granule
/// size, \p MaxWaves possible, and \p TotalNumVGPRs available.
@@ -365,7 +365,7 @@ unsigned getEncodedNumVGPRBlocks(
/// \returns Number of VGPR blocks that need to be allocated for the given
/// subtarget \p STI when \p NumVGPRs are used.
unsigned getAllocatedNumVGPRBlocks(
- const MCSubtargetInfo *STI, unsigned NumVGPRs,
+ const MCSubtargetInfo *STI, unsigned NumVGPRs, bool IsDynamicVGPR,
std::optional<bool> EnableWavefrontSize32 = std::nullopt);
} // end namespace IsaInfo
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
index ca2fca69dcf21..1b59845c53cc2 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr < %s | FileCheck -check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=CHECK %s
; Make sure we use a stack pointer and allocate 112 * 4 bytes at the beginning of the stack.
@@ -92,7 +92,7 @@ define amdgpu_cs void @with_calls_no_inline_const() #0 {
ret void
}
-define amdgpu_cs void @with_spills() {
+define amdgpu_cs void @with_spills() #0 {
; CHECK-LABEL: with_spills:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2)
@@ -246,6 +246,6 @@ define void @default() #0 {
declare amdgpu_gfx void @callee(i32) #0
-attributes #0 = { nounwind }
-attributes #1 = { nounwind "frame-pointer"="none" }
-attributes #2 = { nounwind "frame-pointer"="all" }
+attributes #0 = { nounwind "amdgpu-dynamic-vgpr"="true" }
+attributes #1 = { nounwind "frame-pointer"="none" "amdgpu-dynamic-vgpr"="true" }
+attributes #2 = { nounwind "frame-pointer"="all" "amdgpu-dynamic-vgpr"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll b/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll
index 2de6699aab665..b4c74a1ce8323 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll
+++ b/llvm/test/CodeGen/AMDGPU/machine-function-info-cwsr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr -stop-after=prologepilog < %s | FileCheck -check-prefix=CHECK %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -stop-after=prologepilog < %s | FileCheck -check-prefix=CHECK %s
; Make sure we use a stack pointer and allocate 112 * 4 bytes at the beginning of the stack.
@@ -68,5 +68,5 @@ define void @default() #0 {
declare amdgpu_gfx void @callee(i32) #0
-attributes #0 = { nounwind }
+attributes #0 = { nounwind "amdgpu-dynamic-vgpr" = "true" }
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll
new file mode 100644
index 0000000000000..6c1ab0359bffb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll
@@ -0,0 +1,305 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK: .amdgpu_pal_metadata
+; CHECK-NEXT: ---
+; CHECK-NEXT: amdpal.pipelines:
+; CHECK-NEXT: - .api: Vulkan
+; CHECK-NEXT: .compute_registers:
+; CHECK-NEXT: .dynamic_vgpr_en: true
+; CHECK-NEXT: .tg_size_en: true
+; CHECK-NEXT: .tgid_x_en: false
+; CHECK-NEXT: .tgid_y_en: false
+; CHECK-NEXT: .tgid_z_en: false
+; CHECK-NEXT: .tidig_comp_cnt: 0x1
+; CHECK-NEXT: .hardware_stages:
+; CHECK-NEXT: .cs:
+; CHECK-NEXT: .checksum_value: 0x9444d7d0
+; CHECK-NEXT: .debug_mode: 0
+; CHECK-NEXT: .excp_en: 0
+; CHECK-NEXT: .float_mode: 0xc0
+; CHECK-NEXT: .image_op: false
+; CHECK-NEXT: .lds_size: 0x200
+; CHECK-NEXT: .mem_ordered: true
+; CHECK-NEXT: .sgpr_limit: 0x6a
+; CHECK-NEXT: .threadgroup_dimensions:
+; CHECK-NEXT: - 0x1
+; CHECK-NEXT: - 0x400
+; CHECK-NEXT: - 0x1
+; CHECK-NEXT: .trap_present: false
+; CHECK-NEXT: .user_data_reg_map:
+; CHECK-NEXT: - 0x10000000
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: .user_sgprs: 0x3
+; CHECK-NEXT: .vgpr_limit: 0x100
+; CHECK-NEXT: .wavefront_size: 0x40
+; CHECK-NEXT: .wgp_mode: true
+; CHECK: .registers: {}
+; CHECK-NEXT: .shader_functions:
+; CHECK-NEXT: dynamic_stack:
+; CHECK-NEXT: .backend_stack_size: 0x10
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x22
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT: .vgpr_count: 0x2
+; CHECK-NEXT: dynamic_stack_loop:
+; CHECK-NEXT: .backend_stack_size: 0x10
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x22
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT: .vgpr_count: 0x3
+; CHECK-NEXT: multiple_stack:
+; CHECK-NEXT: .backend_stack_size: 0x24
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x21
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x24
+; CHECK-NEXT: .vgpr_count: 0x3
+; CHECK-NEXT: no_stack:
+; CHECK-NEXT: .backend_stack_size: 0
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x20
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0
+; CHECK-NEXT: .vgpr_count: 0x1
+; CHECK-NEXT: no_stack_call:
+; CHECK-NEXT: .backend_stack_size: 0x10
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x22
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT: .vgpr_count: 0x3
+; CHECK-NEXT: no_stack_extern_call:
+; CHECK-NEXT: .backend_stack_size: 0x10
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x24
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT: .vgpr_count: 0x58
+; CHECK-NEXT: no_stack_extern_call_many_args:
+; CHECK-NEXT: .backend_stack_size: 0x90
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x24
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x90
+; CHECK-NEXT: .vgpr_count: 0x58
+; CHECK-NEXT: no_stack_indirect_call:
+; CHECK-NEXT: .backend_stack_size: 0x10
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x24
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT: .vgpr_count: 0x58
+; CHECK-NEXT: simple_lds:
+; CHECK-NEXT: .backend_stack_size: 0
+; CHECK-NEXT: .lds_size: 0x100
+; CHECK-NEXT: .sgpr_count: 0x20
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0
+; CHECK-NEXT: .vgpr_count: 0x1
+; CHECK-NEXT: simple_lds_recurse:
+; CHECK-NEXT: .backend_stack_size: 0x10
+; CHECK-NEXT: .lds_size: 0x100
+; CHECK-NEXT: .sgpr_count: 0x24
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT: .vgpr_count: 0x29
+; CHECK-NEXT: simple_stack:
+; CHECK-NEXT: .backend_stack_size: 0x14
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x21
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x14
+; CHECK-NEXT: .vgpr_count: 0x2
+; CHECK-NEXT: simple_stack_call:
+; CHECK-NEXT: .backend_stack_size: 0x20
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x22
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
+; CHECK-NEXT: .vgpr_count: 0x4
+; CHECK-NEXT: simple_stack_extern_call:
+; CHECK-NEXT: .backend_stack_size: 0x20
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x24
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
+; CHECK-NEXT: .vgpr_count: 0x58
+; CHECK-NEXT: simple_stack_indirect_call:
+; CHECK-NEXT: .backend_stack_size: 0x20
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x24
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
+; CHECK-NEXT: .vgpr_count: 0x58
+; CHECK-NEXT: simple_stack_recurse:
+; CHECK-NEXT: .backend_stack_size: 0x20
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x24
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
+; CHECK-NEXT: .vgpr_count: 0x2a
+; CHECK:amdpal.version:
+; CHECK-NEXT: - 0x3
+; CHECK-NEXT: - 0
+; CHECK-NEXT:...
+; CHECK-NEXT: .end_amdgpu_pal_metadata
+
+declare amdgpu_gfx float @extern_func(float) #0
+declare amdgpu_gfx float @extern_func_many_args(<64 x float>) #0
+
+ at funcptr = external hidden unnamed_addr addrspace(4) constant ptr, align 4
+
+define amdgpu_gfx float @no_stack(float %arg0) #0 {
+ %add = fadd float %arg0, 1.0
+ ret float %add
+}
+
+define amdgpu_gfx float @simple_stack(float %arg0) #0 {
+ %stack = alloca float, i32 4, align 4, addrspace(5)
+ store volatile float 2.0, ptr addrspace(5) %stack
+ %val = load volatile float, ptr addrspace(5) %stack
+ %add = fadd float %arg0, %val
+ ret float %add
+}
+
+define amdgpu_gfx float @multiple_stack(float %arg0) #0 {
+ %stack = alloca float, i32 4, align 4, addrspace(5)
+ store volatile float 2.0, ptr addrspace(5) %stack
+ %val = load volatile float, ptr addrspace(5) %stack
+ %add = fadd float %arg0, %val
+ %stack2 = alloca float, i32 4, align 4, addrspace(5)
+ store volatile float 2.0, ptr addrspace(5) %stack2
+ %val2 = load volatile float, ptr addrspace(5) %stack2
+ %add2 = fadd float %add, %val2
+ ret float %add2
+}
+
+define amdgpu_gfx float @dynamic_stack(float %arg0) #0 {
+bb0:
+ %cmp = fcmp ogt float %arg0, 0.0
+ br i1 %cmp, label %bb1, label %bb2
+
+bb1:
+ %stack = alloca float, i32 4, align 4, addrspace(5)
+ store volatile float 2.0, ptr addrspace(5) %stack
+ %val = load volatile float, ptr addrspace(5) %stack
+ %add = fadd float %arg0, %val
+ br label %bb2
+
+bb2:
+ %res = phi float [ 0.0, %bb0 ], [ %add, %bb1 ]
+ ret float %res
+}
+
+define amdgpu_gfx float @dynamic_stack_loop(float %arg0) #0 {
+bb0:
+ br label %bb1
+
+bb1:
+ %ctr = phi i32 [ 0, %bb0 ], [ %newctr, %bb1 ]
+ %stack = alloca float, i32 4, align 4, addrspace(5)
+ store volatile float 2.0, ptr addrspace(5) %stack
+ %val = load volatile float, ptr addrspace(5) %stack
+ %add = fadd float %arg0, %val
+ %cmp = icmp sgt i32 %ctr, 0
+ %newctr = sub i32 %ctr, 1
+ br i1 %cmp, label %bb1, label %bb2
+
+bb2:
+ ret float %add
+}
+
+define amdgpu_gfx float @no_stack_call(float %arg0) #0 {
+ %res = call amdgpu_gfx float @simple_stack(float %arg0)
+ ret float %res
+}
+
+define amdgpu_gfx float @simple_stack_call(float %arg0) #0 {
+ %stack = alloca float, i32 4, align 4, addrspace(5)
+ store volatile float 2.0, ptr addrspace(5) %stack
+ %val = load volatile float, ptr addrspace(5) %stack
+ %res = call amdgpu_gfx float @simple_stack(float %arg0)
+ %add = fadd float %res, %val
+ ret float %add
+}
+
+define amdgpu_gfx float @no_stack_extern_call(float %arg0) #0 {
+ %res = call amdgpu_gfx float @extern_func(float %arg0)
+ ret float %res
+}
+
+define amdgpu_gfx float @simple_stack_extern_call(float %arg0) #0 {
+ %stack = alloca float, i32 4, align 4, addrspace(5)
+ store volatile float 2.0, ptr addrspace(5) %stack
+ %val = load volatile float, ptr addrspace(5) %stack
+ %res = call amdgpu_gfx float @extern_func(float %arg0)
+ %add = fadd float %res, %val
+ ret float %add
+}
+
+define amdgpu_gfx float @no_stack_extern_call_many_args(<64 x float> %arg0) #0 {
+ %res = call amdgpu_gfx float @extern_func_many_args(<64 x float> %arg0)
+ ret float %res
+}
+
+define amdgpu_gfx float @no_stack_indirect_call(float %arg0) #0 {
+ %fptr = load ptr, ptr addrspace(4) @funcptr
+ call amdgpu_gfx void %fptr()
+ ret float %arg0
+}
+
+define amdgpu_gfx float @simple_stack_indirect_call(float %arg0) #0 {
+ %stack = alloca float, i32 4, align 4, addrspace(5)
+ store volatile float 2.0, ptr addrspace(5) %stack
+ %val = load volatile float, ptr addrspace(5) %stack
+ %fptr = load ptr, ptr addrspace(4) @funcptr
+ call amdgpu_gfx void %fptr()
+ %add = fadd float %arg0, %val
+ ret float %add
+}
+
+define amdgpu_gfx float @simple_stack_recurse(float %arg0) #0 {
+ %stack = alloca float, i32 4, align 4, addrspace(5)
+ store volatile float 2.0, ptr addrspace(5) %stack
+ %val = load volatile float, ptr addrspace(5) %stack
+ %res = call amdgpu_gfx float @simple_stack_recurse(float %arg0)
+ %add = fadd float %res, %val
+ ret float %add
+}
+
+ at lds = internal addrspace(3) global [64 x float] poison
+
+define amdgpu_gfx float @simple_lds(float %arg0) #0 {
+ %val = load float, ptr addrspace(3) @lds
+ ret float %val
+}
+
+define amdgpu_gfx float @simple_lds_recurse(float %arg0) #0 {
+ %val = load float, ptr addrspace(3) @lds
+ %res = call amdgpu_gfx float @simple_lds_recurse(float %val)
+ ret float %res
+}
+
+attributes #0 = { nounwind "amdgpu-dynamic-vgpr"="true" }
+
+!amdgpu.pal.metadata.msgpack = !{!0}
+
+!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\00"}
+!1 = !{i32 7}
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
index 638dc8965987e..928eabfae62f8 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
@@ -1,13 +1,11 @@
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,GFX11 %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,GFX12 %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,GFX12,DVGPR %s
; CHECK: .amdgpu_pal_metadata
; CHECK-NEXT: ---
; CHECK-NEXT: amdpal.pipelines:
; CHECK-NEXT: - .api: Vulkan
; CHECK-NEXT: .compute_registers:
-; DVGPR-NEXT: .dynamic_vgpr_en: true
; CHECK-NEXT: .tg_size_en: true
; CHECK-NEXT: .tgid_x_en: false
; CHECK-NEXT: .tgid_y_en: false
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll
new file mode 100644
index 0000000000000..8b8a02618a151
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll
@@ -0,0 +1,205 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 <%s | FileCheck %s
+
+; CHECK-LABEL: {{^}}_amdgpu_cs_main:
+; CHECK: ; TotalNumSgprs: 34
+; CHECK: ; NumVgprs: 2
+; CHECK: .amdgpu_pal_metadata
+; CHECK-NEXT: ---
+; CHECK-NEXT: amdpal.pipelines:
+; CHECK-NEXT: - .api: Vulkan
+; CHECK-NEXT: .compute_registers:
+; CHECK-NEXT: .dynamic_vgpr_en: true
+; CHECK-NEXT: .tg_size_en: true
+; CHECK-NEXT: .tgid_x_en: false
+; CHECK-NEXT: .tgid_y_en: false
+; CHECK-NEXT: .tgid_z_en: false
+; CHECK-NEXT: .tidig_comp_cnt: 0x1
+; CHECK-NEXT: .graphics_registers:
+; CHECK-NEXT: .ps_extra_lds_size: 0
+; CHECK-NEXT: .spi_ps_input_addr:
+; CHECK-NEXT: .ancillary_ena: false
+; CHECK-NEXT: .front_face_ena: true
+; CHECK-NEXT: .line_stipple_tex_ena: false
+; CHECK-NEXT: .linear_center_ena: true
+; CHECK-NEXT: .linear_centroid_ena: true
+; CHECK-NEXT: .linear_sample_ena: true
+; CHECK-NEXT: .persp_center_ena: true
+; CHECK-NEXT: .persp_centroid_ena: true
+; CHECK-NEXT: .persp_pull_model_ena: false
+; CHECK-NEXT: .persp_sample_ena: true
+; CHECK-NEXT: .pos_fixed_pt_ena: true
+; CHECK-NEXT: .pos_w_float_ena: false
+; CHECK-NEXT: .pos_x_float_ena: false
+; CHECK-NEXT: .pos_y_float_ena: false
+; CHECK-NEXT: .pos_z_float_ena: false
+; CHECK-NEXT: .sample_coverage_ena: false
+; CHECK-NEXT: .spi_ps_input_ena:
+; CHECK-NEXT: .ancillary_ena: false
+; CHECK-NEXT: .front_face_ena: false
+; CHECK-NEXT: .line_stipple_tex_ena: false
+; CHECK-NEXT: .linear_center_ena: false
+; CHECK-NEXT: .linear_centroid_ena: false
+; CHECK-NEXT: .linear_sample_ena: false
+; CHECK-NEXT: .persp_center_ena: false
+; CHECK-NEXT: .persp_centroid_ena: false
+; CHECK-NEXT: .persp_pull_model_ena: false
+; CHECK-NEXT: .persp_sample_ena: true
+; CHECK-NEXT: .pos_fixed_pt_ena: false
+; CHECK-NEXT: .pos_w_float_ena: false
+; CHECK-NEXT: .pos_x_float_ena: false
+; CHECK-NEXT: .pos_y_float_ena: false
+; CHECK-NEXT: .pos_z_float_ena: false
+; CHECK-NEXT: .sample_coverage_ena: false
+; CHECK-NEXT: .hardware_stages:
+; CHECK-NEXT: .cs:
+; CHECK-NEXT: .checksum_value: 0x9444d7d0
+; CHECK-NEXT: .debug_mode: false
+; CHECK-NEXT: .dynamic_vgpr_saved_count: 0x70
+; CHECK-NEXT: .entry_point: _amdgpu_cs
+; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main
+; CHECK-NEXT: .excp_en: 0
+; CHECK-NEXT: .float_mode: 0xc0
+; CHECK-NEXT: .image_op: false
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .mem_ordered: true
+; CHECK-NEXT: .scratch_en: false
+; CHECK-NEXT: .scratch_memory_size: 0
+; CHECK-NEXT: .sgpr_count: 0x22
+; CHECK-NEXT: .sgpr_limit: 0x6a
+; CHECK-NEXT: .threadgroup_dimensions:
+; CHECK-NEXT: - 0x1
+; CHECK-NEXT: - 0x400
+; CHECK-NEXT: - 0x1
+; CHECK-NEXT: .trap_present: false
+; CHECK-NEXT: .user_data_reg_map:
+; CHECK-NEXT: - 0x10000000
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: .user_sgprs: 0x3
+; CHECK-NEXT: .vgpr_count: 0x2
+; CHECK-NEXT: .vgpr_limit: 0x100
+; CHECK-NEXT: .wavefront_size: 0x40
+; CHECK-NEXT: .wgp_mode: false
+; CHECK-NEXT: .gs:
+; CHECK-NEXT: .debug_mode: false
+; CHECK-NEXT: .entry_point: _amdgpu_gs
+; CHECK-NEXT: .entry_point_symbol: gs_shader
+; CHECK-NEXT: .lds_size: 0x200
+; CHECK-NEXT: .mem_ordered: true
+; CHECK-NEXT: .scratch_en: false
+; CHECK-NEXT: .scratch_memory_size: 0
+; CHECK-NEXT: .sgpr_count: 0x1
+; CHECK-NEXT: .vgpr_count: 0x1
+; CHECK-NEXT: .wgp_mode: true
+; CHECK-NEXT: .hs:
+; CHECK-NEXT: .debug_mode: false
+; CHECK-NEXT: .entry_point: _amdgpu_hs
+; CHECK-NEXT: .entry_point_symbol: hs_shader
+; CHECK-NEXT: .lds_size: 0x1000
+; CHECK-NEXT: .mem_ordered: true
+; CHECK-NEXT: .scratch_en: false
+; CHECK-NEXT: .scratch_memory_size: 0
+; CHECK-NEXT: .sgpr_count: 0x1
+; CHECK-NEXT: .vgpr_count: 0x1
+; CHECK-NEXT: .wgp_mode: true
+; CHECK-NEXT: .ps:
+; CHECK-NEXT: .debug_mode: false
+; CHECK-NEXT: .entry_point: _amdgpu_ps
+; CHECK-NEXT: .entry_point_symbol: ps_shader
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .mem_ordered: true
+; CHECK-NEXT: .scratch_en: false
+; CHECK-NEXT: .scratch_memory_size: 0
+; CHECK-NEXT: .sgpr_count: 0x1
+; CHECK-NEXT: .vgpr_count: 0x1
+; CHECK-NEXT: .wgp_mode: true
+; CHECK: .registers: {}
+; CHECK:amdpal.version:
+; CHECK-NEXT: - 0x3
+; CHECK-NEXT: - 0
+; CHECK-NEXT:...
+; CHECK-NEXT: .end_amdgpu_pal_metadata
+
+define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg1, i32 %arg2) #0 !lgc.shaderstage !1 {
+.entry:
+ %i = call i64 @llvm.amdgcn.s.getpc()
+ %i1 = and i64 %i, -4294967296
+ %i2 = zext i32 %arg1 to i64
+ %i3 = or i64 %i1, %i2
+ %i4 = inttoptr i64 %i3 to ptr addrspace(4)
+ %i5 = and i32 %arg2, 1023
+ %i6 = lshr i32 %arg2, 10
+ %i7 = and i32 %i6, 1023
+ %i8 = add nuw nsw i32 %i7, %i5
+ %i9 = load <4 x i32>, ptr addrspace(4) %i4, align 16
+ %.idx = shl nuw nsw i32 %i8, 2
+ call void @llvm.amdgcn.raw.buffer.store.i32(i32 1, <4 x i32> %i9, i32 %.idx, i32 0, i32 0)
+ ret void
+}
+
+define dllexport amdgpu_ps void @ps_shader() #1 {
+ ret void
+}
+
+ at LDS.GS = external addrspace(3) global [1 x i32], align 4
+
+define dllexport amdgpu_gs void @gs_shader() #2 {
+ %ptr = getelementptr i32, ptr addrspace(3) @LDS.GS, i32 0
+ store i32 0, ptr addrspace(3) %ptr, align 4
+ ret void
+}
+
+ at LDS.HS = external addrspace(3) global [1024 x i32], align 4
+
+define dllexport amdgpu_hs void @hs_shader() #2 {
+ %ptr = getelementptr i32, ptr addrspace(3) @LDS.HS, i32 0
+ store i32 0, ptr addrspace(3) %ptr, align 4
+ ret void
+}
+
+!amdgpu.pal.metadata.msgpack = !{!0}
+
+; Function Attrs: nounwind willreturn memory(none)
+declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i64 @llvm.amdgcn.s.getpc() #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
+declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg) #3
+
+attributes #0 = { nounwind memory(readwrite) "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-memory-bound"="false" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "amdgpu-work-group-info-arg-no"="4" "denormal-fp-math-f32"="preserve-sign" "target-features"=",+wavefrontsize64,+cumode" "amdgpu-dynamic-vgpr"="true" }
+
+attributes #1 = { nounwind memory(readwrite) "InitialPSInputAddr"="36983" }
+
+!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\00"}
+!1 = !{i32 7}
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
index 5748f6b188acf..50aa2be50d913 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll
@@ -1,17 +1,14 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s --check-prefixes=CHECK,GFX11,NODVGPR
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 <%s | FileCheck %s --check-prefixes=CHECK,NODVGPR
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr <%s | FileCheck %s --check-prefixes=CHECK,DVGPR
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 <%s | FileCheck %s --check-prefixes=CHECK,GFX11
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 <%s | FileCheck %s --check-prefixes=CHECK
; CHECK-LABEL: {{^}}_amdgpu_cs_main:
-; NODVGPR: ; TotalNumSgprs: 4
-; DVGPR: ; TotalNumSgprs: 34
+; CHECK: ; TotalNumSgprs: 4
; CHECK: ; NumVgprs: 2
; CHECK: .amdgpu_pal_metadata
; CHECK-NEXT: ---
; CHECK-NEXT: amdpal.pipelines:
; CHECK-NEXT: - .api: Vulkan
; CHECK-NEXT: .compute_registers:
-; DVGPR-NEXT: .dynamic_vgpr_en: true
; CHECK-NEXT: .tg_size_en: true
; CHECK-NEXT: .tgid_x_en: false
; CHECK-NEXT: .tgid_y_en: false
@@ -57,7 +54,6 @@
; CHECK-NEXT: .cs:
; CHECK-NEXT: .checksum_value: 0x9444d7d0
; CHECK-NEXT: .debug_mode: false
-; DVGPR-NEXT: .dynamic_vgpr_saved_count: 0x70
; CHECK-NEXT: .entry_point: _amdgpu_cs
; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main
; CHECK-NEXT: .excp_en: 0
@@ -68,8 +64,7 @@
; CHECK-NEXT: .mem_ordered: true
; CHECK-NEXT: .scratch_en: false
; CHECK-NEXT: .scratch_memory_size: 0
-; NODVGPR-NEXT: .sgpr_count: 0x4
-; DVGPR-NEXT: .sgpr_count: 0x22
+; CHECK-NEXT: .sgpr_count: 0x4
; CHECK-NEXT: .sgpr_limit: 0x6a
; CHECK-NEXT: .threadgroup_dimensions:
; CHECK-NEXT: - 0x1
diff --git a/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12-dvgpr.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12-dvgpr.mir
new file mode 100644
index 0000000000000..24885164dd8da
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12-dvgpr.mir
@@ -0,0 +1,340 @@
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s
+
+--- |
+ define amdgpu_ps void @tbuffer_store1() #0 { ret void }
+ define amdgpu_ps void @tbuffer_store2() #0 { ret void }
+ define amdgpu_ps void @flat_store() #0 { ret void }
+ define amdgpu_ps void @global_store() #0 { ret void }
+ define amdgpu_ps void @buffer_store_format() #0 { ret void }
+ define amdgpu_ps void @ds_write_b32() #0 { ret void }
+ define amdgpu_ps void @global_store_dword() #0 { ret void }
+ define amdgpu_ps void @multiple_basic_blocks1() #0 { ret void }
+ define amdgpu_ps void @multiple_basic_blocks2() #0 { ret void }
+ define amdgpu_ps void @multiple_basic_blocks3() #0 { ret void }
+ define amdgpu_ps void @recursive_loop() #0 { ret void }
+ define amdgpu_ps void @recursive_loop_vmem() #0 { ret void }
+ define amdgpu_ps void @image_store() #0 { ret void }
+ define amdgpu_ps void @scratch_store() #0 { ret void }
+ define amdgpu_ps void @buffer_atomic() #0 { ret void }
+ define amdgpu_ps void @flat_atomic() #0 { ret void }
+ define amdgpu_ps void @global_atomic() #0 { ret void }
+ define amdgpu_ps void @image_atomic() #0 { ret void }
+ define amdgpu_ps void @global_store_optnone() #1 { ret void }
+
+ attributes #0 = { "amdgpu-dynamic-vgpr" = "true" }
+ attributes #1 = { "amdgpu-dynamic-vgpr" = "true" noinline optnone }
+...
+
+---
+name: tbuffer_store1
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: tbuffer_store1
+ ; CHECK-NOT: S_SENDMSG 3
+ ; CHECK: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: tbuffer_store2
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: tbuffer_store2
+ ; CHECK-NOT: S_SENDMSG 3
+ ; CHECK: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
+ S_ENDPGM 0
+...
+
+---
+name: flat_store
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: flat_store
+ ; CHECK-NOT: S_SENDMSG 3
+ ; CHECK: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr
+ S_ENDPGM 0
+...
+
+---
+name: global_store
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: global_store
+ ; CHECK-NOT: S_SENDMSG 3
+ ; CHECK: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
+ S_WAIT_STORECNT 0
+ S_ENDPGM 0
+...
+
+---
+name: buffer_store_format
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: buffer_store_format
+ ; CHECK-NOT: S_SENDMSG 3
+ ; CHECK: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: ds_write_b32
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: ds_write_b32
+ ; CHECK-NOT: S_SENDMSG 3
+ ; CHECK: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ renamable $vgpr0 = IMPLICIT_DEF
+ renamable $vgpr1 = IMPLICIT_DEF
+ DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 12, 0, implicit $exec, implicit $m0
+ S_ENDPGM 0
+
+...
+---
+name: global_store_dword
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr0_sgpr1
+
+ ; CHECK-LABEL: name: global_store_dword
+ ; CHECK-NOT: S_SENDMSG 3
+ ; CHECK: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
+ GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: multiple_basic_blocks1
+body: |
+ ; CHECK-LABEL: name: multiple_basic_blocks1
+ ; CHECK-NOT: S_SENDMSG 3
+ ; CHECK: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1
+
+ renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+
+ $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+ S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+
+...
+
+---
+name: multiple_basic_blocks2
+body: |
+ ; CHECK-LABEL: name: multiple_basic_blocks2
+ ; CHECK: bb.2:
+ ; CHECK-NOT: S_SENDMSG 3
+ ; CHECK: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ bb.0:
+ successors: %bb.2
+
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+ $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+ S_BRANCH %bb.2
+
+ bb.1:
+ successors: %bb.2
+
+ $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
+
+---
+name: multiple_basic_blocks3
+body: |
+ ; CHECK-LABEL: name: multiple_basic_blocks3
+ ; CHECK: bb.4:
+ ; CHECK-NOT: S_SENDMSG 3
+ ; CHECK: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ bb.0:
+ successors: %bb.2
+
+ $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+ TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec
+ S_BRANCH %bb.2
+
+ bb.1:
+ successors: %bb.2
+
+ $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+ S_BRANCH %bb.2
+
+ bb.2:
+ successors: %bb.4
+
+ S_BRANCH %bb.4
+
+ bb.3:
+ successors: %bb.4
+
+ $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec
+ S_BRANCH %bb.4
+
+ bb.4:
+ S_ENDPGM 0
+...
+
+---
+name: recursive_loop
+body: |
+ ; CHECK-LABEL: name: recursive_loop
+ ; CHECK-NOT: S_SENDMSG 3
+ ; CHECK: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1
+
+ renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+
+ S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
+
+---
+name: recursive_loop_vmem
+body: |
+ ; CHECK-LABEL: name: recursive_loop_vmem
+ ; CHECK-NOT: S_SENDMSG 3
+ ; CHECK: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1
+
+ renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.1, %bb.2
+
+ TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec
+ S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc
+ S_CBRANCH_SCC1 %bb.1, implicit killed $scc
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
+
+---
+name: image_store
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: image_store
+ ; CHECK-NOT: S_SENDMSG 3
+ ; CHECK: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7)
+ S_ENDPGM 0
+...
+
+---
+name: scratch_store
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: scratch_store
+ ; CHECK-NOT: S_SENDMSG 3
+ ; CHECK: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc
+ SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
+ S_ENDPGM 0
+...
+
+---
+name: buffer_atomic
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: buffer_atomic
+ ; CHECK-NOT: S_SENDMSG 3
+ ; CHECK: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7)
+ S_ENDPGM 0
+...
+
+---
+name: flat_atomic
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: flat_atomic
+ ; CHECK-NOT: S_SENDMSG 3
+ ; CHECK: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr
+ S_ENDPGM 0
+...
+
+
+---
+name: global_atomic
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: global_atomic
+ ; CHECK-NOT: S_SENDMSG 3
+ ; CHECK: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: image_atomic
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: image_atomic
+ ; CHECK-NOT: S_SENDMSG 3
+ ; CHECK: S_ALLOC_VGPR 0
+ ; CHECK: S_ENDPGM 0
+ renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 7)
+ S_ENDPGM 0
+...
+
+---
+name: global_store_optnone
+body: |
+ bb.0:
+ ; CHECK-LABEL: name: global_store_optnone
+ ; CHECK-NOT: S_SENDMSG 3
+ ; CHECK-NOT: S_ALLOC_VGPR
+ ; CHECK: S_ENDPGM 0
+ GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
+ S_WAIT_STORECNT 0
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir
index d465bf95fbfbe..69e3d2a7fe658 100644
--- a/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir
+++ b/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir
@@ -1,5 +1,4 @@
-# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DEFAULT
-# RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=+dynamic-vgpr -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DVGPR
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s
--- |
define amdgpu_ps void @tbuffer_store1() { ret void }
@@ -29,8 +28,7 @@ body: |
bb.0:
; CHECK-LABEL: name: tbuffer_store1
; CHECK-NOT: S_SENDMSG 3
- ; DEFAULT-NOT: S_ALLOC_VGPR
- ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec
S_ENDPGM 0
@@ -42,8 +40,7 @@ body: |
bb.0:
; CHECK-LABEL: name: tbuffer_store2
; CHECK-NOT: S_SENDMSG 3
- ; DEFAULT-NOT: S_ALLOC_VGPR
- ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7)
S_ENDPGM 0
@@ -55,8 +52,7 @@ body: |
bb.0:
; CHECK-LABEL: name: flat_store
; CHECK-NOT: S_SENDMSG 3
- ; DEFAULT-NOT: S_ALLOC_VGPR
- ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr
S_ENDPGM 0
@@ -68,8 +64,7 @@ body: |
bb.0:
; CHECK-LABEL: name: global_store
; CHECK-NOT: S_SENDMSG 3
- ; DEFAULT-NOT: S_ALLOC_VGPR
- ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
S_WAIT_STORECNT 0
@@ -82,8 +77,7 @@ body: |
bb.0:
; CHECK-LABEL: name: buffer_store_format
; CHECK-NOT: S_SENDMSG 3
- ; DEFAULT-NOT: S_ALLOC_VGPR
- ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec
S_ENDPGM 0
@@ -95,8 +89,7 @@ body: |
bb.0:
; CHECK-LABEL: name: ds_write_b32
; CHECK-NOT: S_SENDMSG 3
- ; DEFAULT-NOT: S_ALLOC_VGPR
- ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
renamable $vgpr0 = IMPLICIT_DEF
renamable $vgpr1 = IMPLICIT_DEF
@@ -112,8 +105,7 @@ body: |
; CHECK-LABEL: name: global_store_dword
; CHECK-NOT: S_SENDMSG 3
- ; DEFAULT-NOT: S_ALLOC_VGPR
- ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec
GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec
@@ -125,8 +117,7 @@ name: multiple_basic_blocks1
body: |
; CHECK-LABEL: name: multiple_basic_blocks1
; CHECK-NOT: S_SENDMSG 3
- ; DEFAULT-NOT: S_ALLOC_VGPR
- ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
bb.0:
successors: %bb.1
@@ -153,8 +144,7 @@ body: |
; CHECK-LABEL: name: multiple_basic_blocks2
; CHECK: bb.2:
; CHECK-NOT: S_SENDMSG 3
- ; DEFAULT-NOT: S_ALLOC_VGPR
- ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
bb.0:
successors: %bb.2
@@ -180,8 +170,7 @@ body: |
; CHECK-LABEL: name: multiple_basic_blocks3
; CHECK: bb.4:
; CHECK-NOT: S_SENDMSG 3
- ; DEFAULT-NOT: S_ALLOC_VGPR
- ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
bb.0:
successors: %bb.2
@@ -216,8 +205,7 @@ name: recursive_loop
body: |
; CHECK-LABEL: name: recursive_loop
; CHECK-NOT: S_SENDMSG 3
- ; DEFAULT-NOT: S_ALLOC_VGPR
- ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
bb.0:
successors: %bb.1
@@ -241,8 +229,7 @@ name: recursive_loop_vmem
body: |
; CHECK-LABEL: name: recursive_loop_vmem
; CHECK-NOT: S_SENDMSG 3
- ; DEFAULT-NOT: S_ALLOC_VGPR
- ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
bb.0:
successors: %bb.1
@@ -268,8 +255,7 @@ body: |
bb.0:
; CHECK-LABEL: name: image_store
; CHECK-NOT: S_SENDMSG 3
- ; DEFAULT-NOT: S_ALLOC_VGPR
- ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7)
S_ENDPGM 0
@@ -281,8 +267,7 @@ body: |
bb.0:
; CHECK-LABEL: name: scratch_store
; CHECK-NOT: S_SENDMSG 3
- ; DEFAULT-NOT: S_ALLOC_VGPR
- ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc
SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr
@@ -295,8 +280,7 @@ body: |
bb.0:
; CHECK-LABEL: name: buffer_atomic
; CHECK-NOT: S_SENDMSG 3
- ; DEFAULT-NOT: S_ALLOC_VGPR
- ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7)
S_ENDPGM 0
@@ -308,8 +292,7 @@ body: |
bb.0:
; CHECK-LABEL: name: flat_atomic
; CHECK-NOT: S_SENDMSG 3
- ; DEFAULT-NOT: S_ALLOC_VGPR
- ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr
S_ENDPGM 0
@@ -322,8 +305,7 @@ body: |
bb.0:
; CHECK-LABEL: name: global_atomic
; CHECK-NOT: S_SENDMSG 3
- ; DEFAULT-NOT: S_ALLOC_VGPR
- ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec
S_ENDPGM 0
@@ -335,8 +317,7 @@ body: |
bb.0:
; CHECK-LABEL: name: image_atomic
; CHECK-NOT: S_SENDMSG 3
- ; DEFAULT-NOT: S_ALLOC_VGPR
- ; DVGPR: S_ALLOC_VGPR 0
+ ; CHECK-NOT: S_ALLOC_VGPR
; CHECK: S_ENDPGM 0
renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 7)
S_ENDPGM 0
diff --git a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
index b1bfa79efbecd..d8ffa8113a960 100644
--- a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
+++ b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
@@ -94,15 +94,15 @@ static const std::pair<StringRef, StringRef>
W64FS = {"+wavefrontsize64", "w64"};
using TestFuncTy =
- function_ref<bool(std::stringstream &, unsigned, const GCNSubtarget &)>;
+ function_ref<bool(std::stringstream &, unsigned, const GCNSubtarget &, bool)>;
static bool testAndRecord(std::stringstream &Table, const GCNSubtarget &ST,
- TestFuncTy test) {
+ TestFuncTy test, bool IsDynamicVGPR) {
bool Success = true;
unsigned MaxOcc = ST.getMaxWavesPerEU();
for (unsigned Occ = MaxOcc; Occ > 0; --Occ) {
Table << std::right << std::setw(3) << Occ << " ";
- Success = test(Table, Occ, ST) && Success;
+ Success = test(Table, Occ, ST, IsDynamicVGPR) && Success;
Table << '\n';
}
return Success;
@@ -132,7 +132,7 @@ static void testGPRLimits(const char *RegName, bool TestW32W64,
FS = &W32FS;
std::stringstream Table;
- bool Success = testAndRecord(Table, ST, test);
+ bool Success = testAndRecord(Table, ST, test, /*IsDynamicVGPR=*/false);
if (!Success || PrintCpuRegLimits)
TablePerCPUs[Table.str()].push_back((CanonCPUName + FS->second).str());
@@ -155,16 +155,14 @@ static void testGPRLimits(const char *RegName, bool TestW32W64,
static void testDynamicVGPRLimits(StringRef CPUName, StringRef FS,
TestFuncTy test) {
- auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName,
- "+dynamic-vgpr," + FS.str());
+ auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName, FS);
ASSERT_TRUE(TM) << "No target machine";
GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()),
std::string(TM->getTargetFeatureString()), *TM);
- ASSERT_TRUE(ST.getFeatureBits().test(AMDGPU::FeatureDynamicVGPR));
std::stringstream Table;
- bool Success = testAndRecord(Table, ST, test);
+ bool Success = testAndRecord(Table, ST, test, /*IsDynamicVGPR=*/true);
EXPECT_TRUE(Success && !PrintCpuRegLimits)
<< CPUName << " dynamic VGPR " << FS
<< ":\nOcc MinVGPR MaxVGPR\n"
@@ -172,13 +170,13 @@ static void testDynamicVGPRLimits(StringRef CPUName, StringRef FS,
}
TEST(AMDGPU, TestVGPRLimitsPerOccupancy) {
- auto test = [](std::stringstream &OS, unsigned Occ, const GCNSubtarget &ST) {
- unsigned MaxVGPRNum = ST.getAddressableNumVGPRs();
+ auto test = [](std::stringstream &OS, unsigned Occ, const GCNSubtarget &ST, bool IsDynamicVGPR) {
+ unsigned MaxVGPRNum = ST.getAddressableNumVGPRs(IsDynamicVGPR);
return checkMinMax(
- OS, Occ, ST.getOccupancyWithNumVGPRs(MaxVGPRNum), ST.getMaxWavesPerEU(),
- [&](unsigned NumGPRs) { return ST.getOccupancyWithNumVGPRs(NumGPRs); },
- [&](unsigned Occ) { return ST.getMinNumVGPRs(Occ); },
- [&](unsigned Occ) { return ST.getMaxNumVGPRs(Occ); });
+ OS, Occ, ST.getOccupancyWithNumVGPRs(MaxVGPRNum, IsDynamicVGPR), ST.getMaxWavesPerEU(),
+ [&](unsigned NumGPRs) { return ST.getOccupancyWithNumVGPRs(NumGPRs, IsDynamicVGPR); },
+ [&](unsigned Occ) { return ST.getMinNumVGPRs(Occ, IsDynamicVGPR); },
+ [&](unsigned Occ) { return ST.getMaxNumVGPRs(Occ, IsDynamicVGPR); });
};
testGPRLimits("VGPR", true, test);
@@ -188,7 +186,7 @@ TEST(AMDGPU, TestVGPRLimitsPerOccupancy) {
"+wavefrontsize32,+dynamic-vgpr-block-size-32", test);
}
-static void testAbsoluteLimits(StringRef CPUName, StringRef FS,
+static void testAbsoluteLimits(StringRef CPUName, StringRef FS, bool IsDynamicVGPR,
unsigned ExpectedMinOcc, unsigned ExpectedMaxOcc,
unsigned ExpectedMaxVGPRs) {
auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName, FS);
@@ -205,12 +203,14 @@ static void testAbsoluteLimits(StringRef CPUName, StringRef FS,
GlobalValue::ExternalLinkage, "testFunc", &M);
Func->setCallingConv(CallingConv::AMDGPU_CS_Chain);
Func->addFnAttr("amdgpu-flat-work-group-size", "1,32");
+ if (IsDynamicVGPR)
+ Func->addFnAttr("amdgpu-dynamic-vgpr", "true");
auto Range = ST.getWavesPerEU(*Func);
EXPECT_EQ(ExpectedMinOcc, Range.first) << CPUName << ' ' << FS;
EXPECT_EQ(ExpectedMaxOcc, Range.second) << CPUName << ' ' << FS;
EXPECT_EQ(ExpectedMaxVGPRs, ST.getMaxNumVGPRs(*Func)) << CPUName << ' ' << FS;
- EXPECT_EQ(ExpectedMaxVGPRs, ST.getAddressableNumVGPRs())
+ EXPECT_EQ(ExpectedMaxVGPRs, ST.getAddressableNumVGPRs(IsDynamicVGPR))
<< CPUName << ' ' << FS;
// Function with requested 'amdgpu-waves-per-eu' in a valid range.
@@ -221,10 +221,10 @@ static void testAbsoluteLimits(StringRef CPUName, StringRef FS,
}
TEST(AMDGPU, TestOccupancyAbsoluteLimits) {
- testAbsoluteLimits("gfx1200", "+wavefrontsize32", 1, 16, 256);
- testAbsoluteLimits("gfx1200", "+wavefrontsize32,+dynamic-vgpr", 1, 16, 128);
+ testAbsoluteLimits("gfx1200", "+wavefrontsize32", false, 1, 16, 256);
+ testAbsoluteLimits("gfx1200", "+wavefrontsize32", true, 1, 16, 128);
testAbsoluteLimits(
- "gfx1200", "+wavefrontsize32,+dynamic-vgpr,+dynamic-vgpr-block-size-32",
+ "gfx1200", "+wavefrontsize32,+dynamic-vgpr-block-size-32", true,
1, 16, 256);
}
More information about the llvm-commits
mailing list