[llvm] 1f84495 - [AMDGPU] Update target helpers & GCNSchedStrategy for dynamic VGPRs (#130047)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 19 02:29:42 PDT 2025
Author: Diana Picus
Date: 2025-03-19T10:29:38+01:00
New Revision: 1f844952558b041ff1b4c27eed7b81c15841ee84
URL: https://github.com/llvm/llvm-project/commit/1f844952558b041ff1b4c27eed7b81c15841ee84
DIFF: https://github.com/llvm/llvm-project/commit/1f844952558b041ff1b4c27eed7b81c15841ee84.diff
LOG: [AMDGPU] Update target helpers & GCNSchedStrategy for dynamic VGPRs (#130047)
In dynamic VGPR mode, we can allocate up to 8 blocks of either 16 or 32
VGPRs (based on a chip-wide setting which we can model with a Subtarget
feature). Update some of the subtarget helpers to reflect this.
In particular:
- getVGPRAllocGranule is set to the block size
- getAddresableNumVGPR will limit itself to 8 * size of a block
We also try to be more careful about how many VGPR blocks we allocate.
Therefore, when deciding if we should revert scheduling after a given
stage, we check that we haven't increased the number of VGPR blocks that
need to be allocated.
---------
Co-authored-by: Jannik Silvanus <jannik.silvanus at amd.com>
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPU.td
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
llvm/lib/Target/AMDGPU/GCNSubtarget.h
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
llvm/unittests/Target/AMDGPU/CMakeLists.txt
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 4532975612b1d..0cac6d9674b5a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1263,6 +1263,12 @@ def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr",
"Enable dynamic VGPR mode"
>;
+def FeatureDynamicVGPRBlockSize32 : SubtargetFeature<"dynamic-vgpr-block-size-32",
+ "DynamicVGPRBlockSize32",
+ "true",
+ "Use a block size of 32 for dynamic VGPR allocation (default is 16)"
+>;
+
// Dummy feature used to disable assembler instructions.
def FeatureDisable : SubtargetFeature<"",
"FeatureDisable","true",
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 5dcf523430fd2..2fd015e499924 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1452,6 +1452,16 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
if (WavesAfter < DAG.MinOccupancy)
return true;
+ // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
+ if (ST.isDynamicVGPREnabled()) {
+ unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
+ &ST, PressureBefore.getVGPRNum(false));
+ unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
+ &ST, PressureAfter.getVGPRNum(false));
+ if (BlocksAfter > BlocksBefore)
+ return true;
+ }
+
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 99892d9a60423..f3d6f95568dde 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -192,6 +192,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
unsigned MaxHardClauseLength = 0;
bool SupportsSRAMECC = false;
bool DynamicVGPR = false;
+ bool DynamicVGPRBlockSize32 = false;
// This should not be used directly. 'TargetID' tracks the dynamic settings
// for SRAMECC.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 528565175f749..2613aa9ef8d56 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1166,6 +1166,9 @@ unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
if (STI->getFeatureBits().test(FeatureGFX90AInsts))
return 8;
+ if (STI->getFeatureBits().test(FeatureDynamicVGPR))
+ return STI->getFeatureBits().test(FeatureDynamicVGPRBlockSize32) ? 32 : 16;
+
bool IsWave32 = EnableWavefrontSize32 ?
*EnableWavefrontSize32 :
STI->getFeatureBits().test(FeatureWavefrontSize32);
@@ -1207,6 +1210,9 @@ unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI) { return 256; }
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
if (STI->getFeatureBits().test(FeatureGFX90AInsts))
return 512;
+ if (STI->getFeatureBits().test(FeatureDynamicVGPR))
+ // On GFX12 we can allocate at most 8 blocks of VGPRs.
+ return 8 * getVGPRAllocGranule(STI);
return getAddressableNumArchVGPRs(STI);
}
diff --git a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
index e22cfb1769561..b1bfa79efbecd 100644
--- a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
+++ b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
@@ -153,6 +153,24 @@ static void testGPRLimits(const char *RegName, bool TestW32W64,
EXPECT_TRUE(ErrStr.empty()) << ErrStr;
}
+static void testDynamicVGPRLimits(StringRef CPUName, StringRef FS,
+ TestFuncTy test) {
+ auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName,
+ "+dynamic-vgpr," + FS.str());
+ ASSERT_TRUE(TM) << "No target machine";
+
+ GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()),
+ std::string(TM->getTargetFeatureString()), *TM);
+ ASSERT_TRUE(ST.getFeatureBits().test(AMDGPU::FeatureDynamicVGPR));
+
+ std::stringstream Table;
+ bool Success = testAndRecord(Table, ST, test);
+ EXPECT_TRUE(Success && !PrintCpuRegLimits)
+ << CPUName << " dynamic VGPR " << FS
+ << ":\nOcc MinVGPR MaxVGPR\n"
+ << Table.str() << '\n';
+}
+
TEST(AMDGPU, TestVGPRLimitsPerOccupancy) {
auto test = [](std::stringstream &OS, unsigned Occ, const GCNSubtarget &ST) {
unsigned MaxVGPRNum = ST.getAddressableNumVGPRs();
@@ -164,6 +182,50 @@ TEST(AMDGPU, TestVGPRLimitsPerOccupancy) {
};
testGPRLimits("VGPR", true, test);
+
+ testDynamicVGPRLimits("gfx1200", "+wavefrontsize32", test);
+ testDynamicVGPRLimits("gfx1200",
+ "+wavefrontsize32,+dynamic-vgpr-block-size-32", test);
+}
+
+static void testAbsoluteLimits(StringRef CPUName, StringRef FS,
+ unsigned ExpectedMinOcc, unsigned ExpectedMaxOcc,
+ unsigned ExpectedMaxVGPRs) {
+ auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName, FS);
+ ASSERT_TRUE(TM) << "No target machine";
+
+ GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()),
+ std::string(TM->getTargetFeatureString()), *TM);
+
+ // Test function without attributes.
+ LLVMContext Context;
+ Module M("", Context);
+ Function *Func =
+ Function::Create(FunctionType::get(Type::getVoidTy(Context), false),
+ GlobalValue::ExternalLinkage, "testFunc", &M);
+ Func->setCallingConv(CallingConv::AMDGPU_CS_Chain);
+ Func->addFnAttr("amdgpu-flat-work-group-size", "1,32");
+
+ auto Range = ST.getWavesPerEU(*Func);
+ EXPECT_EQ(ExpectedMinOcc, Range.first) << CPUName << ' ' << FS;
+ EXPECT_EQ(ExpectedMaxOcc, Range.second) << CPUName << ' ' << FS;
+ EXPECT_EQ(ExpectedMaxVGPRs, ST.getMaxNumVGPRs(*Func)) << CPUName << ' ' << FS;
+ EXPECT_EQ(ExpectedMaxVGPRs, ST.getAddressableNumVGPRs())
+ << CPUName << ' ' << FS;
+
+ // Function with requested 'amdgpu-waves-per-eu' in a valid range.
+ Func->addFnAttr("amdgpu-waves-per-eu", "10,12");
+ Range = ST.getWavesPerEU(*Func);
+ EXPECT_EQ(10u, Range.first) << CPUName << ' ' << FS;
+ EXPECT_EQ(12u, Range.second) << CPUName << ' ' << FS;
+}
+
+TEST(AMDGPU, TestOccupancyAbsoluteLimits) {
+ testAbsoluteLimits("gfx1200", "+wavefrontsize32", 1, 16, 256);
+ testAbsoluteLimits("gfx1200", "+wavefrontsize32,+dynamic-vgpr", 1, 16, 128);
+ testAbsoluteLimits(
+ "gfx1200", "+wavefrontsize32,+dynamic-vgpr,+dynamic-vgpr-block-size-32",
+ 1, 16, 256);
}
static const char *printSubReg(const TargetRegisterInfo &TRI, unsigned SubReg) {
diff --git a/llvm/unittests/Target/AMDGPU/CMakeLists.txt b/llvm/unittests/Target/AMDGPU/CMakeLists.txt
index ca8f48bc393ef..6d6f17883a07e 100644
--- a/llvm/unittests/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/unittests/Target/AMDGPU/CMakeLists.txt
@@ -13,6 +13,7 @@ set(LLVM_LINK_COMPONENTS
Core
GlobalISel
MC
+ MIRParser
Support
TargetParser
)
More information about the llvm-commits
mailing list