[llvm] [AMDGPU] Use correct VGPR threshold for flagging ExcessRP regions in unified register file case (PR #85860)

Tue Mar 19 13:15:15 PDT 2024

https://github.com/jrbyrnes created https://github.com/llvm/llvm-project/pull/85860

`ST.getMaxNumVGPRs(MF)` lowers to `AMDGPUBaseInfo.cpp:getTotalNumVGPRs` which returns 512 for gfx90a. This is subsequently limited by `AMDGPUBaseInfo:getAddressableNumVGPRs()`, which also returns 512 for gfx90a. The ISA states we can have up to 256 of each of AGPR and VGPR, for a total of 512 (gfx90a 3.6.4).

`ST.getMaxNumVGPRs(MF)` therefore calculates the maximum possible number of combined VGPR + AGPR.

However, ST.getMaxNumVGPRs(MF) is being used in the scheduler as the maximum number of specifically accvgpr and the maxinum number of specifically archvgpr. 

By adding a parameter to getMaxNumVGPRs, the caller can control whether to return the maximum total VGPRs or maxmimum acc/arch VGPRs. These values will only differ when we have the unified register file case and getTotalNumVGPRs(STI) / WavesPerEU > getAddressableNumArchVGPRs(STI) (that is, when WavesPerEu == 1). While this doesn't really seem to have an impact based on lit tests, it does have an impact on several real kernels I'm working with. 

It is not unreasonable to think other clients of getTotalNumVGPRs are using it in the wrong way.

Still working on test cases.

>From 188669688b279349002cd6851f12f4c29d50a1d5 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 19 Mar 2024 12:57:52 -0700
Subject: [PATCH] [AMDGPU] Use correct VGPR threshold for flagging ExcessRP
 regions in unified register file case

Change-Id: Ie09cd894f433b3bb43a031a6eab44d9dc2dab0c6
---
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp    | 18 +++++++++-------
 llvm/lib/Target/AMDGPU/GCNRegPressure.cpp     |  4 ++--
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   |  8 ++++---
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         | 21 ++++++++++++-------
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  7 +++++--
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  3 ++-
 .../AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir |  2 ++
 7 files changed, 41 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index fa77b94fc22def..248713bca8a688 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -809,11 +809,13 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
                             getReservedNumSGPRs(F));
 }
 
-unsigned GCNSubtarget::getBaseMaxNumVGPRs(
-    const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
+unsigned
+GCNSubtarget::getBaseMaxNumVGPRs(const Function &F,
+                                 std::pair<unsigned, unsigned> WavesPerEU,
+                                 bool WholeRegisterFile) const {
   // Compute maximum number of VGPRs function can use using default/requested
   // minimum number of waves per execution unit.
-  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
+  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first, WholeRegisterFile);
 
   // Check if maximum number of VGPRs was explicitly requested using
   // "amdgpu-num-vgpr" attribute.
@@ -839,14 +841,16 @@ unsigned GCNSubtarget::getBaseMaxNumVGPRs(
   return MaxNumVGPRs;
 }
 
-unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
-  return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
+unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F,
+                                      bool WholeRegisterFile) const {
+  return getBaseMaxNumVGPRs(F, getWavesPerEU(F), WholeRegisterFile);
 }
 
-unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
+unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF,
+                                      bool WholeRegisterFile) const {
   const Function &F = MF.getFunction();
   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
-  return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
+  return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU(), WholeRegisterFile);
 }
 
 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 5c394e6d6296d0..b6fd5deb880f76 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -110,7 +110,7 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
   if (Occ != OtherOcc)
     return Occ > OtherOcc;
 
-  unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
+  unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF, /*WholeRegisterFile*/ true);
   unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
 
   // SGPR excess pressure conditions
@@ -124,7 +124,7 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
   unsigned OtherVGPRForSGPRSpills =
       (OtherExcessSGPR + (WaveSize - 1)) / WaveSize;
 
-  unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs();
+  unsigned MaxArchVGPRs = ST.getMaxNumVGPRs(MF, /*WholeRegisterFile*/ false);
 
   // Unified excess pressure conditions, accounting for VGPRs used for SGPR
   // spills
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 9f419a7fbf6834..7e73f5ce49e9b8 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -974,11 +974,13 @@ void GCNSchedStage::checkScheduling() {
                       << DAG.MinOccupancy << ".\n");
   }
 
-  unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
+  unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF, /*WholeRegisterFile*/ true);
+  unsigned MaxArchVGPRs = ST.getMaxNumVGPRs(MF, /*WholeRegisterFile*/ false);
   unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
 
-  if (PressureAfter.getVGPRNum(false) > MaxVGPRs ||
-      PressureAfter.getAGPRNum() > MaxVGPRs ||
+  if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
+      PressureAfter.getVGPRNum(false) > MaxArchVGPRs ||
+      PressureAfter.getAGPRNum() > MaxArchVGPRs ||
       PressureAfter.getSGPRNum() > MaxSGPRs) {
     DAG.RescheduleRegions[RegionIdx] = true;
     DAG.RegionsWithHighRP[RegionIdx] = true;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index ca51da659c3311..fd3d124e257414 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1414,26 +1414,32 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   /// \returns the maximum number of VGPRs that can be used and still achieved
   /// at least the specified number of waves \p WavesPerEU.
-  unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
-    return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
+  unsigned getMaxNumVGPRs(unsigned WavesPerEU,
+                          bool WholeRegisterFile = true) const {
+    return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU, WholeRegisterFile);
   }
 
   /// \returns max num VGPRs. This is the common utility function
   /// called by MachineFunction and Function variants of getMaxNumVGPRs.
   unsigned getBaseMaxNumVGPRs(const Function &F,
-                              std::pair<unsigned, unsigned> WavesPerEU) const;
+                              std::pair<unsigned, unsigned> WavesPerEU,
+                              bool WholeRegisterFile) const;
   /// \returns Maximum number of VGPRs that meets number of waves per execution
   /// unit requirement for function \p F, or number of VGPRs explicitly
   /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
+  /// If \p WholeRegisterFile is false and our target has a unified register
+  /// file, getMaxNumVGPRs will instead \return the maxmium number of ArchVGPRs.
   ///
   /// \returns Value that meets number of waves per execution unit requirement
   /// if explicitly requested value cannot be converted to integer, violates
   /// subtarget's specifications, or does not meet number of waves per execution
   /// unit requirement.
-  unsigned getMaxNumVGPRs(const Function &F) const;
+  unsigned getMaxNumVGPRs(const Function &F,
+                          bool WholeRegisterFile = true) const;
 
-  unsigned getMaxNumAGPRs(const Function &F) const {
-    return getMaxNumVGPRs(F);
+  unsigned getMaxNumAGPRs(const Function &F,
+                          bool WholeRegisterFile = true) const {
+    return getMaxNumVGPRs(F, WholeRegisterFile);
   }
 
   /// \returns Maximum number of VGPRs that meets number of waves per execution
@@ -1444,7 +1450,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// if explicitly requested value cannot be converted to integer, violates
   /// subtarget's specifications, or does not meet number of waves per execution
   /// unit requirement.
-  unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
+  unsigned getMaxNumVGPRs(const MachineFunction &MF,
+                          bool WholeRegisterFile = true) const;
 
   void getPostRAMutations(
       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 6d53f68ace70df..a75b080b5850af 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1155,12 +1155,15 @@ unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
   return std::min(MinNumVGPRs, AddrsableNumVGPRs);
 }
 
-unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
+unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
+                        bool WholeRegisterFile) {
   assert(WavesPerEU != 0);
 
   unsigned MaxNumVGPRs = alignDown(getTotalNumVGPRs(STI) / WavesPerEU,
                                    getVGPRAllocGranule(STI));
-  unsigned AddressableNumVGPRs = getAddressableNumVGPRs(STI);
+  unsigned AddressableNumVGPRs = WholeRegisterFile
+                                     ? getAddressableNumVGPRs(STI)
+                                     : getAddressableNumArchVGPRs(STI);
   return std::min(MaxNumVGPRs, AddressableNumVGPRs);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 29ac402d953513..f49bb26e060e37 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -308,7 +308,8 @@ unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
 
 /// \returns Maximum number of VGPRs that meets given number of waves per
 /// execution unit requirement for given subtarget \p STI.
-unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
+unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
+                        bool WholeRegisterFile);
 
 /// \returns Number of waves reachable for a given \p NumVGPRs usage for given
 /// subtarget \p STI.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir
index 091b29c23d60e2..dc5df7f039b04c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir
@@ -4,6 +4,8 @@
 --- |
   define amdgpu_kernel void @single-wave-phase-2b(ptr addrspace(3) noalias %in0, ptr addrspace(3) noalias %in1, ptr addrspace(3) noalias %in2, ptr addrspace(3) noalias %in3, ptr addrspace(3) noalias %in4, ptr addrspace(3) noalias %in5, ptr addrspace(3) noalias %in6, ptr addrspace(3) noalias %in7, ptr addrspace(3) noalias %in8, ptr addrspace(3) noalias %in9, ptr addrspace(3) noalias %in10, ptr addrspace(3) noalias %in11, ptr addrspace(7) noalias %in12, ptr addrspace(7) noalias %in13, ptr addrspace(7) noalias %in14, ptr addrspace(7) noalias %in15, ptr addrspace(7) noalias %in16, ptr addrspace(7) noalias %in17, ptr addrspace(7) noalias %in18, ptr addrspace(7) noalias %in19, ptr addrspace(7) noalias %in20, ptr addrspace(7) noalias %in21, ptr addrspace(7) noalias %in22, ptr addrspace(7) noalias %in23, ptr addrspace(7) noalias %in24, ptr addrspace(7) noalias %in25, ptr addrspace(7) noalias %in26, ptr addrspace(7) noalias %in27, ptr addrspace(7) noalias %in28, ptr addrspace(7) noalias %in29) #0 { ret void }
 
+  attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1"  "amdgpu-flat-work-group-size"="1,256" }
+
   !0 = distinct !{!0}
   !1 = !{!1, !0}
 ...