[llvm] [AMDGPU] Use correct VGPR threshold for flagging ExcessRP regions in unified register file case (PR #85860)

Thu Mar 21 11:29:48 PDT 2024

https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/85860

>From 188669688b279349002cd6851f12f4c29d50a1d5 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 19 Mar 2024 12:57:52 -0700
Subject: [PATCH 1/3] [AMDGPU] Use correct VGPR threshold for flagging ExcessRP
 regions in unified register file case

Change-Id: Ie09cd894f433b3bb43a031a6eab44d9dc2dab0c6
---
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp    | 18 +++++++++-------
 llvm/lib/Target/AMDGPU/GCNRegPressure.cpp     |  4 ++--
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   |  8 ++++---
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         | 21 ++++++++++++-------
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  7 +++++--
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  3 ++-
 .../AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir |  2 ++
 7 files changed, 41 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index fa77b94fc22def..248713bca8a688 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -809,11 +809,13 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
                             getReservedNumSGPRs(F));
 }
 
-unsigned GCNSubtarget::getBaseMaxNumVGPRs(
-    const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
+unsigned
+GCNSubtarget::getBaseMaxNumVGPRs(const Function &F,
+                                 std::pair<unsigned, unsigned> WavesPerEU,
+                                 bool WholeRegisterFile) const {
   // Compute maximum number of VGPRs function can use using default/requested
   // minimum number of waves per execution unit.
-  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
+  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first, WholeRegisterFile);
 
   // Check if maximum number of VGPRs was explicitly requested using
   // "amdgpu-num-vgpr" attribute.
@@ -839,14 +841,16 @@ unsigned GCNSubtarget::getBaseMaxNumVGPRs(
   return MaxNumVGPRs;
 }
 
-unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
-  return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
+unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F,
+                                      bool WholeRegisterFile) const {
+  return getBaseMaxNumVGPRs(F, getWavesPerEU(F), WholeRegisterFile);
 }
 
-unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
+unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF,
+                                      bool WholeRegisterFile) const {
   const Function &F = MF.getFunction();
   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
-  return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
+  return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU(), WholeRegisterFile);
 }
 
 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 5c394e6d6296d0..b6fd5deb880f76 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -110,7 +110,7 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
   if (Occ != OtherOcc)
     return Occ > OtherOcc;
 
-  unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
+  unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF, /*WholeRegisterFile*/ true);
   unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
 
   // SGPR excess pressure conditions
@@ -124,7 +124,7 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
   unsigned OtherVGPRForSGPRSpills =
       (OtherExcessSGPR + (WaveSize - 1)) / WaveSize;
 
-  unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs();
+  unsigned MaxArchVGPRs = ST.getMaxNumVGPRs(MF, /*WholeRegisterFile*/ false);
 
   // Unified excess pressure conditions, accounting for VGPRs used for SGPR
   // spills
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 9f419a7fbf6834..7e73f5ce49e9b8 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -974,11 +974,13 @@ void GCNSchedStage::checkScheduling() {
                       << DAG.MinOccupancy << ".\n");
   }
 
-  unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
+  unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF, /*WholeRegisterFile*/ true);
+  unsigned MaxArchVGPRs = ST.getMaxNumVGPRs(MF, /*WholeRegisterFile*/ false);
   unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
 
-  if (PressureAfter.getVGPRNum(false) > MaxVGPRs ||
-      PressureAfter.getAGPRNum() > MaxVGPRs ||
+  if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
+      PressureAfter.getVGPRNum(false) > MaxArchVGPRs ||
+      PressureAfter.getAGPRNum() > MaxArchVGPRs ||
       PressureAfter.getSGPRNum() > MaxSGPRs) {
     DAG.RescheduleRegions[RegionIdx] = true;
     DAG.RegionsWithHighRP[RegionIdx] = true;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index ca51da659c3311..fd3d124e257414 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1414,26 +1414,32 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   /// \returns the maximum number of VGPRs that can be used and still achieved
   /// at least the specified number of waves \p WavesPerEU.
-  unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
-    return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
+  unsigned getMaxNumVGPRs(unsigned WavesPerEU,
+                          bool WholeRegisterFile = true) const {
+    return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU, WholeRegisterFile);
   }
 
   /// \returns max num VGPRs. This is the common utility function
   /// called by MachineFunction and Function variants of getMaxNumVGPRs.
   unsigned getBaseMaxNumVGPRs(const Function &F,
-                              std::pair<unsigned, unsigned> WavesPerEU) const;
+                              std::pair<unsigned, unsigned> WavesPerEU,
+                              bool WholeRegisterFile) const;
   /// \returns Maximum number of VGPRs that meets number of waves per execution
   /// unit requirement for function \p F, or number of VGPRs explicitly
   /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
+  /// If \p WholeRegisterFile is false and our target has a unified register
+  /// file, getMaxNumVGPRs will instead \return the maxmium number of ArchVGPRs.
   ///
   /// \returns Value that meets number of waves per execution unit requirement
   /// if explicitly requested value cannot be converted to integer, violates
   /// subtarget's specifications, or does not meet number of waves per execution
   /// unit requirement.
-  unsigned getMaxNumVGPRs(const Function &F) const;
+  unsigned getMaxNumVGPRs(const Function &F,
+                          bool WholeRegisterFile = true) const;
 
-  unsigned getMaxNumAGPRs(const Function &F) const {
-    return getMaxNumVGPRs(F);
+  unsigned getMaxNumAGPRs(const Function &F,
+                          bool WholeRegisterFile = true) const {
+    return getMaxNumVGPRs(F, WholeRegisterFile);
   }
 
   /// \returns Maximum number of VGPRs that meets number of waves per execution
@@ -1444,7 +1450,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// if explicitly requested value cannot be converted to integer, violates
   /// subtarget's specifications, or does not meet number of waves per execution
   /// unit requirement.
-  unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
+  unsigned getMaxNumVGPRs(const MachineFunction &MF,
+                          bool WholeRegisterFile = true) const;
 
   void getPostRAMutations(
       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 6d53f68ace70df..a75b080b5850af 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1155,12 +1155,15 @@ unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
   return std::min(MinNumVGPRs, AddrsableNumVGPRs);
 }
 
-unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
+unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
+                        bool WholeRegisterFile) {
   assert(WavesPerEU != 0);
 
   unsigned MaxNumVGPRs = alignDown(getTotalNumVGPRs(STI) / WavesPerEU,
                                    getVGPRAllocGranule(STI));
-  unsigned AddressableNumVGPRs = getAddressableNumVGPRs(STI);
+  unsigned AddressableNumVGPRs = WholeRegisterFile
+                                     ? getAddressableNumVGPRs(STI)
+                                     : getAddressableNumArchVGPRs(STI);
   return std::min(MaxNumVGPRs, AddressableNumVGPRs);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 29ac402d953513..f49bb26e060e37 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -308,7 +308,8 @@ unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
 
 /// \returns Maximum number of VGPRs that meets given number of waves per
 /// execution unit requirement for given subtarget \p STI.
-unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
+unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
+                        bool WholeRegisterFile);
 
 /// \returns Number of waves reachable for a given \p NumVGPRs usage for given
 /// subtarget \p STI.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir
index 091b29c23d60e2..dc5df7f039b04c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir
@@ -4,6 +4,8 @@
 --- |
   define amdgpu_kernel void @single-wave-phase-2b(ptr addrspace(3) noalias %in0, ptr addrspace(3) noalias %in1, ptr addrspace(3) noalias %in2, ptr addrspace(3) noalias %in3, ptr addrspace(3) noalias %in4, ptr addrspace(3) noalias %in5, ptr addrspace(3) noalias %in6, ptr addrspace(3) noalias %in7, ptr addrspace(3) noalias %in8, ptr addrspace(3) noalias %in9, ptr addrspace(3) noalias %in10, ptr addrspace(3) noalias %in11, ptr addrspace(7) noalias %in12, ptr addrspace(7) noalias %in13, ptr addrspace(7) noalias %in14, ptr addrspace(7) noalias %in15, ptr addrspace(7) noalias %in16, ptr addrspace(7) noalias %in17, ptr addrspace(7) noalias %in18, ptr addrspace(7) noalias %in19, ptr addrspace(7) noalias %in20, ptr addrspace(7) noalias %in21, ptr addrspace(7) noalias %in22, ptr addrspace(7) noalias %in23, ptr addrspace(7) noalias %in24, ptr addrspace(7) noalias %in25, ptr addrspace(7) noalias %in26, ptr addrspace(7) noalias %in27, ptr addrspace(7) noalias %in28, ptr addrspace(7) noalias %in29) #0 { ret void }
 
+  attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1"  "amdgpu-flat-work-group-size"="1,256" }
+
   !0 = distinct !{!0}
   !1 = !{!1, !0}
 ...

>From db3214c16372380e3789b8bf038d29b978c954ff Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 19 Mar 2024 15:22:14 -0700
Subject: [PATCH 2/3] fix logic/calculation bug

---
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 9 +++++----
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h   | 3 ++-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index a75b080b5850af..25afa07bf90142 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1102,9 +1102,9 @@ unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
   return IsWave32 ? 8 : 4;
 }
 
-unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
+unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI, bool WholeRegisterFile) {
   if (STI->getFeatureBits().test(FeatureGFX90AInsts))
-    return 512;
+    return WholeRegisterFile ? 512 : getAddressableNumArchVGPRs(STI);
   if (!isGFX10Plus(*STI))
     return 256;
   bool IsWave32 = STI->getFeatureBits().test(FeatureWavefrontSize32);
@@ -1159,8 +1159,9 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
                         bool WholeRegisterFile) {
   assert(WavesPerEU != 0);
 
-  unsigned MaxNumVGPRs = alignDown(getTotalNumVGPRs(STI) / WavesPerEU,
-                                   getVGPRAllocGranule(STI));
+  unsigned MaxNumVGPRs =
+      alignDown(getTotalNumVGPRs(STI, WholeRegisterFile) / WavesPerEU,
+                getVGPRAllocGranule(STI));
   unsigned AddressableNumVGPRs = WholeRegisterFile
                                      ? getAddressableNumVGPRs(STI)
                                      : getAddressableNumArchVGPRs(STI);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index f49bb26e060e37..e56df21faa0ca0 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -293,7 +293,8 @@ unsigned getVGPREncodingGranule(
     std::optional<bool> EnableWavefrontSize32 = std::nullopt);
 
 /// \returns Total number of VGPRs for given subtarget \p STI.
-unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI);
+unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI,
+                          bool WholeRegisterFile = true);
 
 /// \returns Addressable number of architectural VGPRs for a given subtarget \p
 /// STI.

>From dca10b1045e34325c0aad7a3f7f1d00065266bde Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 21 Mar 2024 10:38:45 -0700
Subject: [PATCH 3/3] clamp to TotalNumArch + disallow whole register file for
 agpr

Change-Id: Ief7c8cd606934185968a914105b8795f669e204d
---
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |  5 ++---
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    | 19 +++++++++++++++----
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  8 ++++++--
 3 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index fd3d124e257414..2b1e7570f9e96f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1437,9 +1437,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   unsigned getMaxNumVGPRs(const Function &F,
                           bool WholeRegisterFile = true) const;
 
-  unsigned getMaxNumAGPRs(const Function &F,
-                          bool WholeRegisterFile = true) const {
-    return getMaxNumVGPRs(F, WholeRegisterFile);
+  unsigned getMaxNumAGPRs(const Function &F) const {
+    return getMaxNumVGPRs(F, false);
   }
 
   /// \returns Maximum number of VGPRs that meets number of waves per execution
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 25afa07bf90142..1c91ff57c1f3dd 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1102,9 +1102,9 @@ unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
   return IsWave32 ? 8 : 4;
 }
 
-unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI, bool WholeRegisterFile) {
+unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
   if (STI->getFeatureBits().test(FeatureGFX90AInsts))
-    return WholeRegisterFile ? 512 : getAddressableNumArchVGPRs(STI);
+    return 512;
   if (!isGFX10Plus(*STI))
     return 256;
   bool IsWave32 = STI->getFeatureBits().test(FeatureWavefrontSize32);
@@ -1113,6 +1113,12 @@ unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI, bool WholeRegisterFile) {
   return IsWave32 ? 1024 : 512;
 }
 
+unsigned getTotalNumArchVGPRs(const MCSubtargetInfo *STI) {
+  if (STI->getFeatureBits().test(FeatureGFX90AInsts))
+    return 256;
+  return getTotalNumVGPRs(STI);
+}
+
 unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI) { return 256; }
 
 unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
@@ -1159,9 +1165,14 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
                         bool WholeRegisterFile) {
   assert(WavesPerEU != 0);
 
+  unsigned VGPRsInRegFile =
+      WholeRegisterFile
+          ? getTotalNumVGPRs(STI)
+          : std::min(getTotalNumVGPRs(STI), getTotalNumArchVGPRs(STI));
+
   unsigned MaxNumVGPRs =
-      alignDown(getTotalNumVGPRs(STI, WholeRegisterFile) / WavesPerEU,
-                getVGPRAllocGranule(STI));
+      alignDown(VGPRsInRegFile / WavesPerEU, getVGPRAllocGranule(STI));
+
   unsigned AddressableNumVGPRs = WholeRegisterFile
                                      ? getAddressableNumVGPRs(STI)
                                      : getAddressableNumArchVGPRs(STI);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index e56df21faa0ca0..c367ec96daf029 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -293,8 +293,12 @@ unsigned getVGPREncodingGranule(
     std::optional<bool> EnableWavefrontSize32 = std::nullopt);
 
 /// \returns Total number of VGPRs for given subtarget \p STI.
-unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI,
-                          bool WholeRegisterFile = true);
+unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI);
+
+/// \returns Total number of Arch VGPRs for given subtarget \p STI. This is
+/// only different from getTotalNumVGPRs if the target has a unified register
+/// file.
+unsigned getTotalNumArchVGPRs(const MCSubtargetInfo *STI);
 
 /// \returns Addressable number of architectural VGPRs for a given subtarget \p
 /// STI.