[llvm] b761137 - [AMDGPU] Use correct VGPR threshold for flagging ExcessRP regions in unified register file case (#85860)

Mon Mar 25 13:12:03 PDT 2024

Author: Jeffrey Byrnes
Date: 2024-03-25T13:11:58-07:00
New Revision: b7611370491873722e08e4ce9374312d0c936af1

URL: https://github.com/llvm/llvm-project/commit/b7611370491873722e08e4ce9374312d0c936af1
DIFF: https://github.com/llvm/llvm-project/commit/b7611370491873722e08e4ce9374312d0c936af1.diff

LOG: [AMDGPU] Use correct VGPR threshold for flagging ExcessRP regions in unified register file case (#85860)

`ST.getMaxNumVGPRs(MF)` lowers to `AMDGPUBaseInfo.cpp:getTotalNumVGPRs`
which returns 512 for gfx90a. This is subsequently limited by
`AMDGPUBaseInfo:getAddressableNumVGPRs()`, which also returns 512 for
gfx90a. The ISA states we can have a total of 512 registers, but a
maximum of only 256 of each of AGPR and VGPR (gfx90a 3.6.4).

Therefore, in unified register file case, `ST.getMaxNumVGPRs(MF)`
calculates the maximum number of combined VGPR + AGPR. But, it is
currently used as the limit for accvgpr and as the limit for archvgpr.

This patch uses it as the combined limit, and accounts for the maximum addressable arch/acc VGPRs when calculating the per RegClass limits.

It is not unreasonable to think other clients of getTotalNumVGPRs are
using it in the wrong way.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 9f419a7fbf6834..94d93390d0916f 100644

--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -973,12 +973,17 @@ void GCNSchedStage::checkScheduling() {
     LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
                       << DAG.MinOccupancy << ".\n");
   }
-
+  // The maximum number of arch VGPR on non-unified register file, or the
+  // maximum VGPR + AGPR in the unified register file case.
   unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
+  // The maximum number of arch VGPR for both unified and non-unified register
+  // file.
+  unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
   unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
 
-  if (PressureAfter.getVGPRNum(false) > MaxVGPRs ||
-      PressureAfter.getAGPRNum() > MaxVGPRs ||
+  if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
+      PressureAfter.getVGPRNum(false) > MaxArchVGPRs ||
+      PressureAfter.getAGPRNum() > MaxArchVGPRs ||
       PressureAfter.getSGPRNum() > MaxSGPRs) {
     DAG.RescheduleRegions[RegionIdx] = true;
     DAG.RegionsWithHighRP[RegionIdx] = true;

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir
index 091b29c23d60e2..e93595b9ef2735 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir
@@ -4,6 +4,8 @@
 --- |
   define amdgpu_kernel void @single-wave-phase-2b(ptr addrspace(3) noalias %in0, ptr addrspace(3) noalias %in1, ptr addrspace(3) noalias %in2, ptr addrspace(3) noalias %in3, ptr addrspace(3) noalias %in4, ptr addrspace(3) noalias %in5, ptr addrspace(3) noalias %in6, ptr addrspace(3) noalias %in7, ptr addrspace(3) noalias %in8, ptr addrspace(3) noalias %in9, ptr addrspace(3) noalias %in10, ptr addrspace(3) noalias %in11, ptr addrspace(7) noalias %in12, ptr addrspace(7) noalias %in13, ptr addrspace(7) noalias %in14, ptr addrspace(7) noalias %in15, ptr addrspace(7) noalias %in16, ptr addrspace(7) noalias %in17, ptr addrspace(7) noalias %in18, ptr addrspace(7) noalias %in19, ptr addrspace(7) noalias %in20, ptr addrspace(7) noalias %in21, ptr addrspace(7) noalias %in22, ptr addrspace(7) noalias %in23, ptr addrspace(7) noalias %in24, ptr addrspace(7) noalias %in25, ptr addrspace(7) noalias %in26, ptr addrspace(7) noalias %in27, ptr addrspace(7) noalias %in28, ptr addrspace(7) noalias %in29) #0 { ret void }
 
+  attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
+
   !0 = distinct !{!0}
   !1 = !{!1, !0}
 ...