[llvm] d09d834 - [AMDGPU] Fix GCNSubtarget::getMinNumVGPRs, add unit test to check consistency between GCNSubtarget's getMinNumVGPRs, getMaxNumVGPRs and getOccupancyWithNumVGPRs.

Tue Dec 6 00:15:09 PST 2022

Author: Valery Pykhtin
Date: 2022-12-06T09:14:49+01:00
New Revision: d09d834bb9807d7d484c499fe5128b43c8e5950d

URL: https://github.com/llvm/llvm-project/commit/d09d834bb9807d7d484c499fe5128b43c8e5950d
DIFF: https://github.com/llvm/llvm-project/commit/d09d834bb9807d7d484c499fe5128b43c8e5950d.diff

LOG: [AMDGPU] Fix GCNSubtarget::getMinNumVGPRs, add unit test to check consistency between GCNSubtarget's getMinNumVGPRs, getMaxNumVGPRs and getOccupancyWithNumVGPRs.

```
  /// \returns Minimum number of VGPRs that meets given number of waves per
  /// execution unit requirement supported by the subtarget.
  unsigned getMinNumVGPRs(unsigned WavesPerEU) const;

  /// \returns Maximum number of VGPRs that meets given number of waves per
  /// execution unit requirement supported by the subtarget.
  unsigned getMaxNumVGPRs(unsigned WavesPerEU) const;

  /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
  /// VGPRs
  unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
```

While working on RP tracking issues I noticed that getMinNumVGPRs return incorrect
values: the problem is large VGPR granule sizes on GFX10+ architectures. Some of the
occupancies aren't reachable because require the same amount of VGPR granules as others.
For example 19 waves occupancy on gfx1010 require the same amount of granules as 20 waves
so the resultng occupancy would be 20.

SGPRs have the same issue and even have inconsistency between getMaxNumSGPRs and getOccupancyWithNumSGPRs.
It will be addressed in the next patch.

Legend:
  # MinVGPR and MaxVGPR are values returned by getMinNumVGPRs and getMaxNumVGPRs for a given Occ.
  # (ONumber) is the value returned by getOccupancyWithNumVGPRs for a given MinVGPR or MaxVGPR.
  # R means range problem: MinVGPR should be less than MaxVGPR and both should refer to the same occupancy.

Unit test output without the fix:
```
./build/unittests/Target/AMDGPU/AMDGPUTests --gtest_filter=AMDGPU.TestVGPRLimitsPerOccupancy --print-cpu-reg-limits

 gfx90a gfx940:
Occ    MinVGPR        MaxVGPR
  8        0 (O8)     64  (O8)
  7       65 (O7)     72  (O7)
  6       73 (O6)     80  (O6)
  5       81 (O5)     96  (O5)
  4       97 (O4)     128 (O4)
  3      129 (O3)     168 (O3)
  2      169 (O2)     256 (O2)
  1      257 (O1)     512 (O1)

 gfx600 gfx600 gfx601 gfx601 gfx601 gfx602 gfx602 gfx602 gfx700 gfx700 gfx701 gfx701 gfx702 gfx703 gfx703 gfx703 gfx704 gfx704 gfx705 gfx801 gfx801 gfx802 gfx802 gfx802 gfx803 gfx803 gfx803 gfx803 gfx805 gfx805 gfx810 gfx810 gfx900 gfx902 gfx904 gfx906 gfx908 gfx909 gfx90c:
Occ    MinVGPR        MaxVGPR
 10        0 (O10)    24  (O10)
  9       25 (O9)     28  (O9)
  8       29 (O8)     32  (O8)
  7       33 (O7)     36  (O7)
  6       37 (O6)     40  (O6)
  5       41 (O5)     48  (O5)
  4       49 (O4)     64  (O4)
  3       65 (O3)     84  (O3)
  2       85 (O2)     128 (O2)
  1      129 (O1)     256 (O1)

 gfx1030w64 gfx1031w64 gfx1032w64 gfx1033w64 gfx1034w64 gfx1035w64 gfx1036w64 gfx1102w64 gfx1103w64:
Occ    MinVGPR        MaxVGPR
 16        0 (O16)    32  (O16)
 15       33 (O12) R  32  (O16)
 14       33 (O12) R  32  (O16)
 13       33 (O12) R  32  (O16)
 12       33 (O12)    40  (O12)
 11       41 (O10) R  40  (O12)
 10       41 (O10)    48  (O10)
  9       49 (O9)     56  (O9)
  8       57 (O8)     64  (O8)
  7       65 (O7)     72  (O7)
  6       73 (O6)     80  (O6)
  5       81 (O5)     96  (O5)
  4       97 (O4)     128 (O4)
  3      129 (O3)     168 (O3)
  2      169 (O2)     256 (O2)
  1      256 (O2) R   256 (O2)

 gfx1100w64 gfx1101w64:
Occ    MinVGPR        MaxVGPR
 16        0 (O16)    48  (O16)
 15       49 (O12) R  48  (O16)
 14       49 (O12) R  48  (O16)
 13       49 (O12) R  48  (O16)
 12       49 (O12)    60  (O12)
 11       61 (O10) R  60  (O12)
 10       61 (O10)    72  (O10)
  9       73 (O9)     84  (O9)
  8       85 (O8)     96  (O8)
  7       97 (O7)     108 (O7)
  6      109 (O6)     120 (O6)
  5      121 (O5)     144 (O5)
  4      145 (O4)     192 (O4)
  3      193 (O3)     252 (O3)
  2      253 (O2)     256 (O2)
  1      256 (O2) R   256 (O2)

 gfx1030w32 gfx1031w32 gfx1032w32 gfx1033w32 gfx1034w32 gfx1035w32 gfx1036w32 gfx1102w32 gfx1103w32:
Occ    MinVGPR        MaxVGPR
 16        0 (O16)    64  (O16)
 15       65 (O12) R  64  (O16)
 14       65 (O12) R  64  (O16)
 13       65 (O12) R  64  (O16)
 12       65 (O12)    80  (O12)
 11       81 (O10) R  80  (O12)
 10       81 (O10)    96  (O10)
  9       97 (O9)     112 (O9)
  8      113 (O8)     128 (O8)
  7      129 (O7)     144 (O7)
  6      145 (O6)     160 (O6)
  5      161 (O5)     192 (O5)
  4      193 (O4)     256 (O4)
  3      256 (O4) R   256 (O4)
  2      256 (O4) R   256 (O4)
  1      256 (O4) R   256 (O4)

 gfx1100w32 gfx1101w32:
Occ    MinVGPR        MaxVGPR
 16        0 (O16)    96  (O16)
 15       97 (O12) R  96  (O16)
 14       97 (O12) R  96  (O16)
 13       97 (O12) R  96  (O16)
 12       97 (O12)    120 (O12)
 11      121 (O10) R  120 (O12)
 10      121 (O10)    144 (O10)
  9      145 (O9)     168 (O9)
  8      169 (O8)     192 (O8)
  7      193 (O7)     216 (O7)
  6      217 (O6)     240 (O6)
  5      241 (O5)     256 (O5)
  4      256 (O5) R   256 (O5)
  3      256 (O5) R   256 (O5)
  2      256 (O5) R   256 (O5)
  1      256 (O5) R   256 (O5)

 gfx1010w64 gfx1011w64 gfx1012w64 gfx1013w64:
Occ    MinVGPR        MaxVGPR
 20        0 (O20)    24  (O20)
 19       25 (O18) R  24  (O20)
 18       25 (O18)    28  (O18)
 17       29 (O16) R  28  (O18)
 16       29 (O16)    32  (O16)
 15       33 (O14) R  32  (O16)
 14       33 (O14)    36  (O14)
 13       37 (O12) R  36  (O14)
 12       37 (O12)    40  (O12)
 11       41 (O11)    44  (O11)
 10       45 (O10)    48  (O10)
  9       49 (O9)     56  (O9)
  8       57 (O8)     64  (O8)
  7       65 (O7)     72  (O7)
  6       73 (O6)     84  (O6)
  5       85 (O5)     100 (O5)
  4      101 (O4)     128 (O4)
  3      129 (O3)     168 (O3)
  2      169 (O2)     256 (O2)
  1      256 (O2) R   256 (O2)

 gfx1010w32 gfx1011w32 gfx1012w32 gfx1013w32:
Occ    MinVGPR        MaxVGPR
 20        0 (O20)    48  (O20)
 19       49 (O18) R  48  (O20)
 18       49 (O18)    56  (O18)
 17       57 (O16) R  56  (O18)
 16       57 (O16)    64  (O16)
 15       65 (O14) R  64  (O16)
 14       65 (O14)    72  (O14)
 13       73 (O12) R  72  (O14)
 12       73 (O12)    80  (O12)
 11       81 (O11)    88  (O11)
 10       89 (O10)    96  (O10)
  9       97 (O9)     112 (O9)
  8      113 (O8)     128 (O8)
  7      129 (O7)     144 (O7)
  6      145 (O6)     168 (O6)
  5      169 (O5)     200 (O5)
  4      201 (O4)     256 (O4)
  3      256 (O4) R   256 (O4)
  2      256 (O4) R   256 (O4)
  1      256 (O4) R   256 (O4)
```

After the fix:
```
 gfx90a gfx940:
Occ    MinVGPR        MaxVGPR
  8        0 (O8)     64  (O8)
  7       65 (O7)     72  (O7)
  6       73 (O6)     80  (O6)
  5       81 (O5)     96  (O5)
  4       97 (O4)     128 (O4)
  3      129 (O3)     168 (O3)
  2      169 (O2)     256 (O2)
  1      257 (O1)     512 (O1)

 gfx600 gfx600 gfx601 gfx601 gfx601 gfx602 gfx602 gfx602 gfx700 gfx700 gfx701 gfx701 gfx702 gfx703 gfx703 gfx703 gfx704 gfx704 gfx705 gfx801 gfx801 gfx802 gfx802 gfx802 gfx803 gfx803 gfx803 gfx803 gfx805 gfx805 gfx810 gfx810 gfx900 gfx902 gfx904 gfx906 gfx908 gfx909 gfx90c:
Occ    MinVGPR        MaxVGPR
 10        0 (O10)    24  (O10)
  9       25 (O9)     28  (O9)
  8       29 (O8)     32  (O8)
  7       33 (O7)     36  (O7)
  6       37 (O6)     40  (O6)
  5       41 (O5)     48  (O5)
  4       49 (O4)     64  (O4)
  3       65 (O3)     84  (O3)
  2       85 (O2)     128 (O2)
  1      129 (O1)     256 (O1)

 gfx1030w64 gfx1031w64 gfx1032w64 gfx1033w64 gfx1034w64 gfx1035w64 gfx1036w64 gfx1102w64 gfx1103w64:
Occ    MinVGPR        MaxVGPR
 16        0 (O16)    32  (O16)
 15        0 (O16)    32  (O16)
 14        0 (O16)    32  (O16)
 13        0 (O16)    32  (O16)
 12       33 (O12)    40  (O12)
 11       33 (O12)    40  (O12)
 10       41 (O10)    48  (O10)
  9       49 (O9)     56  (O9)
  8       57 (O8)     64  (O8)
  7       65 (O7)     72  (O7)
  6       73 (O6)     80  (O6)
  5       81 (O5)     96  (O5)
  4       97 (O4)     128 (O4)
  3      129 (O3)     168 (O3)
  2      169 (O2)     256 (O2)
  1      169 (O2)     256 (O2)

 gfx1100w64 gfx1101w64:
Occ    MinVGPR        MaxVGPR
 16        0 (O16)    48  (O16)
 15        0 (O16)    48  (O16)
 14        0 (O16)    48  (O16)
 13        0 (O16)    48  (O16)
 12       49 (O12)    60  (O12)
 11       49 (O12)    60  (O12)
 10       61 (O10)    72  (O10)
  9       73 (O9)     84  (O9)
  8       85 (O8)     96  (O8)
  7       97 (O7)     108 (O7)
  6      109 (O6)     120 (O6)
  5      121 (O5)     144 (O5)
  4      145 (O4)     192 (O4)
  3      193 (O3)     252 (O3)
  2      253 (O2)     256 (O2)
  1      253 (O2)     256 (O2)

 gfx1030w32 gfx1031w32 gfx1032w32 gfx1033w32 gfx1034w32 gfx1035w32 gfx1036w32 gfx1102w32 gfx1103w32:
Occ    MinVGPR        MaxVGPR
 16        0 (O16)    64  (O16)
 15        0 (O16)    64  (O16)
 14        0 (O16)    64  (O16)
 13        0 (O16)    64  (O16)
 12       65 (O12)    80  (O12)
 11       65 (O12)    80  (O12)
 10       81 (O10)    96  (O10)
  9       97 (O9)     112 (O9)
  8      113 (O8)     128 (O8)
  7      129 (O7)     144 (O7)
  6      145 (O6)     160 (O6)
  5      161 (O5)     192 (O5)
  4      193 (O4)     256 (O4)
  3      193 (O4)     256 (O4)
  2      193 (O4)     256 (O4)
  1      193 (O4)     256 (O4)

 gfx1100w32 gfx1101w32:
Occ    MinVGPR        MaxVGPR
 16        0 (O16)    96  (O16)
 15        0 (O16)    96  (O16)
 14        0 (O16)    96  (O16)
 13        0 (O16)    96  (O16)
 12       97 (O12)    120 (O12)
 11       97 (O12)    120 (O12)
 10      121 (O10)    144 (O10)
  9      145 (O9)     168 (O9)
  8      169 (O8)     192 (O8)
  7      193 (O7)     216 (O7)
  6      217 (O6)     240 (O6)
  5      241 (O5)     256 (O5)
  4      241 (O5)     256 (O5)
  3      241 (O5)     256 (O5)
  2      241 (O5)     256 (O5)
  1      241 (O5)     256 (O5)

 gfx1010w64 gfx1011w64 gfx1012w64 gfx1013w64:
Occ    MinVGPR        MaxVGPR
 20        0 (O20)    24  (O20)
 19        0 (O20)    24  (O20)
 18       25 (O18)    28  (O18)
 17       25 (O18)    28  (O18)
 16       29 (O16)    32  (O16)
 15       29 (O16)    32  (O16)
 14       33 (O14)    36  (O14)
 13       33 (O14)    36  (O14)
 12       37 (O12)    40  (O12)
 11       41 (O11)    44  (O11)
 10       45 (O10)    48  (O10)
  9       49 (O9)     56  (O9)
  8       57 (O8)     64  (O8)
  7       65 (O7)     72  (O7)
  6       73 (O6)     84  (O6)
  5       85 (O5)     100 (O5)
  4      101 (O4)     128 (O4)
  3      129 (O3)     168 (O3)
  2      169 (O2)     256 (O2)
  1      169 (O2)     256 (O2)

 gfx1010w32 gfx1011w32 gfx1012w32 gfx1013w32:
Occ    MinVGPR        MaxVGPR
 20        0 (O20)    48  (O20)
 19        0 (O20)    48  (O20)
 18       49 (O18)    56  (O18)
 17       49 (O18)    56  (O18)
 16       57 (O16)    64  (O16)
 15       57 (O16)    64  (O16)
 14       65 (O14)    72  (O14)
 13       65 (O14)    72  (O14)
 12       73 (O12)    80  (O12)
 11       81 (O11)    88  (O11)
 10       89 (O10)    96  (O10)
  9       97 (O9)     112 (O9)
  8      113 (O8)     128 (O8)
  7      129 (O7)     144 (O7)
  6      145 (O6)     168 (O6)
  5      169 (O5)     200 (O5)
  4      201 (O4)     256 (O4)
  3      201 (O4)     256 (O4)
  2      201 (O4)     256 (O4)
  1      201 (O4)     256 (O4)
```

Reviewed By: #amdgpu, arsenm

Differential Revision: https://reviews.llvm.org/D138443

Added: 
    llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
    llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.h

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
    llvm/lib/Target/AMDGPU/GCNSubtarget.h
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
    llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
    llvm/unittests/Target/AMDGPU/CMakeLists.txt
    llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp
    llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index b7d19501ee7b5..8b5d0b88b0a55 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -625,13 +625,8 @@ unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
   return 5;
 }
 
-unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
-  unsigned MaxWaves = getMaxWavesPerEU();
-  unsigned Granule = getVGPRAllocGranule();
-  if (VGPRs < Granule)
-    return MaxWaves;
-  unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
-  return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
+unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {
+  return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs);
 }
 
 unsigned

diff  --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 731e203be1f07..1d035a2e3da52 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1218,14 +1218,14 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
   }
 
-  /// \returns Minimum number of VGPRs that meets given number of waves per
-  /// execution unit requirement supported by the subtarget.
+  /// \returns the minimum number of VGPRs that will prevent achieving more than
+  /// the specified number of waves \p WavesPerEU.
   unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
     return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
   }
 
-  /// \returns Maximum number of VGPRs that meets given number of waves per
-  /// execution unit requirement supported by the subtarget.
+  /// \returns the maximum number of VGPRs that can be used and still achieved
+  /// at least the specified number of waves \p WavesPerEU.
   unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
     return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
   }

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 9616464d5414f..06074e995e7ca 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1018,15 +1018,38 @@ unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
   return 256;
 }
 
+unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
+                                      unsigned NumVGPRs) {
+  unsigned MaxWaves = getMaxWavesPerEU(STI);
+  unsigned Granule = getVGPRAllocGranule(STI);
+  if (NumVGPRs < Granule)
+    return MaxWaves;
+  unsigned RoundedRegs = alignTo(NumVGPRs, Granule);
+  return std::min(std::max(getTotalNumVGPRs(STI) / RoundedRegs, 1u), MaxWaves);
+}
+
 unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
   assert(WavesPerEU != 0);
 
-  if (WavesPerEU >= getMaxWavesPerEU(STI))
+  unsigned MaxWavesPerEU = getMaxWavesPerEU(STI);
+  if (WavesPerEU >= MaxWavesPerEU)
+    return 0;
+
+  unsigned TotNumVGPRs = getTotalNumVGPRs(STI);
+  unsigned AddrsableNumVGPRs = getAddressableNumVGPRs(STI);
+  unsigned Granule = getVGPRAllocGranule(STI);
+  unsigned MaxNumVGPRs = alignDown(TotNumVGPRs / WavesPerEU, Granule);
+
+  if (MaxNumVGPRs == alignDown(TotNumVGPRs / MaxWavesPerEU, Granule))
     return 0;
-  unsigned MinNumVGPRs =
-      alignDown(getTotalNumVGPRs(STI) / (WavesPerEU + 1),
-                getVGPRAllocGranule(STI)) + 1;
-  return std::min(MinNumVGPRs, getAddressableNumVGPRs(STI));
+
+  unsigned MinWavesPerEU = getNumWavesPerEUWithNumVGPRs(STI, AddrsableNumVGPRs);
+  if (WavesPerEU < MinWavesPerEU)
+    return getMinNumVGPRs(STI, MinWavesPerEU);
+
+  unsigned MaxNumVGPRsNext = alignDown(TotNumVGPRs / (WavesPerEU + 1), Granule);
+  unsigned MinNumVGPRs = 1 + std::min(MaxNumVGPRs - Granule, MaxNumVGPRsNext);
+  return std::min(MinNumVGPRs, AddrsableNumVGPRs);
 }
 
 unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 82704dd55d9a5..3af2fb02120ea 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -288,6 +288,11 @@ unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
 /// execution unit requirement for given subtarget \p STI.
 unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
 
+/// \returns Number of waves reachable for a given \p NumVGPRs usage for given
+/// subtarget \p STI.
+unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
+                                      unsigned NumVGPRs);
+
 /// \returns Number of VGPR blocks needed for given subtarget \p STI when
 /// \p NumVGPRs are used.
 ///

diff  --git a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
index 6ef70f45bd503..600118386fe32 100644
--- a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
+++ b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll
@@ -40,7 +40,7 @@ define amdgpu_kernel void @limited_occupancy_18() #1 {
 
 ; GCN-LABEL: {{^}}limited_occupancy_19:
 ; GFX9:       ; Occupancy: 10
-; GFX1010:    ; Occupancy: 18
+; GFX1010:    ; Occupancy: 20
 ; GFX1030:    ; Occupancy: 16
 ; GFX1100:    ; Occupancy: 16
 define amdgpu_kernel void @limited_occupancy_19() #2 {

diff  --git a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
new file mode 100644
index 0000000000000..13415c1f68ae0
--- /dev/null
+++ b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
@@ -0,0 +1,157 @@
+//===--------- llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUUnitTests.h"
+#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/TargetParser.h"
+#include "llvm/Support/TargetSelect.h"
+#include "gtest/gtest.h"
+
+#include "AMDGPUGenSubtargetInfo.inc"
+
+using namespace llvm;
+
+std::once_flag flag;
+
+void InitializeAMDGPUTarget() {
+  std::call_once(flag, []() {
+    LLVMInitializeAMDGPUTargetInfo();
+    LLVMInitializeAMDGPUTarget();
+    LLVMInitializeAMDGPUTargetMC();
+  });
+}
+
+std::unique_ptr<const GCNTargetMachine>
+llvm::createAMDGPUTargetMachine(std::string TStr, StringRef CPU, StringRef FS) {
+  InitializeAMDGPUTarget();
+
+  std::string Error;
+  const Target *T = TargetRegistry::lookupTarget(TStr, Error);
+  if (!T)
+    return nullptr;
+
+  TargetOptions Options;
+  return std::unique_ptr<GCNTargetMachine>(
+      static_cast<GCNTargetMachine *>(T->createTargetMachine(
+          TStr, CPU, FS, Options, std::nullopt, std::nullopt)));
+}
+
+static cl::opt<bool> PrintCpuRegLimits(
+    "print-cpu-reg-limits", cl::NotHidden, cl::init(false),
+    cl::desc("force printing per AMDGPU CPU register limits"));
+
+static bool checkMinMax(std::stringstream &OS, unsigned Occ, unsigned MinOcc,
+                        unsigned MaxOcc,
+                        std::function<unsigned(unsigned)> GetOcc,
+                        std::function<unsigned(unsigned)> GetMinGPRs,
+                        std::function<unsigned(unsigned)> GetMaxGPRs) {
+  bool MinValid = true, MaxValid = true, RangeValid = true;
+  unsigned MinGPRs = GetMinGPRs(Occ);
+  unsigned MaxGPRs = GetMaxGPRs(Occ);
+  unsigned RealOcc;
+
+  if (MinGPRs >= MaxGPRs)
+    RangeValid = false;
+  else {
+    RealOcc = GetOcc(MinGPRs);
+    for (unsigned NumRegs = MinGPRs + 1; NumRegs <= MaxGPRs; ++NumRegs) {
+      if (RealOcc != GetOcc(NumRegs)) {
+        RangeValid = false;
+        break;
+      }
+    }
+  }
+
+  if (RangeValid && RealOcc > MinOcc && RealOcc <= MaxOcc) {
+    if (MinGPRs > 0 && GetOcc(MinGPRs - 1) <= RealOcc)
+      MinValid = false;
+
+    if (GetOcc(MaxGPRs + 1) >= RealOcc)
+      MaxValid = false;
+  }
+
+  std::stringstream MinStr;
+  MinStr << (MinValid ? ' ' : '<') << ' ' << std::setw(3) << MinGPRs << " (O"
+         << GetOcc(MinGPRs) << ") " << (RangeValid ? ' ' : 'R');
+
+  OS << std::left << std::setw(15) << MinStr.str() << std::setw(3) << MaxGPRs
+     << " (O" << GetOcc(MaxGPRs) << ')' << (MaxValid ? "" : " >");
+
+  return MinValid && MaxValid && RangeValid;
+}
+
+static const std::pair<StringRef, StringRef>
+  EmptyFS = {"", ""},
+  W32FS = {"+wavefrontsize32", "w32"},
+  W64FS = {"+wavefrontsize64", "w64"};
+
+static void testGPRLimits(
+    const char *RegName, bool TestW32W64,
+    std::function<bool(std::stringstream &, unsigned, GCNSubtarget &)> test) {
+  SmallVector<StringRef> CPUs;
+  AMDGPU::fillValidArchListAMDGCN(CPUs);
+
+  std::map<std::string, SmallVector<std::string>> TablePerCPUs;
+  for (auto CPUName : CPUs) {
+    auto CanonCPUName =
+        AMDGPU::getArchNameAMDGCN(AMDGPU::parseArchAMDGCN(CPUName));
+
+    auto *FS = &EmptyFS;
+    while (true) {
+      auto TM = createAMDGPUTargetMachine("amdgcn-amd-", CPUName, FS->first);
+      if (!TM)
+        break;
+
+      GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()),
+                      std::string(TM->getTargetFeatureString()), *TM);
+
+      if (TestW32W64 &&
+          ST.getFeatureBits().test(AMDGPU::FeatureWavefrontSize32))
+        FS = &W32FS;
+
+      std::stringstream Table;
+      bool Success = true;
+      unsigned MaxOcc = ST.getMaxWavesPerEU();
+      for (unsigned Occ = MaxOcc; Occ > 0; --Occ) {
+        Table << std::right << std::setw(3) << Occ << "    ";
+        Success = test(Table, Occ, ST) && Success;
+        Table << '\n';
+      }
+      if (!Success || PrintCpuRegLimits)
+        TablePerCPUs[Table.str()].push_back((CanonCPUName + FS->second).str());
+
+      if (FS != &W32FS)
+        break;
+
+      FS = &W64FS;
+    }
+  }
+  std::stringstream OS;
+  for (auto &P : TablePerCPUs) {
+    for (auto &CPUName : P.second)
+      OS << ' ' << CPUName;
+    OS << ":\nOcc    Min" << RegName << "        Max" << RegName << '\n'
+       << P.first << '\n';
+  }
+  auto ErrStr = OS.str();
+  EXPECT_TRUE(ErrStr.empty()) << ErrStr;
+}
+
+TEST(AMDGPU, TestVGPRLimitsPerOccupancy) {
+  testGPRLimits("VGPR", true, [](std::stringstream &OS, unsigned Occ,
+                                 GCNSubtarget &ST) {
+    unsigned MaxVGPRNum = ST.getAddressableNumVGPRs();
+    return checkMinMax(
+        OS, Occ, ST.getOccupancyWithNumVGPRs(MaxVGPRNum), ST.getMaxWavesPerEU(),
+        [&](unsigned NumGPRs) { return ST.getOccupancyWithNumVGPRs(NumGPRs); },
+        [&](unsigned Occ) { return ST.getMinNumVGPRs(Occ); },
+        [&](unsigned Occ) { return ST.getMaxNumVGPRs(Occ); });
+  });
+}

diff  --git a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.h b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.h
new file mode 100644
index 0000000000000..ccd2e32527042
--- /dev/null
+++ b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.h
@@ -0,0 +1,25 @@
+//===---------- llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.h ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_UNITTESTS_TARGET_AMDGPU_AMDGPUUNITTESTS_H
+#define LLVM_UNITTESTS_TARGET_AMDGPU_AMDGPUUNITTESTS_H
+
+#include <memory>
+#include <string>
+
+namespace llvm {
+
+class GCNTargetMachine;
+class StringRef;
+
+std::unique_ptr<const GCNTargetMachine>
+createAMDGPUTargetMachine(std::string TStr, StringRef CPU, StringRef FS);
+
+} // end namespace llvm
+
+#endif // LLVM_UNITTESTS_TARGET_AMDGPU_AMDGPUUNITTESTS_H

diff  --git a/llvm/unittests/Target/AMDGPU/CMakeLists.txt b/llvm/unittests/Target/AMDGPU/CMakeLists.txt
index a5a57ffa82b0c..12211290ef850 100644
--- a/llvm/unittests/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/unittests/Target/AMDGPU/CMakeLists.txt
@@ -7,6 +7,7 @@ set(LLVM_LINK_COMPONENTS
   AMDGPUCodeGen
   AMDGPUDesc
   AMDGPUInfo
+  AMDGPUUtils
   CodeGen
   Core
   MC
@@ -14,6 +15,7 @@ set(LLVM_LINK_COMPONENTS
   )
 
 add_llvm_target_unittest(AMDGPUTests
+  AMDGPUUnitTests.cpp
   DwarfRegMappings.cpp
   ExecMayBeModifiedBeforeAnyUse.cpp
   )

diff  --git a/llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp b/llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp
index e4d712e14364c..620835c5dfc5c 100644
--- a/llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp
+++ b/llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp
@@ -6,47 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/MC/MCTargetOptions.h"
-#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Target/TargetMachine.h"
+#include "AMDGPUUnitTests.h"
 #include "gtest/gtest.h"
-#include <thread>
 
 using namespace llvm;
 
-std::once_flag flag;
-
-void InitializeAMDGPUTarget() {
-  std::call_once(flag, []() {
-    LLVMInitializeAMDGPUTargetInfo();
-    LLVMInitializeAMDGPUTarget();
-    LLVMInitializeAMDGPUTargetMC();
-  });
-}
-
-std::unique_ptr<const GCNTargetMachine>
-createTargetMachine(std::string TStr, StringRef CPU, StringRef FS) {
-  InitializeAMDGPUTarget();
-
-  std::string Error;
-  const Target *T = TargetRegistry::lookupTarget(TStr, Error);
-  if (!T)
-    return nullptr;
-
-  TargetOptions Options;
-  return std::unique_ptr<GCNTargetMachine>(
-      static_cast<GCNTargetMachine *>(T->createTargetMachine(
-          TStr, CPU, FS, Options, std::nullopt, std::nullopt)));
-}
-
-TEST(AMDGPUDwarfRegMappingTests, TestWave64DwarfRegMapping) {
+TEST(AMDGPU, TestWave64DwarfRegMapping) {
   for (auto Triple :
        {"amdgcn-amd-", "amdgcn-amd-amdhsa", "amdgcn-amd-amdpal"}) {
-    auto TM = createTargetMachine(Triple, "gfx1010", "+wavefrontsize64");
+    auto TM = createAMDGPUTargetMachine(Triple, "gfx1010", "+wavefrontsize64");
     if (TM) {
       GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()),
                       std::string(TM->getTargetFeatureString()), *TM);
@@ -66,10 +35,10 @@ TEST(AMDGPUDwarfRegMappingTests, TestWave64DwarfRegMapping) {
   }
 }
 
-TEST(AMDGPUDwarfRegMappingTests, TestWave32DwarfRegMapping) {
+TEST(AMDGPU, TestWave32DwarfRegMapping) {
   for (auto Triple :
        {"amdgcn-amd-", "amdgcn-amd-amdhsa", "amdgcn-amd-amdpal"}) {
-    auto TM = createTargetMachine(Triple, "gfx1010", "+wavefrontsize32");
+    auto TM = createAMDGPUTargetMachine(Triple, "gfx1010", "+wavefrontsize32");
     if (TM) {
       GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()),
                       std::string(TM->getTargetFeatureString()), *TM);

diff  --git a/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp b/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp
index a8f7c2f8151bb..5e926a4fa1639 100755
--- a/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp
+++ b/llvm/unittests/Target/AMDGPU/ExecMayBeModifiedBeforeAnyUse.cpp
@@ -7,25 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUTargetMachine.h"
-#include "GCNSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "AMDGPUUnitTests.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/MC/MCTargetOptions.h"
-#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Target/TargetMachine.h"
 #include "gtest/gtest.h"
-#include <thread>
 
 using namespace llvm;
 
-// implementation is in the llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp
-std::unique_ptr<const GCNTargetMachine>
-createTargetMachine(std::string TStr, StringRef CPU, StringRef FS);
-
-TEST(AMDGPUExecMayBeModifiedBeforeAnyUse, TheTest) {
-  auto TM = createTargetMachine("amdgcn-amd-", "gfx906", "");
+TEST(AMDGPU, ExecMayBeModifiedBeforeAnyUse) {
+  auto TM = createAMDGPUTargetMachine("amdgcn-amd-", "gfx906", "");
   if (!TM)
     return;