[llvm] [AMDGPU] Ignore inactive VGPRs in .vgpr_count (PR #149052)

Fri Jul 18 05:20:07 PDT 2025

https://github.com/rovka updated https://github.com/llvm/llvm-project/pull/149052

>From 1ea0b5b7b20f1597f46b51e93f1f6e9eb48a4f79 Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Wed, 16 Jul 2025 09:34:09 +0200
Subject: [PATCH 1/3] [AMDGPU] Use SIRegisterInfo to compute used registers.
 NFCI

Simplify the code in AMDGPUResourceUsageAnalysis to rely more
on the TargetRegisterInfo for computing the number of used SGPRs and
AGPRs. This is a preliminary refactoring split out from #144855.

(While we could technically use TRI to compute the used number of VGPRs
 at this point too, I'm leaving some of the original code in since for
 VGPRs we're going to introduce some special cases).
---
 .../AMDGPU/AMDGPUResourceUsageAnalysis.cpp    | 234 ++----------------
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     |   8 +-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.h       |   6 +-
 3 files changed, 31 insertions(+), 217 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 9a609a1752de0..ec4daa2cf662a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -139,77 +139,39 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
 
   Info.UsesVCC =
       MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
+  Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass,
+                                                /*IncludeCalls=*/false);
+  if (ST.hasMAIInsts())
+    Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass,
+                                          /*IncludeCalls=*/false);
 
   // If there are no calls, MachineRegisterInfo can tell us the used register
   // count easily.
   // A tail call isn't considered a call for MachineFrameInfo's purposes.
   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
-    Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
-    Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
-    if (ST.hasMAIInsts())
-      Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
+    Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass,
+                                          /*IncludeCalls=*/false);
     return Info;
   }
 
   int32_t MaxVGPR = -1;
-  int32_t MaxAGPR = -1;
-  int32_t MaxSGPR = -1;
   Info.CalleeSegmentSize = 0;
 
   for (const MachineBasicBlock &MBB : MF) {
     for (const MachineInstr &MI : MBB) {
-      // TODO: Check regmasks? Do they occur anywhere except calls?
-      for (const MachineOperand &MO : MI.operands()) {
-        unsigned Width = 0;
-        bool IsSGPR = false;
-        bool IsAGPR = false;
+      for (unsigned I = 0; I < MI.getNumOperands(); ++I) {
+        const MachineOperand &MO = MI.getOperand(I);
 
         if (!MO.isReg())
           continue;
 
         Register Reg = MO.getReg();
         switch (Reg) {
-        case AMDGPU::EXEC:
-        case AMDGPU::EXEC_LO:
-        case AMDGPU::EXEC_HI:
-        case AMDGPU::SCC:
-        case AMDGPU::M0:
-        case AMDGPU::M0_LO16:
-        case AMDGPU::M0_HI16:
-        case AMDGPU::SRC_SHARED_BASE_LO:
-        case AMDGPU::SRC_SHARED_BASE:
-        case AMDGPU::SRC_SHARED_LIMIT_LO:
-        case AMDGPU::SRC_SHARED_LIMIT:
-        case AMDGPU::SRC_PRIVATE_BASE_LO:
-        case AMDGPU::SRC_PRIVATE_BASE:
-        case AMDGPU::SRC_PRIVATE_LIMIT_LO:
-        case AMDGPU::SRC_PRIVATE_LIMIT:
-        case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
-        case AMDGPU::SGPR_NULL:
-        case AMDGPU::SGPR_NULL64:
-        case AMDGPU::MODE:
-          continue;
-
         case AMDGPU::NoRegister:
           assert(MI.isDebugInstr() &&
                  "Instruction uses invalid noreg register");
           continue;
 
-        case AMDGPU::VCC:
-        case AMDGPU::VCC_LO:
-        case AMDGPU::VCC_HI:
-        case AMDGPU::VCC_LO_LO16:
-        case AMDGPU::VCC_LO_HI16:
-        case AMDGPU::VCC_HI_LO16:
-        case AMDGPU::VCC_HI_HI16:
-          Info.UsesVCC = true;
-          continue;
-
-        case AMDGPU::FLAT_SCR:
-        case AMDGPU::FLAT_SCR_LO:
-        case AMDGPU::FLAT_SCR_HI:
-          continue;
-
         case AMDGPU::XNACK_MASK:
         case AMDGPU::XNACK_MASK_LO:
         case AMDGPU::XNACK_MASK_HI:
@@ -239,170 +201,22 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
           break;
         }
 
-        if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
-            AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
-            AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 1;
-        } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
-                   AMDGPU::VGPR_16RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 1;
-        } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
-                   AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 1;
-        } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 2;
-        } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 2;
-        } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 2;
-        } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 3;
-        } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 3;
-        } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 3;
-        } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 4;
-        } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 4;
-        } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 4;
-        } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 5;
-        } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 5;
-        } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 5;
-        } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 6;
-        } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 6;
-        } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 6;
-        } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 7;
-        } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 7;
-        } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 7;
-        } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 8;
-        } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 8;
-        } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 8;
-        } else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 9;
-        } else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 9;
-        } else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 9;
-        } else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 10;
-        } else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 10;
-        } else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 10;
-        } else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 11;
-        } else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 11;
-        } else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 11;
-        } else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 12;
-        } else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 12;
-        } else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 12;
-        } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 16;
-        } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 16;
-        } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 16;
-        } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 32;
-        } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 32;
-        } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 32;
-        } else {
-          // We only expect TTMP registers or registers that do not belong to
-          // any RC.
-          assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
-                  AMDGPU::TTMP_64RegClass.contains(Reg) ||
-                  AMDGPU::TTMP_128RegClass.contains(Reg) ||
-                  AMDGPU::TTMP_256RegClass.contains(Reg) ||
-                  AMDGPU::TTMP_512RegClass.contains(Reg) ||
-                  !TRI.getPhysRegBaseClass(Reg)) &&
-                 "Unknown register class");
-        }
+        const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg);
+        assert((!RC || TRI.isVGPRClass(RC) || TRI.isSGPRClass(RC) ||
+                TRI.isAGPRClass(RC) || AMDGPU::TTMP_32RegClass.contains(Reg) ||
+                AMDGPU::TTMP_64RegClass.contains(Reg) ||
+                AMDGPU::TTMP_128RegClass.contains(Reg) ||
+                AMDGPU::TTMP_256RegClass.contains(Reg) ||
+                AMDGPU::TTMP_512RegClass.contains(Reg)) &&
+               "Unknown register class");
+
+        if (!RC || !TRI.isVGPRClass(RC))
+          continue;
+
+        unsigned Width = divideCeil(TRI.getRegSizeInBits(*RC), 32);
         unsigned HWReg = TRI.getHWRegIndex(Reg);
         int MaxUsed = HWReg + Width - 1;
-        if (IsSGPR) {
-          MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
-        } else if (IsAGPR) {
-          MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
-        } else {
-          MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
-        }
+        MaxVGPR = std::max(MaxUsed, MaxVGPR);
       }
 
       if (MI.isCall()) {
@@ -464,9 +278,7 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
     }
   }
 
-  Info.NumExplicitSGPR = MaxSGPR + 1;
   Info.NumVGPR = MaxVGPR + 1;
-  Info.NumAGPR = MaxAGPR + 1;
 
   return Info;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 8c3873d23419f..03cd5ea6f2bf8 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -4050,11 +4050,11 @@ SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC,
   return 0;
 }
 
-unsigned
-SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
-                                   const TargetRegisterClass &RC) const {
+unsigned SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
+                                            const TargetRegisterClass &RC,
+                                            bool IncludeCalls) const {
   for (MCPhysReg Reg : reverse(RC.getRegisters()))
-    if (MRI.isPhysRegUsed(Reg))
+    if (MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls))
       return getHWRegIndex(Reg) + 1;
   return 0;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index a4b135d5e0b59..f3ce80ac2864e 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -482,9 +482,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
                                      unsigned SubReg) const;
 
   // \returns a number of registers of a given \p RC used in a function.
-  // Does not go inside function calls.
+  // Does not go inside function calls. If \p IncludeCalls is true, it will
+  // include registers that may be clobbered by calls.
   unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
-                              const TargetRegisterClass &RC) const;
+                              const TargetRegisterClass &RC,
+                              bool IncludeCalls = true) const;
 
   std::optional<uint8_t> getVRegFlagValue(StringRef Name) const override {
     return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG

>From 3ccfa1e5284bcafe6f927d89096d26619c23bec1 Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Wed, 16 Jul 2025 09:45:50 +0200
Subject: [PATCH 2/3] [AMDGPU] Ignore inactive VGPRs in .vgpr_count

When using the `amdgcn.init.whole.wave` intrinsic, we add dummy VGPR
arguments with the purpose of preserving their inactive lanes. The
pattern may look something like this:

```
entry:
  call amdgcn.init.whole.wave
  branch to shader or tail

shader:
  $vInactive = IMPLICIT_DEF ; Tells regalloc it's safe to use the active lanes
  actual code...

tail:
  call amdgcn.cs.chain [...], implicit $vInactive
```

We should not report these VGPRs in the `.vgpr_count` metadata. This patch
achieves that goal by ignoring IMPLICIT_DEFs and calls. This should be
safe since if those registers are actually used in any other context,
they will be counted there.

It also reduces the scope of the code that counts unused function
arguments to only work on entry functions, since only they need to
handle hardware-initialized registers.

This is a reworked version of #133242, which was reverted in #144039.
---
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp   |  2 +-
 .../AMDGPU/AMDGPUResourceUsageAnalysis.cpp    |  3 +
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 22 ++++++
 .../init-whole-wave-vgpr-count-large.ll       | 76 ++++++++++++++++++
 .../AMDGPU/init-whole-wave-vgpr-count-leaf.ll | 50 ++++++++++++
 ...init-whole-wave-vgpr-count-use-inactive.ll | 78 +++++++++++++++++++
 .../AMDGPU/init-whole-wave-vgpr-count.ll      | 75 ++++++++++++++++++
 .../AMDGPU/unnamed-function-resource-info.ll  |  2 +-
 .../CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll  |  4 +-
 .../test/CodeGen/AMDGPU/vgpr-count-compute.ll | 30 +++++++
 .../AMDGPU/vgpr-count-graphics-chain.ll       | 27 +++++++
 .../CodeGen/AMDGPU/vgpr-count-graphics.ll     | 25 ++++++
 12 files changed, 390 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-count-graphics-chain.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index c0920e3e71bee..14c392b2b2250 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -993,7 +993,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   // dispatch registers are function args.
   unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
 
-  if (isShader(F.getCallingConv())) {
+  if (AMDGPU::shouldReportUnusedFuncArgs(F.getCallingConv())) {
     bool IsPixelShader =
         F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index ec4daa2cf662a..bb0d2027c71f4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -213,6 +213,9 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
         if (!RC || !TRI.isVGPRClass(RC))
           continue;
 
+        if (MI.isCall() || MI.isImplicitDef())
+          continue;
+
         unsigned Width = divideCeil(TRI.getRegSizeInBits(*RC), 32);
         unsigned HWReg = TRI.getHWRegIndex(Reg);
         int MaxUsed = HWReg + Width - 1;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index e6078d6918ac2..bc06c68d968c6 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1367,6 +1367,28 @@ constexpr bool isEntryFunctionCC(CallingConv::ID CC) {
   }
 }
 
+// Shaders that are entry functions need to count input arguments even if
+// they're not used (i.e. not reported by AMDGPUResourceUsageAnalysis). Other
+// functions can skip including them. This is especially important for shaders
+// that use the init.whole.wave intrinsic, since they sometimes have VGPR
+// arguments that are only added for the purpose of preserving their inactive
+// lanes and should not be included in the vgpr-count.
+LLVM_READNONE
+constexpr bool shouldReportUnusedFuncArgs(CallingConv::ID CC) {
+  switch (CC) {
+  case CallingConv::AMDGPU_VS:
+  case CallingConv::AMDGPU_LS:
+  case CallingConv::AMDGPU_HS:
+  case CallingConv::AMDGPU_ES:
+  case CallingConv::AMDGPU_GS:
+  case CallingConv::AMDGPU_PS:
+  case CallingConv::AMDGPU_CS:
+    return true;
+  default:
+    return false;
+  }
+}
+
 LLVM_READNONE
 constexpr bool isChainCC(CallingConv::ID CC) {
   switch (CC) {
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll
new file mode 100644
index 0000000000000..e47f5e25ead3a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll
@@ -0,0 +1,76 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
+
+; CHECK-LABEL: .shader_functions:
+
+; Use VGPRs above the input arguments.
+; CHECK-LABEL: _miss_1:
+; CHECK: .vgpr_count:{{.*}}0x1d{{$}}
+
+define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
+                                    i32 %vcr, { i32 } %system.data,
+                                    i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
+                                    i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
+                                    i32 %inactive.vgpr8, i32 %inactive.vgpr9)
+                                    local_unnamed_addr {
+entry:
+  %system.data.value = extractvalue { i32 } %system.data, 0
+  %dead.val = call i32 @llvm.amdgcn.dead.i32()
+  %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
+  br i1 %is.whole.wave, label %shader, label %tail
+
+shader:
+  %system.data.extract = extractvalue { i32 } %system.data, 0
+  %data.mul = mul i32 %system.data.extract, 2
+  %data.add = add i32 %data.mul, 1
+  call void asm sideeffect "; clobber v28", "~{v28}"()
+  br label %tail
+
+tail:
+  %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
+  %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
+  %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
+  %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
+  %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
+  %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
+  %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
+  %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
+  %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
+  %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
+  %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
+  %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
+
+  %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
+  %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
+  %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
+  %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
+  %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
+  %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
+  %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
+  %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
+  %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
+  %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
+  %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
+  %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
+
+  %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
+  %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
+  %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
+  %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
+
+  call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
+        @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
+        ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
+        { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
+        i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
+  unreachable
+}
+
+declare i32 @llvm.amdgcn.dead.i32()
+declare i1 @llvm.amdgcn.init.whole.wave()
+declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
+
+declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
+
+!amdgpu.pal.metadata.msgpack = !{!0}
+
+!0 = !{!"\82\B0amdpal.pipelines\91\8B\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C3\AA.tgid_y_en\C3\AA.tgid_z_en\C3\AF.tidig_comp_cnt\00\B0.hardware_stages\81\A3.cs\8D\AF.checksum_value\00\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93 \01\01\AD.trap_present\00\B2.user_data_reg_map\90\AB.user_sgprs\10\AB.vgpr_limit\CD\01\00\AF.wavefront_size \AF.wg_round_robin\C2\B7.internal_pipeline_hash\92\CF|{2&\DCC\85M\CFep\8A\EDR\DE\D6\E1\B1.shader_functions\81\A7_miss_1\82\B4.frontend_stack_size\00\B4.outgoing_vgpr_countP\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\00\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CD\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\A9.uses_cps\C3\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4\AF\9D\0B\07\88\03\02\CF\01o\C9\CAf?)\DA\AD.llpc_version\A476.0\AEamdpal.version\92\03\00"}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
new file mode 100644
index 0000000000000..5d7472fd3c56e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
@@ -0,0 +1,50 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
+
+; CHECK-LABEL: .shader_functions:
+
+; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
+; CHECK-LABEL: leaf_shader:
+; CHECK: .vgpr_count:{{.*}}0xc{{$}}
+
+; Function without calls.
+define amdgpu_cs_chain void @_leaf_shader(ptr %output.ptr, i32 inreg %input.value,
+                              i32 %active.vgpr1, i32 %active.vgpr2,
+                              i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
+                              i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6)
+                              local_unnamed_addr {
+entry:
+  %dead.val = call i32 @llvm.amdgcn.dead.i32()
+  %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
+  br i1 %is.whole.wave, label %compute, label %merge
+
+compute:
+  ; Perform a more complex computation using active VGPRs
+  %square = mul i32 %active.vgpr1, %active.vgpr1
+  %product = mul i32 %square, %active.vgpr2
+  %sum = add i32 %product, %input.value
+  %result = add i32 %sum, 42
+  br label %merge
+
+merge:
+  %final.result = phi i32 [ 0, %entry ], [ %result, %compute ]
+  %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %compute ]
+  %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %compute ]
+  %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %compute ]
+  %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %compute ]
+  %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %compute ]
+  %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %compute ]
+
+  store i32 %final.result, ptr %output.ptr, align 4
+
+  ret void
+}
+
+declare i32 @llvm.amdgcn.dead.i32()
+declare i1 @llvm.amdgcn.init.whole.wave()
+declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
+
+declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
+
+!amdgpu.pal.metadata.msgpack = !{!0}
+
+!0 = !{!"\82\B0amdpal.pipelines\91\8B\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C3\AA.tgid_y_en\C3\AA.tgid_z_en\C3\AF.tidig_comp_cnt\00\B0.hardware_stages\81\A3.cs\8D\AF.checksum_value\00\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93 \01\01\AD.trap_present\00\B2.user_data_reg_map\90\AB.user_sgprs\10\AB.vgpr_limit\CD\01\00\AF.wavefront_size \AF.wg_round_robin\C2\B7.internal_pipeline_hash\92\CF|{2&\DCC\85M\CFep\8A\EDR\DE\D6\E1\B1.shader_functions\81\A7_miss_1\82\B4.frontend_stack_size\00\B4.outgoing_vgpr_countP\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\00\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CD\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\A9.uses_cps\C3\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4\AF\9D\0B\07\88\03\02\CF\01o\C9\CAf?)\DA\AD.llpc_version\A476.0\AEamdpal.version\92\03\00"}
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
new file mode 100644
index 0000000000000..f1f7fb22d44c6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
@@ -0,0 +1,78 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
+
+; CHECK-LABEL: .shader_functions:
+
+; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
+; The shader is free to use any of the VGPRs mapped to a %inactive.vgpr as long as it only touches its active lanes.
+; In that case, the VGPR should be included in the .vgpr_count
+; CHECK-LABEL: _miss_1:
+; CHECK: .vgpr_count:{{.*}}0xd{{$}}
+
+define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
+                                    i32 %vcr, { i32 } %system.data,
+                                    i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
+                                    i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
+                                    i32 %inactive.vgpr8, i32 %inactive.vgpr9)
+                                    local_unnamed_addr {
+entry:
+  %system.data.value = extractvalue { i32 } %system.data, 0
+  %dead.val = call i32 @llvm.amdgcn.dead.i32()
+  %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
+  br i1 %is.whole.wave, label %shader, label %tail
+
+shader:
+  %system.data.extract = extractvalue { i32 } %system.data, 0
+  %data.mul = mul i32 %system.data.extract, 2
+  %data.add = add i32 %data.mul, 1
+  call void asm sideeffect "; use VGPR for %inactive.vgpr2", "~{v12}"()
+  br label %tail
+
+tail:
+  %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
+  %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
+  %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
+  %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
+  %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
+  %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
+  %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
+  %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
+  %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
+  %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
+  %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
+  %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
+
+  %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
+  %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
+  %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
+  %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
+  %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
+  %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
+  %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
+  %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
+  %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
+  %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
+  %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
+  %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
+
+  %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
+  %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
+  %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
+  %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
+
+  call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
+        @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
+        ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
+        { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
+        i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
+  unreachable
+}
+
+declare i32 @llvm.amdgcn.dead.i32()
+declare i1 @llvm.amdgcn.init.whole.wave()
+declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
+
+declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
+
+!amdgpu.pal.metadata.msgpack = !{!0}
+
+!0 = !{!"\82\B0amdpal.pipelines\91\8B\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C3\AA.tgid_y_en\C3\AA.tgid_z_en\C3\AF.tidig_comp_cnt\00\B0.hardware_stages\81\A3.cs\8D\AF.checksum_value\00\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93 \01\01\AD.trap_present\00\B2.user_data_reg_map\90\AB.user_sgprs\10\AB.vgpr_limit\CD\01\00\AF.wavefront_size \AF.wg_round_robin\C2\B7.internal_pipeline_hash\92\CF|{2&\DCC\85M\CFep\8A\EDR\DE\D6\E1\B1.shader_functions\81\A7_miss_1\82\B4.frontend_stack_size\00\B4.outgoing_vgpr_countP\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\00\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CD\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\A9.uses_cps\C3\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4\AF\9D\0B\07\88\03\02\CF\01o\C9\CAf?)\DA\AD.llpc_version\A476.0\AEamdpal.version\92\03\00"}
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll
new file mode 100644
index 0000000000000..b9130dd1b7ed4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll
@@ -0,0 +1,75 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
+
+; CHECK-LABEL: .shader_functions:
+
+; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
+; CHECK-LABEL: _miss_1:
+; CHECK: .vgpr_count:{{.*}}0xa{{$}}
+
+define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
+                                    i32 %vcr, { i32 } %system.data,
+                                    i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
+                                    i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
+                                    i32 %inactive.vgpr8, i32 %inactive.vgpr9)
+                                    local_unnamed_addr {
+entry:
+  %system.data.value = extractvalue { i32 } %system.data, 0
+  %dead.val = call i32 @llvm.amdgcn.dead.i32()
+  %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
+  br i1 %is.whole.wave, label %shader, label %tail
+
+shader:
+  %system.data.extract = extractvalue { i32 } %system.data, 0
+  %data.mul = mul i32 %system.data.extract, 2
+  %data.add = add i32 %data.mul, 1
+  br label %tail
+
+tail:
+  %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
+  %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
+  %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
+  %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
+  %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
+  %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
+  %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
+  %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
+  %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
+  %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
+  %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
+  %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
+
+  %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
+  %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
+  %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
+  %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
+  %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
+  %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
+  %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
+  %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
+  %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
+  %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
+  %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
+  %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
+
+  %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
+  %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
+  %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
+  %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
+
+  call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
+        @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
+        ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
+        { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
+        i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
+  unreachable
+}
+
+declare i32 @llvm.amdgcn.dead.i32()
+declare i1 @llvm.amdgcn.init.whole.wave()
+declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
+
+declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
+
+!amdgpu.pal.metadata.msgpack = !{!0}
+
+!0 = !{!"\82\B0amdpal.pipelines\91\8B\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C3\AA.tgid_y_en\C3\AA.tgid_z_en\C3\AF.tidig_comp_cnt\00\B0.hardware_stages\81\A3.cs\8D\AF.checksum_value\00\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93 \01\01\AD.trap_present\00\B2.user_data_reg_map\90\AB.user_sgprs\10\AB.vgpr_limit\CD\01\00\AF.wavefront_size \AF.wg_round_robin\C2\B7.internal_pipeline_hash\92\CF|{2&\DCC\85M\CFep\8A\EDR\DE\D6\E1\B1.shader_functions\81\A7_miss_1\82\B4.frontend_stack_size\00\B4.outgoing_vgpr_countP\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\00\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CD\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\A9.uses_cps\C3\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4\AF\9D\0B\07\88\03\02\CF\01o\C9\CAf?)\DA\AD.llpc_version\A476.0\AEamdpal.version\92\03\00"}
diff --git a/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll b/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
index cf5b95a729974..bb0ec0d3ad3f8 100644
--- a/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
+++ b/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
@@ -16,7 +16,7 @@ entry:
 }
 
 ; CHECK-LABEL: __unnamed_2:
-; CHECK: .set __unnamed_2.num_vgpr, max(32, __unnamed_1.num_vgpr)
+; CHECK: .set __unnamed_2.num_vgpr, max(1, __unnamed_1.num_vgpr)
 ; CHECK: .set __unnamed_2.num_agpr, max(0, __unnamed_1.num_agpr)
 ; CHECK: .set __unnamed_2.numbered_sgpr, max(34, __unnamed_1.numbered_sgpr)
 ; CHECK: .set __unnamed_2.private_seg_size, 16+max(__unnamed_1.private_seg_size)
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
index 2cb5e309c8c21..ee35dc4cddade 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
@@ -1264,9 +1264,9 @@ define amdgpu_kernel void @k1024_call_no_agprs_ub_callee() #1025 {
 }
 
 ; GCN-LABEL: {{^}}f1024_0:
-; GFX90A: NumVgprs: 32
+; GFX90A: NumVgprs: 1
 ; GFX90A: NumAgprs: 1
-; GFX90A: TotalNumVgprs: 33
+; GFX90A: TotalNumVgprs: 5
 define void @f1024_0() #1024 {
   call void @foo()
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll b/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll
new file mode 100644
index 0000000000000..8c8182db7b479
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll
@@ -0,0 +1,30 @@
+; RUN: llc -mcpu=gfx1200 -o - < %s | FileCheck %s --check-prefixes=CHECK,PACKED
+; RUN: llc -mcpu=gfx1030 -o - < %s | FileCheck %s --check-prefixes=CHECK,NOTPACKED
+target triple = "amdgcn-amd-amdhsa"
+
+ at global = addrspace(1) global i32 poison, align 4
+
+; Carefully crafted kernel that uses v0 but never writes a VGPR or reads another VGPR.
+; Only hardware-initialized VGPRs (v0) are read in this kernel.
+
+; CHECK-LABEL: amdhsa.kernels:
+; CHECK-LABEL: kernel_x
+; CHECK: .vgpr_count:     1
+define amdgpu_kernel void @kernel_x(ptr addrspace(8) %rsrc) #0 {
+entry:
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %id, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: kernel_z
+; PACKED: .vgpr_count:     1
+; NOTPACKED: .vgpr_count:     3
+define amdgpu_kernel void @kernel_z(ptr addrspace(8) %rsrc) {
+entry:
+  %id = call i32 @llvm.amdgcn.workitem.id.z()
+  call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %id, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+  ret void
+}
+
+attributes #0 = { "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics-chain.ll b/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics-chain.ll
new file mode 100644
index 0000000000000..9b8bd079958df
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics-chain.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mcpu=gfx1200 -o - < %s | FileCheck %s
+; Check that reads of a VGPR in kernels counts towards VGPR count, but in functions, only writes of VGPRs count towards VGPR count.
+target triple = "amdgcn--amdpal"
+
+ at global = addrspace(1) global i32 poison, align 4
+
+; CHECK-LABEL: amdpal.pipelines:
+
+; Shouldn't report the part of %vgpr_args that's not used
+; CHECK-LABEL: entry_point_symbol: cs_calling_chain
+; CHECK: .vgpr_count:     0xa
+define amdgpu_cs void @cs_calling_chain(i32 %vgpr, i32 inreg %sgpr) {
+  %vgpr_args = insertvalue {i32, i32, i32, i32} poison, i32 %vgpr, 1
+  call void (ptr, i32, i32, {i32, i32, i32, i32}, i32, ...) @llvm.amdgcn.cs.chain.p0.i32.i32.s(
+    ptr @chain_func, i32 0, i32 inreg %sgpr, {i32, i32, i32, i32} %vgpr_args, i32 0)
+  unreachable
+}
+
+; Neither uses not writes a VGPR
+; CHECK-LABEL: chain_func:
+; CHECK: .vgpr_count:     0x1
+define amdgpu_cs_chain void @chain_func([32 x i32] %args) {
+entry:
+  call void (ptr, i32, {}, [32 x i32], i32, ...) @llvm.amdgcn.cs.chain.p0.i32.s.a(
+        ptr @chain_func, i32 0, {} inreg {}, [32 x i32] %args, i32 0)
+  unreachable
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll b/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll
new file mode 100644
index 0000000000000..04c3005541fe1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll
@@ -0,0 +1,25 @@
+; RUN: llc -mcpu=gfx1200 -o - < %s | FileCheck %s
+; Check that reads of a VGPR in kernels counts towards VGPR count, but in functions, only writes of VGPRs count towards VGPR count.
+target triple = "amdgcn--amdpal"
+
+ at global = addrspace(1) global i32 poison, align 4
+
+; CHECK-LABEL: amdpal.pipelines:
+
+; Neither uses not writes a VGPR, but the hardware initializes the VGPRs that the kernel receives, so they count as used.
+; CHECK-LABEL: .entry_point_symbol: kernel_use
+; CHECK: .vgpr_count:     0x20
+define amdgpu_cs void @kernel_use([32 x i32] %args) {
+entry:
+  %a = extractvalue [32 x i32] %args, 14
+  store i32 %a, ptr addrspace(1) @global
+  ret void
+}
+
+; Neither uses not writes a VGPR
+; CHECK-LABEL: gfx_func:
+; CHECK: .vgpr_count:     0x20
+define amdgpu_gfx [32 x i32] @gfx_func([32 x i32] %args) {
+entry:
+  ret [32 x i32] %args
+}

>From d790c7d2ffc82d2199c9830e70cdecf7f73c2378 Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Fri, 18 Jul 2025 14:15:16 +0200
Subject: [PATCH 3/3] Fixup RUN lines

---
 llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll        | 4 ++--
 llvm/test/CodeGen/AMDGPU/vgpr-count-graphics-chain.ll | 2 +-
 llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll       | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll b/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll
index 8c8182db7b479..71d4e6b277320 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mcpu=gfx1200 -o - < %s | FileCheck %s --check-prefixes=CHECK,PACKED
-; RUN: llc -mcpu=gfx1030 -o - < %s | FileCheck %s --check-prefixes=CHECK,NOTPACKED
+; RUN: llc -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=CHECK,PACKED
+; RUN: llc -mcpu=gfx1030 < %s | FileCheck %s --check-prefixes=CHECK,NOTPACKED
 target triple = "amdgcn-amd-amdhsa"
 
 @global = addrspace(1) global i32 poison, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics-chain.ll b/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics-chain.ll
index 9b8bd079958df..3cf540c114160 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics-chain.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics-chain.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=gfx1200 -o - < %s | FileCheck %s
+; RUN: llc -mcpu=gfx1200 < %s | FileCheck %s
 ; Check that reads of a VGPR in kernels counts towards VGPR count, but in functions, only writes of VGPRs count towards VGPR count.
 target triple = "amdgcn--amdpal"
 
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll b/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll
index 04c3005541fe1..de7d3a13a2a7e 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=gfx1200 -o - < %s | FileCheck %s
+; RUN: llc -mcpu=gfx1200 < %s | FileCheck %s
 ; Check that reads of a VGPR in kernels counts towards VGPR count, but in functions, only writes of VGPRs count towards VGPR count.
 target triple = "amdgcn--amdpal"