[llvm] 130080f - [AMDGPU] Skip register uses in AMDGPUResourceUsageAnalysis (#133242)

Tue Jun 3 02:20:52 PDT 2025

Author: Diana Picus
Date: 2025-06-03T11:20:48+02:00
New Revision: 130080fab11cde5efcb338b77f5c3b31097df6e6

URL: https://github.com/llvm/llvm-project/commit/130080fab11cde5efcb338b77f5c3b31097df6e6
DIFF: https://github.com/llvm/llvm-project/commit/130080fab11cde5efcb338b77f5c3b31097df6e6.diff

LOG: [AMDGPU] Skip register uses in AMDGPUResourceUsageAnalysis (#133242)

Don't count register uses when determining the maximum number of
registers used by a function. Count only the defs. This is really an
underestimate of the true register usage, but in practice that's not
a problem because if a function uses a register, then it has either
defined it earlier, or some other function that executed before has
defined it.

In particular, the register counts are used:
1. When launching an entry function - in which case we're safe because
   the register counts of the entry function will include the register
   counts of all callees.
2. At function boundaries in dynamic VGPR mode. In this case it's safe
   because whenever we set the new VGPR allocation we take into account
   the outgoing_vgpr_count set by the middle-end.

The main advantage of doing this is that the artificial VGPR arguments
used only for preserving the inactive lanes when using the
llvm.amdgcn.init.whole.wave intrinsic are no longer counted. This
enables us to allocate only the registers we need in dynamic VGPR mode.

---------

Co-authored-by: Thomas Symalla <5754458+tsymalla at users.noreply.github.com>

Added: 
    llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll
    llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
    llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
    llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll
    llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll
    llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll

Modified: 
    llvm/docs/AMDGPUUsage.rst
    llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
    llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
    llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
    llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
    llvm/lib/Target/AMDGPU/SIRegisterInfo.h
    llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll
    llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
    llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
    llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
    llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
    llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
    llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
    llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
    llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
    llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
    llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
    llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
    llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
    llvm/test/CodeGen/AMDGPU/code-object-v3.ll
    llvm/test/CodeGen/AMDGPU/elf-notes.ll
    llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
    llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
    llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
    llvm/test/CodeGen/AMDGPU/hsa.ll
    llvm/test/CodeGen/AMDGPU/ipra.ll
    llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll
    llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll
    llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
    llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
    llvm/test/CodeGen/AMDGPU/register-count-comments.ll
    llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
    llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
    llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
    llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
    llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
    llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
    llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
    llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
    llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
    llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll

Removed: 
    


################################################################################
diff  --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 9798e5437be5e..174a497c51b26 100644

--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -4263,10 +4263,9 @@ same *vendor-name*.
                                                                   wavefront for
                                                                   GFX6-GFX9. A register
                                                                   is required if it is
-                                                                  used explicitly, or
+                                                                  written to, or
                                                                   if a higher numbered
-                                                                  register is used
-                                                                  explicitly. This
+                                                                  register is written to. This
                                                                   includes the special
                                                                   SGPRs for VCC, Flat
                                                                   Scratch (GFX7-GFX9)
@@ -4284,10 +4283,10 @@ same *vendor-name*.
                                                                   each work-item for
                                                                   GFX6-GFX9. A register
                                                                   is required if it is
-                                                                  used explicitly, or
+                                                                  written to, or
                                                                   if a higher numbered
-                                                                  register is used
-                                                                  explicitly.
+                                                                  register is
+                                                                  written to.
      ".agpr_count"                       integer        Required  Number of accumulator
                                                                   registers required by
                                                                   each work-item for

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 491314daf2d81..d4fea30f21f45 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -989,7 +989,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   // dispatch registers are function args.
   unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
 
-  if (isShader(F.getCallingConv())) {
+  if (isShader(F.getCallingConv()) && isEntryFunctionCC(F.getCallingConv())) {
     bool IsPixelShader =
         F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
 
@@ -1060,15 +1060,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
 
     ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
         ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
-  } else if (isKernel(F.getCallingConv()) &&
-             MFI->getNumKernargPreloadedSGPRs()) {
-    // Consider cases where the total number of UserSGPRs with trailing
-    // allocated preload SGPRs, is greater than the number of explicitly
-    // referenced SGPRs.
-    const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
-        CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
-    ProgInfo.NumSGPR =
-        AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
   }
 
   // Adjust number of registers used to meet default/requested minimum/maximum

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 9a609a1752de0..7bde59412d905 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -137,274 +137,29 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
   if (MFI->isStackRealigned())
     Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
 
-  Info.UsesVCC =
-      MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
-
-  // If there are no calls, MachineRegisterInfo can tell us the used register
-  // count easily.
-  // A tail call isn't considered a call for MachineFrameInfo's purposes.
-  if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
-    Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
-    Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
-    if (ST.hasMAIInsts())
-      Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
-    return Info;
+  Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC);
+
+  Info.NumVGPR = TRI.getNumDefinedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
+  Info.NumExplicitSGPR =
+      TRI.getNumDefinedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
+  if (ST.hasMAIInsts())
+    Info.NumAGPR = TRI.getNumDefinedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
+
+  // Preloaded registers are written by the hardware, not defined in the
+  // function body, so they need special handling.
+  if (MFI->isEntryFunction()) {
+    Info.NumExplicitSGPR =
+        std::max<int32_t>(Info.NumExplicitSGPR, MFI->getNumPreloadedSGPRs());
+    Info.NumVGPR = std::max<int32_t>(Info.NumVGPR, MFI->getNumPreloadedVGPRs());
   }
 
-  int32_t MaxVGPR = -1;
-  int32_t MaxAGPR = -1;
-  int32_t MaxSGPR = -1;
+  if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall())
+    return Info;
+
   Info.CalleeSegmentSize = 0;
 
   for (const MachineBasicBlock &MBB : MF) {
     for (const MachineInstr &MI : MBB) {
-      // TODO: Check regmasks? Do they occur anywhere except calls?
-      for (const MachineOperand &MO : MI.operands()) {
-        unsigned Width = 0;
-        bool IsSGPR = false;
-        bool IsAGPR = false;
-
-        if (!MO.isReg())
-          continue;
-
-        Register Reg = MO.getReg();
-        switch (Reg) {
-        case AMDGPU::EXEC:
-        case AMDGPU::EXEC_LO:
-        case AMDGPU::EXEC_HI:
-        case AMDGPU::SCC:
-        case AMDGPU::M0:
-        case AMDGPU::M0_LO16:
-        case AMDGPU::M0_HI16:
-        case AMDGPU::SRC_SHARED_BASE_LO:
-        case AMDGPU::SRC_SHARED_BASE:
-        case AMDGPU::SRC_SHARED_LIMIT_LO:
-        case AMDGPU::SRC_SHARED_LIMIT:
-        case AMDGPU::SRC_PRIVATE_BASE_LO:
-        case AMDGPU::SRC_PRIVATE_BASE:
-        case AMDGPU::SRC_PRIVATE_LIMIT_LO:
-        case AMDGPU::SRC_PRIVATE_LIMIT:
-        case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
-        case AMDGPU::SGPR_NULL:
-        case AMDGPU::SGPR_NULL64:
-        case AMDGPU::MODE:
-          continue;
-
-        case AMDGPU::NoRegister:
-          assert(MI.isDebugInstr() &&
-                 "Instruction uses invalid noreg register");
-          continue;
-
-        case AMDGPU::VCC:
-        case AMDGPU::VCC_LO:
-        case AMDGPU::VCC_HI:
-        case AMDGPU::VCC_LO_LO16:
-        case AMDGPU::VCC_LO_HI16:
-        case AMDGPU::VCC_HI_LO16:
-        case AMDGPU::VCC_HI_HI16:
-          Info.UsesVCC = true;
-          continue;
-
-        case AMDGPU::FLAT_SCR:
-        case AMDGPU::FLAT_SCR_LO:
-        case AMDGPU::FLAT_SCR_HI:
-          continue;
-
-        case AMDGPU::XNACK_MASK:
-        case AMDGPU::XNACK_MASK_LO:
-        case AMDGPU::XNACK_MASK_HI:
-          llvm_unreachable("xnack_mask registers should not be used");
-
-        case AMDGPU::LDS_DIRECT:
-          llvm_unreachable("lds_direct register should not be used");
-
-        case AMDGPU::TBA:
-        case AMDGPU::TBA_LO:
-        case AMDGPU::TBA_HI:
-        case AMDGPU::TMA:
-        case AMDGPU::TMA_LO:
-        case AMDGPU::TMA_HI:
-          llvm_unreachable("trap handler registers should not be used");
-
-        case AMDGPU::SRC_VCCZ:
-          llvm_unreachable("src_vccz register should not be used");
-
-        case AMDGPU::SRC_EXECZ:
-          llvm_unreachable("src_execz register should not be used");
-
-        case AMDGPU::SRC_SCC:
-          llvm_unreachable("src_scc register should not be used");
-
-        default:
-          break;
-        }
-
-        if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
-            AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
-            AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 1;
-        } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
-                   AMDGPU::VGPR_16RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 1;
-        } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
-                   AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 1;
-        } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 2;
-        } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 2;
-        } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 2;
-        } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 3;
-        } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 3;
-        } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 3;
-        } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 4;
-        } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 4;
-        } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 4;
-        } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 5;
-        } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 5;
-        } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 5;
-        } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 6;
-        } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 6;
-        } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 6;
-        } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 7;
-        } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 7;
-        } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 7;
-        } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 8;
-        } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 8;
-        } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 8;
-        } else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 9;
-        } else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 9;
-        } else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 9;
-        } else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 10;
-        } else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 10;
-        } else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 10;
-        } else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 11;
-        } else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 11;
-        } else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 11;
-        } else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 12;
-        } else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 12;
-        } else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 12;
-        } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 16;
-        } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 16;
-        } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 16;
-        } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 32;
-        } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 32;
-        } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 32;
-        } else {
-          // We only expect TTMP registers or registers that do not belong to
-          // any RC.
-          assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
-                  AMDGPU::TTMP_64RegClass.contains(Reg) ||
-                  AMDGPU::TTMP_128RegClass.contains(Reg) ||
-                  AMDGPU::TTMP_256RegClass.contains(Reg) ||
-                  AMDGPU::TTMP_512RegClass.contains(Reg) ||
-                  !TRI.getPhysRegBaseClass(Reg)) &&
-                 "Unknown register class");
-        }
-        unsigned HWReg = TRI.getHWRegIndex(Reg);
-        int MaxUsed = HWReg + Width - 1;
-        if (IsSGPR) {
-          MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
-        } else if (IsAGPR) {
-          MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
-        } else {
-          MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
-        }
-      }
-
       if (MI.isCall()) {
         // Pseudo used just to encode the underlying global. Is there a better
         // way to track this?
@@ -464,9 +219,5 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
     }
   }
 
-  Info.NumExplicitSGPR = MaxSGPR + 1;
-  Info.NumVGPR = MaxVGPR + 1;
-  Info.NumAGPR = MaxAGPR + 1;
-
   return Info;
 }

diff  --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 0e7635a045588..01718faaf5c2e 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -970,10 +970,25 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
     return NumUserSGPRs;
   }
 
+  // Get the number of preloaded SGPRs for compute kernels.
   unsigned getNumPreloadedSGPRs() const {
     return NumUserSGPRs + NumSystemSGPRs;
   }
 
+  // Get the number of preloaded VGPRs for compute kernels.
+  unsigned getNumPreloadedVGPRs() const {
+    if (hasWorkItemIDZ())
+      return ArgInfo.WorkItemIDZ.getRegister() - AMDGPU::VGPR0 + 1;
+
+    if (hasWorkItemIDY())
+      return ArgInfo.WorkItemIDY.getRegister() - AMDGPU::VGPR0 + 1;
+
+    if (hasWorkItemIDX())
+      return ArgInfo.WorkItemIDX.getRegister() - AMDGPU::VGPR0 + 1;
+
+    return 0;
+  }
+
   unsigned getNumKernargPreloadedSGPRs() const {
     return UserSGPRInfo.getNumKernargPreloadSGPRs();
   }

diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index e41189adfb46f..511ea4125c8ec 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -4055,6 +4055,20 @@ SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
   return 0;
 }
 
+unsigned
+SIRegisterInfo::getNumDefinedPhysRegs(const MachineRegisterInfo &MRI,
+                                      const TargetRegisterClass &RC) const {
+  for (MCPhysReg Reg : reverse(RC.getRegisters())) {
+    for (MCRegAliasIterator AI(Reg, this, true); AI.isValid(); ++AI) {
+      if (std::any_of(
+              MRI.def_instr_begin(*AI), MRI.def_instr_end(),
+              [](const MachineInstr &MI) { return !MI.isImplicitDef(); }))
+        return getHWRegIndex(Reg) + 1;
+    }
+  }
+  return 0;
+}
+
 SmallVector<StringLiteral>
 SIRegisterInfo::getVRegFlagsOfReg(Register Reg,
                                   const MachineFunction &MF) const {

diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index a4b135d5e0b59..7726762ad0e6d 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -486,6 +486,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
   unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
                               const TargetRegisterClass &RC) const;
 
+  // \returns the number of registers of a given \p RC defined in a function.
+  // Does not go inside function calls.
+  unsigned getNumDefinedPhysRegs(const MachineRegisterInfo &MRI,
+                                 const TargetRegisterClass &RC) const;
+
   std::optional<uint8_t> getVRegFlagValue(StringRef Name) const override {
     return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG
                              : std::optional<uint8_t>{};

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 9b35920f8547a..bdd86c1af6248 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -3059,7 +3059,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 17
+; GPRIDX-NEXT:     wavefront_sgpr_count = 24
 ; GPRIDX-NEXT:     workitem_vgpr_count = 3
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
@@ -3202,7 +3202,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX10-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX10-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX10-NEXT:     granulated_wavefront_sgpr_count = 1
+; GFX10-NEXT:     granulated_wavefront_sgpr_count = 2
 ; GFX10-NEXT:     priority = 0
 ; GFX10-NEXT:     float_mode = 240
 ; GFX10-NEXT:     priv = 0
@@ -3245,7 +3245,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:     gds_segment_byte_size = 0
 ; GFX10-NEXT:     kernarg_segment_byte_size = 28
 ; GFX10-NEXT:     workgroup_fbarrier_count = 0
-; GFX10-NEXT:     wavefront_sgpr_count = 10
+; GFX10-NEXT:     wavefront_sgpr_count = 18
 ; GFX10-NEXT:     workitem_vgpr_count = 3
 ; GFX10-NEXT:     reserved_vgpr_first = 0
 ; GFX10-NEXT:     reserved_vgpr_count = 0
@@ -3294,7 +3294,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX11-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX11-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX11-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX11-NEXT:     granulated_wavefront_sgpr_count = 0
+; GFX11-NEXT:     granulated_wavefront_sgpr_count = 1
 ; GFX11-NEXT:     priority = 0
 ; GFX11-NEXT:     float_mode = 240
 ; GFX11-NEXT:     priv = 0
@@ -3337,7 +3337,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX11-NEXT:     gds_segment_byte_size = 0
 ; GFX11-NEXT:     kernarg_segment_byte_size = 28
 ; GFX11-NEXT:     workgroup_fbarrier_count = 0
-; GFX11-NEXT:     wavefront_sgpr_count = 7
+; GFX11-NEXT:     wavefront_sgpr_count = 16
 ; GFX11-NEXT:     workitem_vgpr_count = 3
 ; GFX11-NEXT:     reserved_vgpr_first = 0
 ; GFX11-NEXT:     reserved_vgpr_count = 0
@@ -4034,7 +4034,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     kernel_code_entry_byte_offset = 256
 ; GPRIDX-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GPRIDX-NEXT:     granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 1
+; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 2
 ; GPRIDX-NEXT:     priority = 0
 ; GPRIDX-NEXT:     float_mode = 240
 ; GPRIDX-NEXT:     priv = 0
@@ -4077,8 +4077,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 16
-; GPRIDX-NEXT:     workitem_vgpr_count = 2
+; GPRIDX-NEXT:     wavefront_sgpr_count = 24
+; GPRIDX-NEXT:     workitem_vgpr_count = 3
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
 ; GPRIDX-NEXT:     reserved_sgpr_first = 0
@@ -4206,7 +4206,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX10-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX10-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX10-NEXT:     granulated_wavefront_sgpr_count = 1
+; GFX10-NEXT:     granulated_wavefront_sgpr_count = 2
 ; GFX10-NEXT:     priority = 0
 ; GFX10-NEXT:     float_mode = 240
 ; GFX10-NEXT:     priv = 0
@@ -4249,8 +4249,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     gds_segment_byte_size = 0
 ; GFX10-NEXT:     kernarg_segment_byte_size = 28
 ; GFX10-NEXT:     workgroup_fbarrier_count = 0
-; GFX10-NEXT:     wavefront_sgpr_count = 10
-; GFX10-NEXT:     workitem_vgpr_count = 2
+; GFX10-NEXT:     wavefront_sgpr_count = 18
+; GFX10-NEXT:     workitem_vgpr_count = 3
 ; GFX10-NEXT:     reserved_vgpr_first = 0
 ; GFX10-NEXT:     reserved_vgpr_count = 0
 ; GFX10-NEXT:     reserved_sgpr_first = 0
@@ -4291,7 +4291,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX11-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX11-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX11-NEXT:     granulated_wavefront_sgpr_count = 0
+; GFX11-NEXT:     granulated_wavefront_sgpr_count = 1
 ; GFX11-NEXT:     priority = 0
 ; GFX11-NEXT:     float_mode = 240
 ; GFX11-NEXT:     priv = 0
@@ -4334,7 +4334,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     gds_segment_byte_size = 0
 ; GFX11-NEXT:     kernarg_segment_byte_size = 28
 ; GFX11-NEXT:     workgroup_fbarrier_count = 0
-; GFX11-NEXT:     wavefront_sgpr_count = 6
+; GFX11-NEXT:     wavefront_sgpr_count = 16
 ; GFX11-NEXT:     workitem_vgpr_count = 2
 ; GFX11-NEXT:     reserved_vgpr_first = 0
 ; GFX11-NEXT:     reserved_vgpr_count = 0
@@ -4382,7 +4382,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     kernel_code_entry_byte_offset = 256
 ; GPRIDX-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GPRIDX-NEXT:     granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 1
+; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 2
 ; GPRIDX-NEXT:     priority = 0
 ; GPRIDX-NEXT:     float_mode = 240
 ; GPRIDX-NEXT:     priv = 0
@@ -4425,7 +4425,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 16
+; GPRIDX-NEXT:     wavefront_sgpr_count = 24
 ; GPRIDX-NEXT:     workitem_vgpr_count = 3
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
@@ -4560,7 +4560,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX10-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX10-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX10-NEXT:     granulated_wavefront_sgpr_count = 1
+; GFX10-NEXT:     granulated_wavefront_sgpr_count = 2
 ; GFX10-NEXT:     priority = 0
 ; GFX10-NEXT:     float_mode = 240
 ; GFX10-NEXT:     priv = 0
@@ -4603,7 +4603,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     gds_segment_byte_size = 0
 ; GFX10-NEXT:     kernarg_segment_byte_size = 28
 ; GFX10-NEXT:     workgroup_fbarrier_count = 0
-; GFX10-NEXT:     wavefront_sgpr_count = 10
+; GFX10-NEXT:     wavefront_sgpr_count = 18
 ; GFX10-NEXT:     workitem_vgpr_count = 3
 ; GFX10-NEXT:     reserved_vgpr_first = 0
 ; GFX10-NEXT:     reserved_vgpr_count = 0
@@ -4648,7 +4648,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX11-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX11-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX11-NEXT:     granulated_wavefront_sgpr_count = 0
+; GFX11-NEXT:     granulated_wavefront_sgpr_count = 1
 ; GFX11-NEXT:     priority = 0
 ; GFX11-NEXT:     float_mode = 240
 ; GFX11-NEXT:     priv = 0
@@ -4691,7 +4691,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     gds_segment_byte_size = 0
 ; GFX11-NEXT:     kernarg_segment_byte_size = 28
 ; GFX11-NEXT:     workgroup_fbarrier_count = 0
-; GFX11-NEXT:     wavefront_sgpr_count = 7
+; GFX11-NEXT:     wavefront_sgpr_count = 16
 ; GFX11-NEXT:     workitem_vgpr_count = 3
 ; GFX11-NEXT:     reserved_vgpr_first = 0
 ; GFX11-NEXT:     reserved_vgpr_count = 0

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll
index 7bf9a29e9ff44..cc614bb24839c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll
@@ -13,8 +13,9 @@
 ; CHECK: {{^}}kernel_illegal_agpr_use_asm:
 ; CHECK: ; use a0
 
-; CHECK: NumVgprs: 0
-; CHECK: NumAgprs: 1
+; GFX908: NumVgprs: 3
+; GFX90A: NumVgprs: 1
+; CHECK: NumAgprs: 0
 define amdgpu_kernel void @kernel_illegal_agpr_use_asm() #0 {
   call void asm sideeffect "; use $0", "a"(i32 poison)
   ret void
@@ -24,7 +25,7 @@ define amdgpu_kernel void @kernel_illegal_agpr_use_asm() #0 {
 ; CHECK: ; use a0
 
 ; CHECK: NumVgprs: 0
-; CHECK: NumAgprs: 1
+; CHECK: NumAgprs: 0
 define void @func_illegal_agpr_use_asm() #0 {
   call void asm sideeffect "; use $0", "a"(i32 poison)
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
index dd760c2a215ca..7851de641c5a3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
@@ -10,9 +10,9 @@
 
 ; ASM-LABEL: amdhsa_kernarg_preload_4_implicit_6:
 ; ASM: .amdhsa_user_sgpr_count 12
-; ASM: .amdhsa_next_free_sgpr 12
-; ASM: ; TotalNumSgprs: 18
-; ASM: ; NumSGPRsForWavesPerEU: 18
+; ASM: .amdhsa_next_free_sgpr 15
+; ASM: ; TotalNumSgprs: 21
+; ASM: ; NumSGPRsForWavesPerEU: 21
 
 ; Test that we include preloaded SGPRs in the GRANULATED_WAVEFRONT_SGPR_COUNT
 ; feild that are not explicitly referenced in the kernel. This test has 6 implicit
@@ -26,13 +26,13 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_4_implicit_6(i128 inreg) { ret
 ; OBJDUMP-NEXT: 0040 00000000 00000000 20010000 00000000  ........ .......
 ; OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000  ................
 ; OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000  ................
-; OBJDUMP-NEXT: 0070 4000af00 94000000 08000800 00000000  @...............
+; OBJDUMP-NEXT: 0070 8000af00 94000000 08000800 00000000  ................
 
 ; ASM-LABEL: amdhsa_kernarg_preload_8_implicit_2:
 ; ASM: .amdhsa_user_sgpr_count 10
-; ASM: .amdhsa_next_free_sgpr 10
-; ASM: ; TotalNumSgprs: 16
-; ASM: ; NumSGPRsForWavesPerEU: 16
+; ASM: .amdhsa_next_free_sgpr 11
+; ASM: ; TotalNumSgprs: 17
+; ASM: ; NumSGPRsForWavesPerEU: 17
 
 ; Only the kernarg_ptr is enabled so we should have 8 preload kernarg SGPRs, 2
 ; implicit, and 6 extra.
@@ -46,9 +46,9 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_8_implicit_2(i256 inreg) #0 {
 
 ; ASM-LABEL: amdhsa_kernarg_preload_1_implicit_2:
 ; ASM: .amdhsa_user_sgpr_count 3
-; ASM: .amdhsa_next_free_sgpr 3
-; ASM: ; TotalNumSgprs: 9
-; ASM: ; NumSGPRsForWavesPerEU: 9
+; ASM: .amdhsa_next_free_sgpr 4
+; ASM: ; TotalNumSgprs: 10
+; ASM: ; NumSGPRsForWavesPerEU: 10
 
 ; 1 preload, 2 implicit, 6 extra. Rounds up to 16 SGPRs in the KD.
 
@@ -57,13 +57,13 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_1_implicit_2(i32 inreg) #0 { r
 ; OBJDUMP-NEXT: 00c0 00000000 00000000 08010000 00000000  ................
 ; OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000  ................
 ; OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000  ................
-; OBJDUMP-NEXT: 00f0 0000af00 84000000 08000000 00000000  ................
+; OBJDUMP-NEXT: 00f0 4000af00 84000000 08000000 00000000  @...............
 
 ; ASM-LABEL: amdhsa_kernarg_preload_0_implicit_2:
 ; ASM: .amdhsa_user_sgpr_count 2
-; ASM: .amdhsa_next_free_sgpr 0
-; ASM: ; TotalNumSgprs: 6
-; ASM: ; NumSGPRsForWavesPerEU: 6
+; ASM: .amdhsa_next_free_sgpr 3
+; ASM: ; TotalNumSgprs: 9
+; ASM: ; NumSGPRsForWavesPerEU: 9
 
 ; 0 preload kernarg SGPRs, 2 implicit, 6 extra. Rounds up to 8 SGPRs in the KD.
 ; Encoded like '00'.

diff  --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
index f4d17e50cf18c..494ade73cb5f8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -142,8 +142,8 @@ attributes #0 = { nounwind }
 
 ; GCN: amdpal.pipelines:
 ; GCN-NEXT:  - .registers:
-; SDAG-NEXT:     '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
-; GISEL-NEXT:    '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
+; GFX8-NEXT:     '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf010a{{$}}
+; GFX9-NEXT:    '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf014a{{$}}
 ; GCN-NEXT:      '0x2e13 (COMPUTE_PGM_RSRC2)': 0x8001{{$}}
 ; GCN-NEXT:    .shader_functions:
 ; GCN-NEXT:      dynamic_stack:
@@ -164,13 +164,13 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:      multiple_stack:
 ; GCN-NEXT:        .backend_stack_size: 0x24{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GCN-NEXT:        .sgpr_count:     0x21{{$}}
+; GCN-NEXT:        .sgpr_count:     0x1{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x24{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x3{{$}}
 ; GCN-NEXT:      no_stack:
 ; GCN-NEXT:        .backend_stack_size: 0{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GCN-NEXT:        .sgpr_count:     0x20{{$}}
+; GCN-NEXT:        .sgpr_count:     0x1{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x1{{$}}
 ; GCN-NEXT:      no_stack_call:
@@ -203,7 +203,7 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:      simple_lds:
 ; GCN-NEXT:        .backend_stack_size: 0{{$}}
 ; GCN-NEXT:        .lds_size:       0x100{{$}}
-; GCN-NEXT:        .sgpr_count:     0x20{{$}}
+; GCN-NEXT:        .sgpr_count:     0x1{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x1{{$}}
 ; GCN-NEXT:      simple_lds_recurse:
@@ -215,7 +215,7 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:      simple_stack:
 ; GCN-NEXT:        .backend_stack_size: 0x14{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GCN-NEXT:        .sgpr_count:     0x21{{$}}
+; GCN-NEXT:        .sgpr_count:     0x1{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x14{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x2{{$}}
 ; GCN-NEXT:      simple_stack_call:

diff  --git a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
index f52ba7000edeb..5ccf41c408b72 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
@@ -2,8 +2,8 @@
 ; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdpal -mcpu=kaveri | llvm-readobj -S --sd --syms - | FileCheck %s --check-prefix=ELF
 ; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 %s
 ; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX11W32 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX11W64 %s
 
 ; ELF: Section {
 ; ELF: Name: .text
@@ -23,8 +23,16 @@
 ; ELF: Section: .text (0x2)
 ; ELF: }
 
-; GFX10: NumSGPRsForWavesPerEU: 6
-; GFX10: NumVGPRsForWavesPerEU: 1
+; GFX10: NumSGPRsForWavesPerEU: 12
+; GFX10: NumVGPRsForWavesPerEU: 3
+
+; Wave32 and 64 behave 
diff erently due to the UserSGPRInit16Bug,
+; which only affects Wave32.
+; GFX11W32: NumSGPRsForWavesPerEU: 16
+; GFX11W32: NumVGPRsForWavesPerEU: 1
+
+; GFX11W64: NumSGPRsForWavesPerEU: 11
+; GFX11W64: NumVGPRsForWavesPerEU: 1
 
 define amdgpu_kernel void @simple(ptr addrspace(1) %out) {
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
index 616867481d177..0e0a81d4657df 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
@@ -2,10 +2,10 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=HSAMD %s
 
 ; CHECK-LABEL: {{^}}min_64_max_64:
-; CHECK: SGPRBlocks: 0
+; CHECK: SGPRBlocks: 2
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 3
 define amdgpu_kernel void @min_64_max_64() #0 {
 entry:
   ret void
@@ -13,10 +13,10 @@ entry:
 attributes #0 = {"amdgpu-flat-work-group-size"="64,64"}
 
 ; CHECK-LABEL: {{^}}min_64_max_128:
-; CHECK: SGPRBlocks: 0
+; CHECK: SGPRBlocks: 2
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 3
 define amdgpu_kernel void @min_64_max_128() #1 {
 entry:
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
index e9fe4f3c618c7..5617a80fc94b4 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -26,10 +26,10 @@ attributes #1 = {"amdgpu-waves-per-eu"="5,5"}
 
 ; Exactly 10 waves per execution unit.
 ; CHECK-LABEL: {{^}}empty_exactly_10:
-; CHECK: SGPRBlocks: 0
+; CHECK: SGPRBlocks: 2
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 3
 define amdgpu_kernel void @empty_exactly_10() #2 {
 entry:
   ret void
@@ -38,10 +38,10 @@ attributes #2 = {"amdgpu-waves-per-eu"="10,10"}
 
 ; At least 1 wave per execution unit.
 ; CHECK-LABEL: {{^}}empty_at_least_1:
-; CHECK: SGPRBlocks: 0
+; CHECK: SGPRBlocks: 2
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 3
 define amdgpu_kernel void @empty_at_least_1() #3 {
 entry:
   ret void
@@ -50,10 +50,10 @@ attributes #3 = {"amdgpu-waves-per-eu"="1"}
 
 ; At least 5 waves per execution unit.
 ; CHECK-LABEL: {{^}}empty_at_least_5:
-; CHECK: SGPRBlocks: 0
+; CHECK: SGPRBlocks: 2
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 3
 define amdgpu_kernel void @empty_at_least_5() #4 {
 entry:
   ret void
@@ -62,10 +62,10 @@ attributes #4 = {"amdgpu-waves-per-eu"="5"}
 
 ; At least 10 waves per execution unit.
 ; CHECK-LABEL: {{^}}empty_at_least_10:
-; CHECK: SGPRBlocks: 0
+; CHECK: SGPRBlocks: 2
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 3
 define amdgpu_kernel void @empty_at_least_10() #5 {
 entry:
   ret void
@@ -88,10 +88,10 @@ attributes #6 = {"amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="1,64
 
 ; At most 10 waves per execution unit.
 ; CHECK-LABEL: {{^}}empty_at_most_10:
-; CHECK: SGPRBlocks: 0
+; CHECK: SGPRBlocks: 2
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 3
 define amdgpu_kernel void @empty_at_most_10() #7 {
 entry:
   ret void
@@ -102,10 +102,10 @@ attributes #7 = {"amdgpu-waves-per-eu"="1,10"}
 
 ; Between 5 and 10 waves per execution unit.
 ; CHECK-LABEL: {{^}}empty_between_5_and_10:
-; CHECK: SGPRBlocks: 0
+; CHECK: SGPRBlocks: 2
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 3
 define amdgpu_kernel void @empty_between_5_and_10() #8 {
 entry:
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
index 2e79d8bab46a6..efa416e301ccc 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
@@ -28,7 +28,7 @@ bb:
 }
 ; ALL:      .set .Laliasee_default.num_vgpr, 0
 ; ALL-NEXT: .set .Laliasee_default.num_agpr, 27
-; ALL-NEXT: .set .Laliasee_default.numbered_sgpr, 32
+; ALL-NEXT: .set .Laliasee_default.numbered_sgpr, 0
 
 attributes #0 = { noinline norecurse nounwind optnone }
 attributes #1 = { noinline norecurse nounwind readnone willreturn }

diff  --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
index 337da5d0ecbe0..62ca985bc6400 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
@@ -18,7 +18,7 @@ bb:
 
 ; CHECK:      .set .Laliasee_default_vgpr64_sgpr102.num_vgpr, 53
 ; CHECK-NEXT: .set .Laliasee_default_vgpr64_sgpr102.num_agpr, 0
-; CHECK-NEXT: .set .Laliasee_default_vgpr64_sgpr102.numbered_sgpr, 32
+; CHECK-NEXT: .set .Laliasee_default_vgpr64_sgpr102.numbered_sgpr, 0
 define internal void @aliasee_default_vgpr64_sgpr102() #1 {
 bb:
   call void asm sideeffect "; clobber v52 ", "~{v52}"()

diff  --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
index 075eddd2763d3..344f8200608f6 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
@@ -24,7 +24,7 @@ bb:
 
 ; CHECK:      .set .Laliasee_vgpr32_sgpr76.num_vgpr, 27
 ; CHECK-NEXT: .set .Laliasee_vgpr32_sgpr76.num_agpr, 0
-; CHECK-NEXT: .set .Laliasee_vgpr32_sgpr76.numbered_sgpr, 32
+; CHECK-NEXT: .set .Laliasee_vgpr32_sgpr76.numbered_sgpr, 0
 define internal void @aliasee_vgpr32_sgpr76() #1 {
 bb:
   call void asm sideeffect "; clobber v26 ", "~{v26}"()

diff  --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
index 4fd181d3c0f43..3d36f8a514c47 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
@@ -21,7 +21,7 @@ bb:
 
 ; CHECK:      .set .Laliasee_vgpr64_sgpr102.num_vgpr, 53
 ; CHECK-NEXT: .set .Laliasee_vgpr64_sgpr102.num_agpr, 0
-; CHECK-NEXT: .set .Laliasee_vgpr64_sgpr102.numbered_sgpr, 32
+; CHECK-NEXT: .set .Laliasee_vgpr64_sgpr102.numbered_sgpr, 0
 define internal void @aliasee_vgpr64_sgpr102() #1 {
 bb:
   call void asm sideeffect "; clobber v52 ", "~{v52}"()

diff  --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
index 00f72d5d8b1dd..2274c437daf62 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
@@ -21,7 +21,7 @@ bb:
 
 ; CHECK:      .set .Laliasee_vgpr256_sgpr102.num_vgpr, 253
 ; CHECK-NEXT: .set .Laliasee_vgpr256_sgpr102.num_agpr, 0
-; CHECK-NEXT: .set .Laliasee_vgpr256_sgpr102.numbered_sgpr, 33
+; CHECK-NEXT: .set .Laliasee_vgpr256_sgpr102.numbered_sgpr, 0
 define internal void @aliasee_vgpr256_sgpr102() #1 {
 bb:
   call void asm sideeffect "; clobber v252 ", "~{v252}"()

diff  --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index dbd00f09943c0..db1269e8e95c2 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -7,7 +7,7 @@
 ; Make sure to run a GPU with the SGPR allocation bug.
 
 ; GCN-LABEL: {{^}}use_vcc:
-; GCN: ; TotalNumSgprs: 34
+; GCN: ; TotalNumSgprs: 2
 ; GCN: ; NumVgprs: 0
 define void @use_vcc() #1 {
   call void asm sideeffect "", "~{vcc}" () #0
@@ -43,8 +43,8 @@ define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out)
 }
 
 ; GCN-LABEL: {{^}}use_flat_scratch:
-; CI: ; TotalNumSgprs: 36
-; VI: ; TotalNumSgprs: 38
+; CI: ; TotalNumSgprs: 4
+; VI: ; TotalNumSgprs: 6
 ; GCN: ; NumVgprs: 0
 define void @use_flat_scratch() #1 {
   call void asm sideeffect "", "~{flat_scratch}" () #0
@@ -234,7 +234,7 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
 ; Make sure there's no assert when a sgpr96 is used.
 ; GCN-LABEL: {{^}}count_use_sgpr96_external_call
 ; GCN: ; sgpr96 s[{{[0-9]+}}:{{[0-9]+}}]
-; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(0, amdgpu.max_num_vgpr)
+; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(3, amdgpu.max_num_vgpr)
 ; GCN: .set count_use_sgpr96_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
 ; CI: TotalNumSgprs: count_use_sgpr96_external_call.numbered_sgpr+4
 ; VI-BUG: TotalNumSgprs: 96
@@ -249,7 +249,7 @@ entry:
 ; Make sure there's no assert when a sgpr160 is used.
 ; GCN-LABEL: {{^}}count_use_sgpr160_external_call
 ; GCN: ; sgpr160 s[{{[0-9]+}}:{{[0-9]+}}]
-; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(0, amdgpu.max_num_vgpr)
+; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(3, amdgpu.max_num_vgpr)
 ; GCN: .set count_use_sgpr160_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
 ; CI: TotalNumSgprs: count_use_sgpr160_external_call.numbered_sgpr+4
 ; VI-BUG: TotalNumSgprs: 96

diff  --git a/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll b/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
index 61830f18ad7a7..55dc394628176 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
@@ -12,7 +12,7 @@ declare float @llvm.fma.f32(float, float, float)
 ; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
 ; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
 ; It's probably OK if this is slightly higher:
-; CHECK: ; NumVgprs: 8
+; CHECK: ; NumVgprs: 5
 define amdgpu_kernel void @foobar(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i32 %flag) {
 entry:
   %cmpflag = icmp eq i32 %flag, 1

diff  --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
index 3fe3cafd729a7..d8d7494d0dc1c 100644
--- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
@@ -16,7 +16,7 @@
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 10
+; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 16
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_vcc 0
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 0
 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
@@ -35,7 +35,7 @@
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 10
+; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 16
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_vcc 0
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 0
 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
@@ -93,7 +93,7 @@ entry:
 ; registers used.
 ;
 ; ALL-ASM-LABEL: {{^}}empty:
-; ALL-ASM:     .amdhsa_next_free_vgpr 1
+; ALL-ASM:     .amdhsa_next_free_vgpr 3
 ; ALL-ASM:     .amdhsa_next_free_sgpr 1
 define amdgpu_kernel void @empty(
     i32 %i,

diff  --git a/llvm/test/CodeGen/AMDGPU/elf-notes.ll b/llvm/test/CodeGen/AMDGPU/elf-notes.ll
index 22d699a8f4809..59cf9825116fa 100644
--- a/llvm/test/CodeGen/AMDGPU/elf-notes.ll
+++ b/llvm/test/CodeGen/AMDGPU/elf-notes.ll
@@ -43,7 +43,7 @@
 ; OSABI-HSA-ELF:     .sgpr_count:     96
 ; OSABI-HSA-ELF:     .sgpr_spill_count: 0
 ; OSABI-HSA-ELF:     .symbol:         elf_notes.kd
-; OSABI-HSA-ELF:     .vgpr_count:     0
+; OSABI-HSA-ELF:     .vgpr_count:     1
 ; OSABI-HSA-ELF:     .vgpr_spill_count: 0
 ; OSABI-HSA-ELF:     .wavefront_size: 64
 ; OSABI-HSA-ELF: amdhsa.target:   amdgcn-amd-amdhsa--gfx802

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index a59382ba20dc5..ed1f3e1397abc 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -27,15 +27,15 @@
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; TotalNumSgprs: 8
-; VI-NOXNACK: ; TotalNumSgprs: 8
-; HSA-VI-NOXNACK: ; TotalNumSgprs: 8
-; VI-XNACK: ; TotalNumSgprs: 12
-; HSA-VI-XNACK: ; TotalNumSgprs: 12
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 8
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 8
+; CI: ; TotalNumSgprs: 12
+; VI-NOXNACK: ; TotalNumSgprs: 12
+; HSA-VI-NOXNACK: ; TotalNumSgprs: 18
+; VI-XNACK: ; TotalNumSgprs: 16
+; HSA-VI-XNACK: ; TotalNumSgprs: 22
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
 define amdgpu_kernel void @no_vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{s7}"()
@@ -50,15 +50,15 @@ entry:
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; TotalNumSgprs: 10
-; VI-NOXNACK: ; TotalNumSgprs: 10
-; HSA-VI-NOXNACK: ; TotalNumSgprs: 10
-; VI-XNACK: ; TotalNumSgprs: 12
-; HSA-VI-XNACK: ; TotalNumSgprs: 12
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 10
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 10
+; CI: ; TotalNumSgprs: 14
+; VI-NOXNACK: ; TotalNumSgprs: 14
+; HSA-VI-NOXNACK: ; TotalNumSgprs: 20
+; VI-XNACK: ; TotalNumSgprs: 16
+; HSA-VI-XNACK: ; TotalNumSgprs: 22
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 13
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 13
 define amdgpu_kernel void @vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{vcc}"()
@@ -73,15 +73,15 @@ entry:
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; TotalNumSgprs: 12
-; VI-NOXNACK: ; TotalNumSgprs: 14
+; CI: ; TotalNumSgprs: 16
+; VI-NOXNACK: ; TotalNumSgprs: 18
 ; HSA-VI-NOXNACK: ; TotalNumSgprs: 24
-; VI-XNACK: ; TotalNumSgprs: 14
+; VI-XNACK: ; TotalNumSgprs: 18
 ; HSA-VI-XNACK: ; TotalNumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 8
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 8
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
 define amdgpu_kernel void @no_vcc_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{flat_scratch}"()
@@ -96,15 +96,15 @@ entry:
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; TotalNumSgprs: 12
-; VI-NOXNACK: ; TotalNumSgprs: 14
+; CI: ; TotalNumSgprs: 16
+; VI-NOXNACK: ; TotalNumSgprs: 18
 ; HSA-VI-NOXNACK: ; TotalNumSgprs: 24
-; VI-XNACK: ; TotalNumSgprs: 14
+; VI-XNACK: ; TotalNumSgprs: 18
 ; HSA-VI-XNACK: ; TotalNumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 10
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 10
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 13
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 13
 define amdgpu_kernel void @vcc_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{vcc},~{flat_scratch}"()
@@ -122,15 +122,15 @@ entry:
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: NumSgprs: 4
-; VI-NOXNACK: NumSgprs: 6
+; CI: NumSgprs: 16
+; VI-NOXNACK: NumSgprs: 18
 ; HSA-VI-NOXNACK: NumSgprs: 24
-; VI-XNACK: NumSgprs: 6
+; VI-XNACK: NumSgprs: 18
 ; HSA-VI-XNACK: NumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
 define amdgpu_kernel void @use_flat_scr() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch}"()
@@ -143,15 +143,15 @@ entry:
 ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: NumSgprs: 4
-; VI-NOXNACK: NumSgprs: 6
+; CI: NumSgprs: 16
+; VI-NOXNACK: NumSgprs: 18
 ; HSA-VI-NOXNACK: NumSgprs: 24
-; VI-XNACK: NumSgprs: 6
+; VI-XNACK: NumSgprs: 18
 ; HSA-VI-XNACK: NumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
 define amdgpu_kernel void @use_flat_scr_lo() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch_lo}"()
@@ -166,15 +166,15 @@ entry:
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: NumSgprs: 4
-; VI-NOXNACK: NumSgprs: 6
+; CI: NumSgprs: 16
+; VI-NOXNACK: NumSgprs: 18
 ; HSA-VI-NOXNACK: NumSgprs: 24
-; VI-XNACK: NumSgprs: 6
+; VI-XNACK: NumSgprs: 18
 ; HSA-VI-XNACK: NumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
 define amdgpu_kernel void @use_flat_scr_hi() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch_hi}"()

diff  --git a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
index e152f2ddd5253..0a6aa05c2d212 100644
--- a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
@@ -5,14 +5,14 @@
 ; GCN-LABEL: {{^}}use_vcc:
 ; GCN: .set use_vcc.num_vgpr, 0
 ; GCN: .set use_vcc.num_agpr, 0
-; GCN: .set use_vcc.numbered_sgpr, 32
+; GCN: .set use_vcc.numbered_sgpr, 0
 ; GCN: .set use_vcc.private_seg_size, 0
 ; GCN: .set use_vcc.uses_vcc, 1
 ; GCN: .set use_vcc.uses_flat_scratch, 0
 ; GCN: .set use_vcc.has_dyn_sized_stack, 0
 ; GCN: .set use_vcc.has_recursion, 0
 ; GCN: .set use_vcc.has_indirect_call, 0
-; GCN: TotalNumSgprs: 36
+; GCN: TotalNumSgprs: 4
 ; GCN: NumVgprs: 0
 ; GCN: ScratchSize: 0
 define void @use_vcc() #1 {
@@ -59,14 +59,14 @@ define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out)
 ; GCN-LABEL: {{^}}use_flat_scratch:
 ; GCN: .set use_flat_scratch.num_vgpr, 0
 ; GCN: .set use_flat_scratch.num_agpr, 0
-; GCN: .set use_flat_scratch.numbered_sgpr, 32
+; GCN: .set use_flat_scratch.numbered_sgpr, 0
 ; GCN: .set use_flat_scratch.private_seg_size, 0
 ; GCN: .set use_flat_scratch.uses_vcc, 0
 ; GCN: .set use_flat_scratch.uses_flat_scratch, 1
 ; GCN: .set use_flat_scratch.has_dyn_sized_stack, 0
 ; GCN: .set use_flat_scratch.has_recursion, 0
 ; GCN: .set use_flat_scratch.has_indirect_call, 0
-; GCN: TotalNumSgprs: 38
+; GCN: TotalNumSgprs: 6
 ; GCN: NumVgprs: 0
 ; GCN: ScratchSize: 0
 define void @use_flat_scratch() #1 {
@@ -113,14 +113,14 @@ define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace
 ; GCN-LABEL: {{^}}use_10_vgpr:
 ; GCN: .set use_10_vgpr.num_vgpr, 10
 ; GCN: .set use_10_vgpr.num_agpr, 0
-; GCN: .set use_10_vgpr.numbered_sgpr, 32
+; GCN: .set use_10_vgpr.numbered_sgpr, 0
 ; GCN: .set use_10_vgpr.private_seg_size, 0
 ; GCN: .set use_10_vgpr.uses_vcc, 0
 ; GCN: .set use_10_vgpr.uses_flat_scratch, 0
 ; GCN: .set use_10_vgpr.has_dyn_sized_stack, 0
 ; GCN: .set use_10_vgpr.has_recursion, 0
 ; GCN: .set use_10_vgpr.has_indirect_call, 0
-; GCN: TotalNumSgprs: 36
+; GCN: TotalNumSgprs: 4
 ; GCN: NumVgprs: 10
 ; GCN: ScratchSize: 0
 define void @use_10_vgpr() #1 {
@@ -168,14 +168,14 @@ define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 {
 ; GCN-LABEL: {{^}}use_50_vgpr:
 ; GCN:	.set use_50_vgpr.num_vgpr, 50
 ; GCN:	.set use_50_vgpr.num_agpr, 0
-; GCN:	.set use_50_vgpr.numbered_sgpr, 32
+; GCN:	.set use_50_vgpr.numbered_sgpr, 0
 ; GCN:	.set use_50_vgpr.private_seg_size, 0
 ; GCN:	.set use_50_vgpr.uses_vcc, 0
 ; GCN:	.set use_50_vgpr.uses_flat_scratch, 0
 ; GCN:	.set use_50_vgpr.has_dyn_sized_stack, 0
 ; GCN:	.set use_50_vgpr.has_recursion, 0
 ; GCN:	.set use_50_vgpr.has_indirect_call, 0
-; GCN: TotalNumSgprs: 36
+; GCN: TotalNumSgprs: 4
 ; GCN: NumVgprs: 50
 ; GCN: ScratchSize: 0
 define void @use_50_vgpr() #1 {
@@ -258,14 +258,14 @@ define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 {
 ; GCN-LABEL: {{^}}use_stack0:
 ; GCN:	.set use_stack0.num_vgpr, 1
 ; GCN:	.set use_stack0.num_agpr, 0
-; GCN:	.set use_stack0.numbered_sgpr, 33
+; GCN:	.set use_stack0.numbered_sgpr, 0
 ; GCN:	.set use_stack0.private_seg_size, 2052
 ; GCN:	.set use_stack0.uses_vcc, 0
 ; GCN:	.set use_stack0.uses_flat_scratch, 0
 ; GCN:	.set use_stack0.has_dyn_sized_stack, 0
 ; GCN:	.set use_stack0.has_recursion, 0
 ; GCN:	.set use_stack0.has_indirect_call, 0
-; GCN: TotalNumSgprs: 37
+; GCN: TotalNumSgprs: 4
 ; GCN: NumVgprs: 1
 ; GCN: ScratchSize: 2052
 define void @use_stack0() #1 {
@@ -277,14 +277,14 @@ define void @use_stack0() #1 {
 ; GCN-LABEL: {{^}}use_stack1:
 ; GCN:	.set use_stack1.num_vgpr, 1
 ; GCN:	.set use_stack1.num_agpr, 0
-; GCN:	.set use_stack1.numbered_sgpr, 33
+; GCN:	.set use_stack1.numbered_sgpr, 0
 ; GCN:	.set use_stack1.private_seg_size, 404
 ; GCN:	.set use_stack1.uses_vcc, 0
 ; GCN:	.set use_stack1.uses_flat_scratch, 0
 ; GCN:	.set use_stack1.has_dyn_sized_stack, 0
 ; GCN:	.set use_stack1.has_recursion, 0
 ; GCN:	.set use_stack1.has_indirect_call, 0
-; GCN: TotalNumSgprs: 37
+; GCN: TotalNumSgprs: 4
 ; GCN: NumVgprs: 1
 ; GCN: ScratchSize: 404
 define void @use_stack1() #1 {

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
index cd89a36fe538b..bf452a9e38e01 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
@@ -15,7 +15,7 @@
 ; CHECK:     .max_flat_workgroup_size: 1024
 ; CHECK:     .name:           test
 ; CHECK:     .private_segment_fixed_size: 0
-; CHECK:     .sgpr_count:     10
+; CHECK:     .sgpr_count:     16
 ; CHECK:     .symbol:         test.kd
 ; CHECK:     .vgpr_count:     {{3|6}}
 ; WAVE64:    .wavefront_size: 64

diff  --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll
index 024593c49dba1..f7e3498907005 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa.ll
@@ -63,7 +63,7 @@
 ; ELF:   0220: 70725F73 70696C6C 5F636F75 6E7400A7
 ; ELF:   0230: 2E73796D 626F6CB5 73696D70 6C655F6E
 ; ELF:   0240: 6F5F6B65 726E6172 67732E6B 64AB2E76
-; ELF:   0250: 6770725F 636F756E 7402B12E 76677072
+; ELF:   0250: 6770725F 636F756E 7401B12E 76677072
 ; ELF:   0260: 5F737069 6C6C5F63 6F756E74 00AF2E77
 ; ELF:   0270: 61766566 726F6E74 5F73697A 6540AD61
 ; ELF:   0280: 6D646873 612E7461 72676574 BD616D64

diff  --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll
new file mode 100644
index 0000000000000..45de8a79fe88d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll
@@ -0,0 +1,72 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
+
+; CHECK-LABEL: .shader_functions:
+
+; Use VGPRs above the input arguments.
+; CHECK-LABEL: _miss_1:
+; CHECK: .vgpr_count:{{.*}}0x1d{{$}}
+
+define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
+                                    i32 %vcr, { i32 } %system.data,
+                                    i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
+                                    i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
+                                    i32 %inactive.vgpr8, i32 %inactive.vgpr9)
+                                    local_unnamed_addr {
+entry:
+  %system.data.value = extractvalue { i32 } %system.data, 0
+  %dead.val = call i32 @llvm.amdgcn.dead.i32()
+  %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
+  br i1 %is.whole.wave, label %shader, label %tail
+
+shader:
+  %system.data.extract = extractvalue { i32 } %system.data, 0
+  %data.mul = mul i32 %system.data.extract, 2
+  %data.add = add i32 %data.mul, 1
+  call void asm sideeffect "; clobber v28", "~{v28}"()
+  br label %tail
+
+tail:
+  %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
+  %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
+  %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
+  %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
+  %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
+  %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
+  %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
+  %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
+  %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
+  %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
+  %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
+  %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
+
+  %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
+  %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
+  %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
+  %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
+  %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
+  %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
+  %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
+  %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
+  %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
+  %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
+  %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
+  %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
+
+  %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
+  %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
+  %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
+  %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
+
+  call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
+        @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
+        ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
+        { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
+        i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
+  unreachable
+}
+
+declare i32 @llvm.amdgcn.dead.i32()
+declare i1 @llvm.amdgcn.init.whole.wave()
+declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
+
+declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)

diff  --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
new file mode 100644
index 0000000000000..9c636d4516a80
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
@@ -0,0 +1,46 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
+
+; CHECK-LABEL: .shader_functions:
+
+; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
+; CHECK-LABEL: leaf_shader:
+; CHECK: .vgpr_count:{{.*}}0x1{{$}}
+
+; Function without calls.
+define amdgpu_cs_chain void @_leaf_shader(ptr %output.ptr, i32 inreg %input.value,
+                              i32 %active.vgpr1, i32 %active.vgpr2,
+                              i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
+                              i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6)
+                              local_unnamed_addr {
+entry:
+  %dead.val = call i32 @llvm.amdgcn.dead.i32()
+  %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
+  br i1 %is.whole.wave, label %compute, label %merge
+
+compute:
+  ; Perform a more complex computation using active VGPRs
+  %square = mul i32 %active.vgpr1, %active.vgpr1
+  %product = mul i32 %square, %active.vgpr2
+  %sum = add i32 %product, %input.value
+  %result = add i32 %sum, 42
+  br label %merge
+
+merge:
+  %final.result = phi i32 [ 0, %entry ], [ %result, %compute ]
+  %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %compute ]
+  %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %compute ]
+  %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %compute ]
+  %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %compute ]
+  %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %compute ]
+  %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %compute ]
+
+  store i32 %final.result, ptr %output.ptr, align 4
+
+  ret void
+}
+
+declare i32 @llvm.amdgcn.dead.i32()
+declare i1 @llvm.amdgcn.init.whole.wave()
+declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
+
+declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)

diff  --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
new file mode 100644
index 0000000000000..1b0d33cec7052
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
@@ -0,0 +1,74 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
+
+; CHECK-LABEL: .shader_functions:
+
+; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
+; The shader is free to use any of the VGPRs mapped to a %inactive.vgpr as long as it only touches its active lanes.
+; In that case, the VGPR should be included in the .vgpr_count
+; CHECK-LABEL: _miss_1:
+; CHECK: .vgpr_count:{{.*}}0xd{{$}}
+
+define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
+                                    i32 %vcr, { i32 } %system.data,
+                                    i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
+                                    i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
+                                    i32 %inactive.vgpr8, i32 %inactive.vgpr9)
+                                    local_unnamed_addr {
+entry:
+  %system.data.value = extractvalue { i32 } %system.data, 0
+  %dead.val = call i32 @llvm.amdgcn.dead.i32()
+  %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
+  br i1 %is.whole.wave, label %shader, label %tail
+
+shader:
+  %system.data.extract = extractvalue { i32 } %system.data, 0
+  %data.mul = mul i32 %system.data.extract, 2
+  %data.add = add i32 %data.mul, 1
+  call void asm sideeffect "; use VGPR for %inactive.vgpr2", "~{v12}"()
+  br label %tail
+
+tail:
+  %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
+  %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
+  %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
+  %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
+  %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
+  %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
+  %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
+  %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
+  %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
+  %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
+  %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
+  %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
+
+  %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
+  %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
+  %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
+  %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
+  %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
+  %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
+  %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
+  %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
+  %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
+  %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
+  %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
+  %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
+
+  %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
+  %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
+  %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
+  %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
+
+  call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
+        @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
+        ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
+        { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
+        i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
+  unreachable
+}
+
+declare i32 @llvm.amdgcn.dead.i32()
+declare i1 @llvm.amdgcn.init.whole.wave()
+declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
+
+declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)

diff  --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll
new file mode 100644
index 0000000000000..9408501718784
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll
@@ -0,0 +1,71 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
+
+; CHECK-LABEL: .shader_functions:
+
+; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
+; CHECK-LABEL: _miss_1:
+; CHECK: .vgpr_count:{{.*}}0xa{{$}}
+
+define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
+                                    i32 %vcr, { i32 } %system.data,
+                                    i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
+                                    i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
+                                    i32 %inactive.vgpr8, i32 %inactive.vgpr9)
+                                    local_unnamed_addr {
+entry:
+  %system.data.value = extractvalue { i32 } %system.data, 0
+  %dead.val = call i32 @llvm.amdgcn.dead.i32()
+  %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
+  br i1 %is.whole.wave, label %shader, label %tail
+
+shader:
+  %system.data.extract = extractvalue { i32 } %system.data, 0
+  %data.mul = mul i32 %system.data.extract, 2
+  %data.add = add i32 %data.mul, 1
+  br label %tail
+
+tail:
+  %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
+  %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
+  %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
+  %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
+  %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
+  %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
+  %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
+  %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
+  %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
+  %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
+  %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
+  %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
+
+  %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
+  %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
+  %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
+  %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
+  %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
+  %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
+  %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
+  %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
+  %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
+  %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
+  %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
+  %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
+
+  %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
+  %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
+  %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
+  %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
+
+  call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
+        @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
+        ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
+        { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
+        i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
+  unreachable
+}
+
+declare i32 @llvm.amdgcn.dead.i32()
+declare i1 @llvm.amdgcn.init.whole.wave()
+declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
+
+declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)

diff  --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll
index 464cd820028cc..c3b033113431f 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra.ll
@@ -64,7 +64,7 @@ define void @func_regular_call() #1 {
 ; GCN-NEXT: s_addc_u32 s17,
 ; GCN-NEXT: s_setpc_b64 s[16:17]
 
-; GCN: ; TotalNumSgprs: 32
+; GCN: ; TotalNumSgprs: 18
 ; GCN: ; NumVgprs: 8
 define void @func_tail_call() #1 {
   tail call void @func()

diff  --git a/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll b/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll
index 60bbf4646ee03..03694b913d6e0 100644
--- a/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll
+++ b/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll
@@ -24,7 +24,7 @@ define void @I_Quit() {
 ; CHECK-LABEL: P_RemoveMobj:
 ; CHECK: .set P_RemoveMobj.num_vgpr, 0
 ; CHECK: .set P_RemoveMobj.num_agpr, 0
-; CHECK: .set P_RemoveMobj.numbered_sgpr, 32
+; CHECK: .set P_RemoveMobj.numbered_sgpr, 0
 ; CHECK: .set P_RemoveMobj.private_seg_size, 0
 ; CHECK: .set P_RemoveMobj.uses_vcc, 0
 ; CHECK: .set P_RemoveMobj.uses_flat_scratch, 0
@@ -38,7 +38,7 @@ define void @P_RemoveMobj() {
 ; CHECK-LABEL: P_SpawnMobj:
 ; CHECK: .set P_SpawnMobj.num_vgpr, 0
 ; CHECK: .set P_SpawnMobj.num_agpr, 0
-; CHECK: .set P_SpawnMobj.numbered_sgpr, 32
+; CHECK: .set P_SpawnMobj.numbered_sgpr, 0
 ; CHECK: .set P_SpawnMobj.private_seg_size, 0
 ; CHECK: .set P_SpawnMobj.uses_vcc, 0
 ; CHECK: .set P_SpawnMobj.uses_flat_scratch, 0
@@ -52,7 +52,7 @@ define void @P_SpawnMobj() {
 ; CHECK-LABEL: G_PlayerReborn:
 ; CHECK: .set G_PlayerReborn.num_vgpr, 0
 ; CHECK: .set G_PlayerReborn.num_agpr, 0
-; CHECK: .set G_PlayerReborn.numbered_sgpr, 32
+; CHECK: .set G_PlayerReborn.numbered_sgpr, 0
 ; CHECK: .set G_PlayerReborn.private_seg_size, 0
 ; CHECK: .set G_PlayerReborn.uses_vcc, 0
 ; CHECK: .set G_PlayerReborn.uses_flat_scratch, 0
@@ -66,7 +66,7 @@ define void @G_PlayerReborn() {
 ; CHECK-LABEL: P_SetThingPosition:
 ; CHECK: .set P_SetThingPosition.num_vgpr, 0
 ; CHECK: .set P_SetThingPosition.num_agpr, 0
-; CHECK: .set P_SetThingPosition.numbered_sgpr, 32
+; CHECK: .set P_SetThingPosition.numbered_sgpr, 0
 ; CHECK: .set P_SetThingPosition.private_seg_size, 0
 ; CHECK: .set P_SetThingPosition.uses_vcc, 0
 ; CHECK: .set P_SetThingPosition.uses_flat_scratch, 0
@@ -96,7 +96,7 @@ define void @P_SetupPsprites(ptr addrspace(1) %i) {
 ; CHECK-LABEL: HU_Start:
 ; CHECK: .set HU_Start.num_vgpr, 0
 ; CHECK: .set HU_Start.num_agpr, 0
-; CHECK: .set HU_Start.numbered_sgpr, 32
+; CHECK: .set HU_Start.numbered_sgpr, 0
 ; CHECK: .set HU_Start.private_seg_size, 0
 ; CHECK: .set HU_Start.uses_vcc, 0
 ; CHECK: .set HU_Start.uses_flat_scratch, 0
@@ -162,7 +162,7 @@ define void @G_DoReborn() {
 ; CHECK-LABEL: AM_Stop:
 ; CHECK: .set AM_Stop.num_vgpr, 0
 ; CHECK: .set AM_Stop.num_agpr, 0
-; CHECK: .set AM_Stop.numbered_sgpr, 32
+; CHECK: .set AM_Stop.numbered_sgpr, 0
 ; CHECK: .set AM_Stop.private_seg_size, 0
 ; CHECK: .set AM_Stop.uses_vcc, 0
 ; CHECK: .set AM_Stop.uses_flat_scratch, 0
@@ -176,7 +176,7 @@ define void @AM_Stop() {
 ; CHECK-LABEL: D_AdvanceDemo:
 ; CHECK: .set D_AdvanceDemo.num_vgpr, 0
 ; CHECK: .set D_AdvanceDemo.num_agpr, 0
-; CHECK: .set D_AdvanceDemo.numbered_sgpr, 32
+; CHECK: .set D_AdvanceDemo.numbered_sgpr, 0
 ; CHECK: .set D_AdvanceDemo.private_seg_size, 0
 ; CHECK: .set D_AdvanceDemo.uses_vcc, 0
 ; CHECK: .set D_AdvanceDemo.uses_flat_scratch, 0
@@ -190,7 +190,7 @@ define void @D_AdvanceDemo() {
 ; CHECK-LABEL: F_StartFinale:
 ; CHECK: .set F_StartFinale.num_vgpr, 0
 ; CHECK: .set F_StartFinale.num_agpr, 0
-; CHECK: .set F_StartFinale.numbered_sgpr, 32
+; CHECK: .set F_StartFinale.numbered_sgpr, 0
 ; CHECK: .set F_StartFinale.private_seg_size, 0
 ; CHECK: .set F_StartFinale.uses_vcc, 0
 ; CHECK: .set F_StartFinale.uses_flat_scratch, 0
@@ -204,7 +204,7 @@ define void @F_StartFinale() {
 ; CHECK-LABEL: F_Ticker:
 ; CHECK: .set F_Ticker.num_vgpr, 0
 ; CHECK: .set F_Ticker.num_agpr, 0
-; CHECK: .set F_Ticker.numbered_sgpr, 32
+; CHECK: .set F_Ticker.numbered_sgpr, 0
 ; CHECK: .set F_Ticker.private_seg_size, 0
 ; CHECK: .set F_Ticker.uses_vcc, 0
 ; CHECK: .set F_Ticker.uses_flat_scratch, 0
@@ -236,7 +236,7 @@ define i32 @G_CheckDemoStatus() {
 ; CHECK-LABEL: P_TempSaveGameFile:
 ; CHECK: .set P_TempSaveGameFile.num_vgpr, 2
 ; CHECK: .set P_TempSaveGameFile.num_agpr, 0
-; CHECK: .set P_TempSaveGameFile.numbered_sgpr, 32
+; CHECK: .set P_TempSaveGameFile.numbered_sgpr, 0
 ; CHECK: .set P_TempSaveGameFile.private_seg_size, 0
 ; CHECK: .set P_TempSaveGameFile.uses_vcc, 0
 ; CHECK: .set P_TempSaveGameFile.uses_flat_scratch, 0
@@ -250,7 +250,7 @@ define ptr @P_TempSaveGameFile() {
 ; CHECK-LABEL: P_SaveGameFile:
 ; CHECK: .set P_SaveGameFile.num_vgpr, 2
 ; CHECK: .set P_SaveGameFile.num_agpr, 0
-; CHECK: .set P_SaveGameFile.numbered_sgpr, 32
+; CHECK: .set P_SaveGameFile.numbered_sgpr, 0
 ; CHECK: .set P_SaveGameFile.private_seg_size, 0
 ; CHECK: .set P_SaveGameFile.uses_vcc, 0
 ; CHECK: .set P_SaveGameFile.uses_flat_scratch, 0
@@ -264,7 +264,7 @@ define ptr @P_SaveGameFile() {
 ; CHECK-LABEL: R_FlatNumForName:
 ; CHECK: .set R_FlatNumForName.num_vgpr, max(42, I_Error.num_vgpr)
 ; CHECK: .set R_FlatNumForName.num_agpr, max(0, I_Error.num_agpr)
-; CHECK: .set R_FlatNumForName.numbered_sgpr, max(56, I_Error.numbered_sgpr)
+; CHECK: .set R_FlatNumForName.numbered_sgpr, max(34, I_Error.numbered_sgpr)
 ; CHECK: .set R_FlatNumForName.private_seg_size, 16+max(I_Error.private_seg_size)
 ; CHECK: .set R_FlatNumForName.uses_vcc, or(1, I_Error.uses_vcc)
 ; CHECK: .set R_FlatNumForName.uses_flat_scratch, or(0, I_Error.uses_flat_scratch)

diff  --git a/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll b/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll
index 7a810d0067c17..83f58db1aa67f 100644
--- a/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: {{^}}qux
 ; CHECK: .set qux.num_vgpr, 13
 ; CHECK: .set qux.num_agpr, 0
-; CHECK: .set qux.numbered_sgpr, 32
+; CHECK: .set qux.numbered_sgpr, 0
 ; CHECK: .set qux.private_seg_size, 0
 ; CHECK: .set qux.uses_vcc, 0
 ; CHECK: .set qux.uses_flat_scratch, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
index 638dc8965987e..28c3131302a31 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
@@ -83,13 +83,13 @@
 ; CHECK-NEXT:      multiple_stack:
 ; CHECK-NEXT:        .backend_stack_size: 0x24
 ; CHECK-NEXT:        .lds_size:       0
-; CHECK-NEXT:        .sgpr_count:     0x21
+; CHECK-NEXT:        .sgpr_count:     0x1
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x24
 ; CHECK-NEXT:        .vgpr_count:     0x3
 ; CHECK-NEXT:      no_stack:
 ; CHECK-NEXT:        .backend_stack_size: 0
 ; CHECK-NEXT:        .lds_size:       0
-; CHECK-NEXT:        .sgpr_count:     0x20
+; CHECK-NEXT:        .sgpr_count:     0x1
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0
 ; CHECK-NEXT:        .vgpr_count:     0x1
 ; CHECK-NEXT:      no_stack_call:
@@ -122,7 +122,7 @@
 ; CHECK-NEXT:      simple_lds:
 ; CHECK-NEXT:        .backend_stack_size: 0
 ; CHECK-NEXT:        .lds_size:       0x100
-; CHECK-NEXT:        .sgpr_count:     0x20
+; CHECK-NEXT:        .sgpr_count:     0x1
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0
 ; CHECK-NEXT:        .vgpr_count:     0x1
 ; CHECK-NEXT:      simple_lds_recurse:
@@ -134,7 +134,7 @@
 ; CHECK-NEXT:      simple_stack:
 ; CHECK-NEXT:        .backend_stack_size: 0x14
 ; CHECK-NEXT:        .lds_size:       0
-; CHECK-NEXT:        .sgpr_count:     0x21
+; CHECK-NEXT:        .sgpr_count:     0x1
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x14
 ; CHECK-NEXT:        .vgpr_count:     0x2
 ; CHECK-NEXT:      simple_stack_call:

diff  --git a/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll b/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
index 5b9b0feea9900..a71fd7fe782ff 100644
--- a/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
@@ -2,7 +2,7 @@
 ;RUN: llc < %s -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
 
 ; ;CHECK-LABEL: {{^}}_amdgpu_ps_1_arg:
-; ;CHECK: NumVgprs: 4
+; ;CHECK: NumVgprs: 2
 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_1_arg(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
 .entry:
   %i1 = extractelement <2 x float> %arg3, i32 1
@@ -193,7 +193,7 @@ define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float>
 
 ; Check that when no input args are used we get the minimum allocation - note that we always enable the first input
 ; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused:
-; CHECK: NumVgprs: 4
+; CHECK: NumVgprs: 2
 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
 .entry:
   ret { <4 x float> } undef
@@ -202,7 +202,7 @@ define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused(i32 inreg %arg
 ; Check that when no input args are used we get the minimum allocation - note that we always enable the first input
 ; Additionally set the PSInputAddr to 0 via the metadata
 ; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_ia0:
-; CHECK: NumVgprs: 4
+; CHECK: NumVgprs: 2
 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_ia0(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #3 {
 .entry:
   ret { <4 x float> } undef

diff  --git a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
index 35e11ad6a648b..bfcf90037bfd3 100644
--- a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
+++ b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
@@ -24,7 +24,9 @@ define amdgpu_kernel void @foo(ptr addrspace(1) noalias %out, ptr addrspace(1) %
 
 ; SI-LABEL: {{^}}one_vgpr_used:
 ; SI: NumVgprs: 1
-define amdgpu_kernel void @one_vgpr_used(ptr addrspace(1) %out, i32 %x) nounwind {
+define amdgpu_kernel void @one_vgpr_used(ptr addrspace(1) %out, i32 %x) #0 {
   store i32 %x, ptr addrspace(1) %out, align 4
   ret void
 }
+
+attributes #0 = { nounwind noinline "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }

diff  --git a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
index afb77ed190896..a2470a60cb19f 100644
--- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
+++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
@@ -122,8 +122,8 @@ define void @test_func() !dbg !6 {
 }
 
 ; STDERR: remark: foo.cl:8:0: Function Name: empty_kernel
-; STDERR-NEXT: remark: foo.cl:8:0:     TotalSGPRs: 4
-; STDERR-NEXT: remark: foo.cl:8:0:     VGPRs: 0
+; STDERR-NEXT: remark: foo.cl:8:0:     TotalSGPRs: 22
+; STDERR-NEXT: remark: foo.cl:8:0:     VGPRs: 3
 ; STDERR-NEXT: remark: foo.cl:8:0:     AGPRs: 0
 ; STDERR-NEXT: remark: foo.cl:8:0:     ScratchSize [bytes/lane]: 0
 ; STDERR-NEXT: remark: foo.cl:8:0:     Dynamic Stack: False

diff  --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
index 0d25bc97ff775..557ffd27a07f6 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
@@ -4,8 +4,8 @@
 ; CHECK-LABEL: {{^}}spill:
 ; GCN:    NumSgprs: 104
 ; GCN-GCNTRACKERS:    NumSgprs: 104
-; GCN:    NumVgprs: 1
-; GCN-GCNTRACKERS:    NumVgprs: 2
+; GCN:    NumVgprs: 3
+; GCN-GCNTRACKERS:    NumVgprs: 3
 ; GCN:    ScratchSize: 0
 ; GCN-GCNTRACKERS:    ScratchSize: 0
 ; GCN:    Occupancy: 5

diff  --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
index c5732531f5423..95d707aee5662 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
@@ -11,8 +11,8 @@
 ; allow scheduling of other instructions which reduce RP
 
 ; CHECK-LABEL: {{^}}return_72xi32:
-; GFX11-PAL:    NumSgprs: 33
-; GFX11-PAL-GCNTRACKERS:    NumSgprs: 33
+; GFX11-PAL:    NumSgprs: 0
+; GFX11-PAL-GCNTRACKERS:    NumSgprs: 0
 ; GFX11-PAL:    NumVgprs: 64
 ; GFX11-PAL-GCNTRACKERS:    NumVgprs: 64
 ; GFX11-PAL:    ScratchSize: 220

diff  --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
index 462ac23ec7e0e..8300a52955b91 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
@@ -7,14 +7,14 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MINREG %s
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-maxocc -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MAXOCC %s
 
-; SI-MINREG: NumSgprs: {{[1-9]$}}
-; SI-MINREG: NumVgprs: {{[1-9]$}}
+; SI-MINREG: NumSgprs: {{[1]?[1-9]$}}
+; SI-MINREG: NumVgprs: {{[1]?[1-9]$}}
 
 ; SI-MAXOCC: NumSgprs: {{[1-4]?[0-9]$}}
 ; SI-MAXOCC: NumVgprs: {{[1-4]?[0-9]$}}
 
 ; stores may alias loads
-; VI-MINREG: NumSgprs: {{[0-9]$}}
+; VI-MINREG: NumSgprs: {{[1]?[0-9]$}}
 ; VI-MINREG: NumVgprs: {{[1-3][0-9]$}}
 
 ; stores may alias loads

diff  --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
index 6ddf0986755f9..682bbdedb37a3 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
@@ -35,7 +35,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; VI-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT:     .amdhsa_next_free_vgpr 1
+; VI-NEXT:     .amdhsa_next_free_vgpr 3
 ; VI-NEXT:     .amdhsa_next_free_sgpr 18
 ; VI-NEXT:     .amdhsa_reserve_vcc 0
 ; VI-NEXT:     .amdhsa_reserve_flat_scratch 0
@@ -86,7 +86,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; GFX9-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT:     .amdhsa_next_free_vgpr 1
+; GFX9-NEXT:     .amdhsa_next_free_vgpr 3
 ; GFX9-NEXT:     .amdhsa_next_free_sgpr 18
 ; GFX9-NEXT:     .amdhsa_reserve_vcc 0
 ; GFX9-NEXT:     .amdhsa_reserve_flat_scratch 0
@@ -146,7 +146,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; VI-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT:     .amdhsa_next_free_vgpr 1
+; VI-NEXT:     .amdhsa_next_free_vgpr 3
 ; VI-NEXT:     .amdhsa_next_free_sgpr 18
 ; VI-NEXT:     .amdhsa_reserve_vcc 0
 ; VI-NEXT:     .amdhsa_reserve_flat_scratch 0
@@ -197,7 +197,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; GFX9-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT:     .amdhsa_next_free_vgpr 1
+; GFX9-NEXT:     .amdhsa_next_free_vgpr 3
 ; GFX9-NEXT:     .amdhsa_next_free_sgpr 18
 ; GFX9-NEXT:     .amdhsa_reserve_vcc 0
 ; GFX9-NEXT:     .amdhsa_reserve_flat_scratch 0
@@ -257,7 +257,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; VI-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT:     .amdhsa_next_free_vgpr 1
+; VI-NEXT:     .amdhsa_next_free_vgpr 3
 ; VI-NEXT:     .amdhsa_next_free_sgpr 18
 ; VI-NEXT:     .amdhsa_reserve_vcc 0
 ; VI-NEXT:     .amdhsa_reserve_flat_scratch 0
@@ -308,7 +308,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; GFX9-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT:     .amdhsa_next_free_vgpr 1
+; GFX9-NEXT:     .amdhsa_next_free_vgpr 3
 ; GFX9-NEXT:     .amdhsa_next_free_sgpr 18
 ; GFX9-NEXT:     .amdhsa_reserve_vcc 0
 ; GFX9-NEXT:     .amdhsa_reserve_flat_scratch 0

diff  --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
index 30accc846d2b6..d3def45c4f9d2 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
@@ -6,7 +6,7 @@
 
 define amdgpu_kernel void @kern() #0 {
 ; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 5
+; ASM: .amdhsa_next_free_sgpr 8
 ; ASM: .amdhsa_reserve_xnack_mask 1
 
 ; Verify that an extra SGPR block is reserved with XNACK "any" tid setting.
@@ -17,7 +17,7 @@ define amdgpu_kernel void @kern() #0 {
 ; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......
 
 ; ELF: AMDGPU Metadata
-; ELF: .sgpr_count:     9
+; ELF: .sgpr_count:     12
 entry:
   tail call void asm sideeffect "", "~{s[0:4]}"()
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
index 4f84b31f1877b..ad831e040d722 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
@@ -6,7 +6,7 @@
 
 define amdgpu_kernel void @kern() #0 {
 ; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 5
+; ASM: .amdhsa_next_free_sgpr 8
 ; ASM: .amdhsa_reserve_xnack_mask 0
 
 ; Verify that an extra SGPR block is not reserved with XNACK "off" tid setting.
@@ -17,7 +17,7 @@ define amdgpu_kernel void @kern() #0 {
 ; OBJ-NEXT: 0030 0000af00 8c000000 21000000 00000000 ........!.......
 
 ; ELF: AMDGPU Metadata
-; ELF: .sgpr_count:     5
+; ELF: .sgpr_count:     8
 entry:
   tail call void asm sideeffect "", "~{s[0:4]}"()
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
index 644f434923368..d1e28e11601ce 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
@@ -6,7 +6,7 @@
 
 define amdgpu_kernel void @kern() #0 {
 ; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 5
+; ASM: .amdhsa_next_free_sgpr 8
 ; ASM: .amdhsa_reserve_xnack_mask 1
 
 ; Verify that an extra SGPR block is reserved with XNACK "on" tid setting.
@@ -17,7 +17,7 @@ define amdgpu_kernel void @kern() #0 {
 ; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......
 
 ; ELF: AMDGPU Metadata
-; ELF: .sgpr_count:     9
+; ELF: .sgpr_count:     12
 entry:
   tail call void asm sideeffect "", "~{s[0:4]}"()
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll b/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
index cf5b95a729974..4802ec861d685 100644
--- a/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
+++ b/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: __unnamed_1:
 ; CHECK: .set __unnamed_1.num_vgpr, 0
 ; CHECK: .set __unnamed_1.num_agpr, 0
-; CHECK: .set __unnamed_1.numbered_sgpr, 32
+; CHECK: .set __unnamed_1.numbered_sgpr, 0
 ; CHECK: .set __unnamed_1.private_seg_size, 0
 ; CHECK: .set __unnamed_1.uses_vcc, 0
 ; CHECK: .set __unnamed_1.uses_flat_scratch, 0
@@ -16,7 +16,7 @@ entry:
 }
 
 ; CHECK-LABEL: __unnamed_2:
-; CHECK: .set __unnamed_2.num_vgpr, max(32, __unnamed_1.num_vgpr)
+; CHECK: .set __unnamed_2.num_vgpr, max(1, __unnamed_1.num_vgpr)
 ; CHECK: .set __unnamed_2.num_agpr, max(0, __unnamed_1.num_agpr)
 ; CHECK: .set __unnamed_2.numbered_sgpr, max(34, __unnamed_1.numbered_sgpr)
 ; CHECK: .set __unnamed_2.private_seg_size, 16+max(__unnamed_1.private_seg_size)

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
index 2cb5e309c8c21..ee35dc4cddade 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
@@ -1264,9 +1264,9 @@ define amdgpu_kernel void @k1024_call_no_agprs_ub_callee() #1025 {
 }
 
 ; GCN-LABEL: {{^}}f1024_0:
-; GFX90A: NumVgprs: 32
+; GFX90A: NumVgprs: 1
 ; GFX90A: NumAgprs: 1
-; GFX90A: TotalNumVgprs: 33
+; GFX90A: TotalNumVgprs: 5
 define void @f1024_0() #1024 {
   call void @foo()
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll b/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll
new file mode 100644
index 0000000000000..8c8182db7b479
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll
@@ -0,0 +1,30 @@
+; RUN: llc -mcpu=gfx1200 -o - < %s | FileCheck %s --check-prefixes=CHECK,PACKED
+; RUN: llc -mcpu=gfx1030 -o - < %s | FileCheck %s --check-prefixes=CHECK,NOTPACKED
+target triple = "amdgcn-amd-amdhsa"
+
+ at global = addrspace(1) global i32 poison, align 4
+
+; Carefully crafted kernel that uses v0 but never writes a VGPR or reads another VGPR.
+; Only hardware-initialized VGPRs (v0) are read in this kernel.
+
+; CHECK-LABEL: amdhsa.kernels:
+; CHECK-LABEL: kernel_x
+; CHECK: .vgpr_count:     1
+define amdgpu_kernel void @kernel_x(ptr addrspace(8) %rsrc) #0 {
+entry:
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %id, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: kernel_z
+; PACKED: .vgpr_count:     1
+; NOTPACKED: .vgpr_count:     3
+define amdgpu_kernel void @kernel_z(ptr addrspace(8) %rsrc) {
+entry:
+  %id = call i32 @llvm.amdgcn.workitem.id.z()
+  call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %id, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+  ret void
+}
+
+attributes #0 = { "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll b/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll
new file mode 100644
index 0000000000000..f5d28a0ae1628
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mcpu=gfx1200 -o - < %s | FileCheck %s
+; Check that reads of a VGPR in kernels counts towards VGPR count, but in functions, only writes of VGPRs count towards VGPR count.
+target triple = "amdgcn--amdpal"
+
+ at global = addrspace(1) global i32 poison, align 4
+
+; CHECK-LABEL: amdpal.pipelines:
+
+; Neither uses not writes a VGPR, but the hardware initializes the VGPRs that the kernel receives, so they count as used.
+; CHECK-LABEL: .entry_point_symbol: kernel_use
+; CHECK: .vgpr_count:     0x20
+define amdgpu_cs void @kernel_use([32 x i32] %args) {
+entry:
+  %a = extractvalue [32 x i32] %args, 14
+  store i32 %a, ptr addrspace(1) @global
+  ret void
+}
+
+; Neither uses not writes a VGPR
+; CHECK-LABEL: chain_func:
+; CHECK: .vgpr_count:     0x1
+define amdgpu_cs_chain void @chain_func([32 x i32] %args) {
+entry:
+  call void (ptr, i32, {}, [32 x i32], i32, ...) @llvm.amdgcn.cs.chain.p0.i32.s.a(
+        ptr @chain_func, i32 0, {} inreg {}, [32 x i32] %args, i32 0)
+  unreachable
+}
+
+; Neither uses not writes a VGPR
+; CHECK-LABEL: gfx_func:
+; CHECK: .vgpr_count:     0x1
+define amdgpu_gfx [32 x i32] @gfx_func([32 x i32] %args) {
+entry:
+  ret [32 x i32] %args
+}