[llvm] [AMDGPU] Skip register uses in AMDGPUResourceUsageAnalysis (PR #133242)

Tue May 6 04:57:31 PDT 2025

https://github.com/rovka updated https://github.com/llvm/llvm-project/pull/133242

>From 8a56150c655ebee059a2ea2a1eee2cf09ca7a6df Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Mon, 17 Mar 2025 10:03:27 +0100
Subject: [PATCH 1/9] [AMDGPU] Ignore inactive VGPRs in .vgpr_count

When using the amdgcn.init.whole.wave intrinsic, we add dummy VGPR
arguments with the purpose of preserving their inactive lanes. The
pattern may look something like this:

```
entry:
  call amdgcn.init.whole.wave
  brand to shader or tail

shader:
  $vInactive = IMPLICIT_DEF ; Tells regalloc it's safe to use the active lanes
  actual code...

tail:
  call amdgcn.cs.chain [...], implicit $vInactive
```

We should not report these VGPRs in the .vgpr_count metadata. This patch
achieves that goal by ignoring IMPLICIT_DEFs and SI_TCRETURNs in
functions that use the amdgcn.init.whole.wave intrinsic. All other VGPRs
are counted as usual.
---
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp   |  5 +-
 .../AMDGPU/AMDGPUResourceUsageAnalysis.cpp    | 16 ++++
 .../init-whole-wave-vgpr-count-large.ll       | 76 ++++++++++++++++++
 .../AMDGPU/init-whole-wave-vgpr-count-leaf.ll | 50 ++++++++++++
 ...init-whole-wave-vgpr-count-use-inactive.ll | 78 +++++++++++++++++++
 .../AMDGPU/init-whole-wave-vgpr-count.ll      | 75 ++++++++++++++++++
 6 files changed, 299 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 800e2b9c0e657..7769bc5d74ebd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -990,7 +990,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   // dispatch registers are function args.
   unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
 
-  if (isShader(F.getCallingConv())) {
+  // Shaders that use the init.whole.wave intrinsic sometimes have VGPR
+  // arguments that are only added for the purpose of preserving their inactive
+  // lanes. Skip including them in the VGPR count.
+  if (isShader(F.getCallingConv()) && !MFI->hasInitWholeWave()) {
     bool IsPixelShader =
         F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 9a609a1752de0..05d1aa38d4a25 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -156,8 +156,14 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
   int32_t MaxSGPR = -1;
   Info.CalleeSegmentSize = 0;
 
+  bool IsIWWFunction = MFI->hasInitWholeWave();
+
   for (const MachineBasicBlock &MBB : MF) {
     for (const MachineInstr &MI : MBB) {
+      // At this point, the chain call pseudos are already expanded.
+      bool IsChainCall = MI.getOpcode() == AMDGPU::SI_TCRETURN;
+      bool IsImplicitDef = MI.isImplicitDef();
+
       // TODO: Check regmasks? Do they occur anywhere except calls?
       for (const MachineOperand &MO : MI.operands()) {
         unsigned Width = 0;
@@ -239,6 +245,16 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
           break;
         }
 
+        // For functions that use the llvm.amdgcn.init.whole.wave intrinsic, we
+        // often add artificial VGPR arguments for the purpose of preserving
+        // their inactive lanes. These should not be reported as part of our
+        // VGPR usage. We can identify them easily because they're only used in
+        // the chain call, and possibly in an IMPLICIT_DEF coming from an
+        // llvm.amdgcn.dead intrinsic.
+        if (IsIWWFunction && (IsChainCall || IsImplicitDef) &&
+            TRI.isVectorRegister(MRI, Reg))
+          continue;
+
         if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
             AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
             AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll
new file mode 100644
index 0000000000000..e47f5e25ead3a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll
@@ -0,0 +1,76 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
+
+; CHECK-LABEL: .shader_functions:
+
+; Use VGPRs above the input arguments.
+; CHECK-LABEL: _miss_1:
+; CHECK: .vgpr_count:{{.*}}0x1d{{$}}
+
+define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
+                                    i32 %vcr, { i32 } %system.data,
+                                    i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
+                                    i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
+                                    i32 %inactive.vgpr8, i32 %inactive.vgpr9)
+                                    local_unnamed_addr {
+entry:
+  %system.data.value = extractvalue { i32 } %system.data, 0
+  %dead.val = call i32 @llvm.amdgcn.dead.i32()
+  %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
+  br i1 %is.whole.wave, label %shader, label %tail
+
+shader:
+  %system.data.extract = extractvalue { i32 } %system.data, 0
+  %data.mul = mul i32 %system.data.extract, 2
+  %data.add = add i32 %data.mul, 1
+  call void asm sideeffect "; clobber v28", "~{v28}"()
+  br label %tail
+
+tail:
+  %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
+  %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
+  %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
+  %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
+  %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
+  %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
+  %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
+  %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
+  %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
+  %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
+  %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
+  %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
+
+  %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
+  %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
+  %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
+  %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
+  %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
+  %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
+  %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
+  %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
+  %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
+  %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
+  %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
+  %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
+
+  %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
+  %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
+  %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
+  %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
+
+  call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
+        @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
+        ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
+        { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
+        i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
+  unreachable
+}
+
+declare i32 @llvm.amdgcn.dead.i32()
+declare i1 @llvm.amdgcn.init.whole.wave()
+declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
+
+declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
+
+!amdgpu.pal.metadata.msgpack = !{!0}
+
+!0 = !{!"\82\B0amdpal.pipelines\91\8B\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C3\AA.tgid_y_en\C3\AA.tgid_z_en\C3\AF.tidig_comp_cnt\00\B0.hardware_stages\81\A3.cs\8D\AF.checksum_value\00\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93 \01\01\AD.trap_present\00\B2.user_data_reg_map\90\AB.user_sgprs\10\AB.vgpr_limit\CD\01\00\AF.wavefront_size \AF.wg_round_robin\C2\B7.internal_pipeline_hash\92\CF|{2&\DCC\85M\CFep\8A\EDR\DE\D6\E1\B1.shader_functions\81\A7_miss_1\82\B4.frontend_stack_size\00\B4.outgoing_vgpr_countP\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\00\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CD\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\A9.uses_cps\C3\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4\AF\9D\0B\07\88\03\02\CF\01o\C9\CAf?)\DA\AD.llpc_version\A476.0\AEamdpal.version\92\03\00"}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
new file mode 100644
index 0000000000000..5d7472fd3c56e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
@@ -0,0 +1,50 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
+
+; CHECK-LABEL: .shader_functions:
+
+; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
+; CHECK-LABEL: leaf_shader:
+; CHECK: .vgpr_count:{{.*}}0xc{{$}}
+
+; Function without calls.
+define amdgpu_cs_chain void @_leaf_shader(ptr %output.ptr, i32 inreg %input.value,
+                              i32 %active.vgpr1, i32 %active.vgpr2,
+                              i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
+                              i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6)
+                              local_unnamed_addr {
+entry:
+  %dead.val = call i32 @llvm.amdgcn.dead.i32()
+  %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
+  br i1 %is.whole.wave, label %compute, label %merge
+
+compute:
+  ; Perform a more complex computation using active VGPRs
+  %square = mul i32 %active.vgpr1, %active.vgpr1
+  %product = mul i32 %square, %active.vgpr2
+  %sum = add i32 %product, %input.value
+  %result = add i32 %sum, 42
+  br label %merge
+
+merge:
+  %final.result = phi i32 [ 0, %entry ], [ %result, %compute ]
+  %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %compute ]
+  %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %compute ]
+  %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %compute ]
+  %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %compute ]
+  %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %compute ]
+  %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %compute ]
+
+  store i32 %final.result, ptr %output.ptr, align 4
+
+  ret void
+}
+
+declare i32 @llvm.amdgcn.dead.i32()
+declare i1 @llvm.amdgcn.init.whole.wave()
+declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
+
+declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
+
+!amdgpu.pal.metadata.msgpack = !{!0}
+
+!0 = !{!"\82\B0amdpal.pipelines\91\8B\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C3\AA.tgid_y_en\C3\AA.tgid_z_en\C3\AF.tidig_comp_cnt\00\B0.hardware_stages\81\A3.cs\8D\AF.checksum_value\00\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93 \01\01\AD.trap_present\00\B2.user_data_reg_map\90\AB.user_sgprs\10\AB.vgpr_limit\CD\01\00\AF.wavefront_size \AF.wg_round_robin\C2\B7.internal_pipeline_hash\92\CF|{2&\DCC\85M\CFep\8A\EDR\DE\D6\E1\B1.shader_functions\81\A7_miss_1\82\B4.frontend_stack_size\00\B4.outgoing_vgpr_countP\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\00\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CD\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\A9.uses_cps\C3\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4\AF\9D\0B\07\88\03\02\CF\01o\C9\CAf?)\DA\AD.llpc_version\A476.0\AEamdpal.version\92\03\00"}
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
new file mode 100644
index 0000000000000..0c699a07cb3fd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
@@ -0,0 +1,78 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
+
+; CHECK-LABEL: .shader_functions:
+
+; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
+; The shader is free to use any of the VGPRs mapped to a %inactive.vpgr as long as it only touches its active lanes.
+; In that case, the VGPR should be included in the .vgpr_count
+; CHECK-LABEL: _miss_1:
+; CHECK: .vgpr_count:{{.*}}0xd{{$}}
+
+define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
+                                    i32 %vcr, { i32 } %system.data,
+                                    i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
+                                    i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
+                                    i32 %inactive.vgpr8, i32 %inactive.vgpr9)
+                                    local_unnamed_addr {
+entry:
+  %system.data.value = extractvalue { i32 } %system.data, 0
+  %dead.val = call i32 @llvm.amdgcn.dead.i32()
+  %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
+  br i1 %is.whole.wave, label %shader, label %tail
+
+shader:
+  %system.data.extract = extractvalue { i32 } %system.data, 0
+  %data.mul = mul i32 %system.data.extract, 2
+  %data.add = add i32 %data.mul, 1
+  call void asm sideeffect "; use VGPR for %inactive.vgpr2", "~{v12}"()
+  br label %tail
+
+tail:
+  %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
+  %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
+  %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
+  %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
+  %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
+  %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
+  %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
+  %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
+  %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
+  %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
+  %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
+  %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
+
+  %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
+  %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
+  %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
+  %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
+  %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
+  %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
+  %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
+  %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
+  %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
+  %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
+  %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
+  %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
+
+  %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
+  %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
+  %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
+  %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
+
+  call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
+        @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
+        ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
+        { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
+        i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
+  unreachable
+}
+
+declare i32 @llvm.amdgcn.dead.i32()
+declare i1 @llvm.amdgcn.init.whole.wave()
+declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
+
+declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
+
+!amdgpu.pal.metadata.msgpack = !{!0}
+
+!0 = !{!"\82\B0amdpal.pipelines\91\8B\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C3\AA.tgid_y_en\C3\AA.tgid_z_en\C3\AF.tidig_comp_cnt\00\B0.hardware_stages\81\A3.cs\8D\AF.checksum_value\00\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93 \01\01\AD.trap_present\00\B2.user_data_reg_map\90\AB.user_sgprs\10\AB.vgpr_limit\CD\01\00\AF.wavefront_size \AF.wg_round_robin\C2\B7.internal_pipeline_hash\92\CF|{2&\DCC\85M\CFep\8A\EDR\DE\D6\E1\B1.shader_functions\81\A7_miss_1\82\B4.frontend_stack_size\00\B4.outgoing_vgpr_countP\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\00\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CD\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\A9.uses_cps\C3\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4\AF\9D\0B\07\88\03\02\CF\01o\C9\CAf?)\DA\AD.llpc_version\A476.0\AEamdpal.version\92\03\00"}
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll
new file mode 100644
index 0000000000000..b9130dd1b7ed4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll
@@ -0,0 +1,75 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s
+
+; CHECK-LABEL: .shader_functions:
+
+; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
+; CHECK-LABEL: _miss_1:
+; CHECK: .vgpr_count:{{.*}}0xa{{$}}
+
+define amdgpu_cs_chain void @_miss_1(ptr inreg %next.callee, i32 inreg %global.table, i32 inreg %max.outgoing.vgpr.count,
+                                    i32 %vcr, { i32 } %system.data,
+                                    i32 %inactive.vgpr, i32 %inactive.vgpr1, i32 %inactive.vgpr2, i32 %inactive.vgpr3,
+                                    i32 %inactive.vgpr4, i32 %inactive.vgpr5, i32 %inactive.vgpr6, i32 %inactive.vgpr7,
+                                    i32 %inactive.vgpr8, i32 %inactive.vgpr9)
+                                    local_unnamed_addr {
+entry:
+  %system.data.value = extractvalue { i32 } %system.data, 0
+  %dead.val = call i32 @llvm.amdgcn.dead.i32()
+  %is.whole.wave = call i1 @llvm.amdgcn.init.whole.wave()
+  br i1 %is.whole.wave, label %shader, label %tail
+
+shader:
+  %system.data.extract = extractvalue { i32 } %system.data, 0
+  %data.mul = mul i32 %system.data.extract, 2
+  %data.add = add i32 %data.mul, 1
+  br label %tail
+
+tail:
+  %final.vcr = phi i32 [ %vcr, %entry ], [ %data.mul, %shader ]
+  %final.sys.data = phi i32 [ %system.data.value, %entry ], [ %data.add, %shader ]
+  %final.inactive0 = phi i32 [ %inactive.vgpr, %entry ], [ %dead.val, %shader ]
+  %final.inactive1 = phi i32 [ %inactive.vgpr1, %entry ], [ %dead.val, %shader ]
+  %final.inactive2 = phi i32 [ %inactive.vgpr2, %entry ], [ %dead.val, %shader ]
+  %final.inactive3 = phi i32 [ %inactive.vgpr3, %entry ], [ %dead.val, %shader ]
+  %final.inactive4 = phi i32 [ %inactive.vgpr4, %entry ], [ %dead.val, %shader ]
+  %final.inactive5 = phi i32 [ %inactive.vgpr5, %entry ], [ %dead.val, %shader ]
+  %final.inactive6 = phi i32 [ %inactive.vgpr6, %entry ], [ %dead.val, %shader ]
+  %final.inactive7 = phi i32 [ %inactive.vgpr7, %entry ], [ %dead.val, %shader ]
+  %final.inactive8 = phi i32 [ %inactive.vgpr8, %entry ], [ %dead.val, %shader ]
+  %final.inactive9 = phi i32 [ %inactive.vgpr9, %entry ], [ %dead.val, %shader ]
+
+  %struct.init = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } poison, i32 %final.vcr, 0
+  %struct.with.data = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.init, i32 %final.sys.data, 1
+  %struct.with.inactive0 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.data, i32 %final.inactive0, 2
+  %struct.with.inactive1 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive0, i32 %final.inactive1, 3
+  %struct.with.inactive2 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive1, i32 %final.inactive2, 4
+  %struct.with.inactive3 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive2, i32 %final.inactive3, 5
+  %struct.with.inactive4 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive3, i32 %final.inactive4, 6
+  %struct.with.inactive5 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive4, i32 %final.inactive5, 7
+  %struct.with.inactive6 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive5, i32 %final.inactive6, 8
+  %struct.with.inactive7 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive6, i32 %final.inactive7, 9
+  %struct.with.inactive8 = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive7, i32 %final.inactive8, 10
+  %final.struct = insertvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %struct.with.inactive8, i32 %final.inactive9, 11
+
+  %vec.global = insertelement <4 x i32> poison, i32 %global.table, i64 0
+  %vec.max.vgpr = insertelement <4 x i32> %vec.global, i32 %max.outgoing.vgpr.count, i64 1
+  %vec.sys.data = insertelement <4 x i32> %vec.max.vgpr, i32 %final.sys.data, i64 2
+  %final.vec = insertelement <4 x i32> %vec.sys.data, i32 0, i64 3
+
+  call void (ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32, ...)
+        @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(
+        ptr %next.callee, i32 0, <4 x i32> inreg %final.vec,
+        { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %final.struct,
+        i32 1, i32 %max.outgoing.vgpr.count, i32 -1, ptr @retry_vgpr_alloc.v4i32)
+  unreachable
+}
+
+declare i32 @llvm.amdgcn.dead.i32()
+declare i1 @llvm.amdgcn.init.whole.wave()
+declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
+
+declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
+
+!amdgpu.pal.metadata.msgpack = !{!0}
+
+!0 = !{!"\82\B0amdpal.pipelines\91\8B\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C3\AA.tgid_y_en\C3\AA.tgid_z_en\C3\AF.tidig_comp_cnt\00\B0.hardware_stages\81\A3.cs\8D\AF.checksum_value\00\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93 \01\01\AD.trap_present\00\B2.user_data_reg_map\90\AB.user_sgprs\10\AB.vgpr_limit\CD\01\00\AF.wavefront_size \AF.wg_round_robin\C2\B7.internal_pipeline_hash\92\CF|{2&\DCC\85M\CFep\8A\EDR\DE\D6\E1\B1.shader_functions\81\A7_miss_1\82\B4.frontend_stack_size\00\B4.outgoing_vgpr_countP\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\00\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CD\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\A9.uses_cps\C3\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4\AF\9D\0B\07\88\03\02\CF\01o\C9\CAf?)\DA\AD.llpc_version\A476.0\AEamdpal.version\92\03\00"}

>From 5089dad5593404e431ddcc6e054f5b30d8bf6186 Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Thu, 27 Mar 2025 15:17:16 +0100
Subject: [PATCH 2/9] Skip implicit defs unconditionally

---
 llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 05d1aa38d4a25..dc23a2f0eeba8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -160,9 +160,11 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
 
   for (const MachineBasicBlock &MBB : MF) {
     for (const MachineInstr &MI : MBB) {
+      if (MI.isImplicitDef())
+        continue;
+
       // At this point, the chain call pseudos are already expanded.
       bool IsChainCall = MI.getOpcode() == AMDGPU::SI_TCRETURN;
-      bool IsImplicitDef = MI.isImplicitDef();
 
       // TODO: Check regmasks? Do they occur anywhere except calls?
       for (const MachineOperand &MO : MI.operands()) {
@@ -251,8 +253,7 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
         // VGPR usage. We can identify them easily because they're only used in
         // the chain call, and possibly in an IMPLICIT_DEF coming from an
         // llvm.amdgcn.dead intrinsic.
-        if (IsIWWFunction && (IsChainCall || IsImplicitDef) &&
-            TRI.isVectorRegister(MRI, Reg))
+        if (IsIWWFunction && IsChainCall && TRI.isVectorRegister(MRI, Reg))
           continue;
 
         if (AMDGPU::SGPR_32RegClass.contains(Reg) ||

>From 726506f8da71b2f4a81b9b0eef28a34bd1078cbe Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Wed, 2 Apr 2025 13:46:41 +0200
Subject: [PATCH 3/9] Skip register uses in AMDGPUResourceUsageAnalysis

Don't count register uses when determining the maximum number of
registers used by a function. Count only the defs. This is really an
underestimate of the true register usage, but in practice that's not
a problem because if a function uses a register, then it has either
defined it earlier, or some other function that executed before has
defined it. In particular, the register counts are used:
1. When launching an entry function - in which case we're safe because
   the register counts of the entry function will include the register
   counts of all callees.
2. At function boundaries in dynamic VGPR mode. In this case it's safe
   because whenever we set the new VGPR allocation we take into account
   the outgoing_vgpr_count set by the middle-end.

The main advantage of doing this is that the artificial VGPR arguments
used only for preserving the inactive lanes when using the
llvm.amdgcn.init.whole.wave intrinsic are no longer counted. This
enables us to allocate only the registers we need in dynamic VGPR mode.
---
 llvm/docs/AMDGPUUsage.rst                        | 11 +++++------
 .../AMDGPU/AMDGPUResourceUsageAnalysis.cpp       | 16 +---------------
 ...xpr-knownbits-assign-crash-gh-issue-110930.ll |  2 +-
 .../AMDGPU/unnamed-function-resource-info.ll     |  2 +-
 .../CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll     |  4 ++--
 5 files changed, 10 insertions(+), 25 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index ab507e3714ebb..5e0d7e0af80b7 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -4253,10 +4253,9 @@ same *vendor-name*.
                                                                   wavefront for
                                                                   GFX6-GFX9. A register
                                                                   is required if it is
-                                                                  used explicitly, or
+                                                                  written to, or
                                                                   if a higher numbered
-                                                                  register is used
-                                                                  explicitly. This
+                                                                  register is written to. This
                                                                   includes the special
                                                                   SGPRs for VCC, Flat
                                                                   Scratch (GFX7-GFX9)
@@ -4274,10 +4273,10 @@ same *vendor-name*.
                                                                   each work-item for
                                                                   GFX6-GFX9. A register
                                                                   is required if it is
-                                                                  used explicitly, or
+                                                                  written to, or
                                                                   if a higher numbered
-                                                                  register is used
-                                                                  explicitly.
+                                                                  register is
+                                                                  written to.
      ".agpr_count"                       integer        Required  Number of accumulator
                                                                   registers required by
                                                                   each work-item for
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index dc23a2f0eeba8..bf65594629684 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -156,23 +156,18 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
   int32_t MaxSGPR = -1;
   Info.CalleeSegmentSize = 0;
 
-  bool IsIWWFunction = MFI->hasInitWholeWave();
-
   for (const MachineBasicBlock &MBB : MF) {
     for (const MachineInstr &MI : MBB) {
       if (MI.isImplicitDef())
         continue;
 
-      // At this point, the chain call pseudos are already expanded.
-      bool IsChainCall = MI.getOpcode() == AMDGPU::SI_TCRETURN;
-
       // TODO: Check regmasks? Do they occur anywhere except calls?
       for (const MachineOperand &MO : MI.operands()) {
         unsigned Width = 0;
         bool IsSGPR = false;
         bool IsAGPR = false;
 
-        if (!MO.isReg())
+        if (!MO.isReg() || MO.isUse())
           continue;
 
         Register Reg = MO.getReg();
@@ -247,15 +242,6 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
           break;
         }
 
-        // For functions that use the llvm.amdgcn.init.whole.wave intrinsic, we
-        // often add artificial VGPR arguments for the purpose of preserving
-        // their inactive lanes. These should not be reported as part of our
-        // VGPR usage. We can identify them easily because they're only used in
-        // the chain call, and possibly in an IMPLICIT_DEF coming from an
-        // llvm.amdgcn.dead intrinsic.
-        if (IsIWWFunction && IsChainCall && TRI.isVectorRegister(MRI, Reg))
-          continue;
-
         if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
             AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
             AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
diff --git a/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll b/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll
index 52f380b7f80a3..e5d483c8488db 100644
--- a/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll
+++ b/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll
@@ -264,7 +264,7 @@ define ptr @P_SaveGameFile() {
 ; CHECK-LABEL: R_FlatNumForName:
 ; CHECK: .set R_FlatNumForName.num_vgpr, max(42, I_Error.num_vgpr)
 ; CHECK: .set R_FlatNumForName.num_agpr, max(0, I_Error.num_agpr)
-; CHECK: .set R_FlatNumForName.numbered_sgpr, max(56, I_Error.numbered_sgpr)
+; CHECK: .set R_FlatNumForName.numbered_sgpr, max(34, I_Error.numbered_sgpr)
 ; CHECK: .set R_FlatNumForName.private_seg_size, 16+(max(I_Error.private_seg_size))
 ; CHECK: .set R_FlatNumForName.uses_vcc, or(1, I_Error.uses_vcc)
 ; CHECK: .set R_FlatNumForName.uses_flat_scratch, or(0, I_Error.uses_flat_scratch)
diff --git a/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll b/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
index c9fbd369e062d..a17ab5527f494 100644
--- a/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
+++ b/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
@@ -16,7 +16,7 @@ entry:
 }
 
 ; CHECK-LABEL: __unnamed_2:
-; CHECK: .set __unnamed_2.num_vgpr, max(32, __unnamed_1.num_vgpr)
+; CHECK: .set __unnamed_2.num_vgpr, max(1, __unnamed_1.num_vgpr)
 ; CHECK: .set __unnamed_2.num_agpr, max(0, __unnamed_1.num_agpr)
 ; CHECK: .set __unnamed_2.numbered_sgpr, max(34, __unnamed_1.numbered_sgpr)
 ; CHECK: .set __unnamed_2.private_seg_size, 16+(max(__unnamed_1.private_seg_size))
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
index 2cb5e309c8c21..ee35dc4cddade 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
@@ -1264,9 +1264,9 @@ define amdgpu_kernel void @k1024_call_no_agprs_ub_callee() #1025 {
 }
 
 ; GCN-LABEL: {{^}}f1024_0:
-; GFX90A: NumVgprs: 32
+; GFX90A: NumVgprs: 1
 ; GFX90A: NumAgprs: 1
-; GFX90A: TotalNumVgprs: 33
+; GFX90A: TotalNumVgprs: 5
 define void @f1024_0() #1024 {
   call void @foo()
   ret void

>From f42baae963d4cafe650121902f3ce21743f93c0b Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Thu, 3 Apr 2025 10:23:35 +0200
Subject: [PATCH 4/9] s/no-init-whole-wave/isEntryFunction

---
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 7769bc5d74ebd..8374374da166c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -993,7 +993,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   // Shaders that use the init.whole.wave intrinsic sometimes have VGPR
   // arguments that are only added for the purpose of preserving their inactive
   // lanes. Skip including them in the VGPR count.
-  if (isShader(F.getCallingConv()) && !MFI->hasInitWholeWave()) {
+  if (isShader(F.getCallingConv()) && isEntryFunctionCC(F.getCallingConv())) {
     bool IsPixelShader =
         F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
 

>From affc8379c1ed3e8959acebb47dd23429c441c5ae Mon Sep 17 00:00:00 2001
From: Diana Picus <diana.picus at gmail.com>
Date: Mon, 7 Apr 2025 10:29:40 +0200
Subject: [PATCH 5/9] Fix typo in comment. NFC

Co-authored-by: Thomas Symalla <5754458+tsymalla at users.noreply.github.com>
---
 .../CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
index 0c699a07cb3fd..f1f7fb22d44c6 100644
--- a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: .shader_functions:
 
 ; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
-; The shader is free to use any of the VGPRs mapped to a %inactive.vpgr as long as it only touches its active lanes.
+; The shader is free to use any of the VGPRs mapped to a %inactive.vgpr as long as it only touches its active lanes.
 ; In that case, the VGPR should be included in the .vgpr_count
 ; CHECK-LABEL: _miss_1:
 ; CHECK: .vgpr_count:{{.*}}0xd{{$}}

>From 733829b0e65ed3c02fb561ab63ec2bf125163f0f Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Tue, 29 Apr 2025 14:54:48 +0200
Subject: [PATCH 6/9] Unify code paths; include user + sys SGPRS if used

---
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp   |   9 -
 .../AMDGPU/AMDGPUResourceUsageAnalysis.cpp    | 286 ++----------------
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     |  15 +
 llvm/lib/Target/AMDGPU/SIRegisterInfo.h       |   5 +
 .../AMDGPU/amdgpu-no-agprs-violations.ll      |   4 +-
 llvm/test/CodeGen/AMDGPU/amdpal-callable.ll   |  12 +-
 .../AMDGPU/call-alias-register-usage-agpr.ll  |   2 +-
 .../AMDGPU/call-alias-register-usage0.ll      |   2 +-
 .../AMDGPU/call-alias-register-usage1.ll      |   2 +-
 .../AMDGPU/call-alias-register-usage2.ll      |   2 +-
 .../AMDGPU/call-alias-register-usage3.ll      |   2 +-
 .../AMDGPU/call-graph-register-usage.ll       |   6 +-
 llvm/test/CodeGen/AMDGPU/coalescer_remat.ll   |   2 +-
 .../CodeGen/AMDGPU/function-resource-usage.ll |  24 +-
 llvm/test/CodeGen/AMDGPU/hsa.ll               |   2 +-
 .../AMDGPU/init-whole-wave-vgpr-count-leaf.ll |   2 +-
 llvm/test/CodeGen/AMDGPU/ipra.ll              |   2 +-
 ...-knownbits-assign-crash-gh-issue-110930.ll |  22 +-
 .../multi-call-resource-usage-mcexpr.ll       |   2 +-
 .../AMDGPU/pal-metadata-3.0-callable.ll       |   8 +-
 .../CodeGen/AMDGPU/ps-shader-arg-count.ll     |   6 +-
 .../CodeGen/AMDGPU/register-count-comments.ll |   4 +-
 .../AMDGPU/schedule-amdgpu-trackers.ll        |   4 +-
 .../AMDGPU/unnamed-function-resource-info.ll  |   2 +-
 24 files changed, 94 insertions(+), 333 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 8374374da166c..7e3700a3336f2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1064,15 +1064,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
 
     ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
         ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
-  } else if (isKernel(F.getCallingConv()) &&
-             MFI->getNumKernargPreloadedSGPRs()) {
-    // Consider cases where the total number of UserSGPRs with trailing
-    // allocated preload SGPRs, is greater than the number of explicitly
-    // referenced SGPRs.
-    const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
-        CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
-    ProgInfo.NumSGPR =
-        AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
   }
 
   // Adjust number of registers used to meet default/requested minimum/maximum
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index bf65594629684..942c3ccd13f0c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -137,277 +137,29 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
   if (MFI->isStackRealigned())
     Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
 
-  Info.UsesVCC =
-      MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
-
-  // If there are no calls, MachineRegisterInfo can tell us the used register
-  // count easily.
-  // A tail call isn't considered a call for MachineFrameInfo's purposes.
-  if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
-    Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
-    Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
-    if (ST.hasMAIInsts())
-      Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
+  Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC);
+
+  Info.NumVGPR = TRI.getNumDefinedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
+  Info.NumExplicitSGPR =
+      TRI.getNumDefinedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
+  if (ST.hasMAIInsts())
+    Info.NumAGPR = TRI.getNumDefinedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
+
+  // Count any user or system SGPRs that are actually used.
+  for (int I = MFI->getNumPreloadedSGPRs() - 1; I >= 0; I--)
+    if (MRI.isPhysRegUsed(AMDGPU::SGPR0 + I)) {
+      Info.NumExplicitSGPR = std::max<int32_t>(
+          Info.NumExplicitSGPR, TRI.getHWRegIndex(AMDGPU::SGPR0 + I) + 1);
+      break;
+    }
+
+  if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall())
     return Info;
-  }
 
-  int32_t MaxVGPR = -1;
-  int32_t MaxAGPR = -1;
-  int32_t MaxSGPR = -1;
   Info.CalleeSegmentSize = 0;
 
   for (const MachineBasicBlock &MBB : MF) {
     for (const MachineInstr &MI : MBB) {
-      if (MI.isImplicitDef())
-        continue;
-
-      // TODO: Check regmasks? Do they occur anywhere except calls?
-      for (const MachineOperand &MO : MI.operands()) {
-        unsigned Width = 0;
-        bool IsSGPR = false;
-        bool IsAGPR = false;
-
-        if (!MO.isReg() || MO.isUse())
-          continue;
-
-        Register Reg = MO.getReg();
-        switch (Reg) {
-        case AMDGPU::EXEC:
-        case AMDGPU::EXEC_LO:
-        case AMDGPU::EXEC_HI:
-        case AMDGPU::SCC:
-        case AMDGPU::M0:
-        case AMDGPU::M0_LO16:
-        case AMDGPU::M0_HI16:
-        case AMDGPU::SRC_SHARED_BASE_LO:
-        case AMDGPU::SRC_SHARED_BASE:
-        case AMDGPU::SRC_SHARED_LIMIT_LO:
-        case AMDGPU::SRC_SHARED_LIMIT:
-        case AMDGPU::SRC_PRIVATE_BASE_LO:
-        case AMDGPU::SRC_PRIVATE_BASE:
-        case AMDGPU::SRC_PRIVATE_LIMIT_LO:
-        case AMDGPU::SRC_PRIVATE_LIMIT:
-        case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
-        case AMDGPU::SGPR_NULL:
-        case AMDGPU::SGPR_NULL64:
-        case AMDGPU::MODE:
-          continue;
-
-        case AMDGPU::NoRegister:
-          assert(MI.isDebugInstr() &&
-                 "Instruction uses invalid noreg register");
-          continue;
-
-        case AMDGPU::VCC:
-        case AMDGPU::VCC_LO:
-        case AMDGPU::VCC_HI:
-        case AMDGPU::VCC_LO_LO16:
-        case AMDGPU::VCC_LO_HI16:
-        case AMDGPU::VCC_HI_LO16:
-        case AMDGPU::VCC_HI_HI16:
-          Info.UsesVCC = true;
-          continue;
-
-        case AMDGPU::FLAT_SCR:
-        case AMDGPU::FLAT_SCR_LO:
-        case AMDGPU::FLAT_SCR_HI:
-          continue;
-
-        case AMDGPU::XNACK_MASK:
-        case AMDGPU::XNACK_MASK_LO:
-        case AMDGPU::XNACK_MASK_HI:
-          llvm_unreachable("xnack_mask registers should not be used");
-
-        case AMDGPU::LDS_DIRECT:
-          llvm_unreachable("lds_direct register should not be used");
-
-        case AMDGPU::TBA:
-        case AMDGPU::TBA_LO:
-        case AMDGPU::TBA_HI:
-        case AMDGPU::TMA:
-        case AMDGPU::TMA_LO:
-        case AMDGPU::TMA_HI:
-          llvm_unreachable("trap handler registers should not be used");
-
-        case AMDGPU::SRC_VCCZ:
-          llvm_unreachable("src_vccz register should not be used");
-
-        case AMDGPU::SRC_EXECZ:
-          llvm_unreachable("src_execz register should not be used");
-
-        case AMDGPU::SRC_SCC:
-          llvm_unreachable("src_scc register should not be used");
-
-        default:
-          break;
-        }
-
-        if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
-            AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
-            AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 1;
-        } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
-                   AMDGPU::VGPR_16RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 1;
-        } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
-                   AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 1;
-        } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 2;
-        } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 2;
-        } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 2;
-        } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 3;
-        } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 3;
-        } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 3;
-        } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 4;
-        } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 4;
-        } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 4;
-        } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 5;
-        } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 5;
-        } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 5;
-        } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 6;
-        } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 6;
-        } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 6;
-        } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 7;
-        } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 7;
-        } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 7;
-        } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 8;
-        } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 8;
-        } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 8;
-        } else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 9;
-        } else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 9;
-        } else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 9;
-        } else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 10;
-        } else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 10;
-        } else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 10;
-        } else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 11;
-        } else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 11;
-        } else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 11;
-        } else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 12;
-        } else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 12;
-        } else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 12;
-        } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 16;
-        } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 16;
-        } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 16;
-        } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
-          IsSGPR = true;
-          Width = 32;
-        } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
-          IsSGPR = false;
-          Width = 32;
-        } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
-          IsSGPR = false;
-          IsAGPR = true;
-          Width = 32;
-        } else {
-          // We only expect TTMP registers or registers that do not belong to
-          // any RC.
-          assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
-                  AMDGPU::TTMP_64RegClass.contains(Reg) ||
-                  AMDGPU::TTMP_128RegClass.contains(Reg) ||
-                  AMDGPU::TTMP_256RegClass.contains(Reg) ||
-                  AMDGPU::TTMP_512RegClass.contains(Reg) ||
-                  !TRI.getPhysRegBaseClass(Reg)) &&
-                 "Unknown register class");
-        }
-        unsigned HWReg = TRI.getHWRegIndex(Reg);
-        int MaxUsed = HWReg + Width - 1;
-        if (IsSGPR) {
-          MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
-        } else if (IsAGPR) {
-          MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
-        } else {
-          MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
-        }
-      }
-
       if (MI.isCall()) {
         // Pseudo used just to encode the underlying global. Is there a better
         // way to track this?
@@ -467,9 +219,5 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
     }
   }
 
-  Info.NumExplicitSGPR = MaxSGPR + 1;
-  Info.NumVGPR = MaxVGPR + 1;
-  Info.NumAGPR = MaxAGPR + 1;
-
   return Info;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index c1ac9491b2363..94cdb48b31af2 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3996,6 +3996,21 @@ SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
   return 0;
 }
 
+unsigned
+SIRegisterInfo::getNumDefinedPhysRegs(const MachineRegisterInfo &MRI,
+                                      const TargetRegisterClass &RC) const {
+  auto isDefinedByImplicitDef = [](MachineOperand &Op) {
+    return Op.getParent()->isImplicitDef();
+  };
+
+  for (MCPhysReg Reg : reverse(RC.getRegisters()))
+    for (MCRegAliasIterator AI(Reg, this, true); AI.isValid(); ++AI)
+      if (!(MRI.def_empty(*AI) || std::all_of(MRI.def_begin(*AI), MRI.def_end(),
+                                              isDefinedByImplicitDef)))
+        return getHWRegIndex(Reg) + 1;
+  return 0;
+}
+
 SmallVector<StringLiteral>
 SIRegisterInfo::getVRegFlagsOfReg(Register Reg,
                                   const MachineFunction &MF) const {
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index f3068963fd10f..b3a9d2ba9d690 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -470,6 +470,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
   unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
                               const TargetRegisterClass &RC) const;
 
+  // \returns the number of registers of a given \p RC defined in a function.
+  // Does not go inside function calls.
+  unsigned getNumDefinedPhysRegs(const MachineRegisterInfo &MRI,
+                                 const TargetRegisterClass &RC) const;
+
   std::optional<uint8_t> getVRegFlagValue(StringRef Name) const override {
     return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG
                              : std::optional<uint8_t>{};
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll
index 7bf9a29e9ff44..44308e353680b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll
@@ -14,7 +14,7 @@
 ; CHECK: ; use a0
 
 ; CHECK: NumVgprs: 0
-; CHECK: NumAgprs: 1
+; CHECK: NumAgprs: 0
 define amdgpu_kernel void @kernel_illegal_agpr_use_asm() #0 {
   call void asm sideeffect "; use $0", "a"(i32 poison)
   ret void
@@ -24,7 +24,7 @@ define amdgpu_kernel void @kernel_illegal_agpr_use_asm() #0 {
 ; CHECK: ; use a0
 
 ; CHECK: NumVgprs: 0
-; CHECK: NumAgprs: 1
+; CHECK: NumAgprs: 0
 define void @func_illegal_agpr_use_asm() #0 {
   call void asm sideeffect "; use $0", "a"(i32 poison)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
index f4d17e50cf18c..494ade73cb5f8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -142,8 +142,8 @@ attributes #0 = { nounwind }
 
 ; GCN: amdpal.pipelines:
 ; GCN-NEXT:  - .registers:
-; SDAG-NEXT:     '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
-; GISEL-NEXT:    '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
+; GFX8-NEXT:     '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf010a{{$}}
+; GFX9-NEXT:    '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf014a{{$}}
 ; GCN-NEXT:      '0x2e13 (COMPUTE_PGM_RSRC2)': 0x8001{{$}}
 ; GCN-NEXT:    .shader_functions:
 ; GCN-NEXT:      dynamic_stack:
@@ -164,13 +164,13 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:      multiple_stack:
 ; GCN-NEXT:        .backend_stack_size: 0x24{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GCN-NEXT:        .sgpr_count:     0x21{{$}}
+; GCN-NEXT:        .sgpr_count:     0x1{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x24{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x3{{$}}
 ; GCN-NEXT:      no_stack:
 ; GCN-NEXT:        .backend_stack_size: 0{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GCN-NEXT:        .sgpr_count:     0x20{{$}}
+; GCN-NEXT:        .sgpr_count:     0x1{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x1{{$}}
 ; GCN-NEXT:      no_stack_call:
@@ -203,7 +203,7 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:      simple_lds:
 ; GCN-NEXT:        .backend_stack_size: 0{{$}}
 ; GCN-NEXT:        .lds_size:       0x100{{$}}
-; GCN-NEXT:        .sgpr_count:     0x20{{$}}
+; GCN-NEXT:        .sgpr_count:     0x1{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x1{{$}}
 ; GCN-NEXT:      simple_lds_recurse:
@@ -215,7 +215,7 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:      simple_stack:
 ; GCN-NEXT:        .backend_stack_size: 0x14{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GCN-NEXT:        .sgpr_count:     0x21{{$}}
+; GCN-NEXT:        .sgpr_count:     0x1{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x14{{$}}
 ; GCN-NEXT:        .vgpr_count:     0x2{{$}}
 ; GCN-NEXT:      simple_stack_call:
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
index 9de6aea9385df..1e12c0bf9aa33 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
@@ -28,7 +28,7 @@ bb:
 }
 ; ALL:      .set .Laliasee_default.num_vgpr, 0
 ; ALL-NEXT: .set .Laliasee_default.num_agpr, 27
-; ALL-NEXT: .set .Laliasee_default.numbered_sgpr, 32
+; ALL-NEXT: .set .Laliasee_default.numbered_sgpr, 0
 
 attributes #0 = { noinline norecurse nounwind optnone }
 attributes #1 = { noinline norecurse nounwind readnone willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
index f719f50ef6f13..ed4749d181ef9 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
@@ -18,7 +18,7 @@ bb:
 
 ; CHECK:      .set .Laliasee_default_vgpr64_sgpr102.num_vgpr, 53
 ; CHECK-NEXT: .set .Laliasee_default_vgpr64_sgpr102.num_agpr, 0
-; CHECK-NEXT: .set .Laliasee_default_vgpr64_sgpr102.numbered_sgpr, 32
+; CHECK-NEXT: .set .Laliasee_default_vgpr64_sgpr102.numbered_sgpr, 0
 define internal void @aliasee_default_vgpr64_sgpr102() #1 {
 bb:
   call void asm sideeffect "; clobber v52 ", "~{v52}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
index fe27859eb0afd..2ae4b1b69abfc 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
@@ -24,7 +24,7 @@ bb:
 
 ; CHECK:      .set .Laliasee_vgpr32_sgpr76.num_vgpr, 27
 ; CHECK-NEXT: .set .Laliasee_vgpr32_sgpr76.num_agpr, 0
-; CHECK-NEXT: .set .Laliasee_vgpr32_sgpr76.numbered_sgpr, 32
+; CHECK-NEXT: .set .Laliasee_vgpr32_sgpr76.numbered_sgpr, 0
 define internal void @aliasee_vgpr32_sgpr76() #1 {
 bb:
   call void asm sideeffect "; clobber v26 ", "~{v26}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
index 35b67351e85dd..334c764c06c0c 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
@@ -21,7 +21,7 @@ bb:
 
 ; CHECK:      .set .Laliasee_vgpr64_sgpr102.num_vgpr, 53
 ; CHECK-NEXT: .set .Laliasee_vgpr64_sgpr102.num_agpr, 0
-; CHECK-NEXT: .set .Laliasee_vgpr64_sgpr102.numbered_sgpr, 32
+; CHECK-NEXT: .set .Laliasee_vgpr64_sgpr102.numbered_sgpr, 0
 define internal void @aliasee_vgpr64_sgpr102() #1 {
 bb:
   call void asm sideeffect "; clobber v52 ", "~{v52}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
index 3674d740b987b..f75f77f0fb6ba 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
@@ -21,7 +21,7 @@ bb:
 
 ; CHECK:      .set .Laliasee_vgpr256_sgpr102.num_vgpr, 253
 ; CHECK-NEXT: .set .Laliasee_vgpr256_sgpr102.num_agpr, 0
-; CHECK-NEXT: .set .Laliasee_vgpr256_sgpr102.numbered_sgpr, 33
+; CHECK-NEXT: .set .Laliasee_vgpr256_sgpr102.numbered_sgpr, 0
 define internal void @aliasee_vgpr256_sgpr102() #1 {
 bb:
   call void asm sideeffect "; clobber v252 ", "~{v252}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index dbd00f09943c0..70574480d2113 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -7,7 +7,7 @@
 ; Make sure to run a GPU with the SGPR allocation bug.
 
 ; GCN-LABEL: {{^}}use_vcc:
-; GCN: ; TotalNumSgprs: 34
+; GCN: ; TotalNumSgprs: 2
 ; GCN: ; NumVgprs: 0
 define void @use_vcc() #1 {
   call void asm sideeffect "", "~{vcc}" () #0
@@ -43,8 +43,8 @@ define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out)
 }
 
 ; GCN-LABEL: {{^}}use_flat_scratch:
-; CI: ; TotalNumSgprs: 36
-; VI: ; TotalNumSgprs: 38
+; CI: ; TotalNumSgprs: 4
+; VI: ; TotalNumSgprs: 6
 ; GCN: ; NumVgprs: 0
 define void @use_flat_scratch() #1 {
   call void asm sideeffect "", "~{flat_scratch}" () #0
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll b/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
index 61830f18ad7a7..55dc394628176 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
@@ -12,7 +12,7 @@ declare float @llvm.fma.f32(float, float, float)
 ; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
 ; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
 ; It's probably OK if this is slightly higher:
-; CHECK: ; NumVgprs: 8
+; CHECK: ; NumVgprs: 5
 define amdgpu_kernel void @foobar(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i32 %flag) {
 entry:
   %cmpflag = icmp eq i32 %flag, 1
diff --git a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
index e152f2ddd5253..0a6aa05c2d212 100644
--- a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
@@ -5,14 +5,14 @@
 ; GCN-LABEL: {{^}}use_vcc:
 ; GCN: .set use_vcc.num_vgpr, 0
 ; GCN: .set use_vcc.num_agpr, 0
-; GCN: .set use_vcc.numbered_sgpr, 32
+; GCN: .set use_vcc.numbered_sgpr, 0
 ; GCN: .set use_vcc.private_seg_size, 0
 ; GCN: .set use_vcc.uses_vcc, 1
 ; GCN: .set use_vcc.uses_flat_scratch, 0
 ; GCN: .set use_vcc.has_dyn_sized_stack, 0
 ; GCN: .set use_vcc.has_recursion, 0
 ; GCN: .set use_vcc.has_indirect_call, 0
-; GCN: TotalNumSgprs: 36
+; GCN: TotalNumSgprs: 4
 ; GCN: NumVgprs: 0
 ; GCN: ScratchSize: 0
 define void @use_vcc() #1 {
@@ -59,14 +59,14 @@ define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out)
 ; GCN-LABEL: {{^}}use_flat_scratch:
 ; GCN: .set use_flat_scratch.num_vgpr, 0
 ; GCN: .set use_flat_scratch.num_agpr, 0
-; GCN: .set use_flat_scratch.numbered_sgpr, 32
+; GCN: .set use_flat_scratch.numbered_sgpr, 0
 ; GCN: .set use_flat_scratch.private_seg_size, 0
 ; GCN: .set use_flat_scratch.uses_vcc, 0
 ; GCN: .set use_flat_scratch.uses_flat_scratch, 1
 ; GCN: .set use_flat_scratch.has_dyn_sized_stack, 0
 ; GCN: .set use_flat_scratch.has_recursion, 0
 ; GCN: .set use_flat_scratch.has_indirect_call, 0
-; GCN: TotalNumSgprs: 38
+; GCN: TotalNumSgprs: 6
 ; GCN: NumVgprs: 0
 ; GCN: ScratchSize: 0
 define void @use_flat_scratch() #1 {
@@ -113,14 +113,14 @@ define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace
 ; GCN-LABEL: {{^}}use_10_vgpr:
 ; GCN: .set use_10_vgpr.num_vgpr, 10
 ; GCN: .set use_10_vgpr.num_agpr, 0
-; GCN: .set use_10_vgpr.numbered_sgpr, 32
+; GCN: .set use_10_vgpr.numbered_sgpr, 0
 ; GCN: .set use_10_vgpr.private_seg_size, 0
 ; GCN: .set use_10_vgpr.uses_vcc, 0
 ; GCN: .set use_10_vgpr.uses_flat_scratch, 0
 ; GCN: .set use_10_vgpr.has_dyn_sized_stack, 0
 ; GCN: .set use_10_vgpr.has_recursion, 0
 ; GCN: .set use_10_vgpr.has_indirect_call, 0
-; GCN: TotalNumSgprs: 36
+; GCN: TotalNumSgprs: 4
 ; GCN: NumVgprs: 10
 ; GCN: ScratchSize: 0
 define void @use_10_vgpr() #1 {
@@ -168,14 +168,14 @@ define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 {
 ; GCN-LABEL: {{^}}use_50_vgpr:
 ; GCN:	.set use_50_vgpr.num_vgpr, 50
 ; GCN:	.set use_50_vgpr.num_agpr, 0
-; GCN:	.set use_50_vgpr.numbered_sgpr, 32
+; GCN:	.set use_50_vgpr.numbered_sgpr, 0
 ; GCN:	.set use_50_vgpr.private_seg_size, 0
 ; GCN:	.set use_50_vgpr.uses_vcc, 0
 ; GCN:	.set use_50_vgpr.uses_flat_scratch, 0
 ; GCN:	.set use_50_vgpr.has_dyn_sized_stack, 0
 ; GCN:	.set use_50_vgpr.has_recursion, 0
 ; GCN:	.set use_50_vgpr.has_indirect_call, 0
-; GCN: TotalNumSgprs: 36
+; GCN: TotalNumSgprs: 4
 ; GCN: NumVgprs: 50
 ; GCN: ScratchSize: 0
 define void @use_50_vgpr() #1 {
@@ -258,14 +258,14 @@ define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 {
 ; GCN-LABEL: {{^}}use_stack0:
 ; GCN:	.set use_stack0.num_vgpr, 1
 ; GCN:	.set use_stack0.num_agpr, 0
-; GCN:	.set use_stack0.numbered_sgpr, 33
+; GCN:	.set use_stack0.numbered_sgpr, 0
 ; GCN:	.set use_stack0.private_seg_size, 2052
 ; GCN:	.set use_stack0.uses_vcc, 0
 ; GCN:	.set use_stack0.uses_flat_scratch, 0
 ; GCN:	.set use_stack0.has_dyn_sized_stack, 0
 ; GCN:	.set use_stack0.has_recursion, 0
 ; GCN:	.set use_stack0.has_indirect_call, 0
-; GCN: TotalNumSgprs: 37
+; GCN: TotalNumSgprs: 4
 ; GCN: NumVgprs: 1
 ; GCN: ScratchSize: 2052
 define void @use_stack0() #1 {
@@ -277,14 +277,14 @@ define void @use_stack0() #1 {
 ; GCN-LABEL: {{^}}use_stack1:
 ; GCN:	.set use_stack1.num_vgpr, 1
 ; GCN:	.set use_stack1.num_agpr, 0
-; GCN:	.set use_stack1.numbered_sgpr, 33
+; GCN:	.set use_stack1.numbered_sgpr, 0
 ; GCN:	.set use_stack1.private_seg_size, 404
 ; GCN:	.set use_stack1.uses_vcc, 0
 ; GCN:	.set use_stack1.uses_flat_scratch, 0
 ; GCN:	.set use_stack1.has_dyn_sized_stack, 0
 ; GCN:	.set use_stack1.has_recursion, 0
 ; GCN:	.set use_stack1.has_indirect_call, 0
-; GCN: TotalNumSgprs: 37
+; GCN: TotalNumSgprs: 4
 ; GCN: NumVgprs: 1
 ; GCN: ScratchSize: 404
 define void @use_stack1() #1 {
diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll
index 5a2a976e23846..724b4174350d4 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa.ll
@@ -63,7 +63,7 @@
 ; ELF:   0220: 70725F73 70696C6C 5F636F75 6E7400A7
 ; ELF:   0230: 2E73796D 626F6CB5 73696D70 6C655F6E
 ; ELF:   0240: 6F5F6B65 726E6172 67732E6B 64AB2E76
-; ELF:   0250: 6770725F 636F756E 7402B12E 76677072
+; ELF:   0250: 6770725F 636F756E 7401B12E 76677072
 ; ELF:   0260: 5F737069 6C6C5F63 6F756E74 00AF2E77
 ; ELF:   0270: 61766566 726F6E74 5F73697A 6540AD61
 ; ELF:   0280: 6D646873 612E7461 72676574 BD616D64
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
index 5d7472fd3c56e..d467d6c0042b0 100644
--- a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
+++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
@@ -4,7 +4,7 @@
 
 ; Make sure that .vgpr_count doesn't include the %inactive.vgpr registers.
 ; CHECK-LABEL: leaf_shader:
-; CHECK: .vgpr_count:{{.*}}0xc{{$}}
+; CHECK: .vgpr_count:{{.*}}0x1{{$}}
 
 ; Function without calls.
 define amdgpu_cs_chain void @_leaf_shader(ptr %output.ptr, i32 inreg %input.value,
diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll
index 464cd820028cc..c3b033113431f 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra.ll
@@ -64,7 +64,7 @@ define void @func_regular_call() #1 {
 ; GCN-NEXT: s_addc_u32 s17,
 ; GCN-NEXT: s_setpc_b64 s[16:17]
 
-; GCN: ; TotalNumSgprs: 32
+; GCN: ; TotalNumSgprs: 18
 ; GCN: ; NumVgprs: 8
 define void @func_tail_call() #1 {
   tail call void @func()
diff --git a/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll b/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll
index 2db90a8883270..03694b913d6e0 100644
--- a/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll
+++ b/llvm/test/CodeGen/AMDGPU/mcexpr-knownbits-assign-crash-gh-issue-110930.ll
@@ -24,7 +24,7 @@ define void @I_Quit() {
 ; CHECK-LABEL: P_RemoveMobj:
 ; CHECK: .set P_RemoveMobj.num_vgpr, 0
 ; CHECK: .set P_RemoveMobj.num_agpr, 0
-; CHECK: .set P_RemoveMobj.numbered_sgpr, 32
+; CHECK: .set P_RemoveMobj.numbered_sgpr, 0
 ; CHECK: .set P_RemoveMobj.private_seg_size, 0
 ; CHECK: .set P_RemoveMobj.uses_vcc, 0
 ; CHECK: .set P_RemoveMobj.uses_flat_scratch, 0
@@ -38,7 +38,7 @@ define void @P_RemoveMobj() {
 ; CHECK-LABEL: P_SpawnMobj:
 ; CHECK: .set P_SpawnMobj.num_vgpr, 0
 ; CHECK: .set P_SpawnMobj.num_agpr, 0
-; CHECK: .set P_SpawnMobj.numbered_sgpr, 32
+; CHECK: .set P_SpawnMobj.numbered_sgpr, 0
 ; CHECK: .set P_SpawnMobj.private_seg_size, 0
 ; CHECK: .set P_SpawnMobj.uses_vcc, 0
 ; CHECK: .set P_SpawnMobj.uses_flat_scratch, 0
@@ -52,7 +52,7 @@ define void @P_SpawnMobj() {
 ; CHECK-LABEL: G_PlayerReborn:
 ; CHECK: .set G_PlayerReborn.num_vgpr, 0
 ; CHECK: .set G_PlayerReborn.num_agpr, 0
-; CHECK: .set G_PlayerReborn.numbered_sgpr, 32
+; CHECK: .set G_PlayerReborn.numbered_sgpr, 0
 ; CHECK: .set G_PlayerReborn.private_seg_size, 0
 ; CHECK: .set G_PlayerReborn.uses_vcc, 0
 ; CHECK: .set G_PlayerReborn.uses_flat_scratch, 0
@@ -66,7 +66,7 @@ define void @G_PlayerReborn() {
 ; CHECK-LABEL: P_SetThingPosition:
 ; CHECK: .set P_SetThingPosition.num_vgpr, 0
 ; CHECK: .set P_SetThingPosition.num_agpr, 0
-; CHECK: .set P_SetThingPosition.numbered_sgpr, 32
+; CHECK: .set P_SetThingPosition.numbered_sgpr, 0
 ; CHECK: .set P_SetThingPosition.private_seg_size, 0
 ; CHECK: .set P_SetThingPosition.uses_vcc, 0
 ; CHECK: .set P_SetThingPosition.uses_flat_scratch, 0
@@ -96,7 +96,7 @@ define void @P_SetupPsprites(ptr addrspace(1) %i) {
 ; CHECK-LABEL: HU_Start:
 ; CHECK: .set HU_Start.num_vgpr, 0
 ; CHECK: .set HU_Start.num_agpr, 0
-; CHECK: .set HU_Start.numbered_sgpr, 32
+; CHECK: .set HU_Start.numbered_sgpr, 0
 ; CHECK: .set HU_Start.private_seg_size, 0
 ; CHECK: .set HU_Start.uses_vcc, 0
 ; CHECK: .set HU_Start.uses_flat_scratch, 0
@@ -162,7 +162,7 @@ define void @G_DoReborn() {
 ; CHECK-LABEL: AM_Stop:
 ; CHECK: .set AM_Stop.num_vgpr, 0
 ; CHECK: .set AM_Stop.num_agpr, 0
-; CHECK: .set AM_Stop.numbered_sgpr, 32
+; CHECK: .set AM_Stop.numbered_sgpr, 0
 ; CHECK: .set AM_Stop.private_seg_size, 0
 ; CHECK: .set AM_Stop.uses_vcc, 0
 ; CHECK: .set AM_Stop.uses_flat_scratch, 0
@@ -176,7 +176,7 @@ define void @AM_Stop() {
 ; CHECK-LABEL: D_AdvanceDemo:
 ; CHECK: .set D_AdvanceDemo.num_vgpr, 0
 ; CHECK: .set D_AdvanceDemo.num_agpr, 0
-; CHECK: .set D_AdvanceDemo.numbered_sgpr, 32
+; CHECK: .set D_AdvanceDemo.numbered_sgpr, 0
 ; CHECK: .set D_AdvanceDemo.private_seg_size, 0
 ; CHECK: .set D_AdvanceDemo.uses_vcc, 0
 ; CHECK: .set D_AdvanceDemo.uses_flat_scratch, 0
@@ -190,7 +190,7 @@ define void @D_AdvanceDemo() {
 ; CHECK-LABEL: F_StartFinale:
 ; CHECK: .set F_StartFinale.num_vgpr, 0
 ; CHECK: .set F_StartFinale.num_agpr, 0
-; CHECK: .set F_StartFinale.numbered_sgpr, 32
+; CHECK: .set F_StartFinale.numbered_sgpr, 0
 ; CHECK: .set F_StartFinale.private_seg_size, 0
 ; CHECK: .set F_StartFinale.uses_vcc, 0
 ; CHECK: .set F_StartFinale.uses_flat_scratch, 0
@@ -204,7 +204,7 @@ define void @F_StartFinale() {
 ; CHECK-LABEL: F_Ticker:
 ; CHECK: .set F_Ticker.num_vgpr, 0
 ; CHECK: .set F_Ticker.num_agpr, 0
-; CHECK: .set F_Ticker.numbered_sgpr, 32
+; CHECK: .set F_Ticker.numbered_sgpr, 0
 ; CHECK: .set F_Ticker.private_seg_size, 0
 ; CHECK: .set F_Ticker.uses_vcc, 0
 ; CHECK: .set F_Ticker.uses_flat_scratch, 0
@@ -236,7 +236,7 @@ define i32 @G_CheckDemoStatus() {
 ; CHECK-LABEL: P_TempSaveGameFile:
 ; CHECK: .set P_TempSaveGameFile.num_vgpr, 2
 ; CHECK: .set P_TempSaveGameFile.num_agpr, 0
-; CHECK: .set P_TempSaveGameFile.numbered_sgpr, 32
+; CHECK: .set P_TempSaveGameFile.numbered_sgpr, 0
 ; CHECK: .set P_TempSaveGameFile.private_seg_size, 0
 ; CHECK: .set P_TempSaveGameFile.uses_vcc, 0
 ; CHECK: .set P_TempSaveGameFile.uses_flat_scratch, 0
@@ -250,7 +250,7 @@ define ptr @P_TempSaveGameFile() {
 ; CHECK-LABEL: P_SaveGameFile:
 ; CHECK: .set P_SaveGameFile.num_vgpr, 2
 ; CHECK: .set P_SaveGameFile.num_agpr, 0
-; CHECK: .set P_SaveGameFile.numbered_sgpr, 32
+; CHECK: .set P_SaveGameFile.numbered_sgpr, 0
 ; CHECK: .set P_SaveGameFile.private_seg_size, 0
 ; CHECK: .set P_SaveGameFile.uses_vcc, 0
 ; CHECK: .set P_SaveGameFile.uses_flat_scratch, 0
diff --git a/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll b/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll
index 7a810d0067c17..83f58db1aa67f 100644
--- a/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-call-resource-usage-mcexpr.ll
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: {{^}}qux
 ; CHECK: .set qux.num_vgpr, 13
 ; CHECK: .set qux.num_agpr, 0
-; CHECK: .set qux.numbered_sgpr, 32
+; CHECK: .set qux.numbered_sgpr, 0
 ; CHECK: .set qux.private_seg_size, 0
 ; CHECK: .set qux.uses_vcc, 0
 ; CHECK: .set qux.uses_flat_scratch, 0
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
index 638dc8965987e..28c3131302a31 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
@@ -83,13 +83,13 @@
 ; CHECK-NEXT:      multiple_stack:
 ; CHECK-NEXT:        .backend_stack_size: 0x24
 ; CHECK-NEXT:        .lds_size:       0
-; CHECK-NEXT:        .sgpr_count:     0x21
+; CHECK-NEXT:        .sgpr_count:     0x1
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x24
 ; CHECK-NEXT:        .vgpr_count:     0x3
 ; CHECK-NEXT:      no_stack:
 ; CHECK-NEXT:        .backend_stack_size: 0
 ; CHECK-NEXT:        .lds_size:       0
-; CHECK-NEXT:        .sgpr_count:     0x20
+; CHECK-NEXT:        .sgpr_count:     0x1
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0
 ; CHECK-NEXT:        .vgpr_count:     0x1
 ; CHECK-NEXT:      no_stack_call:
@@ -122,7 +122,7 @@
 ; CHECK-NEXT:      simple_lds:
 ; CHECK-NEXT:        .backend_stack_size: 0
 ; CHECK-NEXT:        .lds_size:       0x100
-; CHECK-NEXT:        .sgpr_count:     0x20
+; CHECK-NEXT:        .sgpr_count:     0x1
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0
 ; CHECK-NEXT:        .vgpr_count:     0x1
 ; CHECK-NEXT:      simple_lds_recurse:
@@ -134,7 +134,7 @@
 ; CHECK-NEXT:      simple_stack:
 ; CHECK-NEXT:        .backend_stack_size: 0x14
 ; CHECK-NEXT:        .lds_size:       0
-; CHECK-NEXT:        .sgpr_count:     0x21
+; CHECK-NEXT:        .sgpr_count:     0x1
 ; CHECK-NEXT:        .stack_frame_size_in_bytes: 0x14
 ; CHECK-NEXT:        .vgpr_count:     0x2
 ; CHECK-NEXT:      simple_stack_call:
diff --git a/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll b/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
index 5b9b0feea9900..a71fd7fe782ff 100644
--- a/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
@@ -2,7 +2,7 @@
 ;RUN: llc < %s -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
 
 ; ;CHECK-LABEL: {{^}}_amdgpu_ps_1_arg:
-; ;CHECK: NumVgprs: 4
+; ;CHECK: NumVgprs: 2
 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_1_arg(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
 .entry:
   %i1 = extractelement <2 x float> %arg3, i32 1
@@ -193,7 +193,7 @@ define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float>
 
 ; Check that when no input args are used we get the minimum allocation - note that we always enable the first input
 ; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused:
-; CHECK: NumVgprs: 4
+; CHECK: NumVgprs: 2
 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
 .entry:
   ret { <4 x float> } undef
@@ -202,7 +202,7 @@ define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused(i32 inreg %arg
 ; Check that when no input args are used we get the minimum allocation - note that we always enable the first input
 ; Additionally set the PSInputAddr to 0 via the metadata
 ; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_ia0:
-; CHECK: NumVgprs: 4
+; CHECK: NumVgprs: 2
 define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_ia0(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #3 {
 .entry:
   ret { <4 x float> } undef
diff --git a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
index 35e11ad6a648b..bfcf90037bfd3 100644
--- a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
+++ b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
@@ -24,7 +24,9 @@ define amdgpu_kernel void @foo(ptr addrspace(1) noalias %out, ptr addrspace(1) %
 
 ; SI-LABEL: {{^}}one_vgpr_used:
 ; SI: NumVgprs: 1
-define amdgpu_kernel void @one_vgpr_used(ptr addrspace(1) %out, i32 %x) nounwind {
+define amdgpu_kernel void @one_vgpr_used(ptr addrspace(1) %out, i32 %x) #0 {
   store i32 %x, ptr addrspace(1) %out, align 4
   ret void
 }
+
+attributes #0 = { nounwind noinline "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
index c5732531f5423..95d707aee5662 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
@@ -11,8 +11,8 @@
 ; allow scheduling of other instructions which reduce RP
 
 ; CHECK-LABEL: {{^}}return_72xi32:
-; GFX11-PAL:    NumSgprs: 33
-; GFX11-PAL-GCNTRACKERS:    NumSgprs: 33
+; GFX11-PAL:    NumSgprs: 0
+; GFX11-PAL-GCNTRACKERS:    NumSgprs: 0
 ; GFX11-PAL:    NumVgprs: 64
 ; GFX11-PAL-GCNTRACKERS:    NumVgprs: 64
 ; GFX11-PAL:    ScratchSize: 220
diff --git a/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll b/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
index bb0ec0d3ad3f8..4802ec861d685 100644
--- a/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
+++ b/llvm/test/CodeGen/AMDGPU/unnamed-function-resource-info.ll
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: __unnamed_1:
 ; CHECK: .set __unnamed_1.num_vgpr, 0
 ; CHECK: .set __unnamed_1.num_agpr, 0
-; CHECK: .set __unnamed_1.numbered_sgpr, 32
+; CHECK: .set __unnamed_1.numbered_sgpr, 0
 ; CHECK: .set __unnamed_1.private_seg_size, 0
 ; CHECK: .set __unnamed_1.uses_vcc, 0
 ; CHECK: .set __unnamed_1.uses_flat_scratch, 0

>From 563cbd6f1de764af8eb65ee56edf6a11a949ed08 Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Mon, 5 May 2025 16:52:50 +0200
Subject: [PATCH 7/9] Add missing tests. Fix preloaded VGPR issue

---
 .../AMDGPU/AMDGPUResourceUsageAnalysis.cpp    | 15 ++++----
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 13 +++++++
 .../test/CodeGen/AMDGPU/vgpr-count-compute.ll | 30 ++++++++++++++++
 .../CodeGen/AMDGPU/vgpr-count-graphics.ll     | 35 +++++++++++++++++++
 4 files changed, 87 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 942c3ccd13f0c..666187c01c41a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -146,12 +146,15 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
     Info.NumAGPR = TRI.getNumDefinedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
 
   // Count any user or system SGPRs that are actually used.
-  for (int I = MFI->getNumPreloadedSGPRs() - 1; I >= 0; I--)
-    if (MRI.isPhysRegUsed(AMDGPU::SGPR0 + I)) {
-      Info.NumExplicitSGPR = std::max<int32_t>(
-          Info.NumExplicitSGPR, TRI.getHWRegIndex(AMDGPU::SGPR0 + I) + 1);
-      break;
-    }
+  auto GetNumUsedPreloadedRegs = [&](int NumPreloadedRegs, unsigned RegBase) -> int32_t {
+    for (int I = NumPreloadedRegs - 1; I >= 0; I--)
+      if (MRI.isPhysRegUsed(RegBase + I))
+        return TRI.getHWRegIndex(RegBase + I) + 1;
+    return 0;
+  };
+
+  Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, GetNumUsedPreloadedRegs(MFI->getNumPreloadedSGPRs(), AMDGPU::SGPR0));
+  Info.NumVGPR = std::max(Info.NumVGPR, GetNumUsedPreloadedRegs(MFI->getNumPreloadedVGPRs(), AMDGPU::VGPR0));
 
   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall())
     return Info;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 0e7635a045588..be3fe1a0bced4 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -974,6 +974,19 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
     return NumUserSGPRs + NumSystemSGPRs;
   }
 
+  unsigned getNumPreloadedVGPRs() const {
+    if (hasWorkItemIDZ())
+      return ArgInfo.WorkItemIDZ.getRegister() - AMDGPU::VGPR0 + 1;
+
+    if (hasWorkItemIDY())
+      return ArgInfo.WorkItemIDY.getRegister() - AMDGPU::VGPR0 + 1;
+
+    if (hasWorkItemIDX())
+      return ArgInfo.WorkItemIDX.getRegister() - AMDGPU::VGPR0 + 1;
+
+    return 0;
+  }
+
   unsigned getNumKernargPreloadedSGPRs() const {
     return UserSGPRInfo.getNumKernargPreloadSGPRs();
   }
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll b/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll
new file mode 100644
index 0000000000000..8c8182db7b479
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-count-compute.ll
@@ -0,0 +1,30 @@
+; RUN: llc -mcpu=gfx1200 -o - < %s | FileCheck %s --check-prefixes=CHECK,PACKED
+; RUN: llc -mcpu=gfx1030 -o - < %s | FileCheck %s --check-prefixes=CHECK,NOTPACKED
+target triple = "amdgcn-amd-amdhsa"
+
+ at global = addrspace(1) global i32 poison, align 4
+
+; Carefully crafted kernel that uses v0 but never writes a VGPR or reads another VGPR.
+; Only hardware-initialized VGPRs (v0) are read in this kernel.
+
+; CHECK-LABEL: amdhsa.kernels:
+; CHECK-LABEL: kernel_x
+; CHECK: .vgpr_count:     1
+define amdgpu_kernel void @kernel_x(ptr addrspace(8) %rsrc) #0 {
+entry:
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %id, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+  ret void
+}
+
+; CHECK-LABEL: kernel_z
+; PACKED: .vgpr_count:     1
+; NOTPACKED: .vgpr_count:     3
+define amdgpu_kernel void @kernel_z(ptr addrspace(8) %rsrc) {
+entry:
+  %id = call i32 @llvm.amdgcn.workitem.id.z()
+  call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %id, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+  ret void
+}
+
+attributes #0 = { "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll b/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll
new file mode 100644
index 0000000000000..f5d28a0ae1628
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-count-graphics.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mcpu=gfx1200 -o - < %s | FileCheck %s
+; Check that reads of a VGPR in kernels counts towards VGPR count, but in functions, only writes of VGPRs count towards VGPR count.
+target triple = "amdgcn--amdpal"
+
+ at global = addrspace(1) global i32 poison, align 4
+
+; CHECK-LABEL: amdpal.pipelines:
+
+; Neither uses not writes a VGPR, but the hardware initializes the VGPRs that the kernel receives, so they count as used.
+; CHECK-LABEL: .entry_point_symbol: kernel_use
+; CHECK: .vgpr_count:     0x20
+define amdgpu_cs void @kernel_use([32 x i32] %args) {
+entry:
+  %a = extractvalue [32 x i32] %args, 14
+  store i32 %a, ptr addrspace(1) @global
+  ret void
+}
+
+; Neither uses not writes a VGPR
+; CHECK-LABEL: chain_func:
+; CHECK: .vgpr_count:     0x1
+define amdgpu_cs_chain void @chain_func([32 x i32] %args) {
+entry:
+  call void (ptr, i32, {}, [32 x i32], i32, ...) @llvm.amdgcn.cs.chain.p0.i32.s.a(
+        ptr @chain_func, i32 0, {} inreg {}, [32 x i32] %args, i32 0)
+  unreachable
+}
+
+; Neither uses not writes a VGPR
+; CHECK-LABEL: gfx_func:
+; CHECK: .vgpr_count:     0x1
+define amdgpu_gfx [32 x i32] @gfx_func([32 x i32] %args) {
+entry:
+  ret [32 x i32] %args
+}

>From f27621db0efefcc10ebd1c22d4287c8172fcc63a Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Mon, 5 May 2025 17:03:00 +0200
Subject: [PATCH 8/9] Formatting

---
 .../lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 666187c01c41a..c166ec2452f87 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -146,15 +146,20 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
     Info.NumAGPR = TRI.getNumDefinedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
 
   // Count any user or system SGPRs that are actually used.
-  auto GetNumUsedPreloadedRegs = [&](int NumPreloadedRegs, unsigned RegBase) -> int32_t {
+  auto GetNumUsedPreloadedRegs = [&](int NumPreloadedRegs,
+                                     unsigned RegBase) -> int32_t {
     for (int I = NumPreloadedRegs - 1; I >= 0; I--)
       if (MRI.isPhysRegUsed(RegBase + I))
         return TRI.getHWRegIndex(RegBase + I) + 1;
     return 0;
   };
 
-  Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, GetNumUsedPreloadedRegs(MFI->getNumPreloadedSGPRs(), AMDGPU::SGPR0));
-  Info.NumVGPR = std::max(Info.NumVGPR, GetNumUsedPreloadedRegs(MFI->getNumPreloadedVGPRs(), AMDGPU::VGPR0));
+  Info.NumExplicitSGPR = std::max(
+      Info.NumExplicitSGPR,
+      GetNumUsedPreloadedRegs(MFI->getNumPreloadedSGPRs(), AMDGPU::SGPR0));
+  Info.NumVGPR = std::max(
+      Info.NumVGPR,
+      GetNumUsedPreloadedRegs(MFI->getNumPreloadedVGPRs(), AMDGPU::VGPR0));
 
   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall())
     return Info;

>From ca7e723a9d66ef2b3028a7d9a2b5567d981ffb4c Mon Sep 17 00:00:00 2001
From: Diana Picus <diana-magda.picus at amd.com>
Date: Tue, 6 May 2025 13:32:37 +0200
Subject: [PATCH 9/9] Cound all preloaded regs

---
 .../AMDGPU/AMDGPUResourceUsageAnalysis.cpp    |  22 ++--
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     |  15 ++-
 .../AMDGPU/GlobalISel/extractelement.ll       |  38 +++----
 .../AMDGPU/amdgpu-no-agprs-violations.ll      |   3 +-
 .../amdhsa-kernarg-preload-num-sgprs.ll       |  28 ++---
 llvm/test/CodeGen/AMDGPU/amdpal-elf.ll        |  16 ++-
 .../attr-amdgpu-flat-work-group-size.ll       |  12 +-
 .../AMDGPU/attr-amdgpu-waves-per-eu.ll        |  24 ++--
 .../AMDGPU/call-graph-register-usage.ll       |   4 +-
 llvm/test/CodeGen/AMDGPU/code-object-v3.ll    |   6 +-
 llvm/test/CodeGen/AMDGPU/elf-notes.ll         |   2 +-
 llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll  | 106 +++++++++---------
 .../AMDGPU/hsa-metadata-kernel-code-props.ll  |   2 +-
 .../init-whole-wave-vgpr-count-large.ll       |   4 -
 .../AMDGPU/init-whole-wave-vgpr-count-leaf.ll |   4 -
 ...init-whole-wave-vgpr-count-use-inactive.ll |   4 -
 .../AMDGPU/init-whole-wave-vgpr-count.ll      |   4 -
 .../AMDGPU/resource-optimization-remarks.ll   |   4 +-
 .../AMDGPU/schedule-amdgpu-tracker-physreg.ll |   4 +-
 .../AMDGPU/schedule-regpressure-limit2.ll     |   6 +-
 .../CodeGen/AMDGPU/stack-realign-kernel.ll    |  12 +-
 llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll  |   4 +-
 llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll  |   4 +-
 llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll   |   4 +-
 24 files changed, 158 insertions(+), 174 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index c166ec2452f87..7bde59412d905 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -145,21 +145,13 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
   if (ST.hasMAIInsts())
     Info.NumAGPR = TRI.getNumDefinedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
 
-  // Count any user or system SGPRs that are actually used.
-  auto GetNumUsedPreloadedRegs = [&](int NumPreloadedRegs,
-                                     unsigned RegBase) -> int32_t {
-    for (int I = NumPreloadedRegs - 1; I >= 0; I--)
-      if (MRI.isPhysRegUsed(RegBase + I))
-        return TRI.getHWRegIndex(RegBase + I) + 1;
-    return 0;
-  };
-
-  Info.NumExplicitSGPR = std::max(
-      Info.NumExplicitSGPR,
-      GetNumUsedPreloadedRegs(MFI->getNumPreloadedSGPRs(), AMDGPU::SGPR0));
-  Info.NumVGPR = std::max(
-      Info.NumVGPR,
-      GetNumUsedPreloadedRegs(MFI->getNumPreloadedVGPRs(), AMDGPU::VGPR0));
+  // Preloaded registers are written by the hardware, not defined in the
+  // function body, so they need special handling.
+  if (MFI->isEntryFunction()) {
+    Info.NumExplicitSGPR =
+        std::max<int32_t>(Info.NumExplicitSGPR, MFI->getNumPreloadedSGPRs());
+    Info.NumVGPR = std::max<int32_t>(Info.NumVGPR, MFI->getNumPreloadedVGPRs());
+  }
 
   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall())
     return Info;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 9be0770f866e7..1108683ec932f 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -4056,15 +4056,14 @@ SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
 unsigned
 SIRegisterInfo::getNumDefinedPhysRegs(const MachineRegisterInfo &MRI,
                                       const TargetRegisterClass &RC) const {
-  auto isDefinedByImplicitDef = [](MachineOperand &Op) {
-    return Op.getParent()->isImplicitDef();
-  };
-
-  for (MCPhysReg Reg : reverse(RC.getRegisters()))
-    for (MCRegAliasIterator AI(Reg, this, true); AI.isValid(); ++AI)
-      if (!(MRI.def_empty(*AI) || std::all_of(MRI.def_begin(*AI), MRI.def_end(),
-                                              isDefinedByImplicitDef)))
+  for (MCPhysReg Reg : reverse(RC.getRegisters())) {
+    for (MCRegAliasIterator AI(Reg, this, true); AI.isValid(); ++AI) {
+      if (!std::all_of(
+              MRI.def_instr_begin(*AI), MRI.def_instr_end(),
+              [](const MachineInstr &MI) { return MI.isImplicitDef(); }))
         return getHWRegIndex(Reg) + 1;
+    }
+  }
   return 0;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 9ef16aef0dd16..c79e96f8d5dec 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -3059,7 +3059,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 17
+; GPRIDX-NEXT:     wavefront_sgpr_count = 24
 ; GPRIDX-NEXT:     workitem_vgpr_count = 3
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
@@ -3202,7 +3202,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX10-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX10-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX10-NEXT:     granulated_wavefront_sgpr_count = 1
+; GFX10-NEXT:     granulated_wavefront_sgpr_count = 2
 ; GFX10-NEXT:     priority = 0
 ; GFX10-NEXT:     float_mode = 240
 ; GFX10-NEXT:     priv = 0
@@ -3245,7 +3245,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:     gds_segment_byte_size = 0
 ; GFX10-NEXT:     kernarg_segment_byte_size = 28
 ; GFX10-NEXT:     workgroup_fbarrier_count = 0
-; GFX10-NEXT:     wavefront_sgpr_count = 10
+; GFX10-NEXT:     wavefront_sgpr_count = 18
 ; GFX10-NEXT:     workitem_vgpr_count = 3
 ; GFX10-NEXT:     reserved_vgpr_first = 0
 ; GFX10-NEXT:     reserved_vgpr_count = 0
@@ -3294,7 +3294,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX11-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX11-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX11-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX11-NEXT:     granulated_wavefront_sgpr_count = 0
+; GFX11-NEXT:     granulated_wavefront_sgpr_count = 1
 ; GFX11-NEXT:     priority = 0
 ; GFX11-NEXT:     float_mode = 240
 ; GFX11-NEXT:     priv = 0
@@ -3337,7 +3337,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX11-NEXT:     gds_segment_byte_size = 0
 ; GFX11-NEXT:     kernarg_segment_byte_size = 28
 ; GFX11-NEXT:     workgroup_fbarrier_count = 0
-; GFX11-NEXT:     wavefront_sgpr_count = 7
+; GFX11-NEXT:     wavefront_sgpr_count = 16
 ; GFX11-NEXT:     workitem_vgpr_count = 3
 ; GFX11-NEXT:     reserved_vgpr_first = 0
 ; GFX11-NEXT:     reserved_vgpr_count = 0
@@ -4034,7 +4034,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     kernel_code_entry_byte_offset = 256
 ; GPRIDX-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GPRIDX-NEXT:     granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 1
+; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 2
 ; GPRIDX-NEXT:     priority = 0
 ; GPRIDX-NEXT:     float_mode = 240
 ; GPRIDX-NEXT:     priv = 0
@@ -4077,8 +4077,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 16
-; GPRIDX-NEXT:     workitem_vgpr_count = 2
+; GPRIDX-NEXT:     wavefront_sgpr_count = 24
+; GPRIDX-NEXT:     workitem_vgpr_count = 3
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
 ; GPRIDX-NEXT:     reserved_sgpr_first = 0
@@ -4206,7 +4206,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX10-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX10-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX10-NEXT:     granulated_wavefront_sgpr_count = 1
+; GFX10-NEXT:     granulated_wavefront_sgpr_count = 2
 ; GFX10-NEXT:     priority = 0
 ; GFX10-NEXT:     float_mode = 240
 ; GFX10-NEXT:     priv = 0
@@ -4249,8 +4249,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     gds_segment_byte_size = 0
 ; GFX10-NEXT:     kernarg_segment_byte_size = 28
 ; GFX10-NEXT:     workgroup_fbarrier_count = 0
-; GFX10-NEXT:     wavefront_sgpr_count = 10
-; GFX10-NEXT:     workitem_vgpr_count = 2
+; GFX10-NEXT:     wavefront_sgpr_count = 18
+; GFX10-NEXT:     workitem_vgpr_count = 3
 ; GFX10-NEXT:     reserved_vgpr_first = 0
 ; GFX10-NEXT:     reserved_vgpr_count = 0
 ; GFX10-NEXT:     reserved_sgpr_first = 0
@@ -4291,7 +4291,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX11-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX11-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX11-NEXT:     granulated_wavefront_sgpr_count = 0
+; GFX11-NEXT:     granulated_wavefront_sgpr_count = 1
 ; GFX11-NEXT:     priority = 0
 ; GFX11-NEXT:     float_mode = 240
 ; GFX11-NEXT:     priv = 0
@@ -4334,7 +4334,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     gds_segment_byte_size = 0
 ; GFX11-NEXT:     kernarg_segment_byte_size = 28
 ; GFX11-NEXT:     workgroup_fbarrier_count = 0
-; GFX11-NEXT:     wavefront_sgpr_count = 6
+; GFX11-NEXT:     wavefront_sgpr_count = 16
 ; GFX11-NEXT:     workitem_vgpr_count = 2
 ; GFX11-NEXT:     reserved_vgpr_first = 0
 ; GFX11-NEXT:     reserved_vgpr_count = 0
@@ -4382,7 +4382,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     kernel_code_entry_byte_offset = 256
 ; GPRIDX-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GPRIDX-NEXT:     granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 1
+; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 2
 ; GPRIDX-NEXT:     priority = 0
 ; GPRIDX-NEXT:     float_mode = 240
 ; GPRIDX-NEXT:     priv = 0
@@ -4425,7 +4425,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 16
+; GPRIDX-NEXT:     wavefront_sgpr_count = 24
 ; GPRIDX-NEXT:     workitem_vgpr_count = 3
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
@@ -4560,7 +4560,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX10-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX10-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX10-NEXT:     granulated_wavefront_sgpr_count = 1
+; GFX10-NEXT:     granulated_wavefront_sgpr_count = 2
 ; GFX10-NEXT:     priority = 0
 ; GFX10-NEXT:     float_mode = 240
 ; GFX10-NEXT:     priv = 0
@@ -4603,7 +4603,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     gds_segment_byte_size = 0
 ; GFX10-NEXT:     kernarg_segment_byte_size = 28
 ; GFX10-NEXT:     workgroup_fbarrier_count = 0
-; GFX10-NEXT:     wavefront_sgpr_count = 10
+; GFX10-NEXT:     wavefront_sgpr_count = 18
 ; GFX10-NEXT:     workitem_vgpr_count = 3
 ; GFX10-NEXT:     reserved_vgpr_first = 0
 ; GFX10-NEXT:     reserved_vgpr_count = 0
@@ -4648,7 +4648,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX11-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX11-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX11-NEXT:     granulated_wavefront_sgpr_count = 0
+; GFX11-NEXT:     granulated_wavefront_sgpr_count = 1
 ; GFX11-NEXT:     priority = 0
 ; GFX11-NEXT:     float_mode = 240
 ; GFX11-NEXT:     priv = 0
@@ -4691,7 +4691,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     gds_segment_byte_size = 0
 ; GFX11-NEXT:     kernarg_segment_byte_size = 28
 ; GFX11-NEXT:     workgroup_fbarrier_count = 0
-; GFX11-NEXT:     wavefront_sgpr_count = 7
+; GFX11-NEXT:     wavefront_sgpr_count = 16
 ; GFX11-NEXT:     workitem_vgpr_count = 3
 ; GFX11-NEXT:     reserved_vgpr_first = 0
 ; GFX11-NEXT:     reserved_vgpr_count = 0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll
index 44308e353680b..cc614bb24839c 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll
@@ -13,7 +13,8 @@
 ; CHECK: {{^}}kernel_illegal_agpr_use_asm:
 ; CHECK: ; use a0
 
-; CHECK: NumVgprs: 0
+; GFX908: NumVgprs: 3
+; GFX90A: NumVgprs: 1
 ; CHECK: NumAgprs: 0
 define amdgpu_kernel void @kernel_illegal_agpr_use_asm() #0 {
   call void asm sideeffect "; use $0", "a"(i32 poison)
diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
index dd760c2a215ca..7851de641c5a3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll
@@ -10,9 +10,9 @@
 
 ; ASM-LABEL: amdhsa_kernarg_preload_4_implicit_6:
 ; ASM: .amdhsa_user_sgpr_count 12
-; ASM: .amdhsa_next_free_sgpr 12
-; ASM: ; TotalNumSgprs: 18
-; ASM: ; NumSGPRsForWavesPerEU: 18
+; ASM: .amdhsa_next_free_sgpr 15
+; ASM: ; TotalNumSgprs: 21
+; ASM: ; NumSGPRsForWavesPerEU: 21
 
 ; Test that we include preloaded SGPRs in the GRANULATED_WAVEFRONT_SGPR_COUNT
 ; feild that are not explicitly referenced in the kernel. This test has 6 implicit
@@ -26,13 +26,13 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_4_implicit_6(i128 inreg) { ret
 ; OBJDUMP-NEXT: 0040 00000000 00000000 20010000 00000000  ........ .......
 ; OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000  ................
 ; OBJDUMP-NEXT: 0060 00000000 00000000 00000000 00000000  ................
-; OBJDUMP-NEXT: 0070 4000af00 94000000 08000800 00000000  @...............
+; OBJDUMP-NEXT: 0070 8000af00 94000000 08000800 00000000  ................
 
 ; ASM-LABEL: amdhsa_kernarg_preload_8_implicit_2:
 ; ASM: .amdhsa_user_sgpr_count 10
-; ASM: .amdhsa_next_free_sgpr 10
-; ASM: ; TotalNumSgprs: 16
-; ASM: ; NumSGPRsForWavesPerEU: 16
+; ASM: .amdhsa_next_free_sgpr 11
+; ASM: ; TotalNumSgprs: 17
+; ASM: ; NumSGPRsForWavesPerEU: 17
 
 ; Only the kernarg_ptr is enabled so we should have 8 preload kernarg SGPRs, 2
 ; implicit, and 6 extra.
@@ -46,9 +46,9 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_8_implicit_2(i256 inreg) #0 {
 
 ; ASM-LABEL: amdhsa_kernarg_preload_1_implicit_2:
 ; ASM: .amdhsa_user_sgpr_count 3
-; ASM: .amdhsa_next_free_sgpr 3
-; ASM: ; TotalNumSgprs: 9
-; ASM: ; NumSGPRsForWavesPerEU: 9
+; ASM: .amdhsa_next_free_sgpr 4
+; ASM: ; TotalNumSgprs: 10
+; ASM: ; NumSGPRsForWavesPerEU: 10
 
 ; 1 preload, 2 implicit, 6 extra. Rounds up to 16 SGPRs in the KD.
 
@@ -57,13 +57,13 @@ define amdgpu_kernel void @amdhsa_kernarg_preload_1_implicit_2(i32 inreg) #0 { r
 ; OBJDUMP-NEXT: 00c0 00000000 00000000 08010000 00000000  ................
 ; OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000  ................
 ; OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000  ................
-; OBJDUMP-NEXT: 00f0 0000af00 84000000 08000000 00000000  ................
+; OBJDUMP-NEXT: 00f0 4000af00 84000000 08000000 00000000  @...............
 
 ; ASM-LABEL: amdhsa_kernarg_preload_0_implicit_2:
 ; ASM: .amdhsa_user_sgpr_count 2
-; ASM: .amdhsa_next_free_sgpr 0
-; ASM: ; TotalNumSgprs: 6
-; ASM: ; NumSGPRsForWavesPerEU: 6
+; ASM: .amdhsa_next_free_sgpr 3
+; ASM: ; TotalNumSgprs: 9
+; ASM: ; NumSGPRsForWavesPerEU: 9
 
 ; 0 preload kernarg SGPRs, 2 implicit, 6 extra. Rounds up to 8 SGPRs in the KD.
 ; Encoded like '00'.
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
index f52ba7000edeb..5ccf41c408b72 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-elf.ll
@@ -2,8 +2,8 @@
 ; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdpal -mcpu=kaveri | llvm-readobj -S --sd --syms - | FileCheck %s --check-prefix=ELF
 ; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 %s
 ; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX10 %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX10 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize32 | FileCheck --check-prefix=GFX11W32 %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=+wavefrontsize64 | FileCheck --check-prefix=GFX11W64 %s
 
 ; ELF: Section {
 ; ELF: Name: .text
@@ -23,8 +23,16 @@
 ; ELF: Section: .text (0x2)
 ; ELF: }
 
-; GFX10: NumSGPRsForWavesPerEU: 6
-; GFX10: NumVGPRsForWavesPerEU: 1
+; GFX10: NumSGPRsForWavesPerEU: 12
+; GFX10: NumVGPRsForWavesPerEU: 3
+
+; Wave32 and 64 behave differently due to the UserSGPRInit16Bug,
+; which only affects Wave32.
+; GFX11W32: NumSGPRsForWavesPerEU: 16
+; GFX11W32: NumVGPRsForWavesPerEU: 1
+
+; GFX11W64: NumSGPRsForWavesPerEU: 11
+; GFX11W64: NumVGPRsForWavesPerEU: 1
 
 define amdgpu_kernel void @simple(ptr addrspace(1) %out) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
index 22cc5af30da66..4d5e56efd1ca4 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
@@ -2,10 +2,10 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs -filetype=obj -o - < %s | llvm-readelf --notes - | FileCheck --check-prefix=HSAMD %s
 
 ; CHECK-LABEL: {{^}}min_64_max_64:
-; CHECK: SGPRBlocks: 0
+; CHECK: SGPRBlocks: 2
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 3
 define amdgpu_kernel void @min_64_max_64() #0 {
 entry:
   ret void
@@ -13,10 +13,10 @@ entry:
 attributes #0 = {"amdgpu-flat-work-group-size"="64,64"}
 
 ; CHECK-LABEL: {{^}}min_64_max_128:
-; CHECK: SGPRBlocks: 0
+; CHECK: SGPRBlocks: 2
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 3
 define amdgpu_kernel void @min_64_max_128() #1 {
 entry:
   ret void
@@ -24,10 +24,10 @@ entry:
 attributes #1 = {"amdgpu-flat-work-group-size"="64,128"}
 
 ; CHECK-LABEL: {{^}}min_128_max_128:
-; CHECK: SGPRBlocks: 0
+; CHECK: SGPRBlocks: 2
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 3
 define amdgpu_kernel void @min_128_max_128() #2 {
 entry:
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
index 4507fd5865989..d08184aa5cd5e 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -26,10 +26,10 @@ attributes #1 = {"amdgpu-waves-per-eu"="5,5"}
 
 ; Exactly 10 waves per execution unit.
 ; CHECK-LABEL: {{^}}empty_exactly_10:
-; CHECK: SGPRBlocks: 0
+; CHECK: SGPRBlocks: 2
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 3
 define amdgpu_kernel void @empty_exactly_10() #2 {
 entry:
   ret void
@@ -38,10 +38,10 @@ attributes #2 = {"amdgpu-waves-per-eu"="10,10"}
 
 ; At least 1 wave per execution unit.
 ; CHECK-LABEL: {{^}}empty_at_least_1:
-; CHECK: SGPRBlocks: 0
+; CHECK: SGPRBlocks: 2
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 3
 define amdgpu_kernel void @empty_at_least_1() #3 {
 entry:
   ret void
@@ -50,10 +50,10 @@ attributes #3 = {"amdgpu-waves-per-eu"="1"}
 
 ; At least 5 waves per execution unit.
 ; CHECK-LABEL: {{^}}empty_at_least_5:
-; CHECK: SGPRBlocks: 0
+; CHECK: SGPRBlocks: 2
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 3
 define amdgpu_kernel void @empty_at_least_5() #4 {
 entry:
   ret void
@@ -62,10 +62,10 @@ attributes #4 = {"amdgpu-waves-per-eu"="5"}
 
 ; At least 10 waves per execution unit.
 ; CHECK-LABEL: {{^}}empty_at_least_10:
-; CHECK: SGPRBlocks: 0
+; CHECK: SGPRBlocks: 2
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 3
 define amdgpu_kernel void @empty_at_least_10() #5 {
 entry:
   ret void
@@ -88,10 +88,10 @@ attributes #6 = {"amdgpu-waves-per-eu"="1,5" "amdgpu-flat-work-group-size"="1,64
 
 ; At most 10 waves per execution unit.
 ; CHECK-LABEL: {{^}}empty_at_most_10:
-; CHECK: SGPRBlocks: 0
+; CHECK: SGPRBlocks: 2
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 3
 define amdgpu_kernel void @empty_at_most_10() #7 {
 entry:
   ret void
@@ -102,10 +102,10 @@ attributes #7 = {"amdgpu-waves-per-eu"="1,10"}
 
 ; Between 5 and 10 waves per execution unit.
 ; CHECK-LABEL: {{^}}empty_between_5_and_10:
-; CHECK: SGPRBlocks: 0
+; CHECK: SGPRBlocks: 2
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
-; CHECK: NumVGPRsForWavesPerEU: 1
+; CHECK: NumVGPRsForWavesPerEU: 3
 define amdgpu_kernel void @empty_between_5_and_10() #8 {
 entry:
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index 70574480d2113..db1269e8e95c2 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -234,7 +234,7 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
 ; Make sure there's no assert when a sgpr96 is used.
 ; GCN-LABEL: {{^}}count_use_sgpr96_external_call
 ; GCN: ; sgpr96 s[{{[0-9]+}}:{{[0-9]+}}]
-; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(0, amdgpu.max_num_vgpr)
+; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(3, amdgpu.max_num_vgpr)
 ; GCN: .set count_use_sgpr96_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
 ; CI: TotalNumSgprs: count_use_sgpr96_external_call.numbered_sgpr+4
 ; VI-BUG: TotalNumSgprs: 96
@@ -249,7 +249,7 @@ entry:
 ; Make sure there's no assert when a sgpr160 is used.
 ; GCN-LABEL: {{^}}count_use_sgpr160_external_call
 ; GCN: ; sgpr160 s[{{[0-9]+}}:{{[0-9]+}}]
-; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(0, amdgpu.max_num_vgpr)
+; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(3, amdgpu.max_num_vgpr)
 ; GCN: .set count_use_sgpr160_external_call.numbered_sgpr, max(33, amdgpu.max_num_sgpr)
 ; CI: TotalNumSgprs: count_use_sgpr160_external_call.numbered_sgpr+4
 ; VI-BUG: TotalNumSgprs: 96
diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
index 3fe3cafd729a7..d8d7494d0dc1c 100644
--- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
@@ -16,7 +16,7 @@
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 10
+; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 16
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_vcc 0
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 0
 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
@@ -35,7 +35,7 @@
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 10
+; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 16
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_vcc 0
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 0
 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
@@ -93,7 +93,7 @@ entry:
 ; registers used.
 ;
 ; ALL-ASM-LABEL: {{^}}empty:
-; ALL-ASM:     .amdhsa_next_free_vgpr 1
+; ALL-ASM:     .amdhsa_next_free_vgpr 3
 ; ALL-ASM:     .amdhsa_next_free_sgpr 1
 define amdgpu_kernel void @empty(
     i32 %i,
diff --git a/llvm/test/CodeGen/AMDGPU/elf-notes.ll b/llvm/test/CodeGen/AMDGPU/elf-notes.ll
index 22d699a8f4809..59cf9825116fa 100644
--- a/llvm/test/CodeGen/AMDGPU/elf-notes.ll
+++ b/llvm/test/CodeGen/AMDGPU/elf-notes.ll
@@ -43,7 +43,7 @@
 ; OSABI-HSA-ELF:     .sgpr_count:     96
 ; OSABI-HSA-ELF:     .sgpr_spill_count: 0
 ; OSABI-HSA-ELF:     .symbol:         elf_notes.kd
-; OSABI-HSA-ELF:     .vgpr_count:     0
+; OSABI-HSA-ELF:     .vgpr_count:     1
 ; OSABI-HSA-ELF:     .vgpr_spill_count: 0
 ; OSABI-HSA-ELF:     .wavefront_size: 64
 ; OSABI-HSA-ELF: amdhsa.target:   amdgcn-amd-amdhsa--gfx802
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index a59382ba20dc5..ed1f3e1397abc 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -27,15 +27,15 @@
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; TotalNumSgprs: 8
-; VI-NOXNACK: ; TotalNumSgprs: 8
-; HSA-VI-NOXNACK: ; TotalNumSgprs: 8
-; VI-XNACK: ; TotalNumSgprs: 12
-; HSA-VI-XNACK: ; TotalNumSgprs: 12
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 8
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 8
+; CI: ; TotalNumSgprs: 12
+; VI-NOXNACK: ; TotalNumSgprs: 12
+; HSA-VI-NOXNACK: ; TotalNumSgprs: 18
+; VI-XNACK: ; TotalNumSgprs: 16
+; HSA-VI-XNACK: ; TotalNumSgprs: 22
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
 define amdgpu_kernel void @no_vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{s7}"()
@@ -50,15 +50,15 @@ entry:
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; TotalNumSgprs: 10
-; VI-NOXNACK: ; TotalNumSgprs: 10
-; HSA-VI-NOXNACK: ; TotalNumSgprs: 10
-; VI-XNACK: ; TotalNumSgprs: 12
-; HSA-VI-XNACK: ; TotalNumSgprs: 12
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 10
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 10
+; CI: ; TotalNumSgprs: 14
+; VI-NOXNACK: ; TotalNumSgprs: 14
+; HSA-VI-NOXNACK: ; TotalNumSgprs: 20
+; VI-XNACK: ; TotalNumSgprs: 16
+; HSA-VI-XNACK: ; TotalNumSgprs: 22
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 13
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 13
 define amdgpu_kernel void @vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{vcc}"()
@@ -73,15 +73,15 @@ entry:
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; TotalNumSgprs: 12
-; VI-NOXNACK: ; TotalNumSgprs: 14
+; CI: ; TotalNumSgprs: 16
+; VI-NOXNACK: ; TotalNumSgprs: 18
 ; HSA-VI-NOXNACK: ; TotalNumSgprs: 24
-; VI-XNACK: ; TotalNumSgprs: 14
+; VI-XNACK: ; TotalNumSgprs: 18
 ; HSA-VI-XNACK: ; TotalNumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 8
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 8
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
 define amdgpu_kernel void @no_vcc_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{flat_scratch}"()
@@ -96,15 +96,15 @@ entry:
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: ; TotalNumSgprs: 12
-; VI-NOXNACK: ; TotalNumSgprs: 14
+; CI: ; TotalNumSgprs: 16
+; VI-NOXNACK: ; TotalNumSgprs: 18
 ; HSA-VI-NOXNACK: ; TotalNumSgprs: 24
-; VI-XNACK: ; TotalNumSgprs: 14
+; VI-XNACK: ; TotalNumSgprs: 18
 ; HSA-VI-XNACK: ; TotalNumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 10
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 10
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 13
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 13
 define amdgpu_kernel void @vcc_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{vcc},~{flat_scratch}"()
@@ -122,15 +122,15 @@ entry:
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: NumSgprs: 4
-; VI-NOXNACK: NumSgprs: 6
+; CI: NumSgprs: 16
+; VI-NOXNACK: NumSgprs: 18
 ; HSA-VI-NOXNACK: NumSgprs: 24
-; VI-XNACK: NumSgprs: 6
+; VI-XNACK: NumSgprs: 18
 ; HSA-VI-XNACK: NumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
 define amdgpu_kernel void @use_flat_scr() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch}"()
@@ -143,15 +143,15 @@ entry:
 ; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: NumSgprs: 4
-; VI-NOXNACK: NumSgprs: 6
+; CI: NumSgprs: 16
+; VI-NOXNACK: NumSgprs: 18
 ; HSA-VI-NOXNACK: NumSgprs: 24
-; VI-XNACK: NumSgprs: 6
+; VI-XNACK: NumSgprs: 18
 ; HSA-VI-XNACK: NumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
 define amdgpu_kernel void @use_flat_scr_lo() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch_lo}"()
@@ -166,15 +166,15 @@ entry:
 ; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 ; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
 
-; CI: NumSgprs: 4
-; VI-NOXNACK: NumSgprs: 6
+; CI: NumSgprs: 16
+; VI-NOXNACK: NumSgprs: 18
 ; HSA-VI-NOXNACK: NumSgprs: 24
-; VI-XNACK: NumSgprs: 6
+; VI-XNACK: NumSgprs: 18
 ; HSA-VI-XNACK: NumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
-; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
-; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
-; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 17
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 17
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 11
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 11
 define amdgpu_kernel void @use_flat_scr_hi() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch_hi}"()
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
index cd89a36fe538b..bf452a9e38e01 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
@@ -15,7 +15,7 @@
 ; CHECK:     .max_flat_workgroup_size: 1024
 ; CHECK:     .name:           test
 ; CHECK:     .private_segment_fixed_size: 0
-; CHECK:     .sgpr_count:     10
+; CHECK:     .sgpr_count:     16
 ; CHECK:     .symbol:         test.kd
 ; CHECK:     .vgpr_count:     {{3|6}}
 ; WAVE64:    .wavefront_size: 64
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll
index e47f5e25ead3a..45de8a79fe88d 100644
--- a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll
+++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-large.ll
@@ -70,7 +70,3 @@ declare i1 @llvm.amdgcn.init.whole.wave()
 declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
 
 declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
-
-!amdgpu.pal.metadata.msgpack = !{!0}
-
-!0 = !{!"\82\B0amdpal.pipelines\91\8B\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C3\AA.tgid_y_en\C3\AA.tgid_z_en\C3\AF.tidig_comp_cnt\00\B0.hardware_stages\81\A3.cs\8D\AF.checksum_value\00\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93 \01\01\AD.trap_present\00\B2.user_data_reg_map\90\AB.user_sgprs\10\AB.vgpr_limit\CD\01\00\AF.wavefront_size \AF.wg_round_robin\C2\B7.internal_pipeline_hash\92\CF|{2&\DCC\85M\CFep\8A\EDR\DE\D6\E1\B1.shader_functions\81\A7_miss_1\82\B4.frontend_stack_size\00\B4.outgoing_vgpr_countP\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\00\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CD\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\A9.uses_cps\C3\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4\AF\9D\0B\07\88\03\02\CF\01o\C9\CAf?)\DA\AD.llpc_version\A476.0\AEamdpal.version\92\03\00"}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
index d467d6c0042b0..9c636d4516a80 100644
--- a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
+++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-leaf.ll
@@ -44,7 +44,3 @@ declare i1 @llvm.amdgcn.init.whole.wave()
 declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
 
 declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
-
-!amdgpu.pal.metadata.msgpack = !{!0}
-
-!0 = !{!"\82\B0amdpal.pipelines\91\8B\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C3\AA.tgid_y_en\C3\AA.tgid_z_en\C3\AF.tidig_comp_cnt\00\B0.hardware_stages\81\A3.cs\8D\AF.checksum_value\00\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93 \01\01\AD.trap_present\00\B2.user_data_reg_map\90\AB.user_sgprs\10\AB.vgpr_limit\CD\01\00\AF.wavefront_size \AF.wg_round_robin\C2\B7.internal_pipeline_hash\92\CF|{2&\DCC\85M\CFep\8A\EDR\DE\D6\E1\B1.shader_functions\81\A7_miss_1\82\B4.frontend_stack_size\00\B4.outgoing_vgpr_countP\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\00\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CD\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\A9.uses_cps\C3\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4\AF\9D\0B\07\88\03\02\CF\01o\C9\CAf?)\DA\AD.llpc_version\A476.0\AEamdpal.version\92\03\00"}
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
index f1f7fb22d44c6..1b0d33cec7052 100644
--- a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count-use-inactive.ll
@@ -72,7 +72,3 @@ declare i1 @llvm.amdgcn.init.whole.wave()
 declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
 
 declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
-
-!amdgpu.pal.metadata.msgpack = !{!0}
-
-!0 = !{!"\82\B0amdpal.pipelines\91\8B\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C3\AA.tgid_y_en\C3\AA.tgid_z_en\C3\AF.tidig_comp_cnt\00\B0.hardware_stages\81\A3.cs\8D\AF.checksum_value\00\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93 \01\01\AD.trap_present\00\B2.user_data_reg_map\90\AB.user_sgprs\10\AB.vgpr_limit\CD\01\00\AF.wavefront_size \AF.wg_round_robin\C2\B7.internal_pipeline_hash\92\CF|{2&\DCC\85M\CFep\8A\EDR\DE\D6\E1\B1.shader_functions\81\A7_miss_1\82\B4.frontend_stack_size\00\B4.outgoing_vgpr_countP\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\00\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CD\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\A9.uses_cps\C3\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4\AF\9D\0B\07\88\03\02\CF\01o\C9\CAf?)\DA\AD.llpc_version\A476.0\AEamdpal.version\92\03\00"}
diff --git a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll
index b9130dd1b7ed4..9408501718784 100644
--- a/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/init-whole-wave-vgpr-count.ll
@@ -69,7 +69,3 @@ declare i1 @llvm.amdgcn.init.whole.wave()
 declare void @llvm.amdgcn.cs.chain.p0.i32.v4i32.sl_i32i32i32i32i32i32i32i32i32i32i32i32s(ptr, i32, <4 x i32>, { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }, i32 immarg, ...)
 
 declare amdgpu_cs_chain void @retry_vgpr_alloc.v4i32(<4 x i32> inreg)
-
-!amdgpu.pal.metadata.msgpack = !{!0}
-
-!0 = !{!"\82\B0amdpal.pipelines\91\8B\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C3\AA.tgid_y_en\C3\AA.tgid_z_en\C3\AF.tidig_comp_cnt\00\B0.hardware_stages\81\A3.cs\8D\AF.checksum_value\00\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93 \01\01\AD.trap_present\00\B2.user_data_reg_map\90\AB.user_sgprs\10\AB.vgpr_limit\CD\01\00\AF.wavefront_size \AF.wg_round_robin\C2\B7.internal_pipeline_hash\92\CF|{2&\DCC\85M\CFep\8A\EDR\DE\D6\E1\B1.shader_functions\81\A7_miss_1\82\B4.frontend_stack_size\00\B4.outgoing_vgpr_countP\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\00\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CD\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\A9.uses_cps\C3\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4\AF\9D\0B\07\88\03\02\CF\01o\C9\CAf?)\DA\AD.llpc_version\A476.0\AEamdpal.version\92\03\00"}
diff --git a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
index afb77ed190896..a2470a60cb19f 100644
--- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
+++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
@@ -122,8 +122,8 @@ define void @test_func() !dbg !6 {
 }
 
 ; STDERR: remark: foo.cl:8:0: Function Name: empty_kernel
-; STDERR-NEXT: remark: foo.cl:8:0:     TotalSGPRs: 4
-; STDERR-NEXT: remark: foo.cl:8:0:     VGPRs: 0
+; STDERR-NEXT: remark: foo.cl:8:0:     TotalSGPRs: 22
+; STDERR-NEXT: remark: foo.cl:8:0:     VGPRs: 3
 ; STDERR-NEXT: remark: foo.cl:8:0:     AGPRs: 0
 ; STDERR-NEXT: remark: foo.cl:8:0:     ScratchSize [bytes/lane]: 0
 ; STDERR-NEXT: remark: foo.cl:8:0:     Dynamic Stack: False
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
index 0d25bc97ff775..557ffd27a07f6 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-tracker-physreg.ll
@@ -4,8 +4,8 @@
 ; CHECK-LABEL: {{^}}spill:
 ; GCN:    NumSgprs: 104
 ; GCN-GCNTRACKERS:    NumSgprs: 104
-; GCN:    NumVgprs: 1
-; GCN-GCNTRACKERS:    NumVgprs: 2
+; GCN:    NumVgprs: 3
+; GCN-GCNTRACKERS:    NumVgprs: 3
 ; GCN:    ScratchSize: 0
 ; GCN-GCNTRACKERS:    ScratchSize: 0
 ; GCN:    Occupancy: 5
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
index 462ac23ec7e0e..8300a52955b91 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
@@ -7,14 +7,14 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MINREG %s
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -amdgpu-sched-strategy=iterative-maxocc -verify-machineinstrs < %s | FileCheck --check-prefix=VI-MAXOCC %s
 
-; SI-MINREG: NumSgprs: {{[1-9]$}}
-; SI-MINREG: NumVgprs: {{[1-9]$}}
+; SI-MINREG: NumSgprs: {{[1]?[1-9]$}}
+; SI-MINREG: NumVgprs: {{[1]?[1-9]$}}
 
 ; SI-MAXOCC: NumSgprs: {{[1-4]?[0-9]$}}
 ; SI-MAXOCC: NumVgprs: {{[1-4]?[0-9]$}}
 
 ; stores may alias loads
-; VI-MINREG: NumSgprs: {{[0-9]$}}
+; VI-MINREG: NumSgprs: {{[1]?[0-9]$}}
 ; VI-MINREG: NumVgprs: {{[1-3][0-9]$}}
 
 ; stores may alias loads
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
index 6ddf0986755f9..682bbdedb37a3 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
@@ -35,7 +35,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; VI-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT:     .amdhsa_next_free_vgpr 1
+; VI-NEXT:     .amdhsa_next_free_vgpr 3
 ; VI-NEXT:     .amdhsa_next_free_sgpr 18
 ; VI-NEXT:     .amdhsa_reserve_vcc 0
 ; VI-NEXT:     .amdhsa_reserve_flat_scratch 0
@@ -86,7 +86,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; GFX9-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT:     .amdhsa_next_free_vgpr 1
+; GFX9-NEXT:     .amdhsa_next_free_vgpr 3
 ; GFX9-NEXT:     .amdhsa_next_free_sgpr 18
 ; GFX9-NEXT:     .amdhsa_reserve_vcc 0
 ; GFX9-NEXT:     .amdhsa_reserve_flat_scratch 0
@@ -146,7 +146,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; VI-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT:     .amdhsa_next_free_vgpr 1
+; VI-NEXT:     .amdhsa_next_free_vgpr 3
 ; VI-NEXT:     .amdhsa_next_free_sgpr 18
 ; VI-NEXT:     .amdhsa_reserve_vcc 0
 ; VI-NEXT:     .amdhsa_reserve_flat_scratch 0
@@ -197,7 +197,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; GFX9-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT:     .amdhsa_next_free_vgpr 1
+; GFX9-NEXT:     .amdhsa_next_free_vgpr 3
 ; GFX9-NEXT:     .amdhsa_next_free_sgpr 18
 ; GFX9-NEXT:     .amdhsa_reserve_vcc 0
 ; GFX9-NEXT:     .amdhsa_reserve_flat_scratch 0
@@ -257,7 +257,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; VI-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT:     .amdhsa_next_free_vgpr 1
+; VI-NEXT:     .amdhsa_next_free_vgpr 3
 ; VI-NEXT:     .amdhsa_next_free_sgpr 18
 ; VI-NEXT:     .amdhsa_reserve_vcc 0
 ; VI-NEXT:     .amdhsa_reserve_flat_scratch 0
@@ -308,7 +308,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; GFX9-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT:     .amdhsa_next_free_vgpr 1
+; GFX9-NEXT:     .amdhsa_next_free_vgpr 3
 ; GFX9-NEXT:     .amdhsa_next_free_sgpr 18
 ; GFX9-NEXT:     .amdhsa_reserve_vcc 0
 ; GFX9-NEXT:     .amdhsa_reserve_flat_scratch 0
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
index 30accc846d2b6..d3def45c4f9d2 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
@@ -6,7 +6,7 @@
 
 define amdgpu_kernel void @kern() #0 {
 ; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 5
+; ASM: .amdhsa_next_free_sgpr 8
 ; ASM: .amdhsa_reserve_xnack_mask 1
 
 ; Verify that an extra SGPR block is reserved with XNACK "any" tid setting.
@@ -17,7 +17,7 @@ define amdgpu_kernel void @kern() #0 {
 ; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......
 
 ; ELF: AMDGPU Metadata
-; ELF: .sgpr_count:     9
+; ELF: .sgpr_count:     12
 entry:
   tail call void asm sideeffect "", "~{s[0:4]}"()
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
index 4f84b31f1877b..ad831e040d722 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
@@ -6,7 +6,7 @@
 
 define amdgpu_kernel void @kern() #0 {
 ; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 5
+; ASM: .amdhsa_next_free_sgpr 8
 ; ASM: .amdhsa_reserve_xnack_mask 0
 
 ; Verify that an extra SGPR block is not reserved with XNACK "off" tid setting.
@@ -17,7 +17,7 @@ define amdgpu_kernel void @kern() #0 {
 ; OBJ-NEXT: 0030 0000af00 8c000000 21000000 00000000 ........!.......
 
 ; ELF: AMDGPU Metadata
-; ELF: .sgpr_count:     5
+; ELF: .sgpr_count:     8
 entry:
   tail call void asm sideeffect "", "~{s[0:4]}"()
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
index 644f434923368..d1e28e11601ce 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
@@ -6,7 +6,7 @@
 
 define amdgpu_kernel void @kern() #0 {
 ; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 5
+; ASM: .amdhsa_next_free_sgpr 8
 ; ASM: .amdhsa_reserve_xnack_mask 1
 
 ; Verify that an extra SGPR block is reserved with XNACK "on" tid setting.
@@ -17,7 +17,7 @@ define amdgpu_kernel void @kern() #0 {
 ; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......
 
 ; ELF: AMDGPU Metadata
-; ELF: .sgpr_count:     9
+; ELF: .sgpr_count:     12
 entry:
   tail call void asm sideeffect "", "~{s[0:4]}"()
   ret void