[llvm] a910a6a - [AMDGPU] AsmPrinter: Unify arg handling (#151672)

Fri Aug 8 03:00:40 PDT 2025

Author: Diana Picus
Date: 2025-08-08T12:00:37+02:00
New Revision: a910a6a8b5f96aa247f079016c9553a0aa445ba4

URL: https://github.com/llvm/llvm-project/commit/a910a6a8b5f96aa247f079016c9553a0aa445ba4
DIFF: https://github.com/llvm/llvm-project/commit/a910a6a8b5f96aa247f079016c9553a0aa445ba4.diff

LOG: [AMDGPU] AsmPrinter: Unify arg handling (#151672)

When computing the number of registers required by entry functions, the
`AMDGPUAsmPrinter` needs to take into account both the register usage
computed by the `AMDGPUResourceUsageAnalysis` pass, and the number
of registers initialized by the hardware. At the moment, the way it
computes the latter is different for graphics vs compute, due to differences in
the implementation. For kernels, all the information needed is available in
the `SIMachineFunctionInfo`, but for graphics shaders we would iterate over
the `Function`  arguments in the `AMDGPUAsmPrinter`. This pretty much 
repeats some of the logic from instruction selection.

This patch introduces 2 new members to `SIMachineFunctionInfo`, one
for SGPRs and one for VGPRs. Both will be computed during instruction
selection and then used during `AMDGPUAsmPrinter`, removing the need
to refer to the `Function` when printing assembly.

This patch is NFC except for the fact that we now add the extra SGPRs
(VCC, XNACK etc) to the number of SGPRs computed for graphics entry points.
I'm not sure why these weren't included before. It would be nice if
someone could confirm if that was just an oversight or if we have some docs
somewhere that I haven't managed to find. Only one test is affected (its SGPR
usage increases because we now take into account the XNACK registers).

Added: 
    llvm/test/CodeGen/AMDGPU/sgpr-count-graphics.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
    llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
    llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
    llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
    llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll
    llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
    llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
    llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
    llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
    llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
    llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 2a324e5683910..626734a4752f3 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -997,89 +997,24 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   const Function &F = MF.getFunction();
 
   // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
-  // dispatch registers are function args.
-  unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
-
-  if (isShader(F.getCallingConv())) {
-    bool IsPixelShader =
-        F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
-
-    // Calculate the number of VGPR registers based on the SPI input registers
-    uint32_t InputEna = 0;
-    uint32_t InputAddr = 0;
-    unsigned LastEna = 0;
-
-    if (IsPixelShader) {
-      // Note for IsPixelShader:
-      // By this stage, all enabled inputs are tagged in InputAddr as well.
-      // We will use InputAddr to determine whether the input counts against the
-      // vgpr total and only use the InputEnable to determine the last input
-      // that is relevant - if extra arguments are used, then we have to honour
-      // the InputAddr for any intermediate non-enabled inputs.
-      InputEna = MFI->getPSInputEnable();
-      InputAddr = MFI->getPSInputAddr();
-
-      // We only need to consider input args up to the last used arg.
-      assert((InputEna || InputAddr) &&
-             "PSInputAddr and PSInputEnable should "
-             "never both be 0 for AMDGPU_PS shaders");
-      // There are some rare circumstances where InputAddr is non-zero and
-      // InputEna can be set to 0. In this case we default to setting LastEna
-      // to 1.
-      LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;
-    }
+  // dispatch registers as function args.
+  unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(),
+           WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs();
 
-    // FIXME: We should be using the number of registers determined during
-    // calling convention lowering to legalize the types.
-    const DataLayout &DL = F.getDataLayout();
-    unsigned PSArgCount = 0;
-    unsigned IntermediateVGPR = 0;
-    for (auto &Arg : F.args()) {
-      unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
-      if (Arg.hasAttribute(Attribute::InReg)) {
-        WaveDispatchNumSGPR += NumRegs;
-      } else {
-        // If this is a PS shader and we're processing the PS Input args (first
-        // 16 VGPR), use the InputEna and InputAddr bits to define how many
-        // VGPRs are actually used.
-        // Any extra VGPR arguments are handled as normal arguments (and
-        // contribute to the VGPR count whether they're used or not).
-        if (IsPixelShader && PSArgCount < 16) {
-          if ((1 << PSArgCount) & InputAddr) {
-            if (PSArgCount < LastEna)
-              WaveDispatchNumVGPR += NumRegs;
-            else
-              IntermediateVGPR += NumRegs;
-          }
-          PSArgCount++;
-        } else {
-          // If there are extra arguments we have to include the allocation for
-          // the non-used (but enabled with InputAddr) input arguments
-          if (IntermediateVGPR) {
-            WaveDispatchNumVGPR += IntermediateVGPR;
-            IntermediateVGPR = 0;
-          }
-          WaveDispatchNumVGPR += NumRegs;
-        }
-      }
-    }
+  if (WaveDispatchNumSGPR) {
     ProgInfo.NumSGPR = AMDGPUMCExpr::createMax(
-        {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx);
+        {ProgInfo.NumSGPR,
+         MCBinaryExpr::createAdd(CreateExpr(WaveDispatchNumSGPR), ExtraSGPRs,
+                                 Ctx)},
+        Ctx);
+  }
 
+  if (WaveDispatchNumVGPR) {
     ProgInfo.NumArchVGPR = AMDGPUMCExpr::createMax(
         {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
 
     ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
         ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
-  } else if (isKernel(F.getCallingConv()) &&
-             MFI->getNumKernargPreloadedSGPRs()) {
-    // Consider cases where the total number of UserSGPRs with trailing
-    // allocated preload SGPRs, is greater than the number of explicitly
-    // referenced SGPRs.
-    const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
-        CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
-    ProgInfo.NumSGPR =
-        AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
   }
 
   // Adjust number of registers used to meet default/requested minimum/maximum

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 3d8d274f06246..64a9bde4e26e9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -580,6 +580,9 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
     ++i;
   }
 
+  if (Info->getNumKernargPreloadedSGPRs())
+    Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
+
   TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
   TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
   return true;
@@ -743,6 +746,15 @@ bool AMDGPUCallLowering::lowerFormalArguments(
   if (!determineAssignments(Assigner, SplitArgs, CCInfo))
     return false;
 
+  if (IsEntryFunc) {
+    // This assumes the registers are allocated by CCInfo in ascending order
+    // with no gaps.
+    Info->setNumWaveDispatchSGPRs(
+        CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
+    Info->setNumWaveDispatchVGPRs(
+        CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
+  }
+
   FormalArgHandler Handler(B, MRI);
   if (!handleAssignments(Handler, SplitArgs, CCInfo, ArgLocs, B))
     return false;

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5b327fb894504..1b7d65a31635f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3106,6 +3106,15 @@ SDValue SITargetLowering::LowerFormalArguments(
   if (!IsKernel) {
     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
     CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
+
+    // This assumes the registers are allocated by CCInfo in ascending order
+    // with no gaps.
+    Info->setNumWaveDispatchSGPRs(
+        CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
+    Info->setNumWaveDispatchVGPRs(
+        CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
+  } else if (Info->getNumKernargPreloadedSGPRs()) {
+    Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
   }
 
   SmallVector<SDValue, 16> Chains;

diff  --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 9a1448f1f95dc..49425d57187cb 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -728,6 +728,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
       MemoryBound(MFI.isMemoryBound()), WaveLimiter(MFI.needsWaveLimiter()),
       HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
       HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
+      NumWaveDispatchSGPRs(MFI.getNumWaveDispatchSGPRs()),
+      NumWaveDispatchVGPRs(MFI.getNumWaveDispatchVGPRs()),
       HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
       Occupancy(MFI.getOccupancy()),
       ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
@@ -784,6 +786,8 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
   WaveLimiter = YamlMFI.WaveLimiter;
   HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs;
   HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
+  NumWaveDispatchSGPRs = YamlMFI.NumWaveDispatchSGPRs;
+  NumWaveDispatchVGPRs = YamlMFI.NumWaveDispatchVGPRs;
   BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
   ReturnsVoid = YamlMFI.ReturnsVoid;
   IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;

diff  --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 08b0206d244fb..ca8f8033a2d54 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -270,6 +270,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
   bool WaveLimiter = false;
   bool HasSpilledSGPRs = false;
   bool HasSpilledVGPRs = false;
+  uint16_t NumWaveDispatchSGPRs = 0;
+  uint16_t NumWaveDispatchVGPRs = 0;
   uint32_t HighBitsOf32BitAddress = 0;
 
   // TODO: 10 may be a better default since it's the maximum.
@@ -327,6 +329,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
     YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false);
     YamlIO.mapOptional("hasSpilledSGPRs", MFI.HasSpilledSGPRs, false);
     YamlIO.mapOptional("hasSpilledVGPRs", MFI.HasSpilledVGPRs, false);
+    YamlIO.mapOptional("numWaveDispatchSGPRs", MFI.NumWaveDispatchSGPRs, false);
+    YamlIO.mapOptional("numWaveDispatchVGPRs", MFI.NumWaveDispatchVGPRs, false);
     YamlIO.mapOptional("scratchRSrcReg", MFI.ScratchRSrcReg,
                        StringValue("$private_rsrc_reg"));
     YamlIO.mapOptional("frameOffsetReg", MFI.FrameOffsetReg,
@@ -465,6 +469,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   unsigned NumUserSGPRs = 0;
   unsigned NumSystemSGPRs = 0;
 
+  unsigned NumWaveDispatchSGPRs = 0;
+  unsigned NumWaveDispatchVGPRs = 0;
+
   bool HasSpilledSGPRs = false;
   bool HasSpilledVGPRs = false;
   bool HasNonSpillStackObjects = false;
@@ -991,6 +998,14 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
     return UserSGPRInfo.getNumKernargPreloadSGPRs();
   }
 
+  unsigned getNumWaveDispatchSGPRs() const { return NumWaveDispatchSGPRs; }
+
+  void setNumWaveDispatchSGPRs(unsigned Count) { NumWaveDispatchSGPRs = Count; }
+
+  unsigned getNumWaveDispatchVGPRs() const { return NumWaveDispatchVGPRs; }
+
+  void setNumWaveDispatchVGPRs(unsigned Count) { NumWaveDispatchVGPRs = Count; }
+
   Register getPrivateSegmentWaveByteOffsetSystemSGPR() const {
     return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
   }

diff  --git a/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll b/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
index 013b68a40f44b..99e5d0017f30b 100644
--- a/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
@@ -1,5 +1,7 @@
-;RUN: llc < %s -mtriple=amdgcn-pal -mcpu=gfx1010 | FileCheck %s --check-prefixes=CHECK
-;RUN: llc < %s -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 | FileCheck %s --check-prefixes=CHECK
+;RUN: llc -global-isel=1 < %s -mtriple=amdgcn-pal -mcpu=gfx1010 | FileCheck %s --check-prefixes=CHECK
+;RUN: llc -global-isel=1 < %s -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 | FileCheck %s --check-prefixes=CHECK
+;RUN: llc -global-isel=0 < %s -mtriple=amdgcn-pal -mcpu=gfx1010 | FileCheck %s --check-prefixes=CHECK
+;RUN: llc -global-isel=0 < %s -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 | FileCheck %s --check-prefixes=CHECK
 
 ; ;CHECK-LABEL: {{^}}_amdgpu_ps_1_arg:
 ; ;CHECK: NumVgprs: 4

diff  --git a/llvm/test/CodeGen/AMDGPU/sgpr-count-graphics.ll b/llvm/test/CodeGen/AMDGPU/sgpr-count-graphics.ll
new file mode 100644
index 0000000000000..3c7b5bf97b879
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-count-graphics.ll
@@ -0,0 +1,38 @@
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=CHECK,PACKED16
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck %s --check-prefixes=CHECK,SPLIT16
+
+ at global = addrspace(1) global i32 poison, align 4
+
+; The hardware initializes the registers received as arguments by entry points,
+; so they will be counted even if unused.
+
+; Vectors of i1 are always unpacked
+
+; CHECK-LABEL: vec_of_i1:
+; CHECK:  TotalNumSgprs: 8
+define amdgpu_ps void @vec_of_i1(<8 x i1> inreg %v8i1) {
+  ret void
+}
+
+; Vectors of i8 are always unpacked
+
+; CHECK-LABEL: vec_of_i8:
+; CHECK:  TotalNumSgprs: 4
+define amdgpu_ps void @vec_of_i8(<4 x i8> inreg %v4i8) {
+  ret void
+}
+
+; Vectors of 16-bit types are packed for newer architectures and unpacked for older ones.
+
+; CHECK-LABEL: vec_of_16_bit_ty:
+; PACKED16: TotalNumSgprs: 3
+; SPLIT16:  TotalNumSgprs: 6
+define amdgpu_ps void @vec_of_16_bit_ty(<2 x i16> inreg %v2i16, <4 x half> inreg %v4half) {
+  ret void
+}
+
+; CHECK-LABEL: buffer_fat_ptr:
+; CHECK: TotalNumSgprs: 5
+define amdgpu_ps void @buffer_fat_ptr(ptr addrspace(7) inreg %p) {
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll b/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll
index 76c331cdc8303..e2ef60bb80153 100644
--- a/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave_dispatch_regs.ll
@@ -1,6 +1,9 @@
-; RUN: llc -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=GCN -check-prefix=SI -enable-var-scope %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -enable-var-scope %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=GCN -check-prefix=SI -enable-var-scope %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -enable-var-scope %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal < %s | FileCheck -check-prefix=GCN -check-prefix=SI -enable-var-scope %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -enable-var-scope %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -enable-var-scope %s
 
 ; This compute shader has input args that claim that it has 17 sgprs and 5 vgprs
 ; in wave dispatch. Ensure that the sgpr and vgpr counts in COMPUTE_PGM_RSRC1
@@ -17,7 +20,7 @@
 ; GCN-NEXT:         .scratch_memory_size: 0
 ; SI-NEXT:          .sgpr_count:     0x11
 ; VI-NEXT:          .sgpr_count:     0x60
-; GFX9-NEXT:        .sgpr_count:     0x11
+; GFX9-NEXT:        .sgpr_count:     0x15
 ; SI-NEXT:          .vgpr_count:     0x5
 ; VI-NEXT:          .vgpr_count:     0x5
 ; GFX9-NEXT:        .vgpr_count:     0x5

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
index 278cf0150c2f7..929db4c9be1c7 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll
@@ -17,6 +17,8 @@
 ; CHECK-NEXT:   waveLimiter:     false
 ; CHECK-NEXT:   hasSpilledSGPRs: false
 ; CHECK-NEXT:   hasSpilledVGPRs: false
+; CHECK-NEXT:   numWaveDispatchSGPRs: 0
+; CHECK-NEXT:   numWaveDispatchVGPRs: 0
 ; CHECK-NEXT:   scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
 ; CHECK-NEXT:   frameOffsetReg:  '$fp_reg'
 ; CHECK-NEXT:   stackPtrOffsetReg: '$sgpr32'
@@ -287,6 +289,8 @@
 ; CHECK-NEXT:   waveLimiter:     false
 ; CHECK-NEXT:   hasSpilledSGPRs: false
 ; CHECK-NEXT:   hasSpilledVGPRs: false
+; CHECK-NEXT:   numWaveDispatchSGPRs: 0
+; CHECK-NEXT:   numWaveDispatchVGPRs: 0
 ; CHECK-NEXT:   scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
 ; CHECK-NEXT:   frameOffsetReg:  '$fp_reg'
 ; CHECK-NEXT:   stackPtrOffsetReg: '$sgpr32'

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
index 890ea44081ce7..f054bea1f2780 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll
@@ -16,6 +16,8 @@
 ; AFTER-PEI-NEXT: waveLimiter:     false
 ; AFTER-PEI-NEXT: hasSpilledSGPRs: true
 ; AFTER-PEI-NEXT: hasSpilledVGPRs: false
+; AFTER-PEI-NEXT: numWaveDispatchSGPRs: 0
+; AFTER-PEI-NEXT: numWaveDispatchVGPRs: 0
 ; AFTER-PEI-NEXT: scratchRSrcReg:  '$sgpr68_sgpr69_sgpr70_sgpr71'
 ; AFTER-PEI-NEXT: frameOffsetReg:  '$fp_reg'
 ; AFTER-PEI-NEXT: stackPtrOffsetReg: '$sgpr32'

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
index f84ef8a3844dd..924216efcc461 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
@@ -17,6 +17,8 @@
 ; CHECK-NEXT: waveLimiter: false
 ; CHECK-NEXT: hasSpilledSGPRs: false
 ; CHECK-NEXT: hasSpilledVGPRs: false
+; CHECK-NEXT: numWaveDispatchSGPRs: 0
+; CHECK-NEXT: numWaveDispatchVGPRs: 0
 ; CHECK-NEXT: scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
 ; CHECK-NEXT: frameOffsetReg:  '$fp_reg'
 ; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32'

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
index cc834d017c149..39f1ddd0609d8 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
@@ -17,6 +17,8 @@
 ; CHECK-NEXT: waveLimiter: false
 ; CHECK-NEXT: hasSpilledSGPRs: false
 ; CHECK-NEXT: hasSpilledVGPRs: false
+; CHECK-NEXT: numWaveDispatchSGPRs: 0
+; CHECK-NEXT: numWaveDispatchVGPRs: 0
 ; CHECK-NEXT: scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
 ; CHECK-NEXT: frameOffsetReg:  '$fp_reg'
 ; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32'

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
index 06c580ec6f6b4..0cb9bc095bc50 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
@@ -17,6 +17,8 @@
 # FULL-NEXT: waveLimiter:     true
 # FULL-NEXT: hasSpilledSGPRs: false
 # FULL-NEXT: hasSpilledVGPRs: false
+# FULL-NEXT: numWaveDispatchSGPRs: 0
+# FULL-NEXT: numWaveDispatchVGPRs: 0
 # FULL-NEXT: scratchRSrcReg:  '$sgpr8_sgpr9_sgpr10_sgpr11'
 # FULL-NEXT: frameOffsetReg:  '$sgpr12'
 # FULL-NEXT: stackPtrOffsetReg:  '$sgpr13'
@@ -127,6 +129,8 @@ body:             |
 # FULL-NEXT: waveLimiter:     false
 # FULL-NEXT: hasSpilledSGPRs: false
 # FULL-NEXT: hasSpilledVGPRs: false
+# FULL-NEXT: numWaveDispatchSGPRs: 0
+# FULL-NEXT: numWaveDispatchVGPRs: 0
 # FULL-NEXT: scratchRSrcReg:  '$private_rsrc_reg'
 # FULL-NEXT: frameOffsetReg:  '$fp_reg'
 # FULL-NEXT: stackPtrOffsetReg:  '$sp_reg'
@@ -206,6 +210,8 @@ body:             |
 # FULL-NEXT: waveLimiter:     false
 # FULL-NEXT: hasSpilledSGPRs: false
 # FULL-NEXT: hasSpilledVGPRs: false
+# FULL-NEXT: numWaveDispatchSGPRs: 0
+# FULL-NEXT: numWaveDispatchVGPRs: 0
 # FULL-NEXT: scratchRSrcReg:  '$private_rsrc_reg'
 # FULL-NEXT: frameOffsetReg:  '$fp_reg'
 # FULL-NEXT: stackPtrOffsetReg:  '$sp_reg'
@@ -286,6 +292,8 @@ body:             |
 # FULL-NEXT: waveLimiter:     false
 # FULL-NEXT: hasSpilledSGPRs: false
 # FULL-NEXT: hasSpilledVGPRs: false
+# FULL-NEXT: numWaveDispatchSGPRs: 0
+# FULL-NEXT: numWaveDispatchVGPRs: 0
 # FULL-NEXT: scratchRSrcReg:  '$private_rsrc_reg'
 # FULL-NEXT: frameOffsetReg:  '$fp_reg'
 # FULL-NEXT: stackPtrOffsetReg:  '$sp_reg'

diff  --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
index 427154651a381..ab4383b675243 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
@@ -20,6 +20,8 @@
 ; CHECK-NEXT: waveLimiter: false
 ; CHECK-NEXT: hasSpilledSGPRs: false
 ; CHECK-NEXT: hasSpilledVGPRs: false
+; CHECK-NEXT: numWaveDispatchSGPRs: 0
+; CHECK-NEXT: numWaveDispatchVGPRs: 0
 ; CHECK-NEXT: scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
 ; CHECK-NEXT: frameOffsetReg:  '$fp_reg'
 ; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32'
@@ -80,6 +82,8 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
 ; CHECK-NEXT: waveLimiter: false
 ; CHECK-NEXT: hasSpilledSGPRs: false
 ; CHECK-NEXT: hasSpilledVGPRs: false
+; CHECK-NEXT: numWaveDispatchSGPRs: 3
+; CHECK-NEXT: numWaveDispatchVGPRs: 1
 ; CHECK-NEXT: scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
 ; CHECK-NEXT: frameOffsetReg:  '$fp_reg'
 ; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32'
@@ -144,6 +148,8 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
 ; CHECK-NEXT: waveLimiter: false
 ; CHECK-NEXT: hasSpilledSGPRs: false
 ; CHECK-NEXT: hasSpilledVGPRs: false
+; CHECK-NEXT: numWaveDispatchSGPRs: 16
+; CHECK-NEXT: numWaveDispatchVGPRs: 0
 ; CHECK-NEXT: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
 ; CHECK-NEXT: frameOffsetReg: '$sgpr33'
 ; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32'
@@ -200,6 +206,8 @@ define void @function() {
 ; CHECK-NEXT: waveLimiter: false
 ; CHECK-NEXT: hasSpilledSGPRs: false
 ; CHECK-NEXT: hasSpilledVGPRs: false
+; CHECK-NEXT: numWaveDispatchSGPRs: 16
+; CHECK-NEXT: numWaveDispatchVGPRs: 0
 ; CHECK-NEXT: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
 ; CHECK-NEXT: frameOffsetReg: '$sgpr33'
 ; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32'