[llvm] 69f7d81 - [AMDGPU] Set number vgprs used in PS shaders based on input registers actually used

Fri Oct 8 06:25:10 PDT 2021

Author: David Stuttard
Date: 2021-10-08T14:24:35+01:00
New Revision: 69f7d81d0a47781e5d4820873c20f725f3d0236e

URL: https://github.com/llvm/llvm-project/commit/69f7d81d0a47781e5d4820873c20f725f3d0236e
DIFF: https://github.com/llvm/llvm-project/commit/69f7d81d0a47781e5d4820873c20f725f3d0236e.diff

LOG: [AMDGPU] Set number vgprs used in PS shaders based on input registers actually used

For PS shaders we can use the input SPI_PS_INPUT_ENA and SPI_PS_INPUT_ADDR
registers

Calculate the number of VGPR registers used as input VGPRs based on these
registers rather than the arguments passed in (this conservatively always
allocates the maximum).

Differential Revision: https://reviews.llvm.org/D101633

Change-Id: Idf7c060cbbd5f7e3300102c55ecee3c07f209de6

Added: 
    llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
    llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
    llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 2b5381ab7a791..8d314bc254ed7 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -716,18 +716,72 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
 
   if (isShader(F.getCallingConv())) {
+    bool IsPixelShader =
+        F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
+
+    // Calculate the number of VGPR registers based on the SPI input registers
+    uint32_t InputEna = 0;
+    uint32_t InputAddr = 0;
+    unsigned LastEna = 0;
+
+    if (IsPixelShader) {
+      // Note for IsPixelShader:
+      // By this stage, all enabled inputs are tagged in InputAddr as well.
+      // We will use InputAddr to determine whether the input counts against the
+      // vgpr total and only use the InputEnable to determine the last input
+      // that is relevant - if extra arguments are used, then we have to honour
+      // the InputAddr for any intermediate non-enabled inputs.
+      InputEna = MFI->getPSInputEnable();
+      InputAddr = MFI->getPSInputAddr();
+
+      // We only need to consider input args up to the last used arg.
+      assert((InputEna || InputAddr) &&
+             "PSInputAddr and PSInputEnable should "
+             "never both be 0 for AMDGPU_PS shaders");
+      // There are some rare circumstances where InputAddr is non-zero and
+      // InputEna can be set to 0. In this case we default to setting LastEna
+      // to 1.
+      LastEna = InputEna ? findLastSet(InputEna) + 1 : 1;
+    }
+
     // FIXME: We should be using the number of registers determined during
     // calling convention lowering to legalize the types.
     const DataLayout &DL = F.getParent()->getDataLayout();
+    unsigned PSArgCount = 0;
+    unsigned IntermediateVGPR = 0;
     for (auto &Arg : F.args()) {
       unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
-      if (Arg.hasAttribute(Attribute::InReg))
+      if (Arg.hasAttribute(Attribute::InReg)) {
         WaveDispatchNumSGPR += NumRegs;
-      else
-        WaveDispatchNumVGPR += NumRegs;
+      } else {
+        // If this is a PS shader and we're processing the PS Input args (first
+        // 16 VGPR), use the InputEna and InputAddr bits to define how many
+        // VGPRs are actually used.
+        // Any extra VGPR arguments are handled as normal arguments (and
+        // contribute to the VGPR count whether they're used or not).
+        if (IsPixelShader && PSArgCount < 16) {
+          if ((1 << PSArgCount) & InputAddr) {
+            if (PSArgCount < LastEna)
+              WaveDispatchNumVGPR += NumRegs;
+            else
+              IntermediateVGPR += NumRegs;
+          }
+          PSArgCount++;
+        } else {
+          // If there are extra arguments we have to include the allocation for
+          // the non-used (but enabled with InputAddr) input arguments
+          if (IntermediateVGPR) {
+            WaveDispatchNumVGPR += IntermediateVGPR;
+            IntermediateVGPR = 0;
+          }
+          WaveDispatchNumVGPR += NumRegs;
+        }
+      }
     }
     ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
-    ProgInfo.NumVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
+    ProgInfo.NumArchVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
+    ProgInfo.NumVGPR =
+        Info.getTotalNumVGPRs(STM, Info.NumAGPR, ProgInfo.NumArchVGPR);
   }
 
   // Adjust number of registers used to meet default/requested minimum/maximum

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index ef46e53b7460a..e841e939ef34b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -82,11 +82,16 @@ int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
                                    ST.getTargetID().isXnackOnOrAny());
 }
 
+int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
+    const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const {
+  if (ST.hasGFX90AInsts() && ArgNumAGPR)
+    return alignTo(ArgNumVGPR, 4) + ArgNumAGPR;
+  return std::max(ArgNumVGPR, ArgNumAGPR);
+}
+
 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
     const GCNSubtarget &ST) const {
-  if (ST.hasGFX90AInsts() && NumAGPR)
-    return alignTo(NumVGPR, 4) + NumAGPR;
-  return std::max(NumVGPR, NumAGPR);
+  return getTotalNumVGPRs(ST, NumAGPR, NumVGPR);
 }
 
 bool AMDGPUResourceUsageAnalysis::runOnSCC(CallGraphSCC &SCC) {

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
index 652f010a084aa..b0a2d3bffc62e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
@@ -43,6 +43,10 @@ struct AMDGPUResourceUsageAnalysis : public CallGraphSCCPass {
     bool HasIndirectCall = false;
 
     int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const;
+    // Total number of VGPRs is actually a combination of AGPR and VGPR
+    // depending on architecture - and some alignment constraints
+    int32_t getTotalNumVGPRs(const GCNSubtarget &ST, int32_t NumAGPR,
+                             int32_t NumVGPR) const;
     int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const;
   };
 

diff  --git a/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll b/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
new file mode 100644
index 0000000000000..f3c3acc317ce1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/ps-shader-arg-count.ll
@@ -0,0 +1,363 @@
+;RUN: llc < %s -mtriple=amdgcn-pal -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+;RUN: llc < %s -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+; ;CHECK-LABEL: {{^}}_amdgpu_ps_1_arg:
+; ;CHECK: NumVgprs: 4
+define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_1_arg(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
+.entry:
+  %i1 = extractelement <2 x float> %arg3, i32 1
+  %ret1 = insertelement <4 x float> undef, float %i1, i32 0
+  %ret2 = insertvalue { <4 x float> } undef, <4 x float> %ret1, 0
+  ret { <4 x float> } %ret2
+}
+
+; CHECK-LABEL: {{^}}_amdgpu_ps_3_arg:
+; CHECK: NumVgprs: 6
+define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_3_arg(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
+.entry:
+  %i1 = extractelement <2 x float> %arg3, i32 1
+  %i2 = extractelement <2 x float> %arg4, i32 0
+  %i3 = extractelement <2 x float> %arg5, i32 1
+  %ret1 = insertelement <4 x float> undef, float %i1, i32 0
+  %ret1.1 = insertelement <4 x float> %ret1, float %i2, i32 1
+  %ret1.2 = insertelement <4 x float> %ret1.1, float %i3, i32 2
+  %ret2 = insertvalue { <4 x float> } undef, <4 x float> %ret1.2, 0
+  ret { <4 x float> } %ret2
+}
+
+; CHECK-LABEL: {{^}}_amdgpu_ps_2_arg_gap:
+; CHECK: NumVgprs: 4
+define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_2_arg_gap(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
+.entry:
+  %i1 = extractelement <2 x float> %arg3, i32 1
+  %i3 = extractelement <2 x float> %arg5, i32 1
+  %ret1 = insertelement <4 x float> undef, float %i1, i32 0
+  %ret1.2 = insertelement <4 x float> %ret1, float %i3, i32 1
+  %ret2 = insertvalue { <4 x float> } undef, <4 x float> %ret1.2, 0
+  ret { <4 x float> } %ret2
+}
+
+; Using InitialPSInputAddr of 0x2 causes the 2nd VGPR arg to be included in the packing - this increases the total number of VGPRs and in turn makes arg3 not be packed to be
+; adjacent to arg1 (the only 2 used arguments)
+; CHECK-LABEL: {{^}}_amdgpu_ps_2_arg_no_pack:
+; CHECK: NumVgprs: 6
+define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_2_arg_no_pack(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #1 {
+.entry:
+  %i1 = extractelement <2 x float> %arg3, i32 1
+  %i3 = extractelement <2 x float> %arg5, i32 1
+  %ret1 = insertelement <4 x float> undef, float %i1, i32 0
+  %ret1.2 = insertelement <4 x float> %ret1, float %i3, i32 1
+  %ret2 = insertvalue { <4 x float> } undef, <4 x float> %ret1.2, 0
+  ret { <4 x float> } %ret2
+}
+
+; CHECK-LABEL: {{^}}_amdgpu_ps_all_arg:
+; CHECK: NumVgprs: 24
+define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @_amdgpu_ps_all_arg(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
+.entry:
+  %i1 = extractelement <2 x float> %arg3, i32 1
+  %i2 = extractelement <2 x float> %arg4, i32 0
+  %i3 = extractelement <2 x float> %arg5, i32 1
+  %i4 = extractelement <3 x float> %arg6, i32 1
+  %i5 = extractelement <2 x float> %arg7, i32 0
+  %i6 = extractelement <2 x float> %arg8, i32 0
+  %i7 = extractelement <2 x float> %arg9, i32 1
+
+  %ret1 = insertelement <4 x float> undef, float %i1, i32 0
+  %ret1.1 = insertelement <4 x float> %ret1, float %i2, i32 1
+  %ret1.2 = insertelement <4 x float> %ret1.1, float %i3, i32 2
+  %ret1.3 = insertelement <4 x float> %ret1.2, float %i4, i32 3
+
+  %ret2 = insertelement <4 x float> undef, float %i5, i32 0
+  %ret2.1 = insertelement <4 x float> %ret2, float %i6, i32 1
+  %ret2.2 = insertelement <4 x float> %ret2.1, float %i7, i32 2
+  %ret2.3 = insertelement <4 x float> %ret2.2, float %arg10, i32 3
+
+  %ret3 = insertelement <4 x float> undef, float %arg11, i32 0
+  %ret3.1 = insertelement <4 x float> %ret3, float %arg12, i32 1
+  %ret3.2 = insertelement <4 x float> %ret3.1, float %arg13, i32 2
+  %ret3.3 = insertelement <4 x float> %ret3.2, float %arg14, i32 3
+
+  %arg15.f = bitcast i32 %arg15 to float
+  %arg16.f = bitcast i32 %arg16 to float
+  %arg17.f = bitcast i32 %arg17 to float
+  %arg18.f = bitcast i32 %arg18 to float
+
+  %ret4 = insertelement <4 x float> undef, float %arg15.f, i32 0
+  %ret4.1 = insertelement <4 x float> %ret4, float %arg16.f, i32 1
+  %ret4.2 = insertelement <4 x float> %ret4.1, float %arg17.f, i32 2
+  %ret4.3 = insertelement <4 x float> %ret4.2, float %arg18.f, i32 3
+
+  %ret.res1 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } undef, <4 x float> %ret1.3, 0
+  %ret.res2 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res1, <4 x float> %ret2.3, 1
+  %ret.res3 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res2, <4 x float> %ret3.3, 2
+  %ret.res  = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res3, <4 x float> %ret4.3, 3
+
+  ret { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res
+}
+
+; Extra arguments have to be allocated even if they're unused
+; CHECK-LABEL: {{^}}_amdgpu_ps_all_arg_extra_unused:
+; CHECK: NumVgprs: 26
+define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @_amdgpu_ps_all_arg_extra_unused(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 {
+.entry:
+  %i1 = extractelement <2 x float> %arg3, i32 1
+  %i2 = extractelement <2 x float> %arg4, i32 0
+  %i3 = extractelement <2 x float> %arg5, i32 1
+  %i4 = extractelement <3 x float> %arg6, i32 1
+  %i5 = extractelement <2 x float> %arg7, i32 0
+  %i6 = extractelement <2 x float> %arg8, i32 0
+  %i7 = extractelement <2 x float> %arg9, i32 1
+
+  %ret1 = insertelement <4 x float> undef, float %i1, i32 0
+  %ret1.1 = insertelement <4 x float> %ret1, float %i2, i32 1
+  %ret1.2 = insertelement <4 x float> %ret1.1, float %i3, i32 2
+  %ret1.3 = insertelement <4 x float> %ret1.2, float %i4, i32 3
+
+  %ret2 = insertelement <4 x float> undef, float %i5, i32 0
+  %ret2.1 = insertelement <4 x float> %ret2, float %i6, i32 1
+  %ret2.2 = insertelement <4 x float> %ret2.1, float %i7, i32 2
+  %ret2.3 = insertelement <4 x float> %ret2.2, float %arg10, i32 3
+
+  %ret3 = insertelement <4 x float> undef, float %arg11, i32 0
+  %ret3.1 = insertelement <4 x float> %ret3, float %arg12, i32 1
+  %ret3.2 = insertelement <4 x float> %ret3.1, float %arg13, i32 2
+  %ret3.3 = insertelement <4 x float> %ret3.2, float %arg14, i32 3
+
+  %arg15.f = bitcast i32 %arg15 to float
+  %arg16.f = bitcast i32 %arg16 to float
+  %arg17.f = bitcast i32 %arg17 to float
+  %arg18.f = bitcast i32 %arg18 to float
+
+  %ret4 = insertelement <4 x float> undef, float %arg15.f, i32 0
+  %ret4.1 = insertelement <4 x float> %ret4, float %arg16.f, i32 1
+  %ret4.2 = insertelement <4 x float> %ret4.1, float %arg17.f, i32 2
+  %ret4.3 = insertelement <4 x float> %ret4.2, float %arg18.f, i32 3
+
+  %ret.res1 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } undef, <4 x float> %ret1.3, 0
+  %ret.res2 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res1, <4 x float> %ret2.3, 1
+  %ret.res3 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res2, <4 x float> %ret3.3, 2
+  %ret.res  = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res3, <4 x float> %ret4.3, 3
+
+  ret { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res
+}
+
+; CHECK-LABEL: {{^}}_amdgpu_ps_all_arg_extra:
+; CHECK: NumVgprs: 26
+; CHECK: NumVGPRsForWavesPerEU: 26
+define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @_amdgpu_ps_all_arg_extra(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 {
+.entry:
+  %i1 = extractelement <2 x float> %arg3, i32 1
+  %i2 = extractelement <2 x float> %arg4, i32 0
+  %i3 = extractelement <2 x float> %arg5, i32 1
+  %i4 = extractelement <3 x float> %arg6, i32 1
+  %i5 = extractelement <2 x float> %arg7, i32 0
+  %i6 = extractelement <2 x float> %arg8, i32 0
+  %i7 = extractelement <2 x float> %arg9, i32 1
+
+  %ret1 = insertelement <4 x float> undef, float %i1, i32 0
+  %ret1.1 = insertelement <4 x float> %ret1, float %i2, i32 1
+  %ret1.2 = insertelement <4 x float> %ret1.1, float %i3, i32 2
+  %ret1.3 = insertelement <4 x float> %ret1.2, float %i4, i32 3
+
+  %ret2 = insertelement <4 x float> undef, float %i5, i32 0
+  %ret2.1 = insertelement <4 x float> %ret2, float %i6, i32 1
+  %ret2.2 = insertelement <4 x float> %ret2.1, float %i7, i32 2
+  %ret2.3 = insertelement <4 x float> %ret2.2, float %arg10, i32 3
+
+  %ret3 = insertelement <4 x float> undef, float %arg11, i32 0
+  %ret3.1 = insertelement <4 x float> %ret3, float %arg12, i32 1
+  %ret3.2 = insertelement <4 x float> %ret3.1, float %arg13, i32 2
+  %ret3.3 = insertelement <4 x float> %ret3.2, float %arg14, i32 3
+
+  %arg15.f = bitcast i32 %arg15 to float
+  %arg16.f = bitcast i32 %arg16 to float
+  %arg17.f = bitcast i32 %arg17 to float
+  %arg18.f = bitcast i32 %arg18 to float
+
+  %arg15_16.f = fadd float %arg15.f, %arg16.f
+  %arg17_18.f = fadd float %arg17.f, %arg18.f
+
+  %ret4 = insertelement <4 x float> undef, float %extra_arg1, i32 0
+  %ret4.1 = insertelement <4 x float> %ret4, float %extra_arg2, i32 1
+  %ret4.2 = insertelement <4 x float> %ret4.1, float %arg15_16.f, i32 2
+  %ret4.3 = insertelement <4 x float> %ret4.2, float %arg17_18.f, i32 3
+
+  %ret.res1 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } undef, <4 x float> %ret1.3, 0
+  %ret.res2 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res1, <4 x float> %ret2.3, 1
+  %ret.res3 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res2, <4 x float> %ret3.3, 2
+  %ret.res  = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res3, <4 x float> %ret4.3, 3
+
+  ret { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res
+}
+
+; Check that when no input args are used we get the minimum allocation - note that we always enable the first input
+; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused:
+; CHECK: NumVgprs: 4
+define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #0 {
+.entry:
+  ret { <4 x float> } undef
+}
+
+; Check that when no input args are used we get the minimum allocation - note that we always enable the first input
+; Additionally set the PSInputAddr to 0 via the metadata
+; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_ia0:
+; CHECK: NumVgprs: 4
+define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_ia0(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18) local_unnamed_addr #3 {
+.entry:
+  ret { <4 x float> } undef
+}
+
+; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_extra_used:
+; CHECK: NumVgprs: 4
+define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_extra_used(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 {
+.entry:
+  %ret4.1 = insertelement <4 x float> undef, float %extra_arg1, i32 0
+  %ret4.2 = insertelement <4 x float> %ret4.1, float %extra_arg2, i32 1
+
+  %ret.res  = insertvalue { <4 x float> } undef, <4 x float> %ret4.2, 0
+
+  ret { <4 x float> } %ret.res
+}
+
+; CHECK-LABEL: {{^}}_amdgpu_ps_part_unused_extra_used:
+; CHECK: NumVgprs: 5
+define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_part_unused_extra_used(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 {
+.entry:
+  %ret4.1 = insertelement <4 x float> undef, float %arg14, i32 0
+  %ret4.2 = insertelement <4 x float> %ret4.1, float %extra_arg1, i32 1
+  %ret4.3 = insertelement <4 x float> %ret4.2, float %extra_arg2, i32 2
+
+  %ret.res  = insertvalue { <4 x float> } undef, <4 x float> %ret4.3, 0
+
+  ret { <4 x float> } %ret.res
+}
+
+; CHECK-LABEL: {{^}}_amdgpu_ps_part_unused_extra_unused:
+; CHECK: NumVgprs: 7
+define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_part_unused_extra_unused(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 {
+.entry:
+  %ret4.1 = insertelement <4 x float> undef, float %arg12, i32 0
+  %ret4.2 = insertelement <4 x float> %ret4.1, float %arg13, i32 1
+  %ret4.3 = insertelement <4 x float> %ret4.2, float %arg14, i32 2
+
+  %ret.res  = insertvalue { <4 x float> } undef, <4 x float> %ret4.3, 0
+
+  ret { <4 x float> } %ret.res
+}
+
+; Extra unused inputs are always added to the allocation
+; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_extra_unused:
+; CHECK: NumVgprs: 4
+define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_extra_unused(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 {
+.entry:
+
+  ret { <4 x float> } undef
+}
+
+; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_extra_used_no_packing:
+; CHECK: NumVgprs: 26
+define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_extra_used_no_packing(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #2 {
+.entry:
+  %ret4.1 = insertelement <4 x float> undef, float %extra_arg1, i32 0
+  %ret4.2 = insertelement <4 x float> %ret4.1, float %extra_arg2, i32 1
+
+  %ret.res  = insertvalue { <4 x float> } undef, <4 x float> %ret4.2, 0
+
+  ret { <4 x float> } %ret.res
+}
+
+; CHECK-LABEL: {{^}}_amdgpu_ps_all_unused_extra_unused_no_packing:
+; CHECK: NumVgprs: 26
+define dllexport amdgpu_ps { <4 x float> } @_amdgpu_ps_all_unused_extra_unused_no_packing(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #2 {
+.entry:
+  ret { <4 x float> } undef
+}
+
+; CHECK-LABEL: {{^}}_amdgpu_ps_some_unused_arg_extra:
+; CHECK: NumVgprs: 24
+; CHECK: NumVGPRsForWavesPerEU: 24
+define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @_amdgpu_ps_some_unused_arg_extra(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #0 {
+.entry:
+  %i1 = extractelement <2 x float> %arg3, i32 1
+  %i2 = extractelement <2 x float> %arg4, i32 0
+  %i3 = extractelement <2 x float> %arg5, i32 1
+  %i4 = extractelement <3 x float> %arg6, i32 1
+  %i5 = extractelement <2 x float> %arg7, i32 0
+  %i6 = extractelement <2 x float> %arg8, i32 0
+  %i7 = extractelement <2 x float> %arg9, i32 1
+
+  %ret1 = insertelement <4 x float> undef, float %i1, i32 0
+  %ret1.1 = insertelement <4 x float> %ret1, float %i2, i32 1
+  %ret1.2 = insertelement <4 x float> %ret1.1, float %i3, i32 2
+  %ret1.3 = insertelement <4 x float> %ret1.2, float %i4, i32 3
+
+  %ret2 = insertelement <4 x float> undef, float %i5, i32 0
+  %ret2.1 = insertelement <4 x float> %ret2, float %i6, i32 1
+  %ret2.2 = insertelement <4 x float> %ret2.1, float %i7, i32 2
+  %ret2.3 = insertelement <4 x float> %ret2.2, float %arg10, i32 3
+
+  %ret3 = insertelement <4 x float> undef, float %arg11, i32 0
+  %ret3.1 = insertelement <4 x float> %ret3, float %arg12, i32 1
+  %ret3.2 = insertelement <4 x float> %ret3.1, float %arg13, i32 2
+  %ret3.3 = insertelement <4 x float> %ret3.2, float %arg14, i32 3
+
+  %arg15.f = bitcast i32 %arg15 to float
+  %arg16.f = bitcast i32 %arg16 to float
+
+  %ret4 = insertelement <4 x float> undef, float %extra_arg1, i32 0
+  %ret4.1 = insertelement <4 x float> %ret4, float %extra_arg2, i32 1
+  %ret4.2 = insertelement <4 x float> %ret4.1, float %arg15.f, i32 2
+  %ret4.3 = insertelement <4 x float> %ret4.2, float %arg16.f, i32 3
+
+  %ret.res1 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } undef, <4 x float> %ret1.3, 0
+  %ret.res2 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res1, <4 x float> %ret2.3, 1
+  %ret.res3 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res2, <4 x float> %ret3.3, 2
+  %ret.res  = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res3, <4 x float> %ret4.3, 3
+
+  ret { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res
+}
+
+;CHECK-LABEL: {{^}}_amdgpu_ps_some_unused_no_packing_arg_extra:
+;CHECK: NumVgprs: 26
+;CHECK: NumVGPRsForWavesPerEU: 26
+define dllexport amdgpu_ps { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @_amdgpu_ps_some_unused_no_packing_arg_extra(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x float> %arg3, <2 x float> %arg4, <2 x float> %arg5, <3 x float> %arg6, <2 x float> %arg7, <2 x float> %arg8, <2 x float> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, i32 %arg15, i32 %arg16, i32 %arg17, i32 %arg18, float %extra_arg1, float %extra_arg2) local_unnamed_addr #2 {
+.entry:
+  %i1 = extractelement <2 x float> %arg3, i32 1
+  %i2 = extractelement <2 x float> %arg4, i32 0
+  %i3 = extractelement <2 x float> %arg5, i32 1
+  %i4 = extractelement <3 x float> %arg6, i32 1
+  %i5 = extractelement <2 x float> %arg7, i32 0
+  %i6 = extractelement <2 x float> %arg8, i32 0
+  %i7 = extractelement <2 x float> %arg9, i32 1
+
+  %ret1 = insertelement <4 x float> undef, float %i1, i32 0
+  %ret1.1 = insertelement <4 x float> %ret1, float %i2, i32 1
+  %ret1.2 = insertelement <4 x float> %ret1.1, float %i3, i32 2
+  %ret1.3 = insertelement <4 x float> %ret1.2, float %i4, i32 3
+
+  %ret2 = insertelement <4 x float> undef, float %i5, i32 0
+  %ret2.1 = insertelement <4 x float> %ret2, float %i6, i32 1
+  %ret2.2 = insertelement <4 x float> %ret2.1, float %i7, i32 2
+  %ret2.3 = insertelement <4 x float> %ret2.2, float %arg10, i32 3
+
+  %ret3 = insertelement <4 x float> undef, float %arg11, i32 0
+  %ret3.1 = insertelement <4 x float> %ret3, float %arg12, i32 1
+  %ret3.2 = insertelement <4 x float> %ret3.1, float %arg13, i32 2
+  %ret3.3 = insertelement <4 x float> %ret3.2, float %arg14, i32 3
+
+  %ret4 = insertelement <4 x float> undef, float %extra_arg1, i32 0
+  %ret4.1 = insertelement <4 x float> %ret4, float %extra_arg2, i32 1
+
+  %ret.res1 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } undef, <4 x float> %ret1.3, 0
+  %ret.res2 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res1, <4 x float> %ret2.3, 1
+  %ret.res3 = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res2, <4 x float> %ret3.3, 2
+  %ret.res  = insertvalue { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res3, <4 x float> %ret4.1, 3
+
+  ret { < 4 x float>, <4 x float>, <4 x float>, <4 x float> } %ret.res
+}
+
+attributes #0 = { nounwind "target-features"=",+wavefrontsize64,+cumode"  }
+attributes #1 = { nounwind "InitialPSInputAddr"="2" "target-features"=",+wavefrontsize64,+cumode" }
+attributes #2 = { nounwind "InitialPSInputAddr"="0xffff" "target-features"=",+wavefrontsize64,+cumode" }
+attributes #3 = { nounwind "InitialPSInputAddr"="0" "target-features"=",+wavefrontsize64,+cumode" }