[clang] [llvm] [AMDGPU] Convert AMDGPUResourceUsageAnalysis pass from Module to MF pass (PR #102913)

Janek van Oirschot via cfe-commits cfe-commits at lists.llvm.org
Thu Aug 15 06:22:01 PDT 2024


https://github.com/JanekvO updated https://github.com/llvm/llvm-project/pull/102913

>From 136ad3a97df7b02d89d845920c53ef80da1e7c31 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <janek.vanoirschot at amd.com>
Date: Mon, 12 Aug 2024 14:58:41 +0100
Subject: [PATCH 1/2] Convert AMDGPUResourceUsageAnalysis pass from Module to
 MachineFunction pass and move metadata propagation logic to MC layer

---
 .../amdgcn-machine-analysis-remarks.cl        |  10 +-
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp   | 208 +++++--
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h     |  19 +-
 .../Target/AMDGPU/AMDGPUMCResourceInfo.cpp    | 220 ++++++++
 llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h |  94 +++
 .../AMDGPU/AMDGPUResourceUsageAnalysis.cpp    | 147 +----
 .../AMDGPU/AMDGPUResourceUsageAnalysis.h      |  40 +-
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |   1 +
 .../MCTargetDesc/AMDGPUTargetStreamer.cpp     |  41 ++
 .../MCTargetDesc/AMDGPUTargetStreamer.h       |  23 +
 .../Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp |  10 +-
 .../AMDGPU/GlobalISel/extractelement.ll       | 168 +++---
 .../AMDGPU/GlobalISel/flat-scratch-init.ll    |  16 +-
 .../GlobalISel/llvm.amdgcn.workitem.id.ll     |   6 +-
 .../AMDGPU/GlobalISel/non-entry-alloca.ll     |   8 +-
 .../CodeGen/AMDGPU/agpr-register-count.ll     |  65 ++-
 .../CodeGen/AMDGPU/amdgpu.private-memory.ll   |   3 +-
 .../amdpal-metadata-agpr-register-count.ll    |   4 +-
 ...-amdgpu-flat-work-group-size-vgpr-limit.ll |  51 +-
 llvm/test/CodeGen/AMDGPU/attributor-noopt.ll  |  30 +-
 .../AMDGPU/call-alias-register-usage-agpr.ll  |  16 +-
 .../AMDGPU/call-alias-register-usage0.ll      |   8 +-
 .../AMDGPU/call-alias-register-usage1.ll      |  11 +-
 .../AMDGPU/call-alias-register-usage2.ll      |  11 +-
 .../AMDGPU/call-alias-register-usage3.ll      |  11 +-
 .../AMDGPU/call-graph-register-usage.ll       |  50 +-
 .../callee-special-input-sgprs-fixed-abi.ll   |   4 +-
 .../test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll |   1 +
 llvm/test/CodeGen/AMDGPU/code-object-v3.ll    |  34 +-
 .../AMDGPU/codegen-internal-only-func.ll      |   6 +-
 .../AMDGPU/control-flow-fastregalloc.ll       |   6 +-
 llvm/test/CodeGen/AMDGPU/elf.ll               |   7 +-
 .../enable-scratch-only-dynamic-stack.ll      |  14 +-
 .../CodeGen/AMDGPU/function-resource-usage.ll | 533 ++++++++++++++++++
 .../AMDGPU/gfx11-user-sgpr-init16-bug.ll      |  16 +-
 .../AMDGPU/inline-asm-reserved-regs.ll        |   2 +-
 .../CodeGen/AMDGPU/insert-delay-alu-bug.ll    |  24 +-
 llvm/test/CodeGen/AMDGPU/ipra.ll              |  14 +-
 llvm/test/CodeGen/AMDGPU/kernarg-size.ll      |   4 +-
 .../CodeGen/AMDGPU/kernel_code_t_recurse.ll   |   7 +-
 .../CodeGen/AMDGPU/large-alloca-compute.ll    |  31 +-
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |  45 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll |   6 +-
 .../AMDGPU/lower-module-lds-offsets.ll        |  26 +-
 llvm/test/CodeGen/AMDGPU/mesa3d.ll            |  13 +-
 .../AMDGPU/module-lds-false-sharing.ll        |  99 ++--
 llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll  |  20 +-
 llvm/test/CodeGen/AMDGPU/recursion.ll         |  28 +-
 .../AMDGPU/resource-optimization-remarks.ll   |  64 +--
 .../AMDGPU/resource-usage-dead-function.ll    |  13 +-
 .../CodeGen/AMDGPU/stack-realign-kernel.ll    | 126 +++--
 llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll  |   3 +-
 llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll  |   3 +-
 llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll   |   3 +-
 llvm/test/CodeGen/AMDGPU/trap.ll              |  33 +-
 55 files changed, 1801 insertions(+), 655 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
 create mode 100644 llvm/test/CodeGen/AMDGPU/function-resource-usage.ll

diff --git a/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl b/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl
index a05e21b37b9127..a2dd59a871904c 100644
--- a/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl
+++ b/clang/test/Frontend/amdgcn-machine-analysis-remarks.cl
@@ -2,12 +2,12 @@
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx908 -Rpass-analysis=kernel-resource-usage -S -O0 -verify %s -o /dev/null
 
 // expected-remark at +10 {{Function Name: foo}}
-// expected-remark at +9 {{    SGPRs: 13}}
-// expected-remark at +8 {{    VGPRs: 10}}
-// expected-remark at +7 {{    AGPRs: 12}}
-// expected-remark at +6 {{    ScratchSize [bytes/lane]: 0}}
+// expected-remark at +9 {{    SGPRs: foo.num_sgpr+(extrasgprs(foo.uses_vcc, foo.uses_flat_scratch, 1))}}
+// expected-remark at +8 {{    VGPRs: foo.num_vgpr}}
+// expected-remark at +7 {{    AGPRs: foo.num_agpr}}
+// expected-remark at +6 {{    ScratchSize [bytes/lane]: foo.private_seg_size}}
 // expected-remark at +5 {{    Dynamic Stack: False}}
-// expected-remark at +4 {{    Occupancy [waves/SIMD]: 10}}
+// expected-remark at +4 {{    Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 10, max(foo.num_sgpr+(extrasgprs(foo.uses_vcc, foo.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(foo.num_agpr, foo.num_vgpr), 1, 0))}}
 // expected-remark at +3 {{    SGPRs Spill: 0}}
 // expected-remark at +2 {{    VGPRs Spill: 0}}
 // expected-remark at +1 {{    LDS Size [bytes/block]: 0}}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index b90d245b7bd394..23bc804515e690 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -18,6 +18,7 @@
 #include "AMDGPUAsmPrinter.h"
 #include "AMDGPU.h"
 #include "AMDGPUHSAMetadataStreamer.h"
+#include "AMDGPUMCResourceInfo.h"
 #include "AMDGPUResourceUsageAnalysis.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUInstPrinter.h"
@@ -92,6 +93,9 @@ AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
                                    std::unique_ptr<MCStreamer> Streamer)
     : AsmPrinter(TM, std::move(Streamer)) {
   assert(OutStreamer && "AsmPrinter constructed without streamer");
+  RI = std::make_unique<MCResourceInfo>(OutContext);
+  OccupancyValidateMap =
+      std::make_unique<DenseMap<const Function *, const MCExpr *>>();
 }
 
 StringRef AMDGPUAsmPrinter::getPassName() const {
@@ -359,6 +363,102 @@ bool AMDGPUAsmPrinter::doInitialization(Module &M) {
   return AsmPrinter::doInitialization(M);
 }
 
+void AMDGPUAsmPrinter::ValidateMCResourceInfo(Function &F) {
+  if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
+    return;
+
+  using RIK = MCResourceInfo::ResourceInfoKind;
+  const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
+
+  auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
+    int64_t Val;
+    if (Value->evaluateAsAbsolute(Val)) {
+      Res = Val;
+      return true;
+    }
+    return false;
+  };
+
+  const uint64_t MaxScratchPerWorkitem =
+      STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
+  MCSymbol *ScratchSizeSymbol =
+      RI->getSymbol(F.getName(), RIK::RIK_PrivateSegSize);
+  uint64_t ScratchSize;
+  if (ScratchSizeSymbol->isVariable() &&
+      TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
+      ScratchSize > MaxScratchPerWorkitem) {
+    DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem,
+                                          DS_Error);
+    F.getContext().diagnose(DiagStackSize);
+  }
+
+  // Validate addressable scalar registers (i.e., prior to added implicit
+  // SGPRs).
+  MCSymbol *NumSGPRSymbol = RI->getSymbol(F.getName(), RIK::RIK_NumSGPR);
+  if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
+      !STM.hasSGPRInitBug()) {
+    unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
+    uint64_t NumSgpr;
+    if (NumSGPRSymbol->isVariable() &&
+        TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
+        NumSgpr > MaxAddressableNumSGPRs) {
+      DiagnosticInfoResourceLimit Diag(F, "addressable scalar registers",
+                                       NumSgpr, MaxAddressableNumSGPRs,
+                                       DS_Error, DK_ResourceLimit);
+      F.getContext().diagnose(Diag);
+      return;
+    }
+  }
+
+  MCSymbol *VCCUsedSymbol = RI->getSymbol(F.getName(), RIK::RIK_UsesVCC);
+  MCSymbol *FlatUsedSymbol =
+      RI->getSymbol(F.getName(), RIK::RIK_UsesFlatScratch);
+  uint64_t VCCUsed, FlatUsed, NumSgpr;
+
+  if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
+      FlatUsedSymbol->isVariable() &&
+      TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
+      TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) &&
+      TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) {
+
+    // Recomputes NumSgprs + implicit SGPRs but all symbols should now be
+    // resolvable.
+    NumSgpr += IsaInfo::getNumExtraSGPRs(
+        &STM, VCCUsed, FlatUsed,
+        getTargetStreamer()->getTargetID()->isXnackOnOrAny());
+    if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
+        STM.hasSGPRInitBug()) {
+      unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
+      if (NumSgpr > MaxAddressableNumSGPRs) {
+        DiagnosticInfoResourceLimit Diag(F, "scalar registers", NumSgpr,
+                                         MaxAddressableNumSGPRs, DS_Error,
+                                         DK_ResourceLimit);
+        F.getContext().diagnose(Diag);
+        return;
+      }
+    }
+
+    auto I = OccupancyValidateMap->find(&F);
+    if (I != OccupancyValidateMap->end()) {
+      const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
+          F, "amdgpu-waves-per-eu", {0, 0}, true);
+      uint64_t Occupancy;
+      const MCExpr *OccupancyExpr = I->getSecond();
+
+      if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) {
+        DiagnosticInfoOptimizationFailure Diag(
+            F, F.getSubprogram(),
+            "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
+            "'" +
+                F.getName() + "': desired occupancy was " + Twine(MinWEU) +
+                ", final occupancy is " + Twine(Occupancy));
+        F.getContext().diagnose(Diag);
+        return;
+      }
+    }
+  }
+}
+
 bool AMDGPUAsmPrinter::doFinalization(Module &M) {
   // Pad with s_code_end to help tools and guard against instruction prefetch
   // causing stale data in caches. Arguably this should be done by the linker,
@@ -371,25 +471,16 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) {
     getTargetStreamer()->EmitCodeEnd(STI);
   }
 
-  return AsmPrinter::doFinalization(M);
-}
+  // Assign expressions which can only be resolved when all other functions are
+  // known.
+  RI->Finalize();
+  getTargetStreamer()->EmitMCResourceMaximums(
+      RI->getMaxVGPRSymbol(), RI->getMaxAGPRSymbol(), RI->getMaxSGPRSymbol());
 
-// Print comments that apply to both callable functions and entry points.
-void AMDGPUAsmPrinter::emitCommonFunctionComments(
-    uint32_t NumVGPR, std::optional<uint32_t> NumAGPR, uint32_t TotalNumVGPR,
-    uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize,
-    const AMDGPUMachineFunction *MFI) {
-  OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
-  OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
-  OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
-  if (NumAGPR) {
-    OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false);
-    OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR),
-                                false);
+  for (Function &F : M.functions()) {
+    ValidateMCResourceInfo(F);
   }
-  OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
-  OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
-                              false);
+  return AsmPrinter::doFinalization(M);
 }
 
 SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
@@ -402,6 +493,7 @@ SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
   return Str;
 }
 
+// Print comments that apply to both callable functions and entry points.
 void AMDGPUAsmPrinter::emitCommonFunctionComments(
     const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
     const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
@@ -571,21 +663,45 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
                            STM.hasMAIInsts());
 
+  {
+    const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
+        ResourceUsage->getResourceInfo();
+    RI->gatherResourceInfo(MF, Info);
+    using RIK = MCResourceInfo::ResourceInfoKind;
+    getTargetStreamer()->EmitMCResourceInfo(
+        RI->getSymbol(MF.getName(), RIK::RIK_NumVGPR),
+        RI->getSymbol(MF.getName(), RIK::RIK_NumAGPR),
+        RI->getSymbol(MF.getName(), RIK::RIK_NumSGPR),
+        RI->getSymbol(MF.getName(), RIK::RIK_PrivateSegSize),
+        RI->getSymbol(MF.getName(), RIK::RIK_UsesVCC),
+        RI->getSymbol(MF.getName(), RIK::RIK_UsesFlatScratch),
+        RI->getSymbol(MF.getName(), RIK::RIK_HasDynSizedStack),
+        RI->getSymbol(MF.getName(), RIK::RIK_HasRecursion),
+        RI->getSymbol(MF.getName(), RIK::RIK_HasIndirectCall));
+  }
+
   if (isVerbose()) {
     MCSectionELF *CommentSection =
         Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
     OutStreamer->switchSection(CommentSection);
 
     if (!MFI->isEntryFunction()) {
+      using RIK = MCResourceInfo::ResourceInfoKind;
       OutStreamer->emitRawComment(" Function info:", false);
-      const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
-          ResourceUsage->getResourceInfo(&MF.getFunction());
+
       emitCommonFunctionComments(
-          Info.NumVGPR,
-          STM.hasMAIInsts() ? Info.NumAGPR : std::optional<uint32_t>(),
-          Info.getTotalNumVGPRs(STM),
-          Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
-          Info.PrivateSegmentSize, getFunctionCodeSize(MF), MFI);
+          RI->getSymbol(MF.getName(), RIK::RIK_NumVGPR)->getVariableValue(),
+          STM.hasMAIInsts() ? RI->getSymbol(MF.getName(), RIK::RIK_NumAGPR)
+                                  ->getVariableValue()
+                            : nullptr,
+          RI->createTotalNumVGPRs(MF, Ctx),
+          RI->createTotalNumSGPRs(
+              MF,
+              MF.getSubtarget<GCNSubtarget>().getTargetID().isXnackOnOrAny(),
+              Ctx),
+          RI->getSymbol(MF.getName(), RIK::RIK_PrivateSegSize)
+              ->getVariableValue(),
+          getFunctionCodeSize(MF), MFI);
       return false;
     }
 
@@ -753,8 +869,6 @@ uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const
 
 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
                                         const MachineFunction &MF) {
-  const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
-      ResourceUsage->getResourceInfo(&MF.getFunction());
   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   MCContext &Ctx = MF.getContext();
 
@@ -771,18 +885,38 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
     return false;
   };
 
-  ProgInfo.NumArchVGPR = CreateExpr(Info.NumVGPR);
-  ProgInfo.NumAccVGPR = CreateExpr(Info.NumAGPR);
-  ProgInfo.NumVGPR = CreateExpr(Info.getTotalNumVGPRs(STM));
-  ProgInfo.AccumOffset =
-      CreateExpr(alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1);
+  auto GetSymRefExpr =
+      [&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
+    MCSymbol *Sym = RI->getSymbol(MF.getName(), RIK);
+    return MCSymbolRefExpr::create(Sym, Ctx);
+  };
+
+  const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx);
+  const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx);
+
+  using RIK = MCResourceInfo::ResourceInfoKind;
+  ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR);
+  ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR);
+  ProgInfo.NumVGPR = AMDGPUMCExpr::createTotalNumVGPR(
+      ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
+
+  // AccumOffset computed for the MCExpr equivalent of:
+  // alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1;
+  ProgInfo.AccumOffset = MCBinaryExpr::createSub(
+      MCBinaryExpr::createDiv(
+          AMDGPUMCExpr::createAlignTo(
+              AMDGPUMCExpr::createMax({ConstOne, ProgInfo.NumArchVGPR}, Ctx),
+              ConstFour, Ctx),
+          ConstFour, Ctx),
+      ConstOne, Ctx);
   ProgInfo.TgSplit = STM.isTgSplitEnabled();
-  ProgInfo.NumSGPR = CreateExpr(Info.NumExplicitSGPR);
-  ProgInfo.ScratchSize = CreateExpr(Info.PrivateSegmentSize);
-  ProgInfo.VCCUsed = CreateExpr(Info.UsesVCC);
-  ProgInfo.FlatUsed = CreateExpr(Info.UsesFlatScratch);
+  ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR);
+  ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize);
+  ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC);
+  ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch);
   ProgInfo.DynamicCallStack =
-      CreateExpr(Info.HasDynamicallySizedStack || Info.HasRecursion);
+      MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
+                             GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
 
   const uint64_t MaxScratchPerWorkitem =
       STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
@@ -1082,6 +1216,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
       STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU,
       ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
 
+  OccupancyValidateMap->insert({&MF.getFunction(), ProgInfo.Occupancy});
+
   const auto [MinWEU, MaxWEU] =
       AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
   uint64_t Occupancy;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index f66bbde42ce278..676a4687ee2af7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -24,6 +24,7 @@ struct AMDGPUResourceUsageAnalysis;
 class AMDGPUTargetStreamer;
 class MCCodeEmitter;
 class MCOperand;
+class MCResourceInfo;
 
 namespace AMDGPU {
 struct MCKernelDescriptor;
@@ -40,12 +41,20 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
 
   AMDGPUResourceUsageAnalysis *ResourceUsage;
 
+  std::unique_ptr<MCResourceInfo> RI;
+
   SIProgramInfo CurrentProgramInfo;
 
   std::unique_ptr<AMDGPU::HSAMD::MetadataStreamer> HSAMetadataStream;
 
   MCCodeEmitter *DumpCodeInstEmitter = nullptr;
 
+  // ValidateMCResourceInfo cannot recompute parts of the occupancy as it does
+  // for other metadata to validate (e.g., NumSGPRs) so a map is necessary if we
+  // really want to track and validate the occupancy.
+  std::unique_ptr<DenseMap<const Function *, const MCExpr *>>
+      OccupancyValidateMap;
+
   uint64_t getFunctionCodeSize(const MachineFunction &MF) const;
 
   void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF);
@@ -60,11 +69,6 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
   void EmitPALMetadata(const MachineFunction &MF,
                        const SIProgramInfo &KernelInfo);
   void emitPALFunctionMetadata(const MachineFunction &MF);
-  void emitCommonFunctionComments(uint32_t NumVGPR,
-                                  std::optional<uint32_t> NumAGPR,
-                                  uint32_t TotalNumVGPR, uint32_t NumSGPR,
-                                  uint64_t ScratchSize, uint64_t CodeSize,
-                                  const AMDGPUMachineFunction *MFI);
   void emitCommonFunctionComments(const MCExpr *NumVGPR, const MCExpr *NumAGPR,
                                   const MCExpr *TotalNumVGPR,
                                   const MCExpr *NumSGPR,
@@ -84,6 +88,11 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
 
   SmallString<128> getMCExprStr(const MCExpr *Value);
 
+  /// Attempts to replace the validation that is missed in getSIProgramInfo due
+  /// to MCExpr being unknown. Invoked during doFinalization such that the
+  /// MCResourceInfo symbols are known.
+  void ValidateMCResourceInfo(Function &F);
+
 public:
   explicit AMDGPUAsmPrinter(TargetMachine &TM,
                             std::unique_ptr<MCStreamer> Streamer);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
new file mode 100644
index 00000000000000..58383475b312c9
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp
@@ -0,0 +1,220 @@
+//===- AMDGPUMCResourceInfo.cpp --- MC Resource Info ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief MC infrastructure to propagate the function level resource usage
+/// info.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCResourceInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSymbol.h"
+
+using namespace llvm;
+
+MCSymbol *MCResourceInfo::getSymbol(StringRef FuncName, ResourceInfoKind RIK) {
+  switch (RIK) {
+  case RIK_NumVGPR:
+    return OutContext.getOrCreateSymbol(FuncName + Twine(".num_vgpr"));
+  case RIK_NumAGPR:
+    return OutContext.getOrCreateSymbol(FuncName + Twine(".num_agpr"));
+  case RIK_NumSGPR:
+    return OutContext.getOrCreateSymbol(FuncName + Twine(".num_sgpr"));
+  case RIK_PrivateSegSize:
+    return OutContext.getOrCreateSymbol(FuncName + Twine(".private_seg_size"));
+  case RIK_UsesVCC:
+    return OutContext.getOrCreateSymbol(FuncName + Twine(".uses_vcc"));
+  case RIK_UsesFlatScratch:
+    return OutContext.getOrCreateSymbol(FuncName + Twine(".uses_flat_scratch"));
+  case RIK_HasDynSizedStack:
+    return OutContext.getOrCreateSymbol(FuncName +
+                                        Twine(".has_dyn_sized_stack"));
+  case RIK_HasRecursion:
+    return OutContext.getOrCreateSymbol(FuncName + Twine(".has_recursion"));
+  case RIK_HasIndirectCall:
+    return OutContext.getOrCreateSymbol(FuncName + Twine(".has_indirect_call"));
+  }
+  llvm_unreachable("Unexpected ResourceInfoKind.");
+}
+
+const MCExpr *MCResourceInfo::getSymRefExpr(StringRef FuncName,
+                                            ResourceInfoKind RIK,
+                                            MCContext &Ctx) {
+  return MCSymbolRefExpr::create(getSymbol(FuncName, RIK), Ctx);
+}
+
+void MCResourceInfo::assignMaxRegs() {
+  // Assign expression to get the max register use to the max_num_Xgpr symbol.
+  MCSymbol *MaxVGPRSym = getMaxVGPRSymbol();
+  MCSymbol *MaxAGPRSym = getMaxAGPRSymbol();
+  MCSymbol *MaxSGPRSym = getMaxSGPRSymbol();
+
+  auto assignMaxRegSym = [this](MCSymbol *Sym, int32_t RegCount) {
+    const MCExpr *MaxExpr = MCConstantExpr::create(RegCount, OutContext);
+    Sym->setVariableValue(MaxExpr);
+  };
+
+  assignMaxRegSym(MaxVGPRSym, MaxVGPR);
+  assignMaxRegSym(MaxAGPRSym, MaxAGPR);
+  assignMaxRegSym(MaxSGPRSym, MaxSGPR);
+}
+
+void MCResourceInfo::Finalize() {
+  assert(!finalized && "Cannot finalize ResourceInfo again.");
+  finalized = true;
+  assignMaxRegs();
+}
+
+MCSymbol *MCResourceInfo::getMaxVGPRSymbol() {
+  return OutContext.getOrCreateSymbol("max_num_vgpr");
+}
+
+MCSymbol *MCResourceInfo::getMaxAGPRSymbol() {
+  return OutContext.getOrCreateSymbol("max_num_agpr");
+}
+
+MCSymbol *MCResourceInfo::getMaxSGPRSymbol() {
+  return OutContext.getOrCreateSymbol("max_num_sgpr");
+}
+
+void MCResourceInfo::assignResourceInfoExpr(
+    int64_t localValue, ResourceInfoKind RIK, AMDGPUMCExpr::VariantKind Kind,
+    const MachineFunction &MF,
+    const SmallVectorImpl<const Function *> &Callees) {
+  const MCConstantExpr *localConstExpr =
+      MCConstantExpr::create(localValue, OutContext);
+  const MCExpr *SymVal = localConstExpr;
+  if (Callees.size() > 0) {
+    std::vector<const MCExpr *> ArgExprs;
+    // Avoid recursive symbol assignment.
+    SmallSet<StringRef, 8> Seen;
+    ArgExprs.push_back(localConstExpr);
+    Seen.insert(MF.getName());
+
+    for (const Function *Callee : Callees) {
+      if (Seen.contains(Callee->getName()))
+        continue;
+      Seen.insert(Callee->getName());
+      MCSymbol *calleeValSym = getSymbol(Callee->getName(), RIK);
+      ArgExprs.push_back(MCSymbolRefExpr::create(calleeValSym, OutContext));
+    }
+    SymVal = AMDGPUMCExpr::create(Kind, ArgExprs, OutContext);
+  }
+  MCSymbol *Sym = getSymbol(MF.getName(), RIK);
+  Sym->setVariableValue(SymVal);
+}
+
+void MCResourceInfo::gatherResourceInfo(
+    const MachineFunction &MF,
+    const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &FRI) {
+  // Worst case VGPR use for non-hardware-entrypoints.
+  MCSymbol *maxVGPRSym = getMaxVGPRSymbol();
+  MCSymbol *maxAGPRSym = getMaxAGPRSymbol();
+  MCSymbol *maxSGPRSym = getMaxSGPRSymbol();
+
+  if (!AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())) {
+    addMaxVGPRCandidate(FRI.NumVGPR);
+    addMaxAGPRCandidate(FRI.NumAGPR);
+    addMaxSGPRCandidate(FRI.NumExplicitSGPR);
+  }
+
+  auto setMaxReg = [&](MCSymbol *MaxSym, int32_t numRegs,
+                       ResourceInfoKind RIK) {
+    if (!FRI.HasIndirectCall) {
+      assignResourceInfoExpr(numRegs, RIK, AMDGPUMCExpr::AGVK_Max, MF,
+                             FRI.Callees);
+    } else {
+      const MCExpr *SymRef = MCSymbolRefExpr::create(MaxSym, OutContext);
+      MCSymbol *LocalNumSym = getSymbol(MF.getName(), RIK);
+      const MCExpr *MaxWithLocal = AMDGPUMCExpr::createMax(
+          {MCConstantExpr::create(numRegs, OutContext), SymRef}, OutContext);
+      LocalNumSym->setVariableValue(MaxWithLocal);
+    }
+  };
+
+  setMaxReg(maxVGPRSym, FRI.NumVGPR, RIK_NumVGPR);
+  setMaxReg(maxAGPRSym, FRI.NumAGPR, RIK_NumAGPR);
+  setMaxReg(maxSGPRSym, FRI.NumExplicitSGPR, RIK_NumSGPR);
+
+  {
+    // The expression for private segment size should be: FRI.PrivateSegmentSize
+    // + max(FRI.Callees, FRI.CalleeSegmentSize)
+    std::vector<const MCExpr *> ArgExprs;
+    if (FRI.CalleeSegmentSize)
+      ArgExprs.push_back(
+          MCConstantExpr::create(FRI.CalleeSegmentSize, OutContext));
+
+    if (!FRI.HasIndirectCall) {
+      for (const Function *Callee : FRI.Callees) {
+        MCSymbol *calleeValSym =
+            getSymbol(Callee->getName(), RIK_PrivateSegSize);
+        ArgExprs.push_back(MCSymbolRefExpr::create(calleeValSym, OutContext));
+      }
+    }
+    const MCExpr *localConstExpr =
+        MCConstantExpr::create(FRI.PrivateSegmentSize, OutContext);
+    if (ArgExprs.size() > 0) {
+      const AMDGPUMCExpr *transitiveExpr =
+          AMDGPUMCExpr::createMax(ArgExprs, OutContext);
+      localConstExpr =
+          MCBinaryExpr::createAdd(localConstExpr, transitiveExpr, OutContext);
+    }
+    getSymbol(MF.getName(), RIK_PrivateSegSize)
+        ->setVariableValue(localConstExpr);
+  }
+
+  auto setToLocal = [&](int64_t localValue, ResourceInfoKind RIK) {
+    MCSymbol *Sym = getSymbol(MF.getName(), RIK);
+    Sym->setVariableValue(MCConstantExpr::create(localValue, OutContext));
+  };
+
+  if (!FRI.HasIndirectCall) {
+    assignResourceInfoExpr(FRI.UsesVCC, ResourceInfoKind::RIK_UsesVCC,
+                           AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees);
+    assignResourceInfoExpr(FRI.UsesFlatScratch,
+                           ResourceInfoKind::RIK_UsesFlatScratch,
+                           AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees);
+    assignResourceInfoExpr(FRI.HasDynamicallySizedStack,
+                           ResourceInfoKind::RIK_HasDynSizedStack,
+                           AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees);
+    assignResourceInfoExpr(FRI.HasRecursion, ResourceInfoKind::RIK_HasRecursion,
+                           AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees);
+    assignResourceInfoExpr(FRI.HasIndirectCall,
+                           ResourceInfoKind::RIK_HasIndirectCall,
+                           AMDGPUMCExpr::AGVK_Or, MF, FRI.Callees);
+  } else {
+    setToLocal(FRI.UsesVCC, ResourceInfoKind::RIK_UsesVCC);
+    setToLocal(FRI.UsesFlatScratch, ResourceInfoKind::RIK_UsesFlatScratch);
+    setToLocal(FRI.HasDynamicallySizedStack,
+               ResourceInfoKind::RIK_HasDynSizedStack);
+    setToLocal(FRI.HasRecursion, ResourceInfoKind::RIK_HasRecursion);
+    setToLocal(FRI.HasIndirectCall, ResourceInfoKind::RIK_HasIndirectCall);
+  }
+}
+
+const MCExpr *MCResourceInfo::createTotalNumVGPRs(const MachineFunction &MF,
+                                                  MCContext &Ctx) {
+  return AMDGPUMCExpr::createTotalNumVGPR(
+      getSymRefExpr(MF.getName(), RIK_NumAGPR, Ctx),
+      getSymRefExpr(MF.getName(), RIK_NumVGPR, Ctx), Ctx);
+}
+
+const MCExpr *MCResourceInfo::createTotalNumSGPRs(const MachineFunction &MF,
+                                                  bool hasXnack,
+                                                  MCContext &Ctx) {
+  return MCBinaryExpr::createAdd(
+      getSymRefExpr(MF.getName(), RIK_NumSGPR, Ctx),
+      AMDGPUMCExpr::createExtraSGPRs(
+          getSymRefExpr(MF.getName(), RIK_UsesVCC, Ctx),
+          getSymRefExpr(MF.getName(), RIK_UsesFlatScratch, Ctx), hasXnack, Ctx),
+      Ctx);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
new file mode 100644
index 00000000000000..6646003693a67f
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h
@@ -0,0 +1,94 @@
+//===- AMDGPUMCResourceInfo.h ----- MC Resource Info --------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief MC infrastructure to propagate the function level resource usage
+/// info.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUResourceUsageAnalysis.h"
+#include "MCTargetDesc/AMDGPUMCExpr.h"
+
+namespace llvm {
+
+class MCContext;
+class MCSymbol;
+class StringRef;
+class MachineFunction;
+
+class MCResourceInfo {
+public:
+  enum ResourceInfoKind {
+    RIK_NumVGPR,
+    RIK_NumAGPR,
+    RIK_NumSGPR,
+    RIK_PrivateSegSize,
+    RIK_UsesVCC,
+    RIK_UsesFlatScratch,
+    RIK_HasDynSizedStack,
+    RIK_HasRecursion,
+    RIK_HasIndirectCall
+  };
+
+private:
+  int32_t MaxVGPR;
+  int32_t MaxAGPR;
+  int32_t MaxSGPR;
+
+  MCContext &OutContext;
+  bool finalized;
+
+  void assignResourceInfoExpr(int64_t localValue, ResourceInfoKind RIK,
+                              AMDGPUMCExpr::VariantKind Kind,
+                              const MachineFunction &MF,
+                              const SmallVectorImpl<const Function *> &Callees);
+
+  // Assigns expression for Max S/V/A-GPRs to the referenced symbols.
+  void assignMaxRegs();
+
+public:
+  MCResourceInfo(MCContext &OutContext)
+      : MaxVGPR(0), MaxAGPR(0), MaxSGPR(0), OutContext(OutContext),
+        finalized(false) {}
+  void addMaxVGPRCandidate(int32_t candidate) {
+    MaxVGPR = std::max(MaxVGPR, candidate);
+  }
+  void addMaxAGPRCandidate(int32_t candidate) {
+    MaxAGPR = std::max(MaxAGPR, candidate);
+  }
+  void addMaxSGPRCandidate(int32_t candidate) {
+    MaxSGPR = std::max(MaxSGPR, candidate);
+  }
+
+  MCSymbol *getSymbol(StringRef FuncName, ResourceInfoKind RIK);
+  const MCExpr *getSymRefExpr(StringRef FuncName, ResourceInfoKind RIK,
+                              MCContext &Ctx);
+
+  // Resolves the final symbols that requires the inter-function resource info
+  // to be resolved.
+  void Finalize();
+
+  MCSymbol *getMaxVGPRSymbol();
+  MCSymbol *getMaxAGPRSymbol();
+  MCSymbol *getMaxSGPRSymbol();
+
+  /// AMDGPUResourceUsageAnalysis gathers resource usage on a per-function
+  /// granularity. However, some resource info has to be assigned the call
+  /// transitive maximum or accumulative. For example, if A calls B and B's VGPR
+  /// usage exceeds A's, A should be assigned B's VGPR usage. Furthermore,
+  /// functions with indirect calls should be assigned the module level maximum.
+  void gatherResourceInfo(
+      const MachineFunction &MF,
+      const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &FRI);
+
+  const MCExpr *createTotalNumVGPRs(const MachineFunction &MF, MCContext &Ctx);
+  const MCExpr *createTotalNumSGPRs(const MachineFunction &MF, bool hasXnack,
+                                    MCContext &Ctx);
+};
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 0aca99a82d1978..1ee3c40d69a3b3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -13,14 +13,6 @@
 /// The results of this analysis are used to fill the register usage, flat
 /// usage, etc. into hardware registers.
 ///
-/// The analysis takes callees into account. E.g. if a function A that needs 10
-/// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
-/// will return 20.
-/// It is assumed that an indirect call can go into any function except
-/// hardware-entrypoints. Therefore the register usage of functions with
-/// indirect calls is estimated as the maximum of all non-entrypoint functions
-/// in the module.
-///
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUResourceUsageAnalysis.h"
@@ -28,8 +20,8 @@
 #include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/Analysis/CallGraph.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
@@ -78,92 +70,37 @@ static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
   return false;
 }
 
-int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
-    const GCNSubtarget &ST) const {
-  return NumExplicitSGPR +
-         IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
-                                   ST.getTargetID().isXnackOnOrAny());
-}
-
-int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
-    const GCNSubtarget &ST) const {
-  return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), NumAGPR, NumVGPR);
-}
-
-bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
+bool AMDGPUResourceUsageAnalysis::runOnMachineFunction(MachineFunction &MF) {
   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
   if (!TPC)
     return false;
 
-  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
   const TargetMachine &TM = TPC->getTM<TargetMachine>();
   const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
-  bool HasIndirectCall = false;
-
-  CallGraph CG = CallGraph(M);
-  auto End = po_end(&CG);
 
   // By default, for code object v5 and later, track only the minimum scratch
   // size
   uint32_t AssumedStackSizeForDynamicSizeObjects =
       clAssumedStackSizeForDynamicSizeObjects;
   uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
-  if (AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5 ||
+  if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
+          AMDGPU::AMDHSA_COV5 ||
       STI.getTargetTriple().getOS() == Triple::AMDPAL) {
-    if (clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences() == 0)
+    if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
       AssumedStackSizeForDynamicSizeObjects = 0;
-    if (clAssumedStackSizeForExternalCall.getNumOccurrences() == 0)
+    if (!clAssumedStackSizeForExternalCall.getNumOccurrences())
       AssumedStackSizeForExternalCall = 0;
   }
 
-  for (auto IT = po_begin(&CG); IT != End; ++IT) {
-    Function *F = IT->getFunction();
-    if (!F || F->isDeclaration())
-      continue;
-
-    MachineFunction *MF = MMI.getMachineFunction(*F);
-    assert(MF && "function must have been generated already");
-
-    auto CI =
-        CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo()));
-    SIFunctionResourceInfo &Info = CI.first->second;
-    assert(CI.second && "should only be called once per function");
-    Info = analyzeResourceUsage(*MF, TM, AssumedStackSizeForDynamicSizeObjects,
-                                AssumedStackSizeForExternalCall);
-    HasIndirectCall |= Info.HasIndirectCall;
-  }
-
-  // It's possible we have unreachable functions in the module which weren't
-  // visited by the PO traversal. Make sure we have some resource counts to
-  // report.
-  for (const auto &IT : CG) {
-    const Function *F = IT.first;
-    if (!F || F->isDeclaration())
-      continue;
-
-    auto CI =
-        CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo()));
-    if (!CI.second) // Skip already visited functions
-      continue;
-
-    SIFunctionResourceInfo &Info = CI.first->second;
-    MachineFunction *MF = MMI.getMachineFunction(*F);
-    assert(MF && "function must have been generated already");
-    Info = analyzeResourceUsage(*MF, TM, AssumedStackSizeForDynamicSizeObjects,
-                                AssumedStackSizeForExternalCall);
-    HasIndirectCall |= Info.HasIndirectCall;
-  }
-
-  if (HasIndirectCall)
-    propagateIndirectCallRegisterUsage();
+  ResourceInfo = analyzeResourceUsage(MF, AssumedStackSizeForDynamicSizeObjects,
+                                      AssumedStackSizeForExternalCall);
 
   return false;
 }
 
 AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
 AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
-    const MachineFunction &MF, const TargetMachine &TM,
-    uint32_t AssumedStackSizeForDynamicSizeObjects,
+    const MachineFunction &MF, uint32_t AssumedStackSizeForDynamicSizeObjects,
     uint32_t AssumedStackSizeForExternalCall) const {
   SIFunctionResourceInfo Info;
 
@@ -253,7 +190,7 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
   int32_t MaxVGPR = -1;
   int32_t MaxAGPR = -1;
   int32_t MaxSGPR = -1;
-  uint64_t CalleeFrameSize = 0;
+  Info.CalleeSegmentSize = 0;
 
   for (const MachineBasicBlock &MBB : MF) {
     for (const MachineInstr &MI : MBB) {
@@ -512,8 +449,6 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
             TII->getNamedOperand(MI, AMDGPU::OpName::callee);
 
         const Function *Callee = getCalleeFunction(*CalleeOp);
-        DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
-            CallGraphResourceInfo.end();
 
         // Avoid crashing on undefined behavior with an illegal call to a
         // kernel. If a callsite's calling convention doesn't match the
@@ -522,9 +457,14 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
         if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
           report_fatal_error("invalid call to entry function");
 
+        auto isSameFunction = [](const MachineFunction &MF, const Function *F) {
+          return F == &MF.getFunction();
+        };
+
+        if (Callee && !isSameFunction(MF, Callee))
+          Info.Callees.push_back(Callee);
+
         bool IsIndirect = !Callee || Callee->isDeclaration();
-        if (!IsIndirect)
-          I = CallGraphResourceInfo.find(Callee);
 
         // FIXME: Call site could have norecurse on it
         if (!Callee || !Callee->doesNotRecurse()) {
@@ -539,15 +479,15 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
             // directly call the tail called function. If a kernel directly
             // calls a tail recursive function, we'll assume maximum stack size
             // based on the regular call instruction.
-            CalleeFrameSize = std::max(
-                CalleeFrameSize,
+            Info.CalleeSegmentSize = std::max(
+                Info.CalleeSegmentSize,
                 static_cast<uint64_t>(AssumedStackSizeForExternalCall));
           }
         }
 
-        if (IsIndirect || I == CallGraphResourceInfo.end()) {
-          CalleeFrameSize =
-              std::max(CalleeFrameSize,
+        if (IsIndirect) {
+          Info.CalleeSegmentSize =
+              std::max(Info.CalleeSegmentSize,
                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
 
           // Register usage of indirect calls gets handled later
@@ -555,19 +495,6 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
           Info.UsesFlatScratch = ST.hasFlatAddressSpace();
           Info.HasDynamicallySizedStack = true;
           Info.HasIndirectCall = true;
-        } else {
-          // We force CodeGen to run in SCC order, so the callee's register
-          // usage etc. should be the cumulative usage of all callees.
-          MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
-          MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
-          MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
-          CalleeFrameSize =
-              std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
-          Info.UsesVCC |= I->second.UsesVCC;
-          Info.UsesFlatScratch |= I->second.UsesFlatScratch;
-          Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
-          Info.HasRecursion |= I->second.HasRecursion;
-          Info.HasIndirectCall |= I->second.HasIndirectCall;
         }
       }
     }
@@ -576,36 +503,6 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
   Info.NumExplicitSGPR = MaxSGPR + 1;
   Info.NumVGPR = MaxVGPR + 1;
   Info.NumAGPR = MaxAGPR + 1;
-  Info.PrivateSegmentSize += CalleeFrameSize;
 
   return Info;
 }
-
-void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
-  // Collect the maximum number of registers from non-hardware-entrypoints.
-  // All these functions are potential targets for indirect calls.
-  int32_t NonKernelMaxSGPRs = 0;
-  int32_t NonKernelMaxVGPRs = 0;
-  int32_t NonKernelMaxAGPRs = 0;
-
-  for (const auto &I : CallGraphResourceInfo) {
-    if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
-      auto &Info = I.getSecond();
-      NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
-      NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
-      NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
-    }
-  }
-
-  // Add register usage for functions with indirect calls.
-  // For calls to unknown functions, we assume the maximum register usage of
-  // all non-hardware-entrypoints in the current module.
-  for (auto &I : CallGraphResourceInfo) {
-    auto &Info = I.getSecond();
-    if (Info.HasIndirectCall) {
-      Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
-      Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
-      Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
-    }
-  }
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
index 7f71de6749dcef..92ef41f49b3ba8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
@@ -15,8 +15,8 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H
 
-#include "llvm/Analysis/CallGraphSCCPass.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 
 namespace llvm {
 
@@ -24,10 +24,9 @@ class GCNSubtarget;
 class MachineFunction;
 class TargetMachine;
 
-struct AMDGPUResourceUsageAnalysis : public ModulePass {
-  static char ID;
-
+struct AMDGPUResourceUsageAnalysis : public MachineFunctionPass {
 public:
+  static char ID;
   // Track resource usage for callee functions.
   struct SIFunctionResourceInfo {
     // Track the number of explicitly used VGPRs. Special registers reserved at
@@ -35,48 +34,33 @@ struct AMDGPUResourceUsageAnalysis : public ModulePass {
     int32_t NumVGPR = 0;
     int32_t NumAGPR = 0;
     int32_t NumExplicitSGPR = 0;
+    uint64_t CalleeSegmentSize = 0;
     uint64_t PrivateSegmentSize = 0;
     bool UsesVCC = false;
     bool UsesFlatScratch = false;
     bool HasDynamicallySizedStack = false;
     bool HasRecursion = false;
     bool HasIndirectCall = false;
-
-    int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const;
-    // Total number of VGPRs is actually a combination of AGPR and VGPR
-    // depending on architecture - and some alignment constraints
-    int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const;
+    SmallVector<const Function *, 16> Callees;
   };
 
-  AMDGPUResourceUsageAnalysis() : ModulePass(ID) {}
+  AMDGPUResourceUsageAnalysis() : MachineFunctionPass(ID) {}
 
-  bool doInitialization(Module &M) override {
-    CallGraphResourceInfo.clear();
-    return ModulePass::doInitialization(M);
-  }
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  bool runOnModule(Module &M) override;
+  const SIFunctionResourceInfo &getResourceInfo() const { return ResourceInfo; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineModuleInfoWrapperPass>();
     AU.setPreservesAll();
-  }
-
-  const SIFunctionResourceInfo &getResourceInfo(const Function *F) const {
-    auto Info = CallGraphResourceInfo.find(F);
-    assert(Info != CallGraphResourceInfo.end() &&
-           "Failed to find resource info for function");
-    return Info->getSecond();
+    MachineFunctionPass::getAnalysisUsage(AU);
   }
 
 private:
   SIFunctionResourceInfo
-  analyzeResourceUsage(const MachineFunction &MF, const TargetMachine &TM,
+  analyzeResourceUsage(const MachineFunction &MF,
                        uint32_t AssumedStackSizeForDynamicSizeObjects,
                        uint32_t AssumedStackSizeForExternalCall) const;
-  void propagateIndirectCallRegisterUsage();
-
-  DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo;
+  SIFunctionResourceInfo ResourceInfo;
 };
 } // namespace llvm
 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 671caf8484cd97..d685b635a82a6f 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -82,6 +82,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUMCInstLower.cpp
   AMDGPUIGroupLP.cpp
   AMDGPUInsertSingleUseVDST.cpp
+  AMDGPUMCResourceInfo.cpp
   AMDGPUMarkLastScratchLoad.cpp
   AMDGPUMIRFormatter.cpp
   AMDGPUOpenCLEnqueuedBlockLowering.cpp
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 73d466abc66f7b..a1a41d6cc8c6a0 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -271,6 +271,47 @@ void AMDGPUTargetAsmStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
      << Alignment.value() << '\n';
 }
 
+void AMDGPUTargetAsmStreamer::EmitMCResourceInfo(
+    const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
+    const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize,
+    const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch,
+    const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion,
+    const MCSymbol *HasIndirectCall) {
+#define PRINT_RES_INFO(ARG)                                                    \
+  OS << "\t.set ";                                                             \
+  ARG->print(OS, getContext().getAsmInfo());                                   \
+  OS << ", ";                                                                  \
+  ARG->getVariableValue()->print(OS, getContext().getAsmInfo());               \
+  Streamer.addBlankLine();
+
+  PRINT_RES_INFO(NumVGPR);
+  PRINT_RES_INFO(NumAGPR);
+  PRINT_RES_INFO(NumExplicitSGPR);
+  PRINT_RES_INFO(PrivateSegmentSize);
+  PRINT_RES_INFO(UsesVCC);
+  PRINT_RES_INFO(UsesFlatScratch);
+  PRINT_RES_INFO(HasDynamicallySizedStack);
+  PRINT_RES_INFO(HasRecursion);
+  PRINT_RES_INFO(HasIndirectCall);
+#undef PRINT_RES_INFO
+}
+
+void AMDGPUTargetAsmStreamer::EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
+                                                     const MCSymbol *MaxAGPR,
+                                                     const MCSymbol *MaxSGPR) {
+#define PRINT_RES_INFO(ARG)                                                    \
+  OS << "\t.set ";                                                             \
+  ARG->print(OS, getContext().getAsmInfo());                                   \
+  OS << ", ";                                                                  \
+  ARG->getVariableValue()->print(OS, getContext().getAsmInfo());               \
+  Streamer.addBlankLine();
+
+  PRINT_RES_INFO(MaxVGPR);
+  PRINT_RES_INFO(MaxAGPR);
+  PRINT_RES_INFO(MaxSGPR);
+#undef PRINT_RES_INFO
+}
+
 bool AMDGPUTargetAsmStreamer::EmitISAVersion() {
   OS << "\t.amd_amdgpu_isa \"" << getTargetID()->toString() << "\"\n";
   return true;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index bf1538c71d1543..e41f302c3d56ce 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -60,6 +60,17 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
   virtual void emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, Align Alignment) {
   }
 
+  virtual void EmitMCResourceInfo(
+      const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
+      const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize,
+      const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch,
+      const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion,
+      const MCSymbol *HasIndirectCall){};
+
+  virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
+                                      const MCSymbol *MaxAGPR,
+                                      const MCSymbol *MaxSGPR){};
+
   /// \returns True on success, false on failure.
   virtual bool EmitISAVersion() { return true; }
 
@@ -136,6 +147,18 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
 
   void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override;
 
+  void EmitMCResourceInfo(const MCSymbol *NumVGPR, const MCSymbol *NumAGPR,
+                          const MCSymbol *NumExplicitSGPR,
+                          const MCSymbol *PrivateSegmentSize,
+                          const MCSymbol *UsesVCC,
+                          const MCSymbol *UsesFlatScratch,
+                          const MCSymbol *HasDynamicallySizedStack,
+                          const MCSymbol *HasRecursion,
+                          const MCSymbol *HasIndirectCall) override;
+
+  void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR,
+                              const MCSymbol *MaxSGPR) override;
+
   /// \returns True on success, false on failure.
   bool EmitISAVersion() override;
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index a53bf70d77717b..92d09b3afa77d7 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -215,15 +215,15 @@ void AMDGPUPALMetadata::setRegister(unsigned Reg, const MCExpr *Val,
       const MCExpr *NExpr = MCConstantExpr::create(N.getUInt(), Ctx);
       Val = MCBinaryExpr::createOr(Val, NExpr, Ctx);
     }
-    ExprIt->getSecond() = Val;
   } else if (N.getKind() == msgpack::Type::UInt) {
     const MCExpr *NExpr = MCConstantExpr::create(N.getUInt(), Ctx);
     Val = MCBinaryExpr::createOr(Val, NExpr, Ctx);
-    int64_t Unused;
-    if (!Val->evaluateAsAbsolute(Unused))
-      REM[Reg] = Val;
-    (void)Unused;
+  } else {
+    // Default to uint64_t 0 so additional calls to setRegister will allow
+    // propagate ORs.
+    N = (uint64_t)0;
   }
+  REM[Reg] = Val;
   DelayedExprs.assignDocNode(N, msgpack::Type::UInt, Val);
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 34efb089b72bf1..a7ab4393f3b0ac 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -3025,8 +3025,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GPRIDX-NEXT:     amd_machine_version_stepping = 0
 ; GPRIDX-NEXT:     kernel_code_entry_byte_offset = 256
 ; GPRIDX-NEXT:     kernel_code_prefetch_byte_size = 0
-; GPRIDX-NEXT:     granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 1
+; GPRIDX-NEXT:     granulated_workitem_vgpr_count = (11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = ((11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
 ; GPRIDX-NEXT:     priority = 0
 ; GPRIDX-NEXT:     float_mode = 240
 ; GPRIDX-NEXT:     priv = 0
@@ -3036,7 +3036,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GPRIDX-NEXT:     enable_wgp_mode = 0
 ; GPRIDX-NEXT:     enable_mem_ordered = 0
 ; GPRIDX-NEXT:     enable_fwd_progress = 0
-; GPRIDX-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; GPRIDX-NEXT:     enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v5f64_s_s.private_seg_size*64, 1024))/1024)>0)||(dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion))|5012)&1
 ; GPRIDX-NEXT:     user_sgpr_count = 10
 ; GPRIDX-NEXT:     enable_trap_handler = 0
 ; GPRIDX-NEXT:     enable_sgpr_workgroup_id_x = 1
@@ -3061,16 +3061,16 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GPRIDX-NEXT:     enable_ordered_append_gds = 0
 ; GPRIDX-NEXT:     private_element_size = 1
 ; GPRIDX-NEXT:     is_ptr64 = 1
-; GPRIDX-NEXT:     is_dynamic_callstack = 0
+; GPRIDX-NEXT:     is_dynamic_callstack = dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion
 ; GPRIDX-NEXT:     is_debug_enabled = 0
 ; GPRIDX-NEXT:     is_xnack_enabled = 1
-; GPRIDX-NEXT:     workitem_private_segment_byte_size = 0
+; GPRIDX-NEXT:     workitem_private_segment_byte_size = dyn_extract_v5f64_s_s.private_seg_size
 ; GPRIDX-NEXT:     workgroup_group_segment_byte_size = 0
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 13
-; GPRIDX-NEXT:     workitem_vgpr_count = 3
+; GPRIDX-NEXT:     wavefront_sgpr_count = dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 1))
+; GPRIDX-NEXT:     workitem_vgpr_count = totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr)
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
 ; GPRIDX-NEXT:     reserved_sgpr_first = 0
@@ -3116,8 +3116,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; MOVREL-NEXT:     amd_machine_version_stepping = 3
 ; MOVREL-NEXT:     kernel_code_entry_byte_offset = 256
 ; MOVREL-NEXT:     kernel_code_prefetch_byte_size = 0
-; MOVREL-NEXT:     granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 1
+; MOVREL-NEXT:     granulated_workitem_vgpr_count = (11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; MOVREL-NEXT:     granulated_wavefront_sgpr_count = ((11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
 ; MOVREL-NEXT:     priority = 0
 ; MOVREL-NEXT:     float_mode = 240
 ; MOVREL-NEXT:     priv = 0
@@ -3127,7 +3127,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; MOVREL-NEXT:     enable_wgp_mode = 0
 ; MOVREL-NEXT:     enable_mem_ordered = 0
 ; MOVREL-NEXT:     enable_fwd_progress = 0
-; MOVREL-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; MOVREL-NEXT:     enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v5f64_s_s.private_seg_size*64, 1024))/1024)>0)||(dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion))|5012)&1
 ; MOVREL-NEXT:     user_sgpr_count = 10
 ; MOVREL-NEXT:     enable_trap_handler = 0
 ; MOVREL-NEXT:     enable_sgpr_workgroup_id_x = 1
@@ -3152,16 +3152,16 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; MOVREL-NEXT:     enable_ordered_append_gds = 0
 ; MOVREL-NEXT:     private_element_size = 1
 ; MOVREL-NEXT:     is_ptr64 = 1
-; MOVREL-NEXT:     is_dynamic_callstack = 0
+; MOVREL-NEXT:     is_dynamic_callstack = dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion
 ; MOVREL-NEXT:     is_debug_enabled = 0
 ; MOVREL-NEXT:     is_xnack_enabled = 0
-; MOVREL-NEXT:     workitem_private_segment_byte_size = 0
+; MOVREL-NEXT:     workitem_private_segment_byte_size = dyn_extract_v5f64_s_s.private_seg_size
 ; MOVREL-NEXT:     workgroup_group_segment_byte_size = 0
 ; MOVREL-NEXT:     gds_segment_byte_size = 0
 ; MOVREL-NEXT:     kernarg_segment_byte_size = 28
 ; MOVREL-NEXT:     workgroup_fbarrier_count = 0
-; MOVREL-NEXT:     wavefront_sgpr_count = 9
-; MOVREL-NEXT:     workitem_vgpr_count = 4
+; MOVREL-NEXT:     wavefront_sgpr_count = dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 0))
+; MOVREL-NEXT:     workitem_vgpr_count = totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr)
 ; MOVREL-NEXT:     reserved_vgpr_first = 0
 ; MOVREL-NEXT:     reserved_vgpr_count = 0
 ; MOVREL-NEXT:     reserved_sgpr_first = 0
@@ -3208,8 +3208,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:     amd_machine_version_stepping = 0
 ; GFX10-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX10-NEXT:     kernel_code_prefetch_byte_size = 0
-; GFX10-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX10-NEXT:     granulated_wavefront_sgpr_count = 1
+; GFX10-NEXT:     granulated_workitem_vgpr_count = (1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; GFX10-NEXT:     granulated_wavefront_sgpr_count = ((1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
 ; GFX10-NEXT:     priority = 0
 ; GFX10-NEXT:     float_mode = 240
 ; GFX10-NEXT:     priv = 0
@@ -3219,7 +3219,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:     enable_wgp_mode = 1
 ; GFX10-NEXT:     enable_mem_ordered = 1
 ; GFX10-NEXT:     enable_fwd_progress = 0
-; GFX10-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; GFX10-NEXT:     enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v5f64_s_s.private_seg_size*32, 1024))/1024)>0)||(dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion))|5012)&1
 ; GFX10-NEXT:     user_sgpr_count = 10
 ; GFX10-NEXT:     enable_trap_handler = 0
 ; GFX10-NEXT:     enable_sgpr_workgroup_id_x = 1
@@ -3244,16 +3244,16 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:     enable_ordered_append_gds = 0
 ; GFX10-NEXT:     private_element_size = 1
 ; GFX10-NEXT:     is_ptr64 = 1
-; GFX10-NEXT:     is_dynamic_callstack = 0
+; GFX10-NEXT:     is_dynamic_callstack = dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion
 ; GFX10-NEXT:     is_debug_enabled = 0
 ; GFX10-NEXT:     is_xnack_enabled = 1
-; GFX10-NEXT:     workitem_private_segment_byte_size = 0
+; GFX10-NEXT:     workitem_private_segment_byte_size = dyn_extract_v5f64_s_s.private_seg_size
 ; GFX10-NEXT:     workgroup_group_segment_byte_size = 0
 ; GFX10-NEXT:     gds_segment_byte_size = 0
 ; GFX10-NEXT:     kernarg_segment_byte_size = 28
 ; GFX10-NEXT:     workgroup_fbarrier_count = 0
-; GFX10-NEXT:     wavefront_sgpr_count = 9
-; GFX10-NEXT:     workitem_vgpr_count = 3
+; GFX10-NEXT:     wavefront_sgpr_count = dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 1))
+; GFX10-NEXT:     workitem_vgpr_count = totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr)
 ; GFX10-NEXT:     reserved_vgpr_first = 0
 ; GFX10-NEXT:     reserved_vgpr_count = 0
 ; GFX10-NEXT:     reserved_sgpr_first = 0
@@ -3300,8 +3300,8 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX11-NEXT:     amd_machine_version_stepping = 0
 ; GFX11-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX11-NEXT:     kernel_code_prefetch_byte_size = 0
-; GFX11-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX11-NEXT:     granulated_wavefront_sgpr_count = 0
+; GFX11-NEXT:     granulated_workitem_vgpr_count = (1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; GFX11-NEXT:     granulated_wavefront_sgpr_count = ((1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
 ; GFX11-NEXT:     priority = 0
 ; GFX11-NEXT:     float_mode = 240
 ; GFX11-NEXT:     priv = 0
@@ -3311,7 +3311,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX11-NEXT:     enable_wgp_mode = 1
 ; GFX11-NEXT:     enable_mem_ordered = 1
 ; GFX11-NEXT:     enable_fwd_progress = 0
-; GFX11-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; GFX11-NEXT:     enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v5f64_s_s.private_seg_size*32, 256))/256)>0)||(dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion))|5018)&1
 ; GFX11-NEXT:     user_sgpr_count = 13
 ; GFX11-NEXT:     enable_trap_handler = 0
 ; GFX11-NEXT:     enable_sgpr_workgroup_id_x = 1
@@ -3336,16 +3336,16 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX11-NEXT:     enable_ordered_append_gds = 0
 ; GFX11-NEXT:     private_element_size = 1
 ; GFX11-NEXT:     is_ptr64 = 1
-; GFX11-NEXT:     is_dynamic_callstack = 0
+; GFX11-NEXT:     is_dynamic_callstack = dyn_extract_v5f64_s_s.has_dyn_sized_stack|dyn_extract_v5f64_s_s.has_recursion
 ; GFX11-NEXT:     is_debug_enabled = 0
 ; GFX11-NEXT:     is_xnack_enabled = 0
-; GFX11-NEXT:     workitem_private_segment_byte_size = 0
+; GFX11-NEXT:     workitem_private_segment_byte_size = dyn_extract_v5f64_s_s.private_seg_size
 ; GFX11-NEXT:     workgroup_group_segment_byte_size = 0
 ; GFX11-NEXT:     gds_segment_byte_size = 0
 ; GFX11-NEXT:     kernarg_segment_byte_size = 28
 ; GFX11-NEXT:     workgroup_fbarrier_count = 0
-; GFX11-NEXT:     wavefront_sgpr_count = 7
-; GFX11-NEXT:     workitem_vgpr_count = 3
+; GFX11-NEXT:     wavefront_sgpr_count = dyn_extract_v5f64_s_s.num_sgpr+(extrasgprs(dyn_extract_v5f64_s_s.uses_vcc, dyn_extract_v5f64_s_s.uses_flat_scratch, 0))
+; GFX11-NEXT:     workitem_vgpr_count = totalnumvgprs(dyn_extract_v5f64_s_s.num_agpr, dyn_extract_v5f64_s_s.num_vgpr)
 ; GFX11-NEXT:     reserved_vgpr_first = 0
 ; GFX11-NEXT:     reserved_vgpr_count = 0
 ; GFX11-NEXT:     reserved_sgpr_first = 0
@@ -4042,8 +4042,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     amd_machine_version_stepping = 0
 ; GPRIDX-NEXT:     kernel_code_entry_byte_offset = 256
 ; GPRIDX-NEXT:     kernel_code_prefetch_byte_size = 0
-; GPRIDX-NEXT:     granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 1
+; GPRIDX-NEXT:     granulated_workitem_vgpr_count = (11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = ((11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
 ; GPRIDX-NEXT:     priority = 0
 ; GPRIDX-NEXT:     float_mode = 240
 ; GPRIDX-NEXT:     priv = 0
@@ -4053,7 +4053,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     enable_wgp_mode = 0
 ; GPRIDX-NEXT:     enable_mem_ordered = 0
 ; GPRIDX-NEXT:     enable_fwd_progress = 0
-; GPRIDX-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; GPRIDX-NEXT:     enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f32_s_s_s.private_seg_size*64, 1024))/1024)>0)||(dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion))|5012)&1
 ; GPRIDX-NEXT:     user_sgpr_count = 10
 ; GPRIDX-NEXT:     enable_trap_handler = 0
 ; GPRIDX-NEXT:     enable_sgpr_workgroup_id_x = 1
@@ -4078,16 +4078,16 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     enable_ordered_append_gds = 0
 ; GPRIDX-NEXT:     private_element_size = 1
 ; GPRIDX-NEXT:     is_ptr64 = 1
-; GPRIDX-NEXT:     is_dynamic_callstack = 0
+; GPRIDX-NEXT:     is_dynamic_callstack = dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion
 ; GPRIDX-NEXT:     is_debug_enabled = 0
 ; GPRIDX-NEXT:     is_xnack_enabled = 1
-; GPRIDX-NEXT:     workitem_private_segment_byte_size = 0
+; GPRIDX-NEXT:     workitem_private_segment_byte_size = dyn_extract_v4f32_s_s_s.private_seg_size
 ; GPRIDX-NEXT:     workgroup_group_segment_byte_size = 0
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 12
-; GPRIDX-NEXT:     workitem_vgpr_count = 2
+; GPRIDX-NEXT:     wavefront_sgpr_count = dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 1))
+; GPRIDX-NEXT:     workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr)
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
 ; GPRIDX-NEXT:     reserved_sgpr_first = 0
@@ -4126,8 +4126,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     amd_machine_version_stepping = 3
 ; MOVREL-NEXT:     kernel_code_entry_byte_offset = 256
 ; MOVREL-NEXT:     kernel_code_prefetch_byte_size = 0
-; MOVREL-NEXT:     granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 0
+; MOVREL-NEXT:     granulated_workitem_vgpr_count = (11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; MOVREL-NEXT:     granulated_wavefront_sgpr_count = ((11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
 ; MOVREL-NEXT:     priority = 0
 ; MOVREL-NEXT:     float_mode = 240
 ; MOVREL-NEXT:     priv = 0
@@ -4137,7 +4137,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     enable_wgp_mode = 0
 ; MOVREL-NEXT:     enable_mem_ordered = 0
 ; MOVREL-NEXT:     enable_fwd_progress = 0
-; MOVREL-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; MOVREL-NEXT:     enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f32_s_s_s.private_seg_size*64, 1024))/1024)>0)||(dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion))|5012)&1
 ; MOVREL-NEXT:     user_sgpr_count = 10
 ; MOVREL-NEXT:     enable_trap_handler = 0
 ; MOVREL-NEXT:     enable_sgpr_workgroup_id_x = 1
@@ -4162,16 +4162,16 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     enable_ordered_append_gds = 0
 ; MOVREL-NEXT:     private_element_size = 1
 ; MOVREL-NEXT:     is_ptr64 = 1
-; MOVREL-NEXT:     is_dynamic_callstack = 0
+; MOVREL-NEXT:     is_dynamic_callstack = dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion
 ; MOVREL-NEXT:     is_debug_enabled = 0
 ; MOVREL-NEXT:     is_xnack_enabled = 0
-; MOVREL-NEXT:     workitem_private_segment_byte_size = 0
+; MOVREL-NEXT:     workitem_private_segment_byte_size = dyn_extract_v4f32_s_s_s.private_seg_size
 ; MOVREL-NEXT:     workgroup_group_segment_byte_size = 0
 ; MOVREL-NEXT:     gds_segment_byte_size = 0
 ; MOVREL-NEXT:     kernarg_segment_byte_size = 28
 ; MOVREL-NEXT:     workgroup_fbarrier_count = 0
-; MOVREL-NEXT:     wavefront_sgpr_count = 8
-; MOVREL-NEXT:     workitem_vgpr_count = 3
+; MOVREL-NEXT:     wavefront_sgpr_count = dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 0))
+; MOVREL-NEXT:     workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr)
 ; MOVREL-NEXT:     reserved_vgpr_first = 0
 ; MOVREL-NEXT:     reserved_vgpr_count = 0
 ; MOVREL-NEXT:     reserved_sgpr_first = 0
@@ -4211,8 +4211,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     amd_machine_version_stepping = 0
 ; GFX10-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX10-NEXT:     kernel_code_prefetch_byte_size = 0
-; GFX10-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX10-NEXT:     granulated_wavefront_sgpr_count = 0
+; GFX10-NEXT:     granulated_workitem_vgpr_count = (1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; GFX10-NEXT:     granulated_wavefront_sgpr_count = ((1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
 ; GFX10-NEXT:     priority = 0
 ; GFX10-NEXT:     float_mode = 240
 ; GFX10-NEXT:     priv = 0
@@ -4222,7 +4222,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     enable_wgp_mode = 1
 ; GFX10-NEXT:     enable_mem_ordered = 1
 ; GFX10-NEXT:     enable_fwd_progress = 0
-; GFX10-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; GFX10-NEXT:     enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f32_s_s_s.private_seg_size*32, 1024))/1024)>0)||(dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion))|5012)&1
 ; GFX10-NEXT:     user_sgpr_count = 10
 ; GFX10-NEXT:     enable_trap_handler = 0
 ; GFX10-NEXT:     enable_sgpr_workgroup_id_x = 1
@@ -4247,16 +4247,16 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     enable_ordered_append_gds = 0
 ; GFX10-NEXT:     private_element_size = 1
 ; GFX10-NEXT:     is_ptr64 = 1
-; GFX10-NEXT:     is_dynamic_callstack = 0
+; GFX10-NEXT:     is_dynamic_callstack = dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion
 ; GFX10-NEXT:     is_debug_enabled = 0
 ; GFX10-NEXT:     is_xnack_enabled = 1
-; GFX10-NEXT:     workitem_private_segment_byte_size = 0
+; GFX10-NEXT:     workitem_private_segment_byte_size = dyn_extract_v4f32_s_s_s.private_seg_size
 ; GFX10-NEXT:     workgroup_group_segment_byte_size = 0
 ; GFX10-NEXT:     gds_segment_byte_size = 0
 ; GFX10-NEXT:     kernarg_segment_byte_size = 28
 ; GFX10-NEXT:     workgroup_fbarrier_count = 0
-; GFX10-NEXT:     wavefront_sgpr_count = 8
-; GFX10-NEXT:     workitem_vgpr_count = 2
+; GFX10-NEXT:     wavefront_sgpr_count = dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 1))
+; GFX10-NEXT:     workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr)
 ; GFX10-NEXT:     reserved_vgpr_first = 0
 ; GFX10-NEXT:     reserved_vgpr_count = 0
 ; GFX10-NEXT:     reserved_sgpr_first = 0
@@ -4296,8 +4296,8 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     amd_machine_version_stepping = 0
 ; GFX11-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX11-NEXT:     kernel_code_prefetch_byte_size = 0
-; GFX11-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX11-NEXT:     granulated_wavefront_sgpr_count = 0
+; GFX11-NEXT:     granulated_workitem_vgpr_count = (1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; GFX11-NEXT:     granulated_wavefront_sgpr_count = ((1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
 ; GFX11-NEXT:     priority = 0
 ; GFX11-NEXT:     float_mode = 240
 ; GFX11-NEXT:     priv = 0
@@ -4307,7 +4307,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     enable_wgp_mode = 1
 ; GFX11-NEXT:     enable_mem_ordered = 1
 ; GFX11-NEXT:     enable_fwd_progress = 0
-; GFX11-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; GFX11-NEXT:     enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f32_s_s_s.private_seg_size*32, 256))/256)>0)||(dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion))|5018)&1
 ; GFX11-NEXT:     user_sgpr_count = 13
 ; GFX11-NEXT:     enable_trap_handler = 0
 ; GFX11-NEXT:     enable_sgpr_workgroup_id_x = 1
@@ -4332,16 +4332,16 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     enable_ordered_append_gds = 0
 ; GFX11-NEXT:     private_element_size = 1
 ; GFX11-NEXT:     is_ptr64 = 1
-; GFX11-NEXT:     is_dynamic_callstack = 0
+; GFX11-NEXT:     is_dynamic_callstack = dyn_extract_v4f32_s_s_s.has_dyn_sized_stack|dyn_extract_v4f32_s_s_s.has_recursion
 ; GFX11-NEXT:     is_debug_enabled = 0
 ; GFX11-NEXT:     is_xnack_enabled = 0
-; GFX11-NEXT:     workitem_private_segment_byte_size = 0
+; GFX11-NEXT:     workitem_private_segment_byte_size = dyn_extract_v4f32_s_s_s.private_seg_size
 ; GFX11-NEXT:     workgroup_group_segment_byte_size = 0
 ; GFX11-NEXT:     gds_segment_byte_size = 0
 ; GFX11-NEXT:     kernarg_segment_byte_size = 28
 ; GFX11-NEXT:     workgroup_fbarrier_count = 0
-; GFX11-NEXT:     wavefront_sgpr_count = 5
-; GFX11-NEXT:     workitem_vgpr_count = 2
+; GFX11-NEXT:     wavefront_sgpr_count = dyn_extract_v4f32_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f32_s_s_s.uses_vcc, dyn_extract_v4f32_s_s_s.uses_flat_scratch, 0))
+; GFX11-NEXT:     workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f32_s_s_s.num_agpr, dyn_extract_v4f32_s_s_s.num_vgpr)
 ; GFX11-NEXT:     reserved_vgpr_first = 0
 ; GFX11-NEXT:     reserved_vgpr_count = 0
 ; GFX11-NEXT:     reserved_sgpr_first = 0
@@ -4389,8 +4389,8 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     amd_machine_version_stepping = 0
 ; GPRIDX-NEXT:     kernel_code_entry_byte_offset = 256
 ; GPRIDX-NEXT:     kernel_code_prefetch_byte_size = 0
-; GPRIDX-NEXT:     granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = 1
+; GPRIDX-NEXT:     granulated_workitem_vgpr_count = (11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; GPRIDX-NEXT:     granulated_wavefront_sgpr_count = ((11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
 ; GPRIDX-NEXT:     priority = 0
 ; GPRIDX-NEXT:     float_mode = 240
 ; GPRIDX-NEXT:     priv = 0
@@ -4400,7 +4400,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     enable_wgp_mode = 0
 ; GPRIDX-NEXT:     enable_mem_ordered = 0
 ; GPRIDX-NEXT:     enable_fwd_progress = 0
-; GPRIDX-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; GPRIDX-NEXT:     enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f64_s_s_s.private_seg_size*64, 1024))/1024)>0)||(dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion))|5012)&1
 ; GPRIDX-NEXT:     user_sgpr_count = 10
 ; GPRIDX-NEXT:     enable_trap_handler = 0
 ; GPRIDX-NEXT:     enable_sgpr_workgroup_id_x = 1
@@ -4425,16 +4425,16 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     enable_ordered_append_gds = 0
 ; GPRIDX-NEXT:     private_element_size = 1
 ; GPRIDX-NEXT:     is_ptr64 = 1
-; GPRIDX-NEXT:     is_dynamic_callstack = 0
+; GPRIDX-NEXT:     is_dynamic_callstack = dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion
 ; GPRIDX-NEXT:     is_debug_enabled = 0
 ; GPRIDX-NEXT:     is_xnack_enabled = 1
-; GPRIDX-NEXT:     workitem_private_segment_byte_size = 0
+; GPRIDX-NEXT:     workitem_private_segment_byte_size = dyn_extract_v4f64_s_s_s.private_seg_size
 ; GPRIDX-NEXT:     workgroup_group_segment_byte_size = 0
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 13
-; GPRIDX-NEXT:     workitem_vgpr_count = 3
+; GPRIDX-NEXT:     wavefront_sgpr_count = dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 1))
+; GPRIDX-NEXT:     workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr)
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
 ; GPRIDX-NEXT:     reserved_sgpr_first = 0
@@ -4476,8 +4476,8 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     amd_machine_version_stepping = 3
 ; MOVREL-NEXT:     kernel_code_entry_byte_offset = 256
 ; MOVREL-NEXT:     kernel_code_prefetch_byte_size = 0
-; MOVREL-NEXT:     granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 1
+; MOVREL-NEXT:     granulated_workitem_vgpr_count = (11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; MOVREL-NEXT:     granulated_wavefront_sgpr_count = ((11468800|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 4))/4)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
 ; MOVREL-NEXT:     priority = 0
 ; MOVREL-NEXT:     float_mode = 240
 ; MOVREL-NEXT:     priv = 0
@@ -4487,7 +4487,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     enable_wgp_mode = 0
 ; MOVREL-NEXT:     enable_mem_ordered = 0
 ; MOVREL-NEXT:     enable_fwd_progress = 0
-; MOVREL-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; MOVREL-NEXT:     enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f64_s_s_s.private_seg_size*64, 1024))/1024)>0)||(dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion))|5012)&1
 ; MOVREL-NEXT:     user_sgpr_count = 10
 ; MOVREL-NEXT:     enable_trap_handler = 0
 ; MOVREL-NEXT:     enable_sgpr_workgroup_id_x = 1
@@ -4512,16 +4512,16 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     enable_ordered_append_gds = 0
 ; MOVREL-NEXT:     private_element_size = 1
 ; MOVREL-NEXT:     is_ptr64 = 1
-; MOVREL-NEXT:     is_dynamic_callstack = 0
+; MOVREL-NEXT:     is_dynamic_callstack = dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion
 ; MOVREL-NEXT:     is_debug_enabled = 0
 ; MOVREL-NEXT:     is_xnack_enabled = 0
-; MOVREL-NEXT:     workitem_private_segment_byte_size = 0
+; MOVREL-NEXT:     workitem_private_segment_byte_size = dyn_extract_v4f64_s_s_s.private_seg_size
 ; MOVREL-NEXT:     workgroup_group_segment_byte_size = 0
 ; MOVREL-NEXT:     gds_segment_byte_size = 0
 ; MOVREL-NEXT:     kernarg_segment_byte_size = 28
 ; MOVREL-NEXT:     workgroup_fbarrier_count = 0
-; MOVREL-NEXT:     wavefront_sgpr_count = 9
-; MOVREL-NEXT:     workitem_vgpr_count = 4
+; MOVREL-NEXT:     wavefront_sgpr_count = dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 0))
+; MOVREL-NEXT:     workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr)
 ; MOVREL-NEXT:     reserved_vgpr_first = 0
 ; MOVREL-NEXT:     reserved_vgpr_count = 0
 ; MOVREL-NEXT:     reserved_sgpr_first = 0
@@ -4564,8 +4564,8 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     amd_machine_version_stepping = 0
 ; GFX10-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX10-NEXT:     kernel_code_prefetch_byte_size = 0
-; GFX10-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX10-NEXT:     granulated_wavefront_sgpr_count = 1
+; GFX10-NEXT:     granulated_workitem_vgpr_count = (1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; GFX10-NEXT:     granulated_wavefront_sgpr_count = ((1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 1)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
 ; GFX10-NEXT:     priority = 0
 ; GFX10-NEXT:     float_mode = 240
 ; GFX10-NEXT:     priv = 0
@@ -4575,7 +4575,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     enable_wgp_mode = 1
 ; GFX10-NEXT:     enable_mem_ordered = 1
 ; GFX10-NEXT:     enable_fwd_progress = 0
-; GFX10-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; GFX10-NEXT:     enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f64_s_s_s.private_seg_size*32, 1024))/1024)>0)||(dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion))|5012)&1
 ; GFX10-NEXT:     user_sgpr_count = 10
 ; GFX10-NEXT:     enable_trap_handler = 0
 ; GFX10-NEXT:     enable_sgpr_workgroup_id_x = 1
@@ -4600,16 +4600,16 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     enable_ordered_append_gds = 0
 ; GFX10-NEXT:     private_element_size = 1
 ; GFX10-NEXT:     is_ptr64 = 1
-; GFX10-NEXT:     is_dynamic_callstack = 0
+; GFX10-NEXT:     is_dynamic_callstack = dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion
 ; GFX10-NEXT:     is_debug_enabled = 0
 ; GFX10-NEXT:     is_xnack_enabled = 1
-; GFX10-NEXT:     workitem_private_segment_byte_size = 0
+; GFX10-NEXT:     workitem_private_segment_byte_size = dyn_extract_v4f64_s_s_s.private_seg_size
 ; GFX10-NEXT:     workgroup_group_segment_byte_size = 0
 ; GFX10-NEXT:     gds_segment_byte_size = 0
 ; GFX10-NEXT:     kernarg_segment_byte_size = 28
 ; GFX10-NEXT:     workgroup_fbarrier_count = 0
-; GFX10-NEXT:     wavefront_sgpr_count = 9
-; GFX10-NEXT:     workitem_vgpr_count = 3
+; GFX10-NEXT:     wavefront_sgpr_count = dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 1))
+; GFX10-NEXT:     workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr)
 ; GFX10-NEXT:     reserved_vgpr_first = 0
 ; GFX10-NEXT:     reserved_vgpr_count = 0
 ; GFX10-NEXT:     reserved_sgpr_first = 0
@@ -4652,8 +4652,8 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     amd_machine_version_stepping = 0
 ; GFX11-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX11-NEXT:     kernel_code_prefetch_byte_size = 0
-; GFX11-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX11-NEXT:     granulated_wavefront_sgpr_count = 0
+; GFX11-NEXT:     granulated_workitem_vgpr_count = (1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))&63
+; GFX11-NEXT:     granulated_wavefront_sgpr_count = ((1622081536|(((((alignto(max(max(totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr), 1, 0), 1), 8))/8)-1)&63)|(((((alignto(max(max(dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 0)), 1, 0), 1), 8))/8)-1)&15)<<6)))>>6)&15
 ; GFX11-NEXT:     priority = 0
 ; GFX11-NEXT:     float_mode = 240
 ; GFX11-NEXT:     priv = 0
@@ -4663,7 +4663,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     enable_wgp_mode = 1
 ; GFX11-NEXT:     enable_mem_ordered = 1
 ; GFX11-NEXT:     enable_fwd_progress = 0
-; GFX11-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
+; GFX11-NEXT:     enable_sgpr_private_segment_wave_byte_offset = (((((alignto(dyn_extract_v4f64_s_s_s.private_seg_size*32, 256))/256)>0)||(dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion))|5018)&1
 ; GFX11-NEXT:     user_sgpr_count = 13
 ; GFX11-NEXT:     enable_trap_handler = 0
 ; GFX11-NEXT:     enable_sgpr_workgroup_id_x = 1
@@ -4688,16 +4688,16 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX11-NEXT:     enable_ordered_append_gds = 0
 ; GFX11-NEXT:     private_element_size = 1
 ; GFX11-NEXT:     is_ptr64 = 1
-; GFX11-NEXT:     is_dynamic_callstack = 0
+; GFX11-NEXT:     is_dynamic_callstack = dyn_extract_v4f64_s_s_s.has_dyn_sized_stack|dyn_extract_v4f64_s_s_s.has_recursion
 ; GFX11-NEXT:     is_debug_enabled = 0
 ; GFX11-NEXT:     is_xnack_enabled = 0
-; GFX11-NEXT:     workitem_private_segment_byte_size = 0
+; GFX11-NEXT:     workitem_private_segment_byte_size = dyn_extract_v4f64_s_s_s.private_seg_size
 ; GFX11-NEXT:     workgroup_group_segment_byte_size = 0
 ; GFX11-NEXT:     gds_segment_byte_size = 0
 ; GFX11-NEXT:     kernarg_segment_byte_size = 28
 ; GFX11-NEXT:     workgroup_fbarrier_count = 0
-; GFX11-NEXT:     wavefront_sgpr_count = 7
-; GFX11-NEXT:     workitem_vgpr_count = 3
+; GFX11-NEXT:     wavefront_sgpr_count = dyn_extract_v4f64_s_s_s.num_sgpr+(extrasgprs(dyn_extract_v4f64_s_s_s.uses_vcc, dyn_extract_v4f64_s_s_s.uses_flat_scratch, 0))
+; GFX11-NEXT:     workitem_vgpr_count = totalnumvgprs(dyn_extract_v4f64_s_s_s.num_agpr, dyn_extract_v4f64_s_s_s.num_vgpr)
 ; GFX11-NEXT:     reserved_vgpr_first = 0
 ; GFX11-NEXT:     reserved_vgpr_count = 0
 ; GFX11-NEXT:     reserved_sgpr_first = 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
index 66b88236bbb4c1..0221d0f790be0c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
@@ -18,7 +18,7 @@ target triple = "amdgcn-amd-amdhsa"
 ; RW-FLAT:     .amdhsa_system_sgpr_private_segment_wavefront_offset
 ; RW-FLAT-NOT: .amdhsa_enable_private_segment
 ; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset
-; RO-FLAT:     .amdhsa_enable_private_segment 1
+; RO-FLAT:     .amdhsa_enable_private_segment (((((alignto(stack_object_addrspacecast_in_kernel_no_calls.private_seg_size*64, 1024))/1024)>0)||(stack_object_addrspacecast_in_kernel_no_calls.has_dyn_sized_stack|stack_object_addrspacecast_in_kernel_no_calls.has_recursion))|128)&1
 ; GCN:         COMPUTE_PGM_RSRC2:SCRATCH_EN: 1
 ; RW-FLAT:     COMPUTE_PGM_RSRC2:USER_SGPR: 6
 ; RO-FLAT:     COMPUTE_PGM_RSRC2:USER_SGPR: 0
@@ -38,11 +38,12 @@ define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() {
 ; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer
 ; RW-FLAT:     .amdhsa_user_sgpr_flat_scratch_init 1
 ; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init
-; RW-FLAT:     .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; RW-FLAT:     .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(stack_object_in_kernel_no_calls.private_seg_size*64, 1024))/1024)>0)||(stack_object_in_kernel_no_calls.has_dyn_sized_stack|stack_object_in_kernel_no_calls.has_recursion))|140)&1
 ; RW-FLAT-NOT: .amdhsa_enable_private_segment
 ; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset
-; RO-FLAT:     .amdhsa_enable_private_segment 1
-; RW-FLAT:     .amdhsa_reserve_flat_scratch 0
+; RO-FLAT:     .amdhsa_enable_private_segment (((((alignto(stack_object_in_kernel_no_calls.private_seg_size*64, 1024))/1024)>0)||(stack_object_in_kernel_no_calls.has_dyn_sized_stack|stack_object_in_kernel_no_calls.has_recursion))|128)&1
+; RW-FLAT:     .amdhsa_reserve_flat_scratch stack_object_in_kernel_no_calls.uses_flat_scratch
+; RW-FLAT:     .set stack_object_in_kernel_no_calls.uses_flat_scratch, 0
 ; GCN:         COMPUTE_PGM_RSRC2:SCRATCH_EN: 1
 ; RW-FLAT:     COMPUTE_PGM_RSRC2:USER_SGPR: 6
 ; RO-FLAT:     COMPUTE_PGM_RSRC2:USER_SGPR: 0
@@ -58,11 +59,12 @@ define amdgpu_kernel void @stack_object_in_kernel_no_calls() {
 ; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer
 ; RW-FLAT:     .amdhsa_user_sgpr_flat_scratch_init 0
 ; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init
-; RW-FLAT:     .amdhsa_system_sgpr_private_segment_wavefront_offset 0
+; RW-FLAT:     .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(kernel_no_calls_no_stack.private_seg_size*64, 1024))/1024)>0)||(kernel_no_calls_no_stack.has_dyn_sized_stack|kernel_no_calls_no_stack.has_recursion))|136)&1
 ; RW-FLAT-NOT: .amdhsa_enable_private_segment
 ; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset
-; RO-FLAT:     .amdhsa_enable_private_segment 0
-; RW-FLAT:     .amdhsa_reserve_flat_scratch 0
+; RO-FLAT:     .amdhsa_enable_private_segment (((((alignto(kernel_no_calls_no_stack.private_seg_size*64, 1024))/1024)>0)||(kernel_no_calls_no_stack.has_dyn_sized_stack|kernel_no_calls_no_stack.has_recursion))|128)&1
+; RW-FLAT:     .amdhsa_reserve_flat_scratch kernel_no_calls_no_stack.uses_flat_scratch
+; RW-FLAT:     .set kernel_no_calls_no_stack.uses_flat_scratch, 0
 ; GCN:         COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
 ; RW-FLAT:     COMPUTE_PGM_RSRC2:USER_SGPR: 4
 ; RO-FLAT:     COMPUTE_PGM_RSRC2:USER_SGPR: 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
index d5646820a19832..374ce0676d2205 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.workitem.id.ll
@@ -16,7 +16,7 @@ declare i32 @llvm.amdgcn.workitem.id.z() #0
 
 ; MESA: .section .AMDGPU.config
 ; MESA: .long 47180
-; MESA-NEXT: .long 132{{$}}
+; MESA-NEXT: .long ((((alignto(test_workitem_id_x.private_seg_size*64, 1024))/1024)>0)||(test_workitem_id_x.has_dyn_sized_stack|test_workitem_id_x.has_recursion))|132{{$}}
 
 ; ALL-LABEL: {{^}}test_workitem_id_x:
 ; MESA3D: enable_vgpr_workitem_id = 0
@@ -33,7 +33,7 @@ define amdgpu_kernel void @test_workitem_id_x(ptr addrspace(1) %out) #1 {
 
 ; MESA: .section .AMDGPU.config
 ; MESA: .long 47180
-; MESA-NEXT: .long 2180{{$}}
+; MESA-NEXT: .long ((((alignto(test_workitem_id_y.private_seg_size*64, 1024))/1024)>0)||(test_workitem_id_y.has_dyn_sized_stack|test_workitem_id_y.has_recursion))|2180{{$}}
 
 ; ALL-LABEL: {{^}}test_workitem_id_y:
 ; MESA3D: enable_vgpr_workitem_id = 1
@@ -51,7 +51,7 @@ define amdgpu_kernel void @test_workitem_id_y(ptr addrspace(1) %out) #1 {
 
 ; MESA: .section .AMDGPU.config
 ; MESA: .long 47180
-; MESA-NEXT: .long 4228{{$}}
+; MESA-NEXT: .long ((((alignto(test_workitem_id_z.private_seg_size*64, 1024))/1024)>0)||(test_workitem_id_z.has_dyn_sized_stack|test_workitem_id_z.has_recursion))|4228{{$}}
 
 ; ALL-LABEL: {{^}}test_workitem_id_z:
 ; MESA3D: enable_vgpr_workitem_id = 2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
index c7afbeabbbb6b1..1c3db1d64b299d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
@@ -75,10 +75,10 @@ bb.2:
   store volatile i32 0, ptr addrspace(1) undef
   ret void
 }
-; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 16
+; DEFAULTSIZE: .amdhsa_private_segment_fixed_size kernel_non_entry_block_static_alloca_uniformly_reached_align4.private_seg_size
 ; DEFAULTSIZE: ; ScratchSize: 16
 
-; ASSUME1024: .amdhsa_private_segment_fixed_size 1040
+; ASSUME1024: .amdhsa_private_segment_fixed_size kernel_non_entry_block_static_alloca_uniformly_reached_align4.private_seg_size
 ; ASSUME1024: ; ScratchSize: 1040
 
 define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) {
@@ -137,10 +137,10 @@ bb.1:
   ret void
 }
 
-; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 64
+; DEFAULTSIZE: .amdhsa_private_segment_fixed_size kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size
 ; DEFAULTSIZE: ; ScratchSize: 64
 
-; ASSUME1024: .amdhsa_private_segment_fixed_size 1088
+; ASSUME1024: .amdhsa_private_segment_fixed_size kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size
 ; ASSUME1024: ; ScratchSize: 1088
 
 
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
index 8d87b53efb4e73..e311be4b12218a 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
@@ -2,9 +2,10 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX90A %s
 
 ; GCN-LABEL: {{^}}kernel_32_agprs:
-; GFX908: .amdhsa_next_free_vgpr 32
-; GFX90A: .amdhsa_next_free_vgpr 44
-; GFX90A: .amdhsa_accum_offset 12
+; GCN:    .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_32_agprs.num_agpr, kernel_32_agprs.num_vgpr), 1, 0)
+; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_32_agprs.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
+; GFX908: .set kernel_32_agprs.num_vgpr, 9
+; GFX908: .set kernel_32_agprs.num_agpr, 32
 ; GCN:    NumVgprs: 9
 ; GCN:    NumAgprs: 32
 ; GFX908: TotalNumVgprs: 32
@@ -24,8 +25,9 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}kernel_0_agprs:
-; GCN:    .amdhsa_next_free_vgpr 1
-; GFX90A: .amdhsa_accum_offset 4
+; GCN:    .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_0_agprs.num_agpr, kernel_0_agprs.num_vgpr), 1, 0)
+; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_0_agprs.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
+; GCN:    .set kernel_0_agprs.num_vgpr, 1
 ; GCN:    NumVgprs: 1
 ; GCN:    NumAgprs: 0
 ; GCN:    TotalNumVgprs: 1
@@ -42,9 +44,10 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}kernel_40_vgprs:
-; GFX908: .amdhsa_next_free_vgpr 40
-; GFX90A: .amdhsa_next_free_vgpr 56
-; GFX90A: .amdhsa_accum_offset 40
+; GCN:    .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_40_vgprs.num_agpr, kernel_40_vgprs.num_vgpr), 1, 0)
+; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_40_vgprs.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
+; GCN: .set kernel_40_vgprs.num_vgpr, 40
+; GFX90A: .set kernel_40_vgprs.num_agpr, 16
 ; GCN:    NumVgprs: 40
 ; GCN:    NumAgprs: 16
 ; GFX908: TotalNumVgprs: 40
@@ -99,9 +102,10 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}kernel_max_gprs:
-; GFX908: .amdhsa_next_free_vgpr 256
-; GFX90A: .amdhsa_next_free_vgpr 512
-; GFX90A: .amdhsa_accum_offset 256
+; GCN:    .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_max_gprs.num_agpr, kernel_max_gprs.num_vgpr), 1, 0)
+; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_max_gprs.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
+; GCN:    .set kernel_max_gprs.num_vgpr, 256
+; GFX90A: .set kernel_max_gprs.num_agpr, 256
 ; GCN:    NumVgprs: 256
 ; GCN:    NumAgprs: 256
 ; GFX908: TotalNumVgprs: 256
@@ -121,8 +125,10 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}kernel_call_func_32_agprs:
-; GFX908: .amdhsa_next_free_vgpr 32
-; GFX90A: .amdhsa_accum_offset 12
+; GCN:    .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_call_func_32_agprs.num_agpr, kernel_call_func_32_agprs.num_vgpr), 1, 0)
+; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_call_func_32_agprs.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
+; GCN: .set kernel_call_func_32_agprs.num_vgpr, max(0, func_32_agprs.num_vgpr)
+; GCN: .set kernel_call_func_32_agprs.num_agpr, max(0, func_32_agprs.num_agpr)
 ; GCN:    NumVgprs: 9
 ; GCN:    NumAgprs: 32
 ; GFX908: TotalNumVgprs: 32
@@ -154,25 +160,28 @@ bb:
 declare void @undef_func()
 
 ; GCN-LABEL: {{^}}kernel_call_undef_func:
-; GFX908: .amdhsa_next_free_vgpr 32
-; GFX90A: .amdhsa_next_free_vgpr 64
-; GFX90A: .amdhsa_accum_offset 32
-; GCN:    NumVgprs: 32
-; GCN:    NumAgprs: 32
-; GFX908: TotalNumVgprs: 32
-; GFX90A: TotalNumVgprs: 64
-; GFX908: VGPRBlocks: 7
-; GFX90A: VGPRBlocks: 7
-; GFX908: NumVGPRsForWavesPerEU: 32
-; GFX90A: NumVGPRsForWavesPerEU: 64
-; GFX90A: AccumOffset: 32
-; GFX908: Occupancy: 8
-; GFX90A: Occupancy: 8
-; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 7
+; GCN:    .amdhsa_next_free_vgpr max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0)
+; GFX90A: .amdhsa_accum_offset ((((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
+; GCN:    .set kernel_call_undef_func.num_vgpr, max(32, max_num_vgpr)
+; GCN:    .set kernel_call_undef_func.num_agpr, max(0, max_num_agpr)
+; GCN:    NumVgprs: kernel_call_undef_func.num_vgpr
+; GCN:    NumAgprs: kernel_call_undef_func.num_agpr
+; GCN:    TotalNumVgprs: totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr)
+; GFX908: VGPRBlocks: ((alignto(max(max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0), 1), 4))/4)-1
+; GFX90A: VGPRBlocks: ((alignto(max(max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0), 1), 8))/8)-1
+; GCN:    NumVGPRsForWavesPerEU: max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0)
+; GFX90A: AccumOffset: ((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4))/4)-1)+1)*4
+; GFX908: Occupancy: occupancy(10, 4, 256, 8, 10, max(kernel_call_undef_func.num_sgpr+(extrasgprs(kernel_call_undef_func.uses_vcc, kernel_call_undef_func.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0))
+; GFX90A: Occupancy: occupancy(8, 8, 512, 8, 8, max(kernel_call_undef_func.num_sgpr+(extrasgprs(kernel_call_undef_func.uses_vcc, kernel_call_undef_func.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(kernel_call_undef_func.num_agpr, kernel_call_undef_func.num_vgpr), 1, 0))
+; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: ((((alignto(max(1, kernel_call_undef_func.num_vgpr), 4))/4)-1)&(~65536))&63
 define amdgpu_kernel void @kernel_call_undef_func() #0 {
 bb:
   call void @undef_func()
   ret void
 }
 
+; GCN:      .set max_num_vgpr, 32
+; GCN-NEXT: .set max_num_agpr, 32
+; GCN-NEXT: .set max_num_sgpr, 34
+
 attributes #0 = { nounwind noinline "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index 9ec8e425a3f55c..993ff4e4477d35 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -35,7 +35,8 @@
 
 ; FIXME: Creating the emergency stack slots causes us to over-estimate scratch
 ; by 4 bytes.
-; HSA-ALLOCA: .amdhsa_private_segment_fixed_size 24
+; HSA-ALLOCA: .amdhsa_private_segment_fixed_size mova_same_clause.private_seg_size
+; HSA-ALLOCA: .set mova_same_clause.private_seg_size, 24
 
 ; HSA-ALLOCA: s_add_i32 s12, s12, s17
 ; HSA-ALLOCA-DAG: s_mov_b32 flat_scratch_lo, s13
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll b/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll
index 99a7ae37e0e78d..f64a5e01cd2560 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-metadata-agpr-register-count.ll
@@ -60,7 +60,9 @@ bb:
 declare void @undef_func()
 
 ; CHECK:      .type          kernel_call_undef_func
-; CHECK:      NumAgprs:       32
+; CHECK:      .set kernel_call_undef_func.num_agpr, max(0, max_num_agpr)
+; CHECK:      NumAgprs: kernel_call_undef_func.num_agpr
+; CHECK:      .set max_num_agpr, 32
 define amdgpu_kernel void @kernel_call_undef_func() #0 {
 bb:
   call void @undef_func()
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
index e4d427a0b826f8..c893f6b04b7b66 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
@@ -547,18 +547,20 @@ define amdgpu_kernel void @f256() #256 {
 attributes #256 = { nounwind "amdgpu-flat-work-group-size"="256,256" }
 
 ; GCN-LABEL: {{^}}f512:
-; GFX9: NumVgprs: 128
-; GFX90A: NumVgprs: 128
-; GFX90A: NumAgprs: 128
-; GFX90A: TotalNumVgprs: 256
-; GFX10WGP-WAVE32: NumVgprs: 256
-; GFX10WGP-WAVE64: NumVgprs: 256
-; GFX10CU-WAVE32: NumVgprs: 128
-; GFX10CU-WAVE64: NumVgprs: 128
-; GFX11WGP-WAVE32: NumVgprs: 256
-; GFX11WGP-WAVE64: NumVgprs: 256
-; GFX11CU-WAVE32: NumVgprs: 192
-; GFX11CU-WAVE64: NumVgprs: 192
+; GFX9:             .set f512.num_vgpr, max(128, max_num_vgpr)
+; GFX90A:           .set f512.num_vgpr, max(128, max_num_vgpr)
+; GFX90A:           .set f512.num_agpr, max(128, max_num_agpr)
+; GFX10WGP-WAVE32:  .set f512.num_vgpr, max(256, max_num_vgpr)
+; GFX10WGP-WAVE64:  .set f512.num_vgpr, max(256, max_num_vgpr)
+; GFX10CU-WAVE32:   .set f512.num_vgpr, max(128, max_num_vgpr)
+; GFX10CU-WAVE64:   .set f512.num_vgpr, max(128, max_num_vgpr)
+; GFX11WGP-WAVE32:  .set f512.num_vgpr, max(256, max_num_vgpr)
+; GFX11WGP-WAVE64:  .set f512.num_vgpr, max(256, max_num_vgpr)
+; GFX11CU-WAVE32:   .set f512.num_vgpr, max(192, max_num_vgpr)
+; GFX11CU-WAVE64:   .set f512.num_vgpr, max(192, max_num_vgpr)
+; GCN:              NumVgprs: f512.num_vgpr
+; GFX90A:           NumAgprs: f512.num_agpr
+; GFX90A:           TotalNumVgprs: totalnumvgprs(f512.num_agpr, f512.num_vgpr)
 define amdgpu_kernel void @f512() #512 {
   call void @foo()
   call void @use256vgprs()
@@ -567,17 +569,20 @@ define amdgpu_kernel void @f512() #512 {
 attributes #512 = { nounwind "amdgpu-flat-work-group-size"="512,512" }
 
 ; GCN-LABEL: {{^}}f1024:
-; GFX9: NumVgprs: 64
-; GFX90A: NumAgprs: 64
-; GFX90A: TotalNumVgprs: 128
-; GFX10WGP-WAVE32: NumVgprs: 128
-; GFX10WGP-WAVE64: NumVgprs: 128
-; GFX10CU-WAVE32: NumVgprs: 64
-; GFX10CU-WAVE64: NumVgprs: 64
-; GFX11WGP-WAVE32: NumVgprs: 192
-; GFX11WGP-WAVE64: NumVgprs: 192
-; GFX11CU-WAVE32: NumVgprs: 96
-; GFX11CU-WAVE64: NumVgprs: 96
+; GFX9:             .set f1024.num_vgpr, max(64, max_num_vgpr)
+; GFX90A:           .set f1024.num_vgpr, max(64, max_num_vgpr)
+; GFX90A:           .set f1024.num_agpr, max(64, max_num_agpr)
+; GFX10WGP-WAVE32:  .set f1024.num_vgpr, max(128, max_num_vgpr)
+; GFX10WGP-WAVE64:  .set f1024.num_vgpr, max(128, max_num_vgpr)
+; GFX10CU-WAVE32:   .set f1024.num_vgpr, max(64, max_num_vgpr)
+; GFX10CU-WAVE64:   .set f1024.num_vgpr, max(64, max_num_vgpr)
+; GFX11WGP-WAVE32:  .set f1024.num_vgpr, max(192, max_num_vgpr)
+; GFX11WGP-WAVE64:  .set f1024.num_vgpr, max(192, max_num_vgpr)
+; GFX11CU-WAVE32:   .set f1024.num_vgpr, max(96, max_num_vgpr)
+; GFX11CU-WAVE64:   .set f1024.num_vgpr, max(96, max_num_vgpr)
+; GCN:              NumVgprs: f1024.num_vgpr
+; GFX90A:           NumAgprs: f1024.num_agpr
+; GFX90A:           TotalNumVgprs: totalnumvgprs(f1024.num_agpr, f1024.num_vgpr)
 define amdgpu_kernel void @f1024() #1024 {
   call void @foo()
   call void @use256vgprs()
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
index 90562e25a3e9c1..77be8605f20015 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
@@ -10,12 +10,21 @@
 ; OPT: .amdhsa_user_sgpr_dispatch_id 0
 ; OPT: .amdhsa_user_sgpr_flat_scratch_init 0
 ; OPT: .amdhsa_user_sgpr_private_segment_size 0
-; OPT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
+; OPT: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(foo.private_seg_size*64, 1024))/1024)>0)||(foo.has_dyn_sized_stack|foo.has_recursion))|136)&1
 ; OPT: .amdhsa_system_sgpr_workgroup_id_x 1
 ; OPT: .amdhsa_system_sgpr_workgroup_id_y 0
 ; OPT: .amdhsa_system_sgpr_workgroup_id_z 0
 ; OPT: .amdhsa_system_sgpr_workgroup_info 0
 ; OPT: .amdhsa_system_vgpr_workitem_id 0
+; OPT: .set foo.num_vgpr, 0
+; OPT: .set foo.num_agpr, 0
+; OPT: .set foo.num_sgpr, 0
+; OPT: .set foo.private_seg_size, 0
+; OPT: .set foo.uses_vcc, 0
+; OPT: .set foo.uses_flat_scratch, 0
+; OPT: .set foo.has_dyn_sized_stack, 0
+; OPT: .set foo.has_recursion, 0
+; OPT: .set foo.has_indirect_call, 0
 
 ; NOOPT: .amdhsa_user_sgpr_private_segment_buffer 1
 ; NOOPT: .amdhsa_user_sgpr_dispatch_ptr 1
@@ -25,12 +34,25 @@
 ; NOOPT: .amdhsa_user_sgpr_dispatch_id 1
 ; NOOPT: .amdhsa_user_sgpr_flat_scratch_init 0
 ; NOOPT: .amdhsa_user_sgpr_private_segment_size 0
-; NOOPT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
+; COV4: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(foo.private_seg_size*64, 1024))/1024)>0)||(foo.has_dyn_sized_stack|foo.has_recursion))|5016)&1
+; COV5: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(foo.private_seg_size*64, 1024))/1024)>0)||(foo.has_dyn_sized_stack|foo.has_recursion))|5012)&1
 ; NOOPT: .amdhsa_system_sgpr_workgroup_id_x 1
 ; NOOPT: .amdhsa_system_sgpr_workgroup_id_y 1
 ; NOOPT: .amdhsa_system_sgpr_workgroup_id_z 1
-; NOOPT: .amdhsa_system_sgpr_workgroup_info 0
-; NOOPT: .amdhsa_system_vgpr_workitem_id 2
+; COV4: .amdhsa_system_sgpr_workgroup_info 0
+; COV5: .amdhsa_system_sgpr_workgroup_info 0
+; COV4: .amdhsa_system_vgpr_workitem_id 2
+; COV5: .amdhsa_system_vgpr_workitem_id 2
+; NOOPT: .set foo.num_vgpr, 0
+; NOOPT: .set foo.num_agpr, 0
+; NOOPT: .set foo.num_sgpr, 0
+; NOOPT: .set foo.private_seg_size, 0
+; NOOPT: .set foo.uses_vcc, 0
+; NOOPT: .set foo.uses_flat_scratch, 0
+; NOOPT: .set foo.has_dyn_sized_stack, 0
+; NOOPT: .set foo.has_recursion, 0
+; NOOPT: .set foo.has_indirect_call, 0
+
 define amdgpu_kernel void @foo() {
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
index a795e995603410..f5d45993742814 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefixes=ALL,GFX908 %s
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=ALL %s
 ; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=ALL,GFX90A %s
 
 ; CallGraphAnalysis, which CodeGenSCC order depends on, does not look
@@ -8,12 +8,13 @@
 @alias = hidden alias void (), ptr @aliasee_default
 
 ; ALL-LABEL: {{^}}kernel:
-; GFX908: .amdhsa_next_free_vgpr 32
-; GFX908-NEXT: .amdhsa_next_free_sgpr 33
+; ALL:          .amdhsa_next_free_vgpr max(totalnumvgprs(kernel.num_agpr, kernel.num_vgpr), 1, 0)
+; ALL-NEXT:     .amdhsa_next_free_sgpr (max(kernel.num_sgpr+(extrasgprs(kernel.uses_vcc, kernel.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel.uses_vcc, kernel.uses_flat_scratch, 1))
+; GFX90A-NEXT:  .amdhsa_accum_offset ((((((alignto(max(1, kernel.num_vgpr), 4))/4)-1)&(~65536))&63)+1)*4
 
-; GFX90A: .amdhsa_next_free_vgpr 59
-; GFX90A-NEXT: .amdhsa_next_free_sgpr 33
-; GFX90A-NEXT: .amdhsa_accum_offset 32
+; ALL:       .set kernel.num_vgpr, max(32, aliasee_default.num_vgpr)
+; ALL-NEXT:  .set kernel.num_agpr, max(0, aliasee_default.num_agpr)
+; ALL-NEXT:  .set kernel.num_sgpr, max(33, aliasee_default.num_sgpr)
 define amdgpu_kernel void @kernel() #0 {
 bb:
   call void @alias() #2
@@ -25,6 +26,9 @@ bb:
   call void asm sideeffect "; clobber a26 ", "~{a26}"()
   ret void
 }
+; ALL:      .set aliasee_default.num_vgpr, 0
+; ALL-NEXT: .set aliasee_default.num_agpr, 27
+; ALL-NEXT: .set aliasee_default.num_sgpr, 32
 
 attributes #0 = { noinline norecurse nounwind optnone }
 attributes #1 = { noinline norecurse nounwind readnone willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
index c976cc3d53b5eb..092e734ef106be 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
@@ -7,14 +7,18 @@
 @alias0 = hidden alias void (), ptr @aliasee_default_vgpr64_sgpr102
 
 ; CHECK-LABEL: {{^}}kernel0:
-; CHECK: .amdhsa_next_free_vgpr 53
-; CHECK-NEXT: .amdhsa_next_free_sgpr 33
+; CHECK:      .set kernel0.num_vgpr, max(32, aliasee_default_vgpr64_sgpr102.num_vgpr)
+; CHECK-NEXT: .set kernel0.num_agpr, max(0, aliasee_default_vgpr64_sgpr102.num_agpr)
+; CHECK-NEXT: .set kernel0.num_sgpr, max(33, aliasee_default_vgpr64_sgpr102.num_sgpr)
 define amdgpu_kernel void @kernel0() #0 {
 bb:
   call void @alias0() #2
   ret void
 }
 
+; CHECK:      .set aliasee_default_vgpr64_sgpr102.num_vgpr, 53
+; CHECK-NEXT: .set aliasee_default_vgpr64_sgpr102.num_agpr, 0
+; CHECK-NEXT: .set aliasee_default_vgpr64_sgpr102.num_sgpr, 32
 define internal void @aliasee_default_vgpr64_sgpr102() #1 {
 bb:
   call void asm sideeffect "; clobber v52 ", "~{v52}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
index edef71ef143dfd..f8287dc518421e 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
@@ -9,8 +9,12 @@
 ; The parent kernel has a higher VGPR usage than the possible callees.
 
 ; CHECK-LABEL: {{^}}kernel1:
-; CHECK: .amdhsa_next_free_vgpr 41
-; CHECK-NEXT: .amdhsa_next_free_sgpr 33
+; CHECK:      .amdhsa_next_free_vgpr max(totalnumvgprs(kernel1.num_agpr, kernel1.num_vgpr), 1, 0)
+; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel1.num_sgpr+(extrasgprs(kernel1.uses_vcc, kernel1.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel1.uses_vcc, kernel1.uses_flat_scratch, 1))
+
+; CHECK:      .set kernel1.num_vgpr, max(41, aliasee_vgpr32_sgpr76.num_vgpr)
+; CHECK-NEXT: .set kernel1.num_agpr, max(0, aliasee_vgpr32_sgpr76.num_agpr)
+; CHECK-NEXT: .set kernel1.num_sgpr, max(33, aliasee_vgpr32_sgpr76.num_sgpr)
 define amdgpu_kernel void @kernel1() #0 {
 bb:
   call void asm sideeffect "; clobber v40 ", "~{v40}"()
@@ -18,6 +22,9 @@ bb:
   ret void
 }
 
+; CHECK:      .set aliasee_vgpr32_sgpr76.num_vgpr, 27
+; CHECK-NEXT: .set aliasee_vgpr32_sgpr76.num_agpr, 0
+; CHECK-NEXT: .set aliasee_vgpr32_sgpr76.num_sgpr, 32
 define internal void @aliasee_vgpr32_sgpr76() #1 {
 bb:
   call void asm sideeffect "; clobber v26 ", "~{v26}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
index bb34ef1a15d2b9..a99b2295dfe85c 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
@@ -7,14 +7,21 @@
 @alias2 = hidden alias void (), ptr @aliasee_vgpr64_sgpr102
 
 ; CHECK-LABEL: {{^}}kernel2:
-; CHECK: .amdhsa_next_free_vgpr 53
-; CHECK-NEXT: .amdhsa_next_free_sgpr 33
+; CHECK:      .amdhsa_next_free_vgpr max(totalnumvgprs(kernel2.num_agpr, kernel2.num_vgpr), 1, 0)
+; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel2.num_sgpr+(extrasgprs(kernel2.uses_vcc, kernel2.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel2.uses_vcc, kernel2.uses_flat_scratch, 1))
+
+; CHECK:      .set kernel2.num_vgpr, max(32, aliasee_vgpr64_sgpr102.num_vgpr)
+; CHECK-NEXT: .set kernel2.num_agpr, max(0, aliasee_vgpr64_sgpr102.num_agpr)
+; CHECK-NEXT: .set kernel2.num_sgpr, max(33, aliasee_vgpr64_sgpr102.num_sgpr)
 define amdgpu_kernel void @kernel2() #0 {
 bb:
   call void @alias2() #2
   ret void
 }
 
+; CHECK:      .set aliasee_vgpr64_sgpr102.num_vgpr, 53
+; CHECK-NEXT: .set aliasee_vgpr64_sgpr102.num_agpr, 0
+; CHECK-NEXT: .set aliasee_vgpr64_sgpr102.num_sgpr, 32
 define internal void @aliasee_vgpr64_sgpr102() #1 {
 bb:
   call void asm sideeffect "; clobber v52 ", "~{v52}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
index 8a88eb7e51ad72..793dc1bc3a6f33 100644
--- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
@@ -7,14 +7,21 @@
 @alias3 = hidden alias void (), ptr @aliasee_vgpr256_sgpr102
 
 ; CHECK-LABEL: {{^}}kernel3:
-; CHECK: .amdhsa_next_free_vgpr 253
-; CHECK-NEXT: .amdhsa_next_free_sgpr 33
+; CHECK:      .amdhsa_next_free_vgpr max(totalnumvgprs(kernel3.num_agpr, kernel3.num_vgpr), 1, 0)
+; CHECK-NEXT: .amdhsa_next_free_sgpr (max(kernel3.num_sgpr+(extrasgprs(kernel3.uses_vcc, kernel3.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kernel3.uses_vcc, kernel3.uses_flat_scratch, 1))
+
+; CHECK:      .set kernel3.num_vgpr, max(32, aliasee_vgpr256_sgpr102.num_vgpr)
+; CHECK-NEXT: .set kernel3.num_agpr, max(0, aliasee_vgpr256_sgpr102.num_agpr)
+; CHECK-NEXT: .set kernel3.num_sgpr, max(33, aliasee_vgpr256_sgpr102.num_sgpr)
 define amdgpu_kernel void @kernel3() #0 {
 bb:
   call void @alias3() #2
   ret void
 }
 
+; CHECK:      .set aliasee_vgpr256_sgpr102.num_vgpr, 253
+; CHECK-NEXT: .set aliasee_vgpr256_sgpr102.num_agpr, 0
+; CHECK-NEXT: .set aliasee_vgpr256_sgpr102.num_sgpr, 33
 define internal void @aliasee_vgpr256_sgpr102() #1 {
 bb:
   call void asm sideeffect "; clobber v252 ", "~{v252}"()
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index 6af45035d394f8..6311c2a01d366b 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -222,10 +222,14 @@ ret:
 }
 
 ; GCN-LABEL: {{^}}usage_direct_recursion:
-; GCN: .amdhsa_private_segment_fixed_size 18448
+; GCN: .amdhsa_private_segment_fixed_size usage_direct_recursion.private_seg_size
+; GCN: .set usage_direct_recursion.private_seg_size, 0+(max(16384, direct_recursion_use_stack.private_seg_size))
+; GCN: ScratchSize: 18448
 ;
 ; GCN-V5-LABEL: {{^}}usage_direct_recursion:
-; GCN-V5: .amdhsa_private_segment_fixed_size 2064{{$}}
+; GCN-V5: .amdhsa_private_segment_fixed_size usage_direct_recursion.private_seg_size
+; GCN-V5: .set usage_direct_recursion.private_seg_size, 0+(max(direct_recursion_use_stack.private_seg_size))
+; GCN-V5: ScratchSize: 2064
 define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
   call void @direct_recursion_use_stack(i32 %n)
   ret void
@@ -234,10 +238,11 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
 ; Make sure there's no assert when a sgpr96 is used.
 ; GCN-LABEL: {{^}}count_use_sgpr96_external_call
 ; GCN: ; sgpr96 s[{{[0-9]+}}:{{[0-9]+}}]
-; CI: NumSgprs: 84
-; VI-NOBUG: NumSgprs: 86
+; GCN: .set count_use_sgpr96_external_call.num_vgpr, max(0, max_num_vgpr)
+; GCN: .set count_use_sgpr96_external_call.num_sgpr, max(33, max_num_sgpr)
+; CI: NumSgprs: count_use_sgpr96_external_call.num_sgpr+4
 ; VI-BUG: NumSgprs: 96
-; GCN: NumVgprs: 50
+; GCN: NumVgprs: count_use_sgpr96_external_call.num_vgpr
 define amdgpu_kernel void @count_use_sgpr96_external_call()  {
 entry:
   tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> <i32 10, i32 11, i32 12>) #1
@@ -248,10 +253,11 @@ entry:
 ; Make sure there's no assert when a sgpr160 is used.
 ; GCN-LABEL: {{^}}count_use_sgpr160_external_call
 ; GCN: ; sgpr160 s[{{[0-9]+}}:{{[0-9]+}}]
-; CI: NumSgprs: 84
-; VI-NOBUG: NumSgprs: 86
+; GCN: .set count_use_sgpr160_external_call.num_vgpr, max(0, max_num_vgpr)
+; GCN: .set count_use_sgpr160_external_call.num_sgpr, max(33, max_num_sgpr)
+; CI: NumSgprs: count_use_sgpr160_external_call.num_sgpr+4
 ; VI-BUG: NumSgprs: 96
-; GCN: NumVgprs: 50
+; GCN: NumVgprs: count_use_sgpr160_external_call.num_vgpr
 define amdgpu_kernel void @count_use_sgpr160_external_call()  {
 entry:
   tail call void asm sideeffect "; sgpr160 $0", "s"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
@@ -262,10 +268,11 @@ entry:
 ; Make sure there's no assert when a vgpr160 is used.
 ; GCN-LABEL: {{^}}count_use_vgpr160_external_call
 ; GCN: ; vgpr160 v[{{[0-9]+}}:{{[0-9]+}}]
-; CI: NumSgprs: 84
-; VI-NOBUG: NumSgprs: 86
+; GCN: .set count_use_vgpr160_external_call.num_vgpr, max(5, max_num_vgpr)
+; GCN: .set count_use_vgpr160_external_call.num_sgpr, max(33, max_num_sgpr)
+; CI: NumSgprs: count_use_vgpr160_external_call.num_sgpr+4
 ; VI-BUG: NumSgprs: 96
-; GCN: NumVgprs: 50
+; GCN: NumVgprs: count_use_vgpr160_external_call.num_vgpr
 define amdgpu_kernel void @count_use_vgpr160_external_call()  {
 entry:
   tail call void asm sideeffect "; vgpr160 $0", "v"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
@@ -273,6 +280,27 @@ entry:
   ret void
 }
 
+; GCN: .set max_num_vgpr, 50
+; GCN: .set max_num_agpr, 0
+; GCN: .set max_num_sgpr, 80
+
+; GCN-LABEL: amdhsa.kernels:
+; GCN:      .name: count_use_sgpr96_external_call
+; CI:       .sgpr_count: 84
+; VI-NOBUG: .sgpr_count: 86
+; VI-BUG:   .sgpr_count: 96
+; GCN:      .vgpr_count: 50
+; GCN:      .name: count_use_sgpr160_external_call
+; CI:       .sgpr_count: 84
+; VI-NOBUG: .sgpr_count: 86
+; VI-BUG:   .sgpr_count: 96
+; GCN:      .vgpr_count: 50
+; GCN:      .name: count_use_vgpr160_external_call
+; CI:       .sgpr_count: 84
+; VI-NOBUG: .sgpr_count: 86
+; VI-BUG:   .sgpr_count: 96
+; GCN:      .vgpr_count: 50
+
 attributes #0 = { nounwind noinline norecurse "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
 attributes #1 = { nounwind noinline norecurse "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
 attributes #2 = { nounwind noinline }
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
index 032ec65fa85133..7e731a70ca4d76 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
@@ -469,7 +469,7 @@ define hidden void @use_every_sgpr_input() #1 {
 ; GCN: .amdhsa_user_sgpr_dispatch_id 1
 ; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
 ; GCN: .amdhsa_user_sgpr_private_segment_size 0
-; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(kern_indirect_use_every_sgpr_input.private_seg_size*64, 1024))/1024)>0)||(kern_indirect_use_every_sgpr_input.has_dyn_sized_stack|kern_indirect_use_every_sgpr_input.has_recursion))|920)&1
 ; GCN: .amdhsa_system_sgpr_workgroup_id_x 1
 ; GCN: .amdhsa_system_sgpr_workgroup_id_y 1
 ; GCN: .amdhsa_system_sgpr_workgroup_id_z 1
@@ -494,7 +494,7 @@ define amdgpu_kernel void @kern_indirect_use_every_sgpr_input(i8) #1 {
 ; GCN: .amdhsa_user_sgpr_dispatch_id 1
 ; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
 ; GCN: .amdhsa_user_sgpr_private_segment_size 0
-; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(kern_indirect_use_every_sgpr_input_no_kernargs.private_seg_size*64, 1024))/1024)>0)||(kern_indirect_use_every_sgpr_input_no_kernargs.has_dyn_sized_stack|kern_indirect_use_every_sgpr_input_no_kernargs.has_recursion))|916)&1
 ; GCN: .amdhsa_system_sgpr_workgroup_id_x 1
 ; GCN: .amdhsa_system_sgpr_workgroup_id_y 1
 ; GCN: .amdhsa_system_sgpr_workgroup_id_z 1
diff --git a/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll b/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
index 643f2619840a22..ede57f1a0a04ce 100644
--- a/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
@@ -33,6 +33,7 @@ bb2:
 
 ; GCN-LABEL: {{^}}preserve_condition_undef_flag:
 ; GCN-NOT: vcc
+; GCN: s_endpgm
 define amdgpu_kernel void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) {
 bb0:
   %tmp = icmp sgt i32 %arg1, 4
diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
index 3035a8579c8a6d..13fd714933dbb6 100644
--- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
@@ -15,13 +15,19 @@
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_count 10
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
-; OSABI-AMDHSA-ASM:     .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 8
-; OSABI-AMDHSA-ASM:     .amdhsa_reserve_vcc 0
-; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 0
+; OSABI-AMDHSA-ASM:     .amdhsa_next_free_vgpr max(totalnumvgprs(fadd.num_agpr, fadd.num_vgpr), 1, 0)
+; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr (max(fadd.num_sgpr+(extrasgprs(fadd.uses_vcc, fadd.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(fadd.uses_vcc, fadd.uses_flat_scratch, 0))
+; OSABI-AMDHSA-ASM:     .amdhsa_reserve_vcc fadd.uses_vcc
+; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch fadd.uses_flat_scratch
 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
 ; OSABI-AMDHSA-ASM: .text
 
+; OSABI-AMDHSA-ASM: .set fadd.num_vgpr, 3
+; OSABI-AMDHSA-ASM: .set fadd.num_agpr, 0
+; OSABI-AMDHSA-ASM: .set fadd.num_sgpr, 8
+; OSABI-AMDHSA-ASM: .set fadd.uses_vcc, 0
+; OSABI-AMDHSA-ASM: .set fadd.uses_flat_scratch, 0
+
 ; ALL-ASM-LABEL: {{^}}fsub:
 
 ; OSABI-AMDHSA-ASM-NOT: .amdgpu_hsa_kernel
@@ -34,13 +40,19 @@
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_count 10
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
-; OSABI-AMDHSA-ASM:     .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 8
-; OSABI-AMDHSA-ASM:     .amdhsa_reserve_vcc 0
-; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 0
+; OSABI-AMDHSA-ASM:     .amdhsa_next_free_vgpr max(totalnumvgprs(fsub.num_agpr, fsub.num_vgpr), 1, 0)
+; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr (max(fsub.num_sgpr+(extrasgprs(fsub.uses_vcc, fsub.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(fsub.uses_vcc, fsub.uses_flat_scratch, 0))
+; OSABI-AMDHSA-ASM:     .amdhsa_reserve_vcc fsub.uses_vcc
+; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch fsub.uses_flat_scratch
 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
 ; OSABI-AMDHSA-ASM: .text
 
+; OSABI-AMDHSA-ASM: .set fsub.num_vgpr, 3
+; OSABI-AMDHSA-ASM: .set fsub.num_agpr, 0
+; OSABI-AMDHSA-ASM: .set fsub.num_sgpr, 8
+; OSABI-AMDHSA-ASM: .set fsub.uses_vcc, 0
+; OSABI-AMDHSA-ASM: .set fsub.uses_flat_scratch, 0
+
 ; OSABI-AMDHSA-ASM-NOT: .hsa_code_object_version
 ; OSABI-AMDHSA-ASM-NOT: .hsa_code_object_isa
 ; OSABI-AMDHSA-ASM-NOT: .amd_amdgpu_isa
@@ -93,8 +105,10 @@ entry:
 ; registers used.
 ;
 ; ALL-ASM-LABEL: {{^}}empty:
-; ALL-ASM:     .amdhsa_next_free_vgpr 1
-; ALL-ASM:     .amdhsa_next_free_sgpr 1
+; ALL-ASM:     .amdhsa_next_free_vgpr max(totalnumvgprs(empty.num_agpr, empty.num_vgpr), 1, 0)
+; ALL-ASM:     .amdhsa_next_free_sgpr (max(empty.num_sgpr+(extrasgprs(empty.uses_vcc, empty.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(empty.uses_vcc, empty.uses_flat_scratch, 0))
+; ALL-ASM:  NumSGPRsForWavesPerEU: 1
+; ALL-ASM:  NumVGPRsForWavesPerEU: 1
 define amdgpu_kernel void @empty(
     i32 %i,
     ptr addrspace(1) %r,
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll
index 9d93609b1e8813..aa1a93cfec3d86 100644
--- a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll
@@ -1,8 +1,8 @@
 ; REQUIRES: asserts
-; RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
-; RUN: not --crash llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=asm < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=asm < %s | FileCheck %s
 
-; CHECK: function must have been generated already
+; CHECK-NOT: func
 
 define internal i32 @func() {
   ret i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
index 789150f690d52e..69a729f6847f0a 100644
--- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -62,7 +62,8 @@
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]]
 
-; VGPR: .amdhsa_private_segment_fixed_size 16
+; VGPR: .amdhsa_private_segment_fixed_size divergent_if_endif.private_seg_size
+; VGPR: .set divergent_if_endif.private_seg_size, 16
 define amdgpu_kernel void @divergent_if_endif(ptr addrspace(1) %out) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -133,7 +134,8 @@ endif:
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]]
 
-; VGPR: .amdhsa_private_segment_fixed_size 20
+; VGPR: .amdhsa_private_segment_fixed_size divergent_loop.private_seg_size
+; VGPR: .set divergent_loop.private_seg_size, 20
 define amdgpu_kernel void @divergent_loop(ptr addrspace(1) %out) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/elf.ll b/llvm/test/CodeGen/AMDGPU/elf.ll
index f51d9fc5125ba6..423bb95af25df9 100644
--- a/llvm/test/CodeGen/AMDGPU/elf.ll
+++ b/llvm/test/CodeGen/AMDGPU/elf.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -filetype=obj | llvm-readobj -S --symbols --file-headers - | FileCheck --check-prefix=ELF %s
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TONGA %s
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs -filetype=obj | llvm-readobj -S --symbols --file-headers - | FileCheck --check-prefix=ELF %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=CARRIZO %s
 
 ; Test that we don't try to produce a COFF file on windows
 ; RUN: llc < %s -mtriple=amdgcn-pc-mingw -verify-machineinstrs -filetype=obj | llvm-readobj -S --symbols --file-headers - | FileCheck --check-prefix=ELF %s
@@ -20,8 +20,9 @@
 
 ; CONFIG: .section .AMDGPU.config
 ; CONFIG-NEXT: .long   45096
-; TYPICAL-NEXT: .long   0
-; TONGA-NEXT: .long   704
+; TYPICAL-NEXT: .long (((((alignto(max(max(totalnumvgprs(test.num_agpr, max(totalnumvgprs(test.num_agpr, test.num_vgpr), 1)), 1, 0), 1), 4))/4)-1)&63)<<0)|(((((alignto(max(max(max(test.num_sgpr+(extrasgprs(test.uses_vcc, test.uses_flat_scratch, 0)), 0), 1, 0), 1), 8))/8)-1)&15)<<6)
+; TONGA-NEXT: .long   (((((alignto(max(max(totalnumvgprs(test.num_agpr, max(totalnumvgprs(test.num_agpr, test.num_vgpr), 1)), 1, 0), 1), 4))/4)-1)&63)<<0)|(((((alignto(max(96, 1), 8))/8)-1)&15)<<6)
+; CARRIZO-NEXT: .long (((((alignto(max(max(totalnumvgprs(test.num_agpr, max(totalnumvgprs(test.num_agpr, test.num_vgpr), 1)), 1, 0), 1), 4))/4)-1)&63)<<0)|(((((alignto(max(max(max(test.num_sgpr+(extrasgprs(test.uses_vcc, test.uses_flat_scratch, 1)), 0), 1, 0), 1), 8))/8)-1)&15)<<6)
 ; CONFIG: .p2align 8
 ; CONFIG: test:
 define amdgpu_ps void @test(i32 %p) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/enable-scratch-only-dynamic-stack.ll b/llvm/test/CodeGen/AMDGPU/enable-scratch-only-dynamic-stack.ll
index 78ac2f9eaff020..c4111282682527 100644
--- a/llvm/test/CodeGen/AMDGPU/enable-scratch-only-dynamic-stack.ll
+++ b/llvm/test/CodeGen/AMDGPU/enable-scratch-only-dynamic-stack.ll
@@ -7,13 +7,17 @@
 ; No stack objects, only indirect call has to enable scrathch
 ; GCN-LABEL: test_indirect_call:
 
-; COV5: .amdhsa_private_segment_fixed_size 0{{$}}
-; COV4: .amdhsa_private_segment_fixed_size 16384{{$}}
-
+; GCN: .amdhsa_private_segment_fixed_size test_indirect_call.private_seg_size
 ; GCN: .amdhsa_user_sgpr_private_segment_buffer 1
+; COV5: .amdhsa_uses_dynamic_stack ((59|((test_indirect_call.has_dyn_sized_stack|test_indirect_call.has_recursion)<<11))&2048)>>11
+; COV5: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(test_indirect_call.private_seg_size*64, 1024))/1024)>0)||(test_indirect_call.has_dyn_sized_stack|test_indirect_call.has_recursion))|5016)&1
+; COV4: .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(test_indirect_call.private_seg_size*64, 1024))/1024)>0)||(test_indirect_call.has_dyn_sized_stack|test_indirect_call.has_recursion))|5020)&1
+
+; COV5: .set test_indirect_call.private_seg_size, 0{{$}}
+; COV4: .set test_indirect_call.private_seg_size, 0+(max(16384))
+; COV5: .set test_indirect_call.has_recursion, 1
+; COV5: .set test_indirect_call.has_indirect_call, 1
 
-; COV5: .amdhsa_uses_dynamic_stack 1
-; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
 define amdgpu_kernel void @test_indirect_call() {
   %fptr = load ptr, ptr addrspace(4) @gv.fptr0
   call void %fptr()
diff --git a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
new file mode 100644
index 00000000000000..c411323a70ed31
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll
@@ -0,0 +1,533 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; SGPR use may not seem equal to the sgpr use provided in comments as the latter includes extra sgprs (e.g., for vcc use).
+
+; Functions that don't make calls should have constants as its resource usage as no resource information has to be propagated.
+
+; GCN-LABEL: {{^}}use_vcc:
+; GCN: .set use_vcc.num_vgpr, 0
+; GCN: .set use_vcc.num_agpr, 0
+; GCN: .set use_vcc.num_sgpr, 32
+; GCN: .set use_vcc.private_seg_size, 0
+; GCN: .set use_vcc.uses_vcc, 1
+; GCN: .set use_vcc.uses_flat_scratch, 0
+; GCN: .set use_vcc.has_dyn_sized_stack, 0
+; GCN: .set use_vcc.has_recursion, 0
+; GCN: .set use_vcc.has_indirect_call, 0
+; GCN: NumSgprs: 36
+; GCN: NumVgprs: 0
+; GCN: ScratchSize: 0
+define void @use_vcc() #1 {
+  call void asm sideeffect "", "~{vcc}" () #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}indirect_use_vcc:
+; GCN: .set indirect_use_vcc.num_vgpr, max(41, use_vcc.num_vgpr)
+; GCN: .set indirect_use_vcc.num_agpr, max(0, use_vcc.num_agpr)
+; GCN: .set indirect_use_vcc.num_sgpr, max(34, use_vcc.num_sgpr)
+; GCN: .set indirect_use_vcc.private_seg_size, 16+(max(use_vcc.private_seg_size))
+; GCN: .set indirect_use_vcc.uses_vcc, or(1, use_vcc.uses_vcc)
+; GCN: .set indirect_use_vcc.uses_flat_scratch, or(0, use_vcc.uses_flat_scratch)
+; GCN: .set indirect_use_vcc.has_dyn_sized_stack, or(0, use_vcc.has_dyn_sized_stack)
+; GCN: .set indirect_use_vcc.has_recursion, or(0, use_vcc.has_recursion)
+; GCN: .set indirect_use_vcc.has_indirect_call, or(0, use_vcc.has_indirect_call)
+; GCN: NumSgprs: 38
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 16
+define void @indirect_use_vcc() #1 {
+  call void @use_vcc()
+  ret void
+}
+
+; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel:
+; GCN: .set indirect_2level_use_vcc_kernel.num_vgpr, max(32, indirect_use_vcc.num_vgpr)
+; GCN: .set indirect_2level_use_vcc_kernel.num_agpr, max(0, indirect_use_vcc.num_agpr)
+; GCN: .set indirect_2level_use_vcc_kernel.num_sgpr, max(33, indirect_use_vcc.num_sgpr)
+; GCN: .set indirect_2level_use_vcc_kernel.private_seg_size, 0+(max(indirect_use_vcc.private_seg_size))
+; GCN: .set indirect_2level_use_vcc_kernel.uses_vcc, or(1, indirect_use_vcc.uses_vcc)
+; GCN: .set indirect_2level_use_vcc_kernel.uses_flat_scratch, or(1, indirect_use_vcc.uses_flat_scratch)
+; GCN: .set indirect_2level_use_vcc_kernel.has_dyn_sized_stack, or(0, indirect_use_vcc.has_dyn_sized_stack)
+; GCN: .set indirect_2level_use_vcc_kernel.has_recursion, or(0, indirect_use_vcc.has_recursion)
+; GCN: .set indirect_2level_use_vcc_kernel.has_indirect_call, or(0, indirect_use_vcc.has_indirect_call)
+; GCN: NumSgprs: 40
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 16
+define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) #0 {
+  call void @indirect_use_vcc()
+  ret void
+}
+
+; GCN-LABEL: {{^}}use_flat_scratch:
+; GCN: .set use_flat_scratch.num_vgpr, 0
+; GCN: .set use_flat_scratch.num_agpr, 0
+; GCN: .set use_flat_scratch.num_sgpr, 32
+; GCN: .set use_flat_scratch.private_seg_size, 0
+; GCN: .set use_flat_scratch.uses_vcc, 0
+; GCN: .set use_flat_scratch.uses_flat_scratch, 1
+; GCN: .set use_flat_scratch.has_dyn_sized_stack, 0
+; GCN: .set use_flat_scratch.has_recursion, 0
+; GCN: .set use_flat_scratch.has_indirect_call, 0
+; GCN: NumSgprs: 38
+; GCN: NumVgprs: 0
+; GCN: ScratchSize: 0
+define void @use_flat_scratch() #1 {
+  call void asm sideeffect "", "~{flat_scratch}" () #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}indirect_use_flat_scratch:
+; GCN: .set indirect_use_flat_scratch.num_vgpr, max(41, use_flat_scratch.num_vgpr)
+; GCN: .set indirect_use_flat_scratch.num_agpr, max(0, use_flat_scratch.num_agpr)
+; GCN: .set indirect_use_flat_scratch.num_sgpr, max(34, use_flat_scratch.num_sgpr)
+; GCN: .set indirect_use_flat_scratch.private_seg_size, 16+(max(use_flat_scratch.private_seg_size))
+; GCN: .set indirect_use_flat_scratch.uses_vcc, or(1, use_flat_scratch.uses_vcc)
+; GCN: .set indirect_use_flat_scratch.uses_flat_scratch, or(0, use_flat_scratch.uses_flat_scratch)
+; GCN: .set indirect_use_flat_scratch.has_dyn_sized_stack, or(0, use_flat_scratch.has_dyn_sized_stack)
+; GCN: .set indirect_use_flat_scratch.has_recursion, or(0, use_flat_scratch.has_recursion)
+; GCN: .set indirect_use_flat_scratch.has_indirect_call, or(0, use_flat_scratch.has_indirect_call)
+; GCN: NumSgprs: 40
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 16
+define void @indirect_use_flat_scratch() #1 {
+  call void @use_flat_scratch()
+  ret void
+}
+
+; GCN-LABEL: {{^}}indirect_2level_use_flat_scratch_kernel:
+; GCN: .set indirect_2level_use_flat_scratch_kernel.num_vgpr, max(32, indirect_use_flat_scratch.num_vgpr)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.num_agpr, max(0, indirect_use_flat_scratch.num_agpr)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.num_sgpr, max(33, indirect_use_flat_scratch.num_sgpr)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.private_seg_size, 0+(max(indirect_use_flat_scratch.private_seg_size))
+; GCN: .set indirect_2level_use_flat_scratch_kernel.uses_vcc, or(1, indirect_use_flat_scratch.uses_vcc)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.uses_flat_scratch, or(1, indirect_use_flat_scratch.uses_flat_scratch)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.has_dyn_sized_stack, or(0, indirect_use_flat_scratch.has_dyn_sized_stack)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.has_recursion, or(0, indirect_use_flat_scratch.has_recursion)
+; GCN: .set indirect_2level_use_flat_scratch_kernel.has_indirect_call, or(0, indirect_use_flat_scratch.has_indirect_call)
+; GCN: NumSgprs: 40
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 16
+define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace(1) %out) #0 {
+  call void @indirect_use_flat_scratch()
+  ret void
+}
+
+; GCN-LABEL: {{^}}use_10_vgpr:
+; GCN: .set use_10_vgpr.num_vgpr, 10
+; GCN: .set use_10_vgpr.num_agpr, 0
+; GCN: .set use_10_vgpr.num_sgpr, 32
+; GCN: .set use_10_vgpr.private_seg_size, 0
+; GCN: .set use_10_vgpr.uses_vcc, 0
+; GCN: .set use_10_vgpr.uses_flat_scratch, 0
+; GCN: .set use_10_vgpr.has_dyn_sized_stack, 0
+; GCN: .set use_10_vgpr.has_recursion, 0
+; GCN: .set use_10_vgpr.has_indirect_call, 0
+; GCN: NumSgprs: 36
+; GCN: NumVgprs: 10
+; GCN: ScratchSize: 0
+define void @use_10_vgpr() #1 {
+  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4}"() #0
+  call void asm sideeffect "", "~{v5},~{v6},~{v7},~{v8},~{v9}"() #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}indirect_use_10_vgpr:
+; GCN: .set indirect_use_10_vgpr.num_vgpr, max(41, use_10_vgpr.num_vgpr)
+; GCN: .set indirect_use_10_vgpr.num_agpr, max(0, use_10_vgpr.num_agpr)
+; GCN: .set indirect_use_10_vgpr.num_sgpr, max(34, use_10_vgpr.num_sgpr)
+; GCN: .set indirect_use_10_vgpr.private_seg_size, 16+(max(use_10_vgpr.private_seg_size))
+; GCN: .set indirect_use_10_vgpr.uses_vcc, or(1, use_10_vgpr.uses_vcc)
+; GCN: .set indirect_use_10_vgpr.uses_flat_scratch, or(0, use_10_vgpr.uses_flat_scratch)
+; GCN: .set indirect_use_10_vgpr.has_dyn_sized_stack, or(0, use_10_vgpr.has_dyn_sized_stack)
+; GCN: .set indirect_use_10_vgpr.has_recursion, or(0, use_10_vgpr.has_recursion)
+; GCN: .set indirect_use_10_vgpr.has_indirect_call, or(0, use_10_vgpr.has_indirect_call)
+; GCN: NumSgprs: 38
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 16
+define void @indirect_use_10_vgpr() #0 {
+  call void @use_10_vgpr()
+  ret void
+}
+
+; GCN-LABEL: {{^}}indirect_2_level_use_10_vgpr:
+; GCN:	.set indirect_2_level_use_10_vgpr.num_vgpr, max(32, indirect_use_10_vgpr.num_vgpr)
+; GCN:	.set indirect_2_level_use_10_vgpr.num_agpr, max(0, indirect_use_10_vgpr.num_agpr)
+; GCN:	.set indirect_2_level_use_10_vgpr.num_sgpr, max(33, indirect_use_10_vgpr.num_sgpr)
+; GCN:	.set indirect_2_level_use_10_vgpr.private_seg_size, 0+(max(indirect_use_10_vgpr.private_seg_size))
+; GCN:	.set indirect_2_level_use_10_vgpr.uses_vcc, or(1, indirect_use_10_vgpr.uses_vcc)
+; GCN:	.set indirect_2_level_use_10_vgpr.uses_flat_scratch, or(1, indirect_use_10_vgpr.uses_flat_scratch)
+; GCN:	.set indirect_2_level_use_10_vgpr.has_dyn_sized_stack, or(0, indirect_use_10_vgpr.has_dyn_sized_stack)
+; GCN:	.set indirect_2_level_use_10_vgpr.has_recursion, or(0, indirect_use_10_vgpr.has_recursion)
+; GCN:	.set indirect_2_level_use_10_vgpr.has_indirect_call, or(0, indirect_use_10_vgpr.has_indirect_call)
+; GCN: NumSgprs: 40
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 16
+define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 {
+  call void @indirect_use_10_vgpr()
+  ret void
+}
+
+; GCN-LABEL: {{^}}use_50_vgpr:
+; GCN:	.set use_50_vgpr.num_vgpr, 50
+; GCN:	.set use_50_vgpr.num_agpr, 0
+; GCN:	.set use_50_vgpr.num_sgpr, 32
+; GCN:	.set use_50_vgpr.private_seg_size, 0
+; GCN:	.set use_50_vgpr.uses_vcc, 0
+; GCN:	.set use_50_vgpr.uses_flat_scratch, 0
+; GCN:	.set use_50_vgpr.has_dyn_sized_stack, 0
+; GCN:	.set use_50_vgpr.has_recursion, 0
+; GCN:	.set use_50_vgpr.has_indirect_call, 0
+; GCN: NumSgprs: 36
+; GCN: NumVgprs: 50
+; GCN: ScratchSize: 0
+define void @use_50_vgpr() #1 {
+  call void asm sideeffect "", "~{v49}"() #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}indirect_use_50_vgpr:
+; GCN:	.set indirect_use_50_vgpr.num_vgpr, max(41, use_50_vgpr.num_vgpr)
+; GCN:	.set indirect_use_50_vgpr.num_agpr, max(0, use_50_vgpr.num_agpr)
+; GCN:	.set indirect_use_50_vgpr.num_sgpr, max(34, use_50_vgpr.num_sgpr)
+; GCN:	.set indirect_use_50_vgpr.private_seg_size, 16+(max(use_50_vgpr.private_seg_size))
+; GCN:	.set indirect_use_50_vgpr.uses_vcc, or(1, use_50_vgpr.uses_vcc)
+; GCN:	.set indirect_use_50_vgpr.uses_flat_scratch, or(0, use_50_vgpr.uses_flat_scratch)
+; GCN:	.set indirect_use_50_vgpr.has_dyn_sized_stack, or(0, use_50_vgpr.has_dyn_sized_stack)
+; GCN:	.set indirect_use_50_vgpr.has_recursion, or(0, use_50_vgpr.has_recursion)
+; GCN:	.set indirect_use_50_vgpr.has_indirect_call, or(0, use_50_vgpr.has_indirect_call)
+; GCN: NumSgprs: 38
+; GCN: NumVgprs: 50
+; GCN: ScratchSize: 16
+define void @indirect_use_50_vgpr() #0 {
+  call void @use_50_vgpr()
+  ret void
+}
+
+; GCN-LABEL: {{^}}use_80_sgpr:
+; GCN:	.set use_80_sgpr.num_vgpr, 1
+; GCN:	.set use_80_sgpr.num_agpr, 0
+; GCN:	.set use_80_sgpr.num_sgpr, 80
+; GCN:	.set use_80_sgpr.private_seg_size, 8
+; GCN:	.set use_80_sgpr.uses_vcc, 0
+; GCN:	.set use_80_sgpr.uses_flat_scratch, 0
+; GCN:	.set use_80_sgpr.has_dyn_sized_stack, 0
+; GCN:	.set use_80_sgpr.has_recursion, 0
+; GCN:	.set use_80_sgpr.has_indirect_call, 0
+; GCN: NumSgprs: 84
+; GCN: NumVgprs: 1
+; GCN: ScratchSize: 8
+define void @use_80_sgpr() #1 {
+  call void asm sideeffect "", "~{s79}"() #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}indirect_use_80_sgpr:
+; GCN:	.set indirect_use_80_sgpr.num_vgpr, max(41, use_80_sgpr.num_vgpr)
+; GCN:	.set indirect_use_80_sgpr.num_agpr, max(0, use_80_sgpr.num_agpr)
+; GCN:	.set indirect_use_80_sgpr.num_sgpr, max(34, use_80_sgpr.num_sgpr)
+; GCN:	.set indirect_use_80_sgpr.private_seg_size, 16+(max(use_80_sgpr.private_seg_size))
+; GCN:	.set indirect_use_80_sgpr.uses_vcc, or(1, use_80_sgpr.uses_vcc)
+; GCN:	.set indirect_use_80_sgpr.uses_flat_scratch, or(0, use_80_sgpr.uses_flat_scratch)
+; GCN:	.set indirect_use_80_sgpr.has_dyn_sized_stack, or(0, use_80_sgpr.has_dyn_sized_stack)
+; GCN:	.set indirect_use_80_sgpr.has_recursion, or(0, use_80_sgpr.has_recursion)
+; GCN:	.set indirect_use_80_sgpr.has_indirect_call, or(0, use_80_sgpr.has_indirect_call)
+; GCN: NumSgprs: 84
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 24
+define void @indirect_use_80_sgpr() #1 {
+  call void @use_80_sgpr()
+  ret void
+}
+
+; GCN-LABEL: {{^}}indirect_2_level_use_80_sgpr:
+; GCN:	.set indirect_2_level_use_80_sgpr.num_vgpr, max(32, indirect_use_80_sgpr.num_vgpr)
+; GCN:	.set indirect_2_level_use_80_sgpr.num_agpr, max(0, indirect_use_80_sgpr.num_agpr)
+; GCN:	.set indirect_2_level_use_80_sgpr.num_sgpr, max(33, indirect_use_80_sgpr.num_sgpr)
+; GCN:	.set indirect_2_level_use_80_sgpr.private_seg_size, 0+(max(indirect_use_80_sgpr.private_seg_size))
+; GCN:	.set indirect_2_level_use_80_sgpr.uses_vcc, or(1, indirect_use_80_sgpr.uses_vcc)
+; GCN:	.set indirect_2_level_use_80_sgpr.uses_flat_scratch, or(1, indirect_use_80_sgpr.uses_flat_scratch)
+; GCN:	.set indirect_2_level_use_80_sgpr.has_dyn_sized_stack, or(0, indirect_use_80_sgpr.has_dyn_sized_stack)
+; GCN:	.set indirect_2_level_use_80_sgpr.has_recursion, or(0, indirect_use_80_sgpr.has_recursion)
+; GCN:	.set indirect_2_level_use_80_sgpr.has_indirect_call, or(0, indirect_use_80_sgpr.has_indirect_call)
+; GCN: NumSgprs: 86
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 24
+define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 {
+  call void @indirect_use_80_sgpr()
+  ret void
+}
+
+; GCN-LABEL: {{^}}use_stack0:
+; GCN:	.set use_stack0.num_vgpr, 1
+; GCN:	.set use_stack0.num_agpr, 0
+; GCN:	.set use_stack0.num_sgpr, 33
+; GCN:	.set use_stack0.private_seg_size, 2052
+; GCN:	.set use_stack0.uses_vcc, 0
+; GCN:	.set use_stack0.uses_flat_scratch, 0
+; GCN:	.set use_stack0.has_dyn_sized_stack, 0
+; GCN:	.set use_stack0.has_recursion, 0
+; GCN:	.set use_stack0.has_indirect_call, 0
+; GCN: NumSgprs: 37
+; GCN: NumVgprs: 1
+; GCN: ScratchSize: 2052
+define void @use_stack0() #1 {
+  %alloca = alloca [512 x i32], align 4, addrspace(5)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}use_stack1:
+; GCN:	.set use_stack1.num_vgpr, 1
+; GCN:	.set use_stack1.num_agpr, 0
+; GCN:	.set use_stack1.num_sgpr, 33
+; GCN:	.set use_stack1.private_seg_size, 404
+; GCN:	.set use_stack1.uses_vcc, 0
+; GCN:	.set use_stack1.uses_flat_scratch, 0
+; GCN:	.set use_stack1.has_dyn_sized_stack, 0
+; GCN:	.set use_stack1.has_recursion, 0
+; GCN:	.set use_stack1.has_indirect_call, 0
+; GCN: NumSgprs: 37
+; GCN: NumVgprs: 1
+; GCN: ScratchSize: 404
+define void @use_stack1() #1 {
+  %alloca = alloca [100 x i32], align 4, addrspace(5)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}indirect_use_stack:
+; GCN:	.set indirect_use_stack.num_vgpr, max(41, use_stack0.num_vgpr)
+; GCN:	.set indirect_use_stack.num_agpr, max(0, use_stack0.num_agpr)
+; GCN:	.set indirect_use_stack.num_sgpr, max(34, use_stack0.num_sgpr)
+; GCN:	.set indirect_use_stack.private_seg_size, 80+(max(use_stack0.private_seg_size))
+; GCN:	.set indirect_use_stack.uses_vcc, or(1, use_stack0.uses_vcc)
+; GCN:	.set indirect_use_stack.uses_flat_scratch, or(0, use_stack0.uses_flat_scratch)
+; GCN:	.set indirect_use_stack.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack)
+; GCN:	.set indirect_use_stack.has_recursion, or(0, use_stack0.has_recursion)
+; GCN:	.set indirect_use_stack.has_indirect_call, or(0, use_stack0.has_indirect_call)
+; GCN: NumSgprs: 38
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 2132
+define void @indirect_use_stack() #1 {
+  %alloca = alloca [16 x i32], align 4, addrspace(5)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0
+  call void @use_stack0()
+  ret void
+}
+
+; GCN-LABEL: {{^}}indirect_2_level_use_stack:
+; GCN:	.set indirect_2_level_use_stack.num_vgpr, max(32, indirect_use_stack.num_vgpr)
+; GCN:	.set indirect_2_level_use_stack.num_agpr, max(0, indirect_use_stack.num_agpr)
+; GCN:	.set indirect_2_level_use_stack.num_sgpr, max(33, indirect_use_stack.num_sgpr)
+; GCN:	.set indirect_2_level_use_stack.private_seg_size, 0+(max(indirect_use_stack.private_seg_size))
+; GCN:	.set indirect_2_level_use_stack.uses_vcc, or(1, indirect_use_stack.uses_vcc)
+; GCN:	.set indirect_2_level_use_stack.uses_flat_scratch, or(1, indirect_use_stack.uses_flat_scratch)
+; GCN:	.set indirect_2_level_use_stack.has_dyn_sized_stack, or(0, indirect_use_stack.has_dyn_sized_stack)
+; GCN:	.set indirect_2_level_use_stack.has_recursion, or(0, indirect_use_stack.has_recursion)
+; GCN:	.set indirect_2_level_use_stack.has_indirect_call, or(0, indirect_use_stack.has_indirect_call)
+; GCN: NumSgprs: 40
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 2132
+define amdgpu_kernel void @indirect_2_level_use_stack() #0 {
+  call void @indirect_use_stack()
+  ret void
+}
+
+
+; Should be maximum of callee usage
+; GCN-LABEL: {{^}}multi_call_use_use_stack:
+; GCN:	.set multi_call_use_use_stack.num_vgpr, max(41, use_stack0.num_vgpr, use_stack1.num_vgpr)
+; GCN:	.set multi_call_use_use_stack.num_agpr, max(0, use_stack0.num_agpr, use_stack1.num_agpr)
+; GCN:	.set multi_call_use_use_stack.num_sgpr, max(42, use_stack0.num_sgpr, use_stack1.num_sgpr)
+; GCN:	.set multi_call_use_use_stack.private_seg_size, 0+(max(use_stack0.private_seg_size, use_stack1.private_seg_size))
+; GCN:	.set multi_call_use_use_stack.uses_vcc, or(1, use_stack0.uses_vcc, use_stack1.uses_vcc)
+; GCN:	.set multi_call_use_use_stack.uses_flat_scratch, or(1, use_stack0.uses_flat_scratch, use_stack1.uses_flat_scratch)
+; GCN:	.set multi_call_use_use_stack.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack, use_stack1.has_dyn_sized_stack)
+; GCN:	.set multi_call_use_use_stack.has_recursion, or(0, use_stack0.has_recursion, use_stack1.has_recursion)
+; GCN:	.set multi_call_use_use_stack.has_indirect_call, or(0, use_stack0.has_indirect_call, use_stack1.has_indirect_call)
+; GCN: NumSgprs: 48
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 2052
+define amdgpu_kernel void @multi_call_use_use_stack() #0 {
+  call void @use_stack0()
+  call void @use_stack1()
+  ret void
+}
+
+declare void @external() #0
+
+; GCN-LABEL: {{^}}multi_call_with_external:
+; GCN:	.set multi_call_with_external.num_vgpr, max(41, max_num_vgpr)
+; GCN:	.set multi_call_with_external.num_agpr, max(0, max_num_agpr)
+; GCN:	.set multi_call_with_external.num_sgpr, max(42, max_num_sgpr)
+; GCN:	.set multi_call_with_external.private_seg_size, 0
+; GCN:	.set multi_call_with_external.uses_vcc, 1
+; GCN:	.set multi_call_with_external.uses_flat_scratch, 1
+; GCN:	.set multi_call_with_external.has_dyn_sized_stack, 1
+; GCN:	.set multi_call_with_external.has_recursion, 0
+; GCN:	.set multi_call_with_external.has_indirect_call, 1
+; GCN: NumSgprs: multi_call_with_external.num_sgpr+6
+; GCN: NumVgprs: multi_call_with_external.num_vgpr
+; GCN: ScratchSize: 0
+define amdgpu_kernel void @multi_call_with_external() #0 {
+  call void @use_stack0()
+  call void @use_stack1()
+  call void @external()
+  ret void
+}
+
+; GCN-LABEL: {{^}}usage_external:
+; GCN:	.set usage_external.num_vgpr, max(32, max_num_vgpr)
+; GCN:	.set usage_external.num_agpr, max(0, max_num_agpr)
+; GCN:	.set usage_external.num_sgpr, max(33, max_num_sgpr)
+; GCN:	.set usage_external.private_seg_size, 0
+; GCN:	.set usage_external.uses_vcc, 1
+; GCN:	.set usage_external.uses_flat_scratch, 1
+; GCN:	.set usage_external.has_dyn_sized_stack, 1
+; GCN:	.set usage_external.has_recursion, 0
+; GCN:	.set usage_external.has_indirect_call, 1
+; GCN: NumSgprs: usage_external.num_sgpr+6
+; GCN: NumVgprs: usage_external.num_vgpr
+; GCN: ScratchSize: 0
+define amdgpu_kernel void @usage_external() #0 {
+  call void @external()
+  ret void
+}
+
+declare void @external_recurse() #2
+
+; GCN-LABEL: {{^}}usage_external_recurse:
+; GCN:	.set usage_external_recurse.num_vgpr, max(32, max_num_vgpr)
+; GCN:	.set usage_external_recurse.num_agpr, max(0, max_num_agpr)
+; GCN:	.set usage_external_recurse.num_sgpr, max(33, max_num_sgpr)
+; GCN:	.set usage_external_recurse.private_seg_size, 0
+; GCN:	.set usage_external_recurse.uses_vcc, 1
+; GCN:	.set usage_external_recurse.uses_flat_scratch, 1
+; GCN:	.set usage_external_recurse.has_dyn_sized_stack, 1
+; GCN:	.set usage_external_recurse.has_recursion, 1
+; GCN:	.set usage_external_recurse.has_indirect_call, 1
+; GCN: NumSgprs: usage_external_recurse.num_sgpr+6
+; GCN: NumVgprs: usage_external_recurse.num_vgpr
+; GCN: ScratchSize: 0
+define amdgpu_kernel void @usage_external_recurse() #0 {
+  call void @external_recurse()
+  ret void
+}
+
+; GCN-LABEL: {{^}}direct_recursion_use_stack:
+; GCN: .set direct_recursion_use_stack.num_vgpr, 41
+; GCN: .set direct_recursion_use_stack.num_agpr, 0
+; GCN: .set direct_recursion_use_stack.num_sgpr, 36
+; GCN: .set direct_recursion_use_stack.private_seg_size, 2064
+; GCN: .set direct_recursion_use_stack.uses_vcc, 1
+; GCN: .set direct_recursion_use_stack.uses_flat_scratch, 0
+; GCN: .set direct_recursion_use_stack.has_dyn_sized_stack, 0
+; GCN: .set direct_recursion_use_stack.has_recursion, 1
+; GCN: .set direct_recursion_use_stack.has_indirect_call, 0
+; GCN: NumSgprs: 40
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 2064
+define void @direct_recursion_use_stack(i32 %val) #2 {
+  %alloca = alloca [512 x i32], align 4, addrspace(5)
+  call void asm sideeffect "; use $0", "v"(ptr addrspace(5) %alloca) #0
+  %cmp = icmp eq i32 %val, 0
+  br i1 %cmp, label %ret, label %call
+
+call:
+  %val.sub1 = sub i32 %val, 1
+  call void @direct_recursion_use_stack(i32 %val.sub1)
+  br label %ret
+
+ret:
+  ret void
+}
+
+; GCN-LABEL: {{^}}usage_direct_recursion:
+; GCN:  .set usage_direct_recursion.num_vgpr, max(32, direct_recursion_use_stack.num_vgpr)
+; GCN:  .set usage_direct_recursion.num_agpr, max(0, direct_recursion_use_stack.num_agpr)
+; GCN:  .set usage_direct_recursion.num_sgpr, max(33, direct_recursion_use_stack.num_sgpr)
+; GCN:  .set usage_direct_recursion.private_seg_size, 0+(max(direct_recursion_use_stack.private_seg_size))
+; GCN:  .set usage_direct_recursion.uses_vcc, or(1, direct_recursion_use_stack.uses_vcc)
+; GCN:  .set usage_direct_recursion.uses_flat_scratch, or(1, direct_recursion_use_stack.uses_flat_scratch)
+; GCN:  .set usage_direct_recursion.has_dyn_sized_stack, or(0, direct_recursion_use_stack.has_dyn_sized_stack)
+; GCN:  .set usage_direct_recursion.has_recursion, or(1, direct_recursion_use_stack.has_recursion)
+; GCN:  .set usage_direct_recursion.has_indirect_call, or(0, direct_recursion_use_stack.has_indirect_call)
+; GCN: NumSgprs: 42
+; GCN: NumVgprs: 41
+; GCN: ScratchSize: 2064
+define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 {
+  call void @direct_recursion_use_stack(i32 %n)
+  ret void
+}
+
+; Make sure there's no assert when a sgpr96 is used.
+; GCN-LABEL: {{^}}count_use_sgpr96_external_call
+; GCN:	.set count_use_sgpr96_external_call.num_vgpr, max(32, max_num_vgpr)
+; GCN:	.set count_use_sgpr96_external_call.num_agpr, max(0, max_num_agpr)
+; GCN:	.set count_use_sgpr96_external_call.num_sgpr, max(33, max_num_sgpr)
+; GCN:	.set count_use_sgpr96_external_call.private_seg_size, 0
+; GCN:	.set count_use_sgpr96_external_call.uses_vcc, 1
+; GCN:	.set count_use_sgpr96_external_call.uses_flat_scratch, 1
+; GCN:	.set count_use_sgpr96_external_call.has_dyn_sized_stack, 1
+; GCN:	.set count_use_sgpr96_external_call.has_recursion, 0
+; GCN:	.set count_use_sgpr96_external_call.has_indirect_call, 1
+; GCN: NumSgprs: count_use_sgpr96_external_call.num_sgpr+6
+; GCN: NumVgprs: count_use_sgpr96_external_call.num_vgpr
+; GCN: ScratchSize: 0
+define amdgpu_kernel void @count_use_sgpr96_external_call()  {
+entry:
+  tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> <i32 10, i32 11, i32 12>) #1
+  call void @external()
+  ret void
+}
+
+; Make sure there's no assert when a sgpr160 is used.
+; GCN-LABEL: {{^}}count_use_sgpr160_external_call
+; GCN:	.set count_use_sgpr160_external_call.num_vgpr, max(32, max_num_vgpr)
+; GCN:	.set count_use_sgpr160_external_call.num_agpr, max(0, max_num_agpr)
+; GCN:	.set count_use_sgpr160_external_call.num_sgpr, max(33, max_num_sgpr)
+; GCN:	.set count_use_sgpr160_external_call.private_seg_size, 0
+; GCN:	.set count_use_sgpr160_external_call.uses_vcc, 1
+; GCN:	.set count_use_sgpr160_external_call.uses_flat_scratch, 1
+; GCN:	.set count_use_sgpr160_external_call.has_dyn_sized_stack, 1
+; GCN:	.set count_use_sgpr160_external_call.has_recursion, 0
+; GCN:	.set count_use_sgpr160_external_call.has_indirect_call, 1
+; GCN: NumSgprs: count_use_sgpr160_external_call.num_sgpr+6
+; GCN: NumVgprs: count_use_sgpr160_external_call.num_vgpr
+; GCN: ScratchSize: 0
+define amdgpu_kernel void @count_use_sgpr160_external_call()  {
+entry:
+  tail call void asm sideeffect "; sgpr160 $0", "s"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
+  call void @external()
+  ret void
+}
+
+; Make sure there's no assert when a vgpr160 is used.
+; GCN-LABEL: {{^}}count_use_vgpr160_external_call
+; GCN:	.set count_use_vgpr160_external_call.num_vgpr, max(32, max_num_vgpr)
+; GCN:	.set count_use_vgpr160_external_call.num_agpr, max(0, max_num_agpr)
+; GCN:	.set count_use_vgpr160_external_call.num_sgpr, max(33, max_num_sgpr)
+; GCN:	.set count_use_vgpr160_external_call.private_seg_size, 0
+; GCN:	.set count_use_vgpr160_external_call.uses_vcc, 1
+; GCN:	.set count_use_vgpr160_external_call.uses_flat_scratch, 1
+; GCN:	.set count_use_vgpr160_external_call.has_dyn_sized_stack, 1
+; GCN:	.set count_use_vgpr160_external_call.has_recursion, 0
+; GCN:	.set count_use_vgpr160_external_call.has_indirect_call, 1
+; GCN: NumSgprs: count_use_vgpr160_external_call.num_sgpr+6
+; GCN: NumVgprs: count_use_vgpr160_external_call.num_vgpr
+; GCN: ScratchSize: 0
+define amdgpu_kernel void @count_use_vgpr160_external_call()  {
+entry:
+  tail call void asm sideeffect "; vgpr160 $0", "v"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
+  call void @external()
+  ret void
+}
+
+; Added at the of the .s are the module level maximums
+; GCN:	.set max_num_vgpr, 50
+; GCN:	.set max_num_agpr, 0
+; GCN:	.set max_num_sgpr, 80
+
+attributes #0 = { nounwind noinline norecurse }
+attributes #1 = { nounwind noinline norecurse }
+attributes #2 = { nounwind noinline }
diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
index 0f951e89d37c8a..4f300e2282426e 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll
@@ -36,8 +36,8 @@
 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
 ; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
 ; GCN-NEXT: .amdhsa_wavefront_size32
-; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
-; GCN-NEXT: .amdhsa_enable_private_segment 0
+; GCN-NEXT: .amdhsa_uses_dynamic_stack
+; GCN-NEXT: .amdhsa_enable_private_segment
 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
@@ -65,8 +65,8 @@ define amdgpu_kernel void @minimal_kernel_inputs() #0 {
 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
 ; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
 ; GCN-NEXT: .amdhsa_wavefront_size32
-; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
-; GCN-NEXT: .amdhsa_enable_private_segment 1
+; GCN-NEXT: .amdhsa_uses_dynamic_stack
+; GCN-NEXT: .amdhsa_enable_private_segment
 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
@@ -98,8 +98,8 @@ define amdgpu_kernel void @minimal_kernel_inputs_with_stack() #0 {
 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0
 ; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
 ; GCN-NEXT: .amdhsa_wavefront_size32
-; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
-; GCN-NEXT: .amdhsa_enable_private_segment 0
+; GCN-NEXT: .amdhsa_uses_dynamic_stack
+; GCN-NEXT: .amdhsa_enable_private_segment
 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0
 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 0
@@ -145,8 +145,8 @@ define amdgpu_kernel void @queue_ptr() #1 {
 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 1
 ; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0
 ; GCN-NEXT: .amdhsa_wavefront_size32
-; GCN-NEXT: .amdhsa_uses_dynamic_stack 0
-; GCN-NEXT: .amdhsa_enable_private_segment 1
+; GCN-NEXT: .amdhsa_uses_dynamic_stack
+; GCN-NEXT: .amdhsa_enable_private_segment
 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1
 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1
 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_z 1
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll b/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll
index f20d720c3876ba..dce4162c246247 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm-reserved-regs.ll
@@ -41,7 +41,7 @@ entry:
 }
 
 ; FIXME: This should warn too
-; ERR-NOT: warning
+; ERR-NOT: warning: inline asm clobber list contains reserved registers
 define amdgpu_kernel void @def_exec(ptr addrspace(1) %ptr) {
 entry:
   %exec = call i64 asm sideeffect "; def $0", "={exec}"()
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index cddfb21a6fbdf4..6e07e2fc9da79a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -3,6 +3,18 @@
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 
+define <2 x i64> @f1() #0 {
+; GFX11-LABEL: f1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  ret <2 x i64> zeroinitializer
+}
+
 define void @f0() {
 ; GFX11-LABEL: f0:
 ; GFX11:       ; %bb.0: ; %bb
@@ -36,18 +48,6 @@ bb:
   ret void
 }
 
-define <2 x i64> @f1() #0 {
-; GFX11-LABEL: f1:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  ret <2 x i64> zeroinitializer
-}
-
 ; FIXME: This generates "instid1(/* invalid instid value */)".
 define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, ptr %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i1 %arg11) {
 ; GFX11-LABEL: f2:
diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll
index b49931379b84a5..4575df1e0c6b95 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra.ll
@@ -105,13 +105,6 @@ define void @test_funcx2() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}wombat:
-define weak amdgpu_kernel void @wombat(ptr %arg, ptr %arg2) {
-bb:
-  call void @hoge() #0
-  ret void
-}
-
 ; Make sure we save/restore the return address around the call.
 ; Function Attrs: norecurse
 define internal void @hoge() #2 {
@@ -128,6 +121,13 @@ bb:
   ret void
 }
 
+; GCN-LABEL: {{^}}wombat:
+define weak amdgpu_kernel void @wombat(ptr %arg, ptr %arg2) {
+bb:
+  call void @hoge() #0
+  ret void
+}
+
 declare dso_local void @eggs()
 
 
diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
index 496a1c652da251..d9a5a49e75f0a5 100644
--- a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
@@ -5,12 +5,14 @@ declare void @llvm.trap() #0
 
 ; DOORBELL:      .amdhsa_kernel trap
 ; DOORBELL-NEXT:     .amdhsa_group_segment_fixed_size 0
-; DOORBELL-NEXT:     .amdhsa_private_segment_fixed_size 0
+; DOORBELL-NEXT:     .amdhsa_private_segment_fixed_size trap.private_seg_size
 ; DOORBELL-NEXT:     .amdhsa_kernarg_size 8
 ; DOORBELL-NEXT:     .amdhsa_user_sgpr_count 12
 ; DOORBELL-NEXT:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; DOORBELL:      .end_amdhsa_kernel
 
+; DOORBELL: .set trap.private_seg_size, 0
+
 define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) #0 {
   store volatile i32 1, ptr addrspace(1) %arg0
   call void @llvm.trap()
diff --git a/llvm/test/CodeGen/AMDGPU/kernel_code_t_recurse.ll b/llvm/test/CodeGen/AMDGPU/kernel_code_t_recurse.ll
index cdd6e88dd103b7..f3d9e9a727c251 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel_code_t_recurse.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel_code_t_recurse.ll
@@ -1,6 +1,8 @@
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d < %s | FileCheck %s
 
 ; CHECK-LABEL: non_kernel_recursion:
+; CHECK: .set non_kernel_recursion.has_recursion, 1
+; CHECK: .set non_kernel_recursion.has_indirect_call, 0
 define void @non_kernel_recursion(i32 %val) #2 {
   %cmp = icmp eq i32 %val, 0
   br i1 %cmp, label %ret, label %call
@@ -16,8 +18,11 @@ ret:
 
 ; CHECK-LABEL: kernel_caller_recursion:
 ; CHECK: .amd_kernel_code_t
-; CHECK: is_dynamic_callstack = 1
+; CHECK: is_dynamic_callstack = kernel_caller_recursion.has_dyn_sized_stack|kernel_caller_recursion.has_recursion
 ; CHECK: .end_amd_kernel_code_t
+
+; CHECK: .set kernel_caller_recursion.has_recursion, or(1, non_kernel_recursion.has_recursion)
+; CHECK: .set kernel_caller_recursion.has_indirect_call, or(0, non_kernel_recursion.has_indirect_call)
 define amdgpu_kernel void @kernel_caller_recursion(i32 %n) #0 {
   call void @non_kernel_recursion(i32 %n)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll
index 7698372b687797..64bc14d750573b 100644
--- a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll
+++ b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=bonaire -show-mc-encoding < %s | FileCheck --check-prefixes=GCN,CI,ALL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=carrizo --show-mc-encoding < %s | FileCheck --check-prefixes=GCN,VI,ALL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 --show-mc-encoding < %s | FileCheck --check-prefixes=GCN,GFX9,ALL %s
-; RUN: llc -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCNHSA,ALL %s
-; RUN: llc -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCNHSA,ALL %s
-; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCNHSA,ALL %s
-; RUN: llc -mcpu=gfx1100 -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global,-architected-flat-scratch,-user-sgpr-init16-bug < %s | FileCheck --check-prefixes=GCNHSA,ALL %s
+; RUN: llc -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck --check-prefixes=BON,GCNHSA,ALL %s
+; RUN: llc -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck --check-prefixes=CAR,GCNHSA,ALL %s
+; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GFX10,GCNHSA,ALL %s
+; RUN: llc -mcpu=gfx1100 -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global,-architected-flat-scratch,-user-sgpr-init16-bug < %s | FileCheck --check-prefixes=GFX11,GCNHSA,ALL %s
 
 ; FIXME: align on alloca seems to be ignored for private_segment_alignment
 
@@ -24,7 +24,7 @@
 
 ; GCNHSA: .amdhsa_kernel large_alloca_compute_shader
 ; GCNHSA:         .amdhsa_group_segment_fixed_size 0
-; GCNHSA:         .amdhsa_private_segment_fixed_size 32772
+; GCNHSA:         .amdhsa_private_segment_fixed_size large_alloca_compute_shader.private_seg_size
 ; GCNHSA:         .amdhsa_user_sgpr_private_segment_buffer 1
 ; GCNHSA:         .amdhsa_user_sgpr_dispatch_ptr 1
 ; GCNHSA:         .amdhsa_user_sgpr_queue_ptr 1
@@ -32,14 +32,19 @@
 ; GCNHSA:         .amdhsa_user_sgpr_dispatch_id 1
 ; GCNHSA:         .amdhsa_user_sgpr_flat_scratch_init 1
 ; GCNHSA:         .amdhsa_user_sgpr_private_segment_size 0
-; GCNHSA:         .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; GCNHSA:         .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(large_alloca_compute_shader.private_seg_size*{{32|64}}, {{1024|256}}))/{{1024|256}})>0)||(large_alloca_compute_shader.has_dyn_sized_stack|large_alloca_compute_shader.has_recursion))|5020)&1
 ; GCNHSA:         .amdhsa_system_sgpr_workgroup_id_x 1
 ; GCNHSA:         .amdhsa_system_sgpr_workgroup_id_y 1
 ; GCNHSA:         .amdhsa_system_sgpr_workgroup_id_z 1
 ; GCNHSA:         .amdhsa_system_sgpr_workgroup_info 0
 ; GCNHSA:         .amdhsa_system_vgpr_workitem_id 2
-; GCNHSA:         .amdhsa_next_free_vgpr 3
-; GCNHSA:         .amdhsa_next_free_sgpr 18
+; GCNHSA:         .amdhsa_next_free_vgpr max(totalnumvgprs(large_alloca_compute_shader.num_agpr, large_alloca_compute_shader.num_vgpr), 1, 0)
+; BON:            .amdhsa_next_free_sgpr (max(large_alloca_compute_shader.num_sgpr+(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 0))
+; CAR:            .amdhsa_next_free_sgpr (max(large_alloca_compute_shader.num_sgpr+(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 1))
+; GFX10:          .amdhsa_next_free_sgpr (max(large_alloca_compute_shader.num_sgpr+(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 1))
+; GFX11:          .amdhsa_next_free_sgpr (max(large_alloca_compute_shader.num_sgpr+(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(large_alloca_compute_shader.uses_vcc, large_alloca_compute_shader.uses_flat_scratch, 0))
+; GCNHSA:         .amdhsa_reserve_vcc large_alloca_compute_shader.uses_vcc
+; GCNHSA:         .amdhsa_reserve_flat_scratch large_alloca_compute_shader.uses_flat_scratch
 ; GCNHSA:         .amdhsa_float_round_mode_32 0
 ; GCNHSA:         .amdhsa_float_round_mode_16_64 0
 ; GCNHSA:         .amdhsa_float_denorm_mode_32 3
@@ -55,6 +60,16 @@
 ; GCNHSA:         .amdhsa_exception_int_div_zero 0
 ; GCNHSA: .end_amdhsa_kernel
 
+; GCNHSA: .set large_alloca_compute_shader.num_vgpr, 3
+; GCNHSA: .set large_alloca_compute_shader.num_agpr, 0
+; GCNHSA: .set large_alloca_compute_shader.num_sgpr, 18
+; GCNHSA: .set large_alloca_compute_shader.private_seg_size, 32772
+; GCNHSA: .set large_alloca_compute_shader.uses_vcc
+; GCNHSA: .set large_alloca_compute_shader.uses_flat_scratch, 0
+; GCNHSA: .set large_alloca_compute_shader.has_dyn_sized_stack, 0
+; GCNHSA: .set large_alloca_compute_shader.has_recursion, 0
+; GCNHSA: .set large_alloca_compute_shader.has_indirect_call, 0
+
 ; Scratch size = alloca size + emergency stack slot, align {{.*}}, addrspace(5)
 ; ALL: ; ScratchSize: 32772
 define amdgpu_kernel void @large_alloca_compute_shader(i32 %x, i32 %y) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 1b1ea52520c0bf..cf331e119126ed 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -146,12 +146,9 @@
 ; GCN-O0-NEXT:        Lazy Machine Block Frequency Analysis
 ; GCN-O0-NEXT:        Machine Optimization Remark Emitter
 ; GCN-O0-NEXT:        Stack Frame Layout Analysis
-; GCN-O0-NEXT:    Function register usage analysis
-; GCN-O0-NEXT:    FunctionPass Manager
-; GCN-O0-NEXT:      Lazy Machine Block Frequency Analysis
-; GCN-O0-NEXT:      Machine Optimization Remark Emitter
-; GCN-O0-NEXT:      AMDGPU Assembly Printer
-; GCN-O0-NEXT:      Free MachineFunction
+; GCN-O0-NEXT:        Function register usage analysis
+; GCN-O0-NEXT:        AMDGPU Assembly Printer
+; GCN-O0-NEXT:        Free MachineFunction
 
 ; GCN-O1:Target Library Information
 ; GCN-O1-NEXT:Target Pass Configuration
@@ -421,12 +418,9 @@
 ; GCN-O1-NEXT:        Lazy Machine Block Frequency Analysis
 ; GCN-O1-NEXT:        Machine Optimization Remark Emitter
 ; GCN-O1-NEXT:        Stack Frame Layout Analysis
-; GCN-O1-NEXT:    Function register usage analysis
-; GCN-O1-NEXT:    FunctionPass Manager
-; GCN-O1-NEXT:      Lazy Machine Block Frequency Analysis
-; GCN-O1-NEXT:      Machine Optimization Remark Emitter
-; GCN-O1-NEXT:      AMDGPU Assembly Printer
-; GCN-O1-NEXT:      Free MachineFunction
+; GCN-O1-NEXT:        Function register usage analysis
+; GCN-O1-NEXT:        AMDGPU Assembly Printer
+; GCN-O1-NEXT:        Free MachineFunction
 
 ; GCN-O1-OPTS:Target Library Information
 ; GCN-O1-OPTS-NEXT:Target Pass Configuration
@@ -724,12 +718,9 @@
 ; GCN-O1-OPTS-NEXT:        Lazy Machine Block Frequency Analysis
 ; GCN-O1-OPTS-NEXT:        Machine Optimization Remark Emitter
 ; GCN-O1-OPTS-NEXT:        Stack Frame Layout Analysis
-; GCN-O1-OPTS-NEXT:    Function register usage analysis
-; GCN-O1-OPTS-NEXT:    FunctionPass Manager
-; GCN-O1-OPTS-NEXT:      Lazy Machine Block Frequency Analysis
-; GCN-O1-OPTS-NEXT:      Machine Optimization Remark Emitter
-; GCN-O1-OPTS-NEXT:      AMDGPU Assembly Printer
-; GCN-O1-OPTS-NEXT:      Free MachineFunction
+; GCN-O1-OPTS-NEXT:        Function register usage analysis
+; GCN-O1-OPTS-NEXT:        AMDGPU Assembly Printer
+; GCN-O1-OPTS-NEXT:        Free MachineFunction
 
 ; GCN-O2:Target Library Information
 ; GCN-O2-NEXT:Target Pass Configuration
@@ -1033,12 +1024,9 @@
 ; GCN-O2-NEXT:        Lazy Machine Block Frequency Analysis
 ; GCN-O2-NEXT:        Machine Optimization Remark Emitter
 ; GCN-O2-NEXT:        Stack Frame Layout Analysis
-; GCN-O2-NEXT:    Function register usage analysis
-; GCN-O2-NEXT:    FunctionPass Manager
-; GCN-O2-NEXT:      Lazy Machine Block Frequency Analysis
-; GCN-O2-NEXT:      Machine Optimization Remark Emitter
-; GCN-O2-NEXT:      AMDGPU Assembly Printer
-; GCN-O2-NEXT:      Free MachineFunction
+; GCN-O2-NEXT:        Function register usage analysis
+; GCN-O2-NEXT:        AMDGPU Assembly Printer
+; GCN-O2-NEXT:        Free MachineFunction
 
 ; GCN-O3:Target Library Information
 ; GCN-O3-NEXT:Target Pass Configuration
@@ -1354,12 +1342,9 @@
 ; GCN-O3-NEXT:        Lazy Machine Block Frequency Analysis
 ; GCN-O3-NEXT:        Machine Optimization Remark Emitter
 ; GCN-O3-NEXT:        Stack Frame Layout Analysis
-; GCN-O3-NEXT:    Function register usage analysis
-; GCN-O3-NEXT:    FunctionPass Manager
-; GCN-O3-NEXT:      Lazy Machine Block Frequency Analysis
-; GCN-O3-NEXT:      Machine Optimization Remark Emitter
-; GCN-O3-NEXT:      AMDGPU Assembly Printer
-; GCN-O3-NEXT:      Free MachineFunction
+; GCN-O3-NEXT:        Function register usage analysis
+; GCN-O3-NEXT:        AMDGPU Assembly Printer
+; GCN-O3-NEXT:        Free MachineFunction
 
 define void @empty() {
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
index eaee8ec73fe411..778060d3c5fb3d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
@@ -12,7 +12,7 @@ declare i32 @llvm.amdgcn.workitem.id.z() #0
 
 ; MESA: .section .AMDGPU.config
 ; MESA: .long 47180
-; MESA-NEXT: .long 132{{$}}
+; MESA-NEXT: .long ((((alignto(test_workitem_id_x.private_seg_size*64, 1024))/1024)>0)||(test_workitem_id_x.has_dyn_sized_stack|test_workitem_id_x.has_recursion))|132{{$}}
 
 ; ALL-LABEL: {{^}}test_workitem_id_x:
 ; MESA3D: enable_vgpr_workitem_id = 0
@@ -29,7 +29,7 @@ define amdgpu_kernel void @test_workitem_id_x(ptr addrspace(1) %out) #1 {
 
 ; MESA: .section .AMDGPU.config
 ; MESA: .long 47180
-; MESA-NEXT: .long 2180{{$}}
+; MESA-NEXT: .long ((((alignto(test_workitem_id_y.private_seg_size*64, 1024))/1024)>0)||(test_workitem_id_y.has_dyn_sized_stack|test_workitem_id_y.has_recursion))|2180{{$}}
 
 ; ALL-LABEL: {{^}}test_workitem_id_y:
 ; MESA3D: enable_vgpr_workitem_id = 1
@@ -47,7 +47,7 @@ define amdgpu_kernel void @test_workitem_id_y(ptr addrspace(1) %out) #1 {
 
 ; MESA: .section .AMDGPU.config
 ; MESA: .long 47180
-; MESA-NEXT: .long 4228{{$}}
+; MESA-NEXT: .long ((((alignto(test_workitem_id_z.private_seg_size*64, 1024))/1024)>0)||(test_workitem_id_z.has_dyn_sized_stack|test_workitem_id_z.has_recursion))|4228{{$}}
 
 ; ALL-LABEL: {{^}}test_workitem_id_z:
 ; MESA3D: enable_vgpr_workitem_id = 2
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
index 34dcdaf29677e4..b508ffff8050a8 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
@@ -9,6 +9,19 @@
 @lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1
 @lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16
 
+; GCN-LABEL: {{^}}f0:
+; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0
+; GCN-DAG: v_mov_b32_e32 [[TREE:v[0-9]+]], 3
+; GCN:     ds_write_b8 [[NULL]], [[TREE]]
+define void @f0() {
+; OPT-LABEL: @f0() {
+; OPT-NEXT:    store i8 3, ptr addrspace(3) @llvm.amdgcn.module.lds, align 1
+; OPT-NEXT:    ret void
+;
+  store i8 3, ptr addrspace(3) @lds.size.1.align.1, align 1
+  ret void
+}
+
 ; GCN-LABEL: {{^}}k0:
 ; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0
 ; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
@@ -29,16 +42,3 @@ define amdgpu_kernel void @k0() {
   call void @f0()
   ret void
 }
-
-; GCN-LABEL: {{^}}f0:
-; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0
-; GCN-DAG: v_mov_b32_e32 [[TREE:v[0-9]+]], 3
-; GCN:     ds_write_b8 [[NULL]], [[TREE]]
-define void @f0() {
-; OPT-LABEL: @f0() {
-; OPT-NEXT:    store i8 3, ptr addrspace(3) @llvm.amdgcn.module.lds, align 1
-; OPT-NEXT:    ret void
-;
-  store i8 3, ptr addrspace(3) @lds.size.1.align.1, align 1
-  ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/mesa3d.ll b/llvm/test/CodeGen/AMDGPU/mesa3d.ll
index 7f0f473c11bd59..f3cf2a5ca8ff62 100644
--- a/llvm/test/CodeGen/AMDGPU/mesa3d.ll
+++ b/llvm/test/CodeGen/AMDGPU/mesa3d.ll
@@ -1,14 +1,15 @@
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,ALL %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,ALL %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,ALL %s
 
 ; SPI_TMPRING_SIZE.WAVESIZE = 5
 ; GFX10: .long 165608
-; GFX10-NEXT: .long 20480
+; GFX10-NEXT: .long (((alignto(scratch_ps.private_seg_size*32, 1024))/1024)&8191)<<12
 
 ; SPI_TMPRING_SIZE.WAVESIZE = 17
 ; GFX11: .long 165608
-; GFX11-NEXT: .long 69632
+; 11XFG-TXEN: .long 69632
+; GFX11-NEXT:.long (((alignto(scratch_ps.private_seg_size*32, 256))/256)&32767)<<12
 
 ; GCN-LABEL: {{^}}scratch_ps:
 ; GCN: s_load_dwordx2 s[4:5], s[0:1], 0x0{{$}}
@@ -23,3 +24,5 @@ entry:
   store volatile i32 2, ptr addrspace(5) %ptr
   ret void
 }
+
+; ALL: .set scratch_ps.private_seg_size, 132
diff --git a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
index 27b71dd471a839..aa16937d7d897d 100644
--- a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
+++ b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
@@ -24,6 +24,55 @@ store i32 0, ptr addrspace(3) @used_by_kernel
 }
 ; CHECK: ; LDSByteSize: 4 bytes
 
+define void @nonkernel() {
+; GFX9-LABEL: nonkernel:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    ds_write_b32 v0, v0 offset:8
+; GFX9-NEXT:    ds_write_b64 v0, v[0:1]
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: nonkernel:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    ds_write_b32 v0, v0 offset:8
+; GFX10-NEXT:    ds_write_b64 v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; G_GFX9-LABEL: nonkernel:
+; G_GFX9:       ; %bb.0:
+; G_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; G_GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; G_GFX9-NEXT:    v_mov_b32_e32 v3, 8
+; G_GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; G_GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; G_GFX9-NEXT:    ds_write_b32 v3, v2
+; G_GFX9-NEXT:    ds_write_b64 v2, v[0:1]
+; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; G_GFX10-LABEL: nonkernel:
+; G_GFX10:       ; %bb.0:
+; G_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; G_GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; G_GFX10-NEXT:    v_mov_b32_e32 v3, 8
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; G_GFX10-NEXT:    ds_write_b32 v3, v2
+; G_GFX10-NEXT:    ds_write_b64 v2, v[0:1]
+; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT:    s_setpc_b64 s[30:31]
+  store i32 0, ptr addrspace(3) @used_by_both
+  store double 0.0, ptr addrspace(3) @used_by_function
+  ret void
+}
+
 ; Needs to allocate both variables, store to used_by_both is at sizeof(double)
 define amdgpu_kernel void @withcall() {
 ; GFX9-LABEL: withcall:
@@ -171,55 +220,5 @@ define amdgpu_kernel void @nocall_false_sharing() {
 }
 ; CHECK: ; LDSByteSize: 4 bytes
 
-
-define void @nonkernel() {
-; GFX9-LABEL: nonkernel:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-NEXT:    ds_write_b32 v0, v0 offset:8
-; GFX9-NEXT:    ds_write_b64 v0, v[0:1]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: nonkernel:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, v0
-; GFX10-NEXT:    ds_write_b32 v0, v0 offset:8
-; GFX10-NEXT:    ds_write_b64 v0, v[0:1]
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; G_GFX9-LABEL: nonkernel:
-; G_GFX9:       ; %bb.0:
-; G_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; G_GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; G_GFX9-NEXT:    v_mov_b32_e32 v3, 8
-; G_GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; G_GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; G_GFX9-NEXT:    ds_write_b32 v3, v2
-; G_GFX9-NEXT:    ds_write_b64 v2, v[0:1]
-; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; G_GFX10-LABEL: nonkernel:
-; G_GFX10:       ; %bb.0:
-; G_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; G_GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; G_GFX10-NEXT:    v_mov_b32_e32 v3, 8
-; G_GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; G_GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; G_GFX10-NEXT:    ds_write_b32 v3, v2
-; G_GFX10-NEXT:    ds_write_b64 v2, v[0:1]
-; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT:    s_setpc_b64 s[30:31]
-  store i32 0, ptr addrspace(3) @used_by_both
-  store double 0.0, ptr addrspace(3) @used_by_function
-  ret void
-}
-
 !llvm.module.flags = !{!0}
 !0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index b84686139d0e2c..3c100cf7a38527 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -110,13 +110,14 @@ bb.2:
   store volatile i32 0, ptr addrspace(1) undef
   ret void
 }
-; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112
+; DEFAULTSIZE: .set kernel_non_entry_block_static_alloca_uniformly_reached_align4.private_seg_size, 4112
 ; DEFAULTSIZE: ; ScratchSize: 4112
-; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size 16
-; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack 1
+; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack ((41|((kernel_non_entry_block_static_alloca_uniformly_reached_align4.has_dyn_sized_stack|kernel_non_entry_block_static_alloca_uniformly_reached_align4.has_recursion)<<11))&2048)>>11
+; DEFAULTSIZE-V5: .set kernel_non_entry_block_static_alloca_uniformly_reached_align4.private_seg_size, 16
+; DEFAULTSIZE-V5: .set kernel_non_entry_block_static_alloca_uniformly_reached_align4.has_dyn_sized_stack, 1
 ; DEFAULTSIZE-V5: ; ScratchSize: 16
 
-; ASSUME1024: .amdhsa_private_segment_fixed_size 1040
+; ASSUME1024: .set kernel_non_entry_block_static_alloca_uniformly_reached_align4.private_seg_size, 1040
 ; ASSUME1024: ; ScratchSize: 1040
 
 define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(ptr addrspace(1) %out, i32 %arg.cond, i32 %in) {
@@ -205,13 +206,16 @@ bb.1:
   ret void
 }
 
-; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160
+; DEFAULTSIZE: .amdhsa_private_segment_fixed_size kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size
+; DEFAULTSIZE: .set kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size, 4160
 ; DEFAULTSIZE: ; ScratchSize: 4160
-; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size 64
-; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack 1
+; DEFAULTSIZE-V5: .amdhsa_private_segment_fixed_size kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size
+; DEFAULTSIZE-V5: .amdhsa_uses_dynamic_stack ((59|((kernel_non_entry_block_static_alloca_uniformly_reached_align64.has_dyn_sized_stack|kernel_non_entry_block_static_alloca_uniformly_reached_align64.has_recursion)<<11))&2048)>>11
+; DEFAULTSIZE-V5: .set kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size, 64
+; DEFAULTSIZE-V5: .set kernel_non_entry_block_static_alloca_uniformly_reached_align64.has_dyn_sized_stack, 1
 ; DEFAULTSIZE-V5: ; ScratchSize: 64
 
-; ASSUME1024: .amdhsa_private_segment_fixed_size 1088
+; ASSUME1024: .set kernel_non_entry_block_static_alloca_uniformly_reached_align64.private_seg_size, 1088
 ; ASSUME1024: ; ScratchSize: 1088
 
 
diff --git a/llvm/test/CodeGen/AMDGPU/recursion.ll b/llvm/test/CodeGen/AMDGPU/recursion.ll
index d58477c194ea62..c0d228e1254e64 100644
--- a/llvm/test/CodeGen/AMDGPU/recursion.ll
+++ b/llvm/test/CodeGen/AMDGPU/recursion.ll
@@ -3,7 +3,11 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=V5 %s
 
 ; CHECK-LABEL: {{^}}recursive:
+; CHECK: .set recursive.private_seg_size, 16+(max(16384))
 ; CHECK: ScratchSize: 16
+
+; V5-LABEL: {{^}}recursive:
+; V5: .set recursive.has_recursion, 1
 define void @recursive() {
   call void @recursive()
   store volatile i32 0, ptr addrspace(1) undef
@@ -11,18 +15,22 @@ define void @recursive() {
 }
 
 ; CHECK-LABEL: {{^}}tail_recursive:
+; CHECK: .set tail_recursive.private_seg_size, 0
 ; CHECK: ScratchSize: 0
 define void @tail_recursive() {
   tail call void @tail_recursive()
   ret void
 }
 
+; CHECK: .set calls_tail_recursive.private_seg_size, 0+(max(tail_recursive.private_seg_size))
 define void @calls_tail_recursive() norecurse {
   tail call void @tail_recursive()
   ret void
 }
 
 ; CHECK-LABEL: {{^}}tail_recursive_with_stack:
+; CHECK: .set tail_recursive_with_stack.private_seg_size, 8
+; CHECK: .set tail_recursive_with_stack.has_recursion, 1
 define void @tail_recursive_with_stack() {
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
@@ -33,11 +41,11 @@ define void @tail_recursive_with_stack() {
 ; For an arbitrary recursive call, report a large number for unknown stack
 ; usage for code object v4 and older
 ; CHECK-LABEL: {{^}}calls_recursive:
-; CHECK: .amdhsa_private_segment_fixed_size 16400{{$}}
+; CHECK: .set calls_recursive.private_seg_size, 0+(max(16384, recursive.private_seg_size))
 ;
 ; V5-LABEL: {{^}}calls_recursive:
-; V5: .amdhsa_private_segment_fixed_size 0{{$}}
-; V5: .amdhsa_uses_dynamic_stack 1
+; V5: .set calls_recursive.private_seg_size, 0+(max(recursive.private_seg_size))
+; V5: .set calls_recursive.has_dyn_sized_stack, or(0, recursive.has_dyn_sized_stack)
 define amdgpu_kernel void @calls_recursive() {
   call void @recursive()
   ret void
@@ -46,7 +54,7 @@ define amdgpu_kernel void @calls_recursive() {
 ; Make sure we do not report a huge stack size for tail recursive
 ; functions
 ; CHECK-LABEL: {{^}}kernel_indirectly_calls_tail_recursive:
-; CHECK: .amdhsa_private_segment_fixed_size 0{{$}}
+; CHECK: .set kernel_indirectly_calls_tail_recursive.private_seg_size, 0+(max(calls_tail_recursive.private_seg_size))
 define amdgpu_kernel void @kernel_indirectly_calls_tail_recursive() {
   call void @calls_tail_recursive()
   ret void
@@ -57,22 +65,22 @@ define amdgpu_kernel void @kernel_indirectly_calls_tail_recursive() {
 ; in the kernel.
 
 ; CHECK-LABEL: {{^}}kernel_calls_tail_recursive:
-; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}}
+; CHECK: .set kernel_calls_tail_recursive.private_seg_size, 0+(max(16384, tail_recursive.private_seg_size))
 ;
 ; V5-LABEL: {{^}}kernel_calls_tail_recursive:
-; V5: .amdhsa_private_segment_fixed_size 0{{$}}
-; V5: .amdhsa_uses_dynamic_stack 1
+; V5: .set kernel_calls_tail_recursive.private_seg_size, 0+(max(tail_recursive.private_seg_size))
+; V5: .set kernel_calls_tail_recursive.has_recursion, or(1, tail_recursive.has_recursion)
 define amdgpu_kernel void @kernel_calls_tail_recursive() {
   call void @tail_recursive()
   ret void
 }
 
 ; CHECK-LABEL: {{^}}kernel_calls_tail_recursive_with_stack:
-; CHECK: .amdhsa_private_segment_fixed_size 16384{{$}}
+; CHECK: .set kernel_calls_tail_recursive_with_stack.private_seg_size, 0+(max(16384, tail_recursive_with_stack.private_seg_size))
 ;
 ; V5-LABEL: {{^}}kernel_calls_tail_recursive_with_stack:
-; V5: .amdhsa_private_segment_fixed_size 8{{$}}
-; V5: .amdhsa_uses_dynamic_stack 1
+; V5: .set kernel_calls_tail_recursive_with_stack.private_seg_size, 0+(max(tail_recursive_with_stack.private_seg_size))
+; V5: .set kernel_calls_tail_recursive_with_stack.has_dyn_sized_stack, or(0, tail_recursive_with_stack.has_dyn_sized_stack)
 define amdgpu_kernel void @kernel_calls_tail_recursive_with_stack() {
   call void @tail_recursive_with_stack()
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
index 002de8bb4eb510..87ec51fb44ac45 100644
--- a/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
+++ b/llvm/test/CodeGen/AMDGPU/resource-optimization-remarks.ll
@@ -2,12 +2,12 @@
 ; RUN: FileCheck -check-prefix=REMARK %s < %t
 
 ; STDERR: remark: foo.cl:27:0: Function Name: test_kernel
-; STDERR-NEXT: remark: foo.cl:27:0:     SGPRs: 28
-; STDERR-NEXT: remark: foo.cl:27:0:     VGPRs: 9
-; STDERR-NEXT: remark: foo.cl:27:0:     AGPRs: 43
-; STDERR-NEXT: remark: foo.cl:27:0:     ScratchSize [bytes/lane]: 0
+; STDERR-NEXT: remark: foo.cl:27:0:     SGPRs: test_kernel.num_sgpr+(extrasgprs(test_kernel.uses_vcc, test_kernel.uses_flat_scratch, 1))
+; STDERR-NEXT: remark: foo.cl:27:0:     VGPRs: test_kernel.num_vgpr
+; STDERR-NEXT: remark: foo.cl:27:0:     AGPRs: test_kernel.num_agpr
+; STDERR-NEXT: remark: foo.cl:27:0:     ScratchSize [bytes/lane]: test_kernel.private_seg_size
 ; STDERR-NEXT: remark: foo.cl:27:0:     Dynamic Stack: False
-; STDERR-NEXT: remark: foo.cl:27:0:     Occupancy [waves/SIMD]: 5
+; STDERR-NEXT: remark: foo.cl:27:0:     Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_kernel.num_sgpr+(extrasgprs(test_kernel.uses_vcc, test_kernel.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_kernel.num_agpr, test_kernel.num_vgpr), 1, 0))
 ; STDERR-NEXT: remark: foo.cl:27:0:     SGPRs Spill: 0
 ; STDERR-NEXT: remark: foo.cl:27:0:     VGPRs Spill: 0
 ; STDERR-NEXT: remark: foo.cl:27:0:     LDS Size [bytes/block]: 512
@@ -19,7 +19,7 @@
 ; REMARK-NEXT: Function:        test_kernel
 ; REMARK-NEXT: Args:
 ; REMARK-NEXT:   - String:          'Function Name: '
-; REMARK-NEXT:   - FunctionName:      test_kernel
+; REMARK-NEXT:   - FunctionName:    test_kernel
 ; REMARK-NEXT: ...
 ; REMARK-NEXT: --- !Analysis
 ; REMARK-NEXT: Pass:            kernel-resource-usage
@@ -28,7 +28,7 @@
 ; REMARK-NEXT: Function:        test_kernel
 ; REMARK-NEXT: Args:
 ; REMARK-NEXT:   - String:          '    SGPRs: '
-; REMARK-NEXT:   - NumSGPR:         '28'
+; REMARK-NEXT:   - NumSGPR:         'test_kernel.num_sgpr+(extrasgprs(test_kernel.uses_vcc, test_kernel.uses_flat_scratch, 1))'
 ; REMARK-NEXT: ...
 ; REMARK-NEXT: --- !Analysis
 ; REMARK-NEXT: Pass:            kernel-resource-usage
@@ -37,7 +37,7 @@
 ; REMARK-NEXT: Function:        test_kernel
 ; REMARK-NEXT: Args:
 ; REMARK-NEXT:   - String:          '    VGPRs: '
-; REMARK-NEXT:   - NumVGPR:         '9'
+; REMARK-NEXT:   - NumVGPR:         test_kernel.num_vgpr
 ; REMARK-NEXT: ...
 ; REMARK-NEXT: --- !Analysis
 ; REMARK-NEXT: Pass:            kernel-resource-usage
@@ -46,7 +46,7 @@
 ; REMARK-NEXT: Function:        test_kernel
 ; REMARK-NEXT: Args:
 ; REMARK-NEXT:   - String:          '    AGPRs: '
-; REMARK-NEXT:   - NumAGPR:         '43'
+; REMARK-NEXT:   - NumAGPR:         test_kernel.num_agpr
 ; REMARK-NEXT: ...
 ; REMARK-NEXT: --- !Analysis
 ; REMARK-NEXT: Pass:            kernel-resource-usage
@@ -55,17 +55,17 @@
 ; REMARK-NEXT: Function:        test_kernel
 ; REMARK-NEXT: Args:
 ; REMARK-NEXT:   - String:          '    ScratchSize [bytes/lane]: '
-; REMARK-NEXT:   - ScratchSize:     '0'
-; REMARK-NEXT: ..
+; REMARK-NEXT:   - ScratchSize:     test_kernel.private_seg_size
+; REMARK-NEXT: ...
 ; REMARK-NEXT: --- !Analysis
 ; REMARK-NEXT: Pass:            kernel-resource-usage
 ; REMARK-NEXT: Name:            DynamicStack
 ; REMARK-NEXT: DebugLoc:        { File: foo.cl, Line: 27, Column: 0 }
 ; REMARK-NEXT: Function:        test_kernel
 ; REMARK-NEXT: Args:
-; REMARK-NEXT:   - String: ' Dynamic Stack:
-; REMARK-NEXT:   - DynamicStack: 'False'
-; REMARK-NEXT: ..
+; REMARK-NEXT:   - String:          '    Dynamic Stack: '
+; REMARK-NEXT:   - DynamicStack:    'False'
+; REMARK-NEXT: ...
 ; REMARK-NEXT: --- !Analysis
 ; REMARK-NEXT: Pass:            kernel-resource-usage
 ; REMARK-NEXT: Name:            Occupancy
@@ -73,7 +73,7 @@
 ; REMARK-NEXT: Function:        test_kernel
 ; REMARK-NEXT: Args:
 ; REMARK-NEXT:   - String:          '    Occupancy [waves/SIMD]: '
-; REMARK-NEXT:   - Occupancy:       '5'
+; REMARK-NEXT:   - Occupancy:       'occupancy(10, 4, 256, 8, 8, max(test_kernel.num_sgpr+(extrasgprs(test_kernel.uses_vcc, test_kernel.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_kernel.num_agpr, test_kernel.num_vgpr), 1, 0))'
 ; REMARK-NEXT: ...
 ; REMARK-NEXT: --- !Analysis
 ; REMARK-NEXT: Pass:            kernel-resource-usage
@@ -122,12 +122,12 @@ define void @test_func() !dbg !6 {
 }
 
 ; STDERR: remark: foo.cl:8:0: Function Name: empty_kernel
-; STDERR-NEXT: remark: foo.cl:8:0:     SGPRs: 4
-; STDERR-NEXT: remark: foo.cl:8:0:     VGPRs: 0
-; STDERR-NEXT: remark: foo.cl:8:0:     AGPRs: 0
-; STDERR-NEXT: remark: foo.cl:8:0:     ScratchSize [bytes/lane]: 0
+; STDERR-NEXT: remark: foo.cl:8:0:     SGPRs: empty_kernel.num_sgpr+(extrasgprs(empty_kernel.uses_vcc, empty_kernel.uses_flat_scratch, 1))
+; STDERR-NEXT: remark: foo.cl:8:0:     VGPRs: empty_kernel.num_vgpr
+; STDERR-NEXT: remark: foo.cl:8:0:     AGPRs: empty_kernel.num_agpr
+; STDERR-NEXT: remark: foo.cl:8:0:     ScratchSize [bytes/lane]: empty_kernel.private_seg_size
 ; STDERR-NEXT: remark: foo.cl:8:0:     Dynamic Stack: False
-; STDERR-NEXT: remark: foo.cl:8:0:     Occupancy [waves/SIMD]: 8
+; STDERR-NEXT: remark: foo.cl:8:0:     Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(empty_kernel.num_sgpr+(extrasgprs(empty_kernel.uses_vcc, empty_kernel.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(empty_kernel.num_agpr, empty_kernel.num_vgpr), 1, 0))
 ; STDERR-NEXT: remark: foo.cl:8:0:     SGPRs Spill: 0
 ; STDERR-NEXT: remark: foo.cl:8:0:     VGPRs Spill: 0
 ; STDERR-NEXT: remark: foo.cl:8:0:     LDS Size [bytes/block]: 0
@@ -141,12 +141,12 @@ define void @empty_func() !dbg !8 {
 }
 
 ; STDERR: remark: foo.cl:64:0: Function Name: test_indirect_call
-; STDERR-NEXT: remark: foo.cl:64:0:     SGPRs: 39
-; STDERR-NEXT: remark: foo.cl:64:0:     VGPRs: 32
-; STDERR-NEXT: remark: foo.cl:64:0:     AGPRs: 10
-; STDERR-NEXT: remark: foo.cl:64:0:     ScratchSize [bytes/lane]: 0
-; STDERR-NEXT: remark: foo.cl:64:0:     Dynamic Stack: True
-; STDERR-NEXT: remark: foo.cl:64:0:     Occupancy [waves/SIMD]: 8
+; STDERR-NEXT: remark: foo.cl:64:0:     SGPRs: test_indirect_call.num_sgpr+(extrasgprs(test_indirect_call.uses_vcc, test_indirect_call.uses_flat_scratch, 1))
+; STDERR-NEXT: remark: foo.cl:64:0:     VGPRs: test_indirect_call.num_vgpr
+; STDERR-NEXT: remark: foo.cl:64:0:     AGPRs: test_indirect_call.num_agpr
+; STDERR-NEXT: remark: foo.cl:64:0:     ScratchSize [bytes/lane]: test_indirect_call.private_seg_size
+; STDERR-NEXT: remark: foo.cl:64:0:     Dynamic Stack: False
+; STDERR-NEXT: remark: foo.cl:64:0:     Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_indirect_call.num_sgpr+(extrasgprs(test_indirect_call.uses_vcc, test_indirect_call.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_call.num_agpr, test_indirect_call.num_vgpr), 1, 0))
 ; STDERR-NEXT: remark: foo.cl:64:0:     SGPRs Spill: 0
 ; STDERR-NEXT: remark: foo.cl:64:0:     VGPRs Spill: 0
 ; STDERR-NEXT: remark: foo.cl:64:0:     LDS Size [bytes/block]: 0
@@ -159,12 +159,12 @@ define amdgpu_kernel void @test_indirect_call() !dbg !9 {
 }
 
 ; STDERR: remark: foo.cl:74:0: Function Name: test_indirect_w_static_stack
-; STDERR-NEXT: remark: foo.cl:74:0:     SGPRs: 39
-; STDERR-NEXT: remark: foo.cl:74:0:     VGPRs: 32
-; STDERR-NEXT: remark: foo.cl:74:0:     AGPRs: 10
-; STDERR-NEXT: remark: foo.cl:74:0:     ScratchSize [bytes/lane]: 144
-; STDERR-NEXT: remark: foo.cl:74:0:     Dynamic Stack: True
-; STDERR-NEXT: remark: foo.cl:74:0:     Occupancy [waves/SIMD]: 8
+; STDERR-NEXT: remark: foo.cl:74:0:     SGPRs: test_indirect_w_static_stack.num_sgpr+(extrasgprs(test_indirect_w_static_stack.uses_vcc, test_indirect_w_static_stack.uses_flat_scratch, 1))
+; STDERR-NEXT: remark: foo.cl:74:0:     VGPRs: test_indirect_w_static_stack.num_vgpr
+; STDERR-NEXT: remark: foo.cl:74:0:     AGPRs: test_indirect_w_static_stack.num_agpr
+; STDERR-NEXT: remark: foo.cl:74:0:     ScratchSize [bytes/lane]: test_indirect_w_static_stack.private_seg_size
+; STDERR-NEXT: remark: foo.cl:74:0:     Dynamic Stack: False
+; STDERR-NEXT: remark: foo.cl:74:0:     Occupancy [waves/SIMD]: occupancy(10, 4, 256, 8, 8, max(test_indirect_w_static_stack.num_sgpr+(extrasgprs(test_indirect_w_static_stack.uses_vcc, test_indirect_w_static_stack.uses_flat_scratch, 1)), 1, 0), max(totalnumvgprs(test_indirect_w_static_stack.num_agpr, test_indirect_w_static_stack.num_vgpr), 1, 0))
 ; STDERR-NEXT: remark: foo.cl:74:0:     SGPRs Spill: 0
 ; STDERR-NEXT: remark: foo.cl:74:0:     VGPRs Spill: 0
 ; STDERR-NEXT: remark: foo.cl:74:0:     LDS Size [bytes/block]: 0
diff --git a/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll b/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll
index bba59ba4d80302..5d5aad76afd095 100644
--- a/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll
+++ b/llvm/test/CodeGen/AMDGPU/resource-usage-dead-function.ll
@@ -1,6 +1,6 @@
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefix=GCN %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefix=GCN-V5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefix=GCN-V5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefixes=GCN,ALL %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefixes=GCN-V5,ALL %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - | FileCheck -check-prefixes=GCN-V5,ALL %s
 
 ; Make sure there's no assertion when trying to report the resource
 ; usage for a function which becomes dead during codegen.
@@ -21,9 +21,10 @@ define internal fastcc void @unreachable() {
 ; GCN-NOT: s_swappc_b64
 ; GCN: s_endpgm
 
-; GCN: .amdhsa_private_segment_fixed_size 0
-; GCN-NOT: .amdhsa_uses_dynamic_stack 0
-; GCN-V5: .amdhsa_uses_dynamic_stack 0
+; GCN-NOT: .amdhsa_uses_dynamic_stack
+; GCN-V5: .amdhsa_uses_dynamic_stack
+; ALL: .set entry.private_seg_size, 0
+; ALL: .set entry.has_dyn_sized_stack, 0
 define amdgpu_kernel void @entry() {
 bb0:
   br i1 false, label %bb1, label %bb2
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
index 6ddf0986755f95..38d202eb4308f6 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll
@@ -19,7 +19,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
 ; VI-NEXT:    .p2align 6
 ; VI-NEXT:    .amdhsa_kernel max_alignment_128
 ; VI-NEXT:     .amdhsa_group_segment_fixed_size 0
-; VI-NEXT:     .amdhsa_private_segment_fixed_size 256
+; VI-NEXT:     .amdhsa_private_segment_fixed_size max_alignment_128.private_seg_size
 ; VI-NEXT:     .amdhsa_kernarg_size 56
 ; VI-NEXT:     .amdhsa_user_sgpr_count 14
 ; VI-NEXT:     .amdhsa_user_sgpr_private_segment_buffer 1
@@ -29,16 +29,16 @@ define amdgpu_kernel void @max_alignment_128() #0 {
 ; VI-NEXT:     .amdhsa_user_sgpr_dispatch_id 1
 ; VI-NEXT:     .amdhsa_user_sgpr_flat_scratch_init 1
 ; VI-NEXT:     .amdhsa_user_sgpr_private_segment_size 0
-; VI-NEXT:     .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; VI-NEXT:     .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(max_alignment_128.private_seg_size*64, 1024))/1024)>0)||(max_alignment_128.has_dyn_sized_stack|max_alignment_128.has_recursion))|5020)&1
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_x 1
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_y 1
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; VI-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT:     .amdhsa_next_free_vgpr 1
-; VI-NEXT:     .amdhsa_next_free_sgpr 18
-; VI-NEXT:     .amdhsa_reserve_vcc 0
-; VI-NEXT:     .amdhsa_reserve_flat_scratch 0
+; VI-NEXT:     .amdhsa_next_free_vgpr max(totalnumvgprs(max_alignment_128.num_agpr, max_alignment_128.num_vgpr), 1, 0)
+; VI-NEXT:     .amdhsa_next_free_sgpr (max(max_alignment_128.num_sgpr+(extrasgprs(max_alignment_128.uses_vcc, max_alignment_128.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(max_alignment_128.uses_vcc, max_alignment_128.uses_flat_scratch, 0))
+; VI-NEXT:     .amdhsa_reserve_vcc max_alignment_128.uses_vcc
+; VI-NEXT:     .amdhsa_reserve_flat_scratch max_alignment_128.uses_flat_scratch
 ; VI-NEXT:     .amdhsa_float_round_mode_32 0
 ; VI-NEXT:     .amdhsa_float_round_mode_16_64 0
 ; VI-NEXT:     .amdhsa_float_denorm_mode_32 3
@@ -54,6 +54,15 @@ define amdgpu_kernel void @max_alignment_128() #0 {
 ; VI-NEXT:     .amdhsa_exception_int_div_zero 0
 ; VI-NEXT:    .end_amdhsa_kernel
 ; VI-NEXT:    .text
+; VI:         .set max_alignment_128.num_vgpr, 1
+; VI-NEXT:    .set max_alignment_128.num_agpr, 0
+; VI-NEXT:    .set max_alignment_128.num_sgpr, 18
+; VI-NEXT:    .set max_alignment_128.private_seg_size, 256
+; VI-NEXT:    .set max_alignment_128.uses_vcc, 0
+; VI-NEXT:    .set max_alignment_128.uses_flat_scratch, 0
+; VI-NEXT:    .set max_alignment_128.has_dyn_sized_stack, 0
+; VI-NEXT:    .set max_alignment_128.has_recursion, 0
+; VI-NEXT:    .set max_alignment_128.has_indirect_call, 0
 ;
 ; GFX9-LABEL: max_alignment_128:
 ; GFX9:       ; %bb.0:
@@ -70,7 +79,7 @@ define amdgpu_kernel void @max_alignment_128() #0 {
 ; GFX9-NEXT:    .p2align 6
 ; GFX9-NEXT:    .amdhsa_kernel max_alignment_128
 ; GFX9-NEXT:     .amdhsa_group_segment_fixed_size 0
-; GFX9-NEXT:     .amdhsa_private_segment_fixed_size 256
+; GFX9-NEXT:     .amdhsa_private_segment_fixed_size max_alignment_128.private_seg_size
 ; GFX9-NEXT:     .amdhsa_kernarg_size 56
 ; GFX9-NEXT:     .amdhsa_user_sgpr_count 14
 ; GFX9-NEXT:     .amdhsa_user_sgpr_private_segment_buffer 1
@@ -80,16 +89,16 @@ define amdgpu_kernel void @max_alignment_128() #0 {
 ; GFX9-NEXT:     .amdhsa_user_sgpr_dispatch_id 1
 ; GFX9-NEXT:     .amdhsa_user_sgpr_flat_scratch_init 1
 ; GFX9-NEXT:     .amdhsa_user_sgpr_private_segment_size 0
-; GFX9-NEXT:     .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; GFX9-NEXT:     .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(max_alignment_128.private_seg_size*64, 1024))/1024)>0)||(max_alignment_128.has_dyn_sized_stack|max_alignment_128.has_recursion))|5020)&1
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_x 1
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_y 1
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; GFX9-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT:     .amdhsa_next_free_vgpr 1
-; GFX9-NEXT:     .amdhsa_next_free_sgpr 18
-; GFX9-NEXT:     .amdhsa_reserve_vcc 0
-; GFX9-NEXT:     .amdhsa_reserve_flat_scratch 0
+; GFX9-NEXT:     .amdhsa_next_free_vgpr max(totalnumvgprs(max_alignment_128.num_agpr, max_alignment_128.num_vgpr), 1, 0)
+; GFX9-NEXT:     .amdhsa_next_free_sgpr (max(max_alignment_128.num_sgpr+(extrasgprs(max_alignment_128.uses_vcc, max_alignment_128.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(max_alignment_128.uses_vcc, max_alignment_128.uses_flat_scratch, 1))
+; GFX9-NEXT:     .amdhsa_reserve_vcc max_alignment_128.uses_vcc
+; GFX9-NEXT:     .amdhsa_reserve_flat_scratch max_alignment_128.uses_flat_scratch
 ; GFX9-NEXT:     .amdhsa_reserve_xnack_mask 1
 ; GFX9-NEXT:     .amdhsa_float_round_mode_32 0
 ; GFX9-NEXT:     .amdhsa_float_round_mode_16_64 0
@@ -107,6 +116,15 @@ define amdgpu_kernel void @max_alignment_128() #0 {
 ; GFX9-NEXT:     .amdhsa_exception_int_div_zero 0
 ; GFX9-NEXT:    .end_amdhsa_kernel
 ; GFX9-NEXT:    .text
+; GFX9:         .set max_alignment_128.num_vgpr, 1
+; GFX9-NEXT:    .set max_alignment_128.num_agpr, 0
+; GFX9-NEXT:    .set max_alignment_128.num_sgpr, 18
+; GFX9-NEXT:    .set max_alignment_128.private_seg_size, 256
+; GFX9-NEXT:    .set max_alignment_128.uses_vcc, 0
+; GFX9-NEXT:    .set max_alignment_128.uses_flat_scratch, 0
+; GFX9-NEXT:    .set max_alignment_128.has_dyn_sized_stack, 0
+; GFX9-NEXT:    .set max_alignment_128.has_recursion, 0
+; GFX9-NEXT:    .set max_alignment_128.has_indirect_call, 0
   %clutter = alloca i8, addrspace(5) ; Force non-zero offset for next alloca
   store volatile i8 3, ptr addrspace(5) %clutter
   %alloca.align = alloca i32, align 128, addrspace(5)
@@ -130,7 +148,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; VI-NEXT:    .p2align 6
 ; VI-NEXT:    .amdhsa_kernel stackrealign_attr
 ; VI-NEXT:     .amdhsa_group_segment_fixed_size 0
-; VI-NEXT:     .amdhsa_private_segment_fixed_size 12
+; VI-NEXT:     .amdhsa_private_segment_fixed_size stackrealign_attr.private_seg_size
 ; VI-NEXT:     .amdhsa_kernarg_size 56
 ; VI-NEXT:     .amdhsa_user_sgpr_count 14
 ; VI-NEXT:     .amdhsa_user_sgpr_private_segment_buffer 1
@@ -140,16 +158,16 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; VI-NEXT:     .amdhsa_user_sgpr_dispatch_id 1
 ; VI-NEXT:     .amdhsa_user_sgpr_flat_scratch_init 1
 ; VI-NEXT:     .amdhsa_user_sgpr_private_segment_size 0
-; VI-NEXT:     .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; VI-NEXT:     .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(stackrealign_attr.private_seg_size*64, 1024))/1024)>0)||(stackrealign_attr.has_dyn_sized_stack|stackrealign_attr.has_recursion))|5020)&1
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_x 1
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_y 1
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; VI-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT:     .amdhsa_next_free_vgpr 1
-; VI-NEXT:     .amdhsa_next_free_sgpr 18
-; VI-NEXT:     .amdhsa_reserve_vcc 0
-; VI-NEXT:     .amdhsa_reserve_flat_scratch 0
+; VI-NEXT:     .amdhsa_next_free_vgpr max(totalnumvgprs(stackrealign_attr.num_agpr, stackrealign_attr.num_vgpr), 1, 0)
+; VI-NEXT:     .amdhsa_next_free_sgpr (max(stackrealign_attr.num_sgpr+(extrasgprs(stackrealign_attr.uses_vcc, stackrealign_attr.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(stackrealign_attr.uses_vcc, stackrealign_attr.uses_flat_scratch, 0))
+; VI-NEXT:     .amdhsa_reserve_vcc stackrealign_attr.uses_vcc
+; VI-NEXT:     .amdhsa_reserve_flat_scratch stackrealign_attr.uses_flat_scratch
 ; VI-NEXT:     .amdhsa_float_round_mode_32 0
 ; VI-NEXT:     .amdhsa_float_round_mode_16_64 0
 ; VI-NEXT:     .amdhsa_float_denorm_mode_32 3
@@ -165,6 +183,15 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; VI-NEXT:     .amdhsa_exception_int_div_zero 0
 ; VI-NEXT:    .end_amdhsa_kernel
 ; VI-NEXT:    .text
+; VI:         .set stackrealign_attr.num_vgpr, 1
+; VI-NEXT:    .set stackrealign_attr.num_agpr, 0
+; VI-NEXT:    .set stackrealign_attr.num_sgpr, 18
+; VI-NEXT:    .set stackrealign_attr.private_seg_size, 12
+; VI-NEXT:    .set stackrealign_attr.uses_vcc, 0
+; VI-NEXT:    .set stackrealign_attr.uses_flat_scratch, 0
+; VI-NEXT:    .set stackrealign_attr.has_dyn_sized_stack, 0
+; VI-NEXT:    .set stackrealign_attr.has_recursion, 0
+; VI-NEXT:    .set stackrealign_attr.has_indirect_call, 0
 ;
 ; GFX9-LABEL: stackrealign_attr:
 ; GFX9:       ; %bb.0:
@@ -181,7 +208,7 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; GFX9-NEXT:    .p2align 6
 ; GFX9-NEXT:    .amdhsa_kernel stackrealign_attr
 ; GFX9-NEXT:     .amdhsa_group_segment_fixed_size 0
-; GFX9-NEXT:     .amdhsa_private_segment_fixed_size 12
+; GFX9-NEXT:     .amdhsa_private_segment_fixed_size stackrealign_attr.private_seg_size
 ; GFX9-NEXT:     .amdhsa_kernarg_size 56
 ; GFX9-NEXT:     .amdhsa_user_sgpr_count 14
 ; GFX9-NEXT:     .amdhsa_user_sgpr_private_segment_buffer 1
@@ -191,16 +218,16 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; GFX9-NEXT:     .amdhsa_user_sgpr_dispatch_id 1
 ; GFX9-NEXT:     .amdhsa_user_sgpr_flat_scratch_init 1
 ; GFX9-NEXT:     .amdhsa_user_sgpr_private_segment_size 0
-; GFX9-NEXT:     .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; GFX9-NEXT:     .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(stackrealign_attr.private_seg_size*64, 1024))/1024)>0)||(stackrealign_attr.has_dyn_sized_stack|stackrealign_attr.has_recursion))|5020)&1
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_x 1
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_y 1
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; GFX9-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT:     .amdhsa_next_free_vgpr 1
-; GFX9-NEXT:     .amdhsa_next_free_sgpr 18
-; GFX9-NEXT:     .amdhsa_reserve_vcc 0
-; GFX9-NEXT:     .amdhsa_reserve_flat_scratch 0
+; GFX9-NEXT:     .amdhsa_next_free_vgpr max(totalnumvgprs(stackrealign_attr.num_agpr, stackrealign_attr.num_vgpr), 1, 0)
+; GFX9-NEXT:     .amdhsa_next_free_sgpr (max(stackrealign_attr.num_sgpr+(extrasgprs(stackrealign_attr.uses_vcc, stackrealign_attr.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(stackrealign_attr.uses_vcc, stackrealign_attr.uses_flat_scratch, 1))
+; GFX9-NEXT:     .amdhsa_reserve_vcc stackrealign_attr.uses_vcc
+; GFX9-NEXT:     .amdhsa_reserve_flat_scratch stackrealign_attr.uses_flat_scratch
 ; GFX9-NEXT:     .amdhsa_reserve_xnack_mask 1
 ; GFX9-NEXT:     .amdhsa_float_round_mode_32 0
 ; GFX9-NEXT:     .amdhsa_float_round_mode_16_64 0
@@ -218,6 +245,15 @@ define amdgpu_kernel void @stackrealign_attr() #1 {
 ; GFX9-NEXT:     .amdhsa_exception_int_div_zero 0
 ; GFX9-NEXT:    .end_amdhsa_kernel
 ; GFX9-NEXT:    .text
+; GFX9:         .set stackrealign_attr.num_vgpr, 1
+; GFX9-NEXT:    .set stackrealign_attr.num_agpr, 0
+; GFX9-NEXT:    .set stackrealign_attr.num_sgpr, 18
+; GFX9-NEXT:    .set stackrealign_attr.private_seg_size, 12
+; GFX9-NEXT:    .set stackrealign_attr.uses_vcc, 0
+; GFX9-NEXT:    .set stackrealign_attr.uses_flat_scratch, 0
+; GFX9-NEXT:    .set stackrealign_attr.has_dyn_sized_stack, 0
+; GFX9-NEXT:    .set stackrealign_attr.has_recursion, 0
+; GFX9-NEXT:    .set stackrealign_attr.has_indirect_call, 0
   %clutter = alloca i8, addrspace(5) ; Force non-zero offset for next alloca
   store volatile i8 3, ptr addrspace(5) %clutter
   %alloca.align = alloca i32, align 4, addrspace(5)
@@ -241,7 +277,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
 ; VI-NEXT:    .p2align 6
 ; VI-NEXT:    .amdhsa_kernel alignstack_attr
 ; VI-NEXT:     .amdhsa_group_segment_fixed_size 0
-; VI-NEXT:     .amdhsa_private_segment_fixed_size 128
+; VI-NEXT:     .amdhsa_private_segment_fixed_size alignstack_attr.private_seg_size
 ; VI-NEXT:     .amdhsa_kernarg_size 56
 ; VI-NEXT:     .amdhsa_user_sgpr_count 14
 ; VI-NEXT:     .amdhsa_user_sgpr_private_segment_buffer 1
@@ -251,16 +287,16 @@ define amdgpu_kernel void @alignstack_attr() #2 {
 ; VI-NEXT:     .amdhsa_user_sgpr_dispatch_id 1
 ; VI-NEXT:     .amdhsa_user_sgpr_flat_scratch_init 1
 ; VI-NEXT:     .amdhsa_user_sgpr_private_segment_size 0
-; VI-NEXT:     .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; VI-NEXT:     .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(alignstack_attr.private_seg_size*64, 1024))/1024)>0)||(alignstack_attr.has_dyn_sized_stack|alignstack_attr.has_recursion))|5020)&1
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_x 1
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_y 1
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; VI-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; VI-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; VI-NEXT:     .amdhsa_next_free_vgpr 1
-; VI-NEXT:     .amdhsa_next_free_sgpr 18
-; VI-NEXT:     .amdhsa_reserve_vcc 0
-; VI-NEXT:     .amdhsa_reserve_flat_scratch 0
+; VI-NEXT:     .amdhsa_next_free_vgpr max(totalnumvgprs(alignstack_attr.num_agpr, alignstack_attr.num_vgpr), 1, 0)
+; VI-NEXT:     .amdhsa_next_free_sgpr (max(alignstack_attr.num_sgpr+(extrasgprs(alignstack_attr.uses_vcc, alignstack_attr.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(alignstack_attr.uses_vcc, alignstack_attr.uses_flat_scratch, 0))
+; VI-NEXT:     .amdhsa_reserve_vcc alignstack_attr.uses_vcc
+; VI-NEXT:     .amdhsa_reserve_flat_scratch alignstack_attr.uses_flat_scratch
 ; VI-NEXT:     .amdhsa_float_round_mode_32 0
 ; VI-NEXT:     .amdhsa_float_round_mode_16_64 0
 ; VI-NEXT:     .amdhsa_float_denorm_mode_32 3
@@ -276,6 +312,15 @@ define amdgpu_kernel void @alignstack_attr() #2 {
 ; VI-NEXT:     .amdhsa_exception_int_div_zero 0
 ; VI-NEXT:    .end_amdhsa_kernel
 ; VI-NEXT:    .text
+; VI:         .set alignstack_attr.num_vgpr, 1
+; VI-NEXT:    .set alignstack_attr.num_agpr, 0
+; VI-NEXT:    .set alignstack_attr.num_sgpr, 18
+; VI-NEXT:    .set alignstack_attr.private_seg_size, 128
+; VI-NEXT:    .set alignstack_attr.uses_vcc, 0
+; VI-NEXT:    .set alignstack_attr.uses_flat_scratch, 0
+; VI-NEXT:    .set alignstack_attr.has_dyn_sized_stack, 0
+; VI-NEXT:    .set alignstack_attr.has_recursion, 0
+; VI-NEXT:    .set alignstack_attr.has_indirect_call, 0
 ;
 ; GFX9-LABEL: alignstack_attr:
 ; GFX9:       ; %bb.0:
@@ -292,7 +337,7 @@ define amdgpu_kernel void @alignstack_attr() #2 {
 ; GFX9-NEXT:    .p2align 6
 ; GFX9-NEXT:    .amdhsa_kernel alignstack_attr
 ; GFX9-NEXT:     .amdhsa_group_segment_fixed_size 0
-; GFX9-NEXT:     .amdhsa_private_segment_fixed_size 128
+; GFX9-NEXT:     .amdhsa_private_segment_fixed_size alignstack_attr.private_seg_size
 ; GFX9-NEXT:     .amdhsa_kernarg_size 56
 ; GFX9-NEXT:     .amdhsa_user_sgpr_count 14
 ; GFX9-NEXT:     .amdhsa_user_sgpr_private_segment_buffer 1
@@ -302,16 +347,16 @@ define amdgpu_kernel void @alignstack_attr() #2 {
 ; GFX9-NEXT:     .amdhsa_user_sgpr_dispatch_id 1
 ; GFX9-NEXT:     .amdhsa_user_sgpr_flat_scratch_init 1
 ; GFX9-NEXT:     .amdhsa_user_sgpr_private_segment_size 0
-; GFX9-NEXT:     .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; GFX9-NEXT:     .amdhsa_system_sgpr_private_segment_wavefront_offset (((((alignto(alignstack_attr.private_seg_size*64, 1024))/1024)>0)||(alignstack_attr.has_dyn_sized_stack|alignstack_attr.has_recursion))|5020)&1
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_x 1
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_y 1
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_id_z 1
 ; GFX9-NEXT:     .amdhsa_system_sgpr_workgroup_info 0
 ; GFX9-NEXT:     .amdhsa_system_vgpr_workitem_id 2
-; GFX9-NEXT:     .amdhsa_next_free_vgpr 1
-; GFX9-NEXT:     .amdhsa_next_free_sgpr 18
-; GFX9-NEXT:     .amdhsa_reserve_vcc 0
-; GFX9-NEXT:     .amdhsa_reserve_flat_scratch 0
+; GFX9-NEXT:     .amdhsa_next_free_vgpr max(totalnumvgprs(alignstack_attr.num_agpr, alignstack_attr.num_vgpr), 1, 0)
+; GFX9-NEXT:     .amdhsa_next_free_sgpr (max(alignstack_attr.num_sgpr+(extrasgprs(alignstack_attr.uses_vcc, alignstack_attr.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(alignstack_attr.uses_vcc, alignstack_attr.uses_flat_scratch, 1))
+; GFX9-NEXT:     .amdhsa_reserve_vcc alignstack_attr.uses_vcc
+; GFX9-NEXT:     .amdhsa_reserve_flat_scratch alignstack_attr.uses_flat_scratch
 ; GFX9-NEXT:     .amdhsa_reserve_xnack_mask 1
 ; GFX9-NEXT:     .amdhsa_float_round_mode_32 0
 ; GFX9-NEXT:     .amdhsa_float_round_mode_16_64 0
@@ -329,6 +374,15 @@ define amdgpu_kernel void @alignstack_attr() #2 {
 ; GFX9-NEXT:     .amdhsa_exception_int_div_zero 0
 ; GFX9-NEXT:    .end_amdhsa_kernel
 ; GFX9-NEXT:    .text
+; GFX9:         .set alignstack_attr.num_vgpr, 1
+; GFX9-NEXT:    .set alignstack_attr.num_agpr, 0
+; GFX9-NEXT:    .set alignstack_attr.num_sgpr, 18
+; GFX9-NEXT:    .set alignstack_attr.private_seg_size, 128
+; GFX9-NEXT:    .set alignstack_attr.uses_vcc, 0
+; GFX9-NEXT:    .set alignstack_attr.uses_flat_scratch, 0
+; GFX9-NEXT:    .set alignstack_attr.has_dyn_sized_stack, 0
+; GFX9-NEXT:    .set alignstack_attr.has_recursion, 0
+; GFX9-NEXT:    .set alignstack_attr.has_indirect_call, 0
   %clutter = alloca i8, addrspace(5) ; Force non-zero offset for next alloca
   store volatile i8 3, ptr addrspace(5) %clutter
   %alloca.align = alloca i32, align 4, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
index 19d633651fdd0d..432d8e0e856dbf 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
@@ -6,8 +6,9 @@
 
 define amdgpu_kernel void @kern() #0 {
 ; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 5
+; ASM: .amdhsa_next_free_sgpr (max(kern.num_sgpr+(extrasgprs(kern.uses_vcc, kern.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kern.uses_vcc, kern.uses_flat_scratch, 1))
 ; ASM: .amdhsa_reserve_xnack_mask 1
+; ASM: .set kern.num_sgpr, 5
 
 ; Verify that an extra SGPR block is reserved with XNACK "any" tid setting.
 ; OBJ: Contents of section .rodata:
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
index 2097579e0c9959..b6b30bc591e2b9 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
@@ -6,8 +6,9 @@
 
 define amdgpu_kernel void @kern() #0 {
 ; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 5
+; ASM: .amdhsa_next_free_sgpr (max(kern.num_sgpr+(extrasgprs(kern.uses_vcc, kern.uses_flat_scratch, 0)), 1, 0))-(extrasgprs(kern.uses_vcc, kern.uses_flat_scratch, 0))
 ; ASM: .amdhsa_reserve_xnack_mask 0
+; ASM: .set kern.num_sgpr, 5
 
 ; Verify that an extra SGPR block is not reserved with XNACK "off" tid setting.
 ; OBJ: Contents of section .rodata:
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
index 775c62e73261a9..0aa5f2a0919761 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
@@ -6,8 +6,9 @@
 
 define amdgpu_kernel void @kern() #0 {
 ; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 5
+; ASM: .amdhsa_next_free_sgpr (max(kern.num_sgpr+(extrasgprs(kern.uses_vcc, kern.uses_flat_scratch, 1)), 1, 0))-(extrasgprs(kern.uses_vcc, kern.uses_flat_scratch, 1))
 ; ASM: .amdhsa_reserve_xnack_mask 1
+; ASM: .set kern.num_sgpr, 5
 
 ; Verify that an extra SGPR block is reserved with XNACK "on" tid setting.
 ; OBJ: Contents of section .rodata:
diff --git a/llvm/test/CodeGen/AMDGPU/trap.ll b/llvm/test/CodeGen/AMDGPU/trap.ll
index 9bab3e6fcf8c45..c2845bf1035640 100644
--- a/llvm/test/CodeGen/AMDGPU/trap.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap.ll
@@ -5,23 +5,32 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=+trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=HSA-TRAP %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-HSA-TRAP %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-HSA-TRAP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs %s -o %t1 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs %s -o %t2 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
+; RUN: FileCheck -check-prefix=GCN %s < %t1
+; RUN: FileCheck -check-prefix=GCN %s < %t2
 
 ; enable trap handler feature
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=TRAP-BIT -check-prefix=MESA-TRAP %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=TRAP-BIT -check-prefix=MESA-TRAP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=TRAP-BIT %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=TRAP-BIT %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs %s -o %t3 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs %s -o %t4 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
+; RUN: FileCheck -check-prefix=GCN -check-prefix=TRAP-BIT %s < %t3
+; RUN: FileCheck -check-prefix=GCN -check-prefix=TRAP-BIT %s < %t4
 
 ; disable trap handler feature
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=NO-TRAP-BIT -check-prefix=NOMESA-TRAP %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=NO-TRAP-BIT -check-prefix=NOMESA-TRAP %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=NO-TRAP-BIT %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=NO-TRAP-BIT %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs %s -o %t5 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs %s -o %t6 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
+; RUN: FileCheck -check-prefix=GCN -check-prefix=NO-TRAP-BIT %s < %t5
+; RUN: FileCheck -check-prefix=GCN -check-prefix=NO-TRAP-BIT %s < %t6
+
+; RUN: llc -global-isel=0 -mtriple=amdgcn -verify-machineinstrs %s -o %t7 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -verify-machineinstrs %s -o %t8 2>&1 | FileCheck -check-prefix=GCN-WARNING %s
+; RUN: FileCheck -check-prefix=GCN %s < %t7
+; RUN: FileCheck -check-prefix=GCN %s < %t8
 
-; RUN: llc -global-isel=0 -mtriple=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
 
 ; GCN-WARNING: warning: <unknown>:0:0: in function hsa_debugtrap void (ptr addrspace(1)): debugtrap handler not supported
 
@@ -31,11 +40,11 @@ declare void @llvm.debugtrap() #1
 
 ; MESA-TRAP: .section .AMDGPU.config
 ; MESA-TRAP:  .long   47180
-; MESA-TRAP-NEXT: .long   5080
+; MESA-TRAP-NEXT: .long	((((alignto(hsa_trap.private_seg_size*64, 1024))/1024)>0)||(hsa_trap.has_dyn_sized_stack|hsa_trap.has_recursion))|5080
 
 ; NOMESA-TRAP: .section .AMDGPU.config
 ; NOMESA-TRAP:  .long   47180
-; NOMESA-TRAP-NEXT: .long   5016
+; NOMESA-TRAP-NEXT: .long	((((alignto(hsa_trap.private_seg_size*64, 1024))/1024)>0)||(hsa_trap.has_dyn_sized_stack|hsa_trap.has_recursion))|5016
 
 ; GCN-LABEL: {{^}}hsa_trap:
 ; HSA-TRAP: s_mov_b64 s[0:1], s[6:7]
@@ -59,11 +68,11 @@ define amdgpu_kernel void @hsa_trap(ptr addrspace(1) nocapture readonly %arg0) {
 
 ; MESA-TRAP: .section .AMDGPU.config
 ; MESA-TRAP:  .long   47180
-; MESA-TRAP-NEXT: .long   5080
+; MESA-TRAP-NEXT: .long	((((alignto(hsa_debugtrap.private_seg_size*64, 1024))/1024)>0)||(hsa_debugtrap.has_dyn_sized_stack|hsa_debugtrap.has_recursion))|5080
 
 ; NOMESA-TRAP: .section .AMDGPU.config
 ; NOMESA-TRAP:  .long   47180
-; NOMESA-TRAP-NEXT: .long   5016
+; NOMESA-TRAP-NEXT: .long	((((alignto(hsa_debugtrap.private_seg_size*64, 1024))/1024)>0)||(hsa_debugtrap.has_dyn_sized_stack|hsa_debugtrap.has_recursion))|5016
 
 ; GCN-LABEL: {{^}}hsa_debugtrap:
 ; HSA-TRAP: s_trap 3

>From 5dff9e2352a3c00164c6fe6e9d9fdc27a0c5fb15 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <janek.vanoirschot at amd.com>
Date: Mon, 12 Aug 2024 15:56:46 +0100
Subject: [PATCH 2/2] Formatting

---
 llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index e41f302c3d56ce..6a91ad06de5d12 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -65,11 +65,11 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
       const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize,
       const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch,
       const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion,
-      const MCSymbol *HasIndirectCall){};
+      const MCSymbol *HasIndirectCall) {};
 
   virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR,
                                       const MCSymbol *MaxAGPR,
-                                      const MCSymbol *MaxSGPR){};
+                                      const MCSymbol *MaxSGPR) {};
 
   /// \returns True on success, false on failure.
   virtual bool EmitISAVersion() { return true; }



More information about the cfe-commits mailing list