[llvm] MCExpr-ify SIProgramInfo (PR #88257)

Wed Apr 10 04:49:14 PDT 2024

https://github.com/JanekvO created https://github.com/llvm/llvm-project/pull/88257

All members in SIProgramInfo that are affected by variables provided by AMDGPUResourceUsageAnalysis need to become MCExpr for when AMDGPUResourceUsageAnalysis' resource info propagation is done in the MC layer, through MCExprs. Additionally, some operations done on said resource info to create/populate some of the SIProgramInfo members now have become custom MCExpr operations (e.g., occupancy compute, totalnumvgpr, etc.).

>From 5da69c76c040df8c1ecaa7038d4e2ebebb93698b Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <janek.vanoirschot at amd.com>
Date: Tue, 9 Apr 2024 22:08:47 +0100
Subject: [PATCH] MCExpr-ify SIProgramInfo struct with accompanying population
 function.

---
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp   | 396 ++++++++++++------
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h     |   2 +
 .../AMDGPU/AMDGPUHSAMetadataStreamer.cpp      |  22 +-
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      |   8 +-
 .../AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp      | 221 ++++++++++
 .../Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h |  35 +-
 llvm/lib/Target/AMDGPU/SIProgramInfo.cpp      | 207 +++++++--
 llvm/lib/Target/AMDGPU/SIProgramInfo.h        |  45 +-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  12 +-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |   6 +
 llvm/test/MC/AMDGPU/alignto_mcexpr.s          |  15 +
 llvm/test/MC/AMDGPU/extrasgprs_mcexpr.s       |  41 ++
 llvm/test/MC/AMDGPU/occupancy_mcexpr.s        |  61 +++
 llvm/test/MC/AMDGPU/totalnumvgpr_mcexpr.s     |  25 ++
 14 files changed, 905 insertions(+), 191 deletions(-)
 create mode 100644 llvm/test/MC/AMDGPU/alignto_mcexpr.s
 create mode 100644 llvm/test/MC/AMDGPU/extrasgprs_mcexpr.s
 create mode 100644 llvm/test/MC/AMDGPU/occupancy_mcexpr.s
 create mode 100644 llvm/test/MC/AMDGPU/totalnumvgpr_mcexpr.s

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 052b231d62a3eba..b410f0c13e1b49d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -22,6 +22,7 @@
 #include "AMDKernelCodeT.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUInstPrinter.h"
+#include "MCTargetDesc/AMDGPUMCExpr.h"
 #include "MCTargetDesc/AMDGPUMCKernelDescriptor.h"
 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
 #include "R600AsmPrinter.h"
@@ -134,6 +135,12 @@ void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
     getTargetStreamer()->getPALMetadata()->readFromIR(M);
 }
 
+uint64_t AMDGPUAsmPrinter::getMCExprValue(const MCExpr *Value) {
+  int64_t Val;
+  Value->evaluateAsAbsolute(Val);
+  return Val;
+}
+
 void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
   // Init target streamer if it has not yet happened
   if (!IsTargetStreamerInitialized)
@@ -237,12 +244,14 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
   getNameWithPrefix(KernelName, &MF->getFunction());
   getTargetStreamer()->EmitAmdhsaKernelDescriptor(
       STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
-      CurrentProgramInfo.NumVGPRsForWavesPerEU,
-      CurrentProgramInfo.NumSGPRsForWavesPerEU -
+      getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU),
+      getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU) -
           IsaInfo::getNumExtraSGPRs(
-              &STM, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
+              &STM, getMCExprValue(CurrentProgramInfo.VCCUsed),
+              getMCExprValue(CurrentProgramInfo.FlatUsed),
               getTargetStreamer()->getTargetID()->isXnackOnOrAny()),
-      CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
+      getMCExprValue(CurrentProgramInfo.VCCUsed),
+      getMCExprValue(CurrentProgramInfo.FlatUsed));
 
   Streamer.popSection();
 }
@@ -422,7 +431,7 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
   }
 
-  if (CurrentProgramInfo.DynamicCallStack &&
+  if (getMCExprValue(CurrentProgramInfo.DynamicCallStack) &&
       CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
     KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK;
 
@@ -439,29 +448,22 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
 
   MCKernelDescriptor KernelDescriptor;
 
-  assert(isUInt<32>(PI.ScratchSize));
-  assert(isUInt<32>(PI.getComputePGMRSrc1(STM)));
-  assert(isUInt<32>(PI.getComputePGMRSrc2()));
-
   KernelDescriptor.group_segment_fixed_size =
       MCConstantExpr::create(PI.LDSSize, Ctx);
-  KernelDescriptor.private_segment_fixed_size =
-      MCConstantExpr::create(PI.ScratchSize, Ctx);
+  KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
 
   Align MaxKernArgAlign;
   KernelDescriptor.kernarg_size = MCConstantExpr::create(
       STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
 
-  KernelDescriptor.compute_pgm_rsrc1 =
-      MCConstantExpr::create(PI.getComputePGMRSrc1(STM), Ctx);
-  KernelDescriptor.compute_pgm_rsrc2 =
-      MCConstantExpr::create(PI.getComputePGMRSrc2(), Ctx);
+  KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
+  KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
   KernelDescriptor.kernel_code_properties =
       MCConstantExpr::create(getAmdhsaKernelCodeProperties(MF), Ctx);
 
-  assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
-  KernelDescriptor.compute_pgm_rsrc3 = MCConstantExpr::create(
-      STM.hasGFX90AInsts() ? CurrentProgramInfo.ComputePGMRSrc3GFX90A : 0, Ctx);
+  assert(STM.hasGFX90AInsts() ||
+         getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A) == 0);
+  KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A;
 
   KernelDescriptor.kernarg_preload = MCConstantExpr::create(
       AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
@@ -477,7 +479,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
     initTargetStreamer(*MF.getFunction().getParent());
 
   ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
-  CurrentProgramInfo = SIProgramInfo();
+  CurrentProgramInfo.reset(MF);
 
   const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
 
@@ -550,11 +552,13 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
     OutStreamer->emitRawComment(" Kernel info:", false);
     emitCommonFunctionComments(
-        CurrentProgramInfo.NumArchVGPR,
-        STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR
+        getMCExprValue(CurrentProgramInfo.NumArchVGPR),
+        STM.hasMAIInsts() ? getMCExprValue(CurrentProgramInfo.NumAccVGPR)
                           : std::optional<uint32_t>(),
-        CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
-        CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI);
+        getMCExprValue(CurrentProgramInfo.NumVGPR),
+        getMCExprValue(CurrentProgramInfo.NumSGPR),
+        getMCExprValue(CurrentProgramInfo.ScratchSize), getFunctionCodeSize(MF),
+        MFI);
 
     OutStreamer->emitRawComment(
       " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
@@ -565,32 +569,38 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
       " bytes/workgroup (compile time only)", false);
 
     OutStreamer->emitRawComment(
-      " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
+        " SGPRBlocks: " + Twine(getMCExprValue(CurrentProgramInfo.SGPRBlocks)),
+        false);
     OutStreamer->emitRawComment(
-      " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
+        " VGPRBlocks: " + Twine(getMCExprValue(CurrentProgramInfo.VGPRBlocks)),
+        false);
 
     OutStreamer->emitRawComment(
-      " NumSGPRsForWavesPerEU: " +
-      Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
+        " NumSGPRsForWavesPerEU: " +
+            Twine(getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU)),
+        false);
     OutStreamer->emitRawComment(
-      " NumVGPRsForWavesPerEU: " +
-      Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
+        " NumVGPRsForWavesPerEU: " +
+            Twine(getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU)),
+        false);
 
     if (STM.hasGFX90AInsts())
       OutStreamer->emitRawComment(
-        " AccumOffset: " +
-        Twine((CurrentProgramInfo.AccumOffset + 1) * 4), false);
+          " AccumOffset: " +
+              Twine((getMCExprValue(CurrentProgramInfo.AccumOffset) + 1) * 4),
+          false);
 
     OutStreamer->emitRawComment(
-      " Occupancy: " +
-      Twine(CurrentProgramInfo.Occupancy), false);
+        " Occupancy: " + Twine(getMCExprValue(CurrentProgramInfo.Occupancy)),
+        false);
 
     OutStreamer->emitRawComment(
       " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
 
-    OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
-                                    Twine(CurrentProgramInfo.ScratchEnable),
-                                false);
+    OutStreamer->emitRawComment(
+        " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
+            Twine(getMCExprValue(CurrentProgramInfo.ScratchEnable)),
+        false);
     OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
                                     Twine(CurrentProgramInfo.UserSGPR),
                                 false);
@@ -611,18 +621,20 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
                                 false);
 
     assert(STM.hasGFX90AInsts() ||
-           CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
+           getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A) == 0);
     if (STM.hasGFX90AInsts()) {
       OutStreamer->emitRawComment(
-        " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
-        Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
-                               amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
-                               false);
+          " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
+              Twine((AMDHSA_BITS_GET(
+                  getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A),
+                  amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
+          false);
       OutStreamer->emitRawComment(
-        " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
-        Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
-                               amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
-                               false);
+          " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
+              Twine((AMDHSA_BITS_GET(
+                  getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A),
+                  amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
+          false);
     }
   }
 
@@ -702,23 +714,40 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &Info =
       ResourceUsage->getResourceInfo(&MF.getFunction());
   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+  MCContext &Ctx = MF.getContext();
+
+  auto CreateExpr = [&Ctx](int64_t Value) {
+    return MCConstantExpr::create(Value, Ctx);
+  };
 
-  ProgInfo.NumArchVGPR = Info.NumVGPR;
-  ProgInfo.NumAccVGPR = Info.NumAGPR;
-  ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM);
-  ProgInfo.AccumOffset = alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1;
+  auto TryGetMCExprValue = [&Ctx](const MCExpr *Value, uint64_t &Res) -> bool {
+    int64_t Val;
+    if (Value->evaluateAsAbsolute(Val)) {
+      Res = Val;
+      return true;
+    } else
+      return false;
+  };
+
+  ProgInfo.NumArchVGPR = CreateExpr(Info.NumVGPR);
+  ProgInfo.NumAccVGPR = CreateExpr(Info.NumAGPR);
+  ProgInfo.NumVGPR = CreateExpr(Info.getTotalNumVGPRs(STM));
+  ProgInfo.AccumOffset =
+      CreateExpr(alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1);
   ProgInfo.TgSplit = STM.isTgSplitEnabled();
-  ProgInfo.NumSGPR = Info.NumExplicitSGPR;
-  ProgInfo.ScratchSize = Info.PrivateSegmentSize;
-  ProgInfo.VCCUsed = Info.UsesVCC;
-  ProgInfo.FlatUsed = Info.UsesFlatScratch;
-  ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
+  ProgInfo.NumSGPR = CreateExpr(Info.NumExplicitSGPR);
+  ProgInfo.ScratchSize = CreateExpr(Info.PrivateSegmentSize);
+  ProgInfo.VCCUsed = CreateExpr(Info.UsesVCC);
+  ProgInfo.FlatUsed = CreateExpr(Info.UsesFlatScratch);
+  ProgInfo.DynamicCallStack =
+      CreateExpr(Info.HasDynamicallySizedStack || Info.HasRecursion);
 
   const uint64_t MaxScratchPerWorkitem =
       STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
-  if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) {
-    DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
-                                          ProgInfo.ScratchSize,
+  uint64_t ScratchSize;
+  if (TryGetMCExprValue(ProgInfo.ScratchSize, ScratchSize) &&
+      ScratchSize > MaxScratchPerWorkitem) {
+    DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), ScratchSize,
                                           MaxScratchPerWorkitem, DS_Error);
     MF.getFunction().getContext().diagnose(DiagStackSize);
   }
@@ -728,27 +757,30 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   // The calculations related to SGPR/VGPR blocks are
   // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
   // unified.
-  unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
-      &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed,
-      getTargetStreamer()->getTargetID()->isXnackOnOrAny());
+  const MCExpr *ExtraSGPRs = AMDGPUVariadicMCExpr::createExtraSGPRs(
+      ProgInfo.VCCUsed, ProgInfo.FlatUsed, getIsaVersion(STM.getCPU()).Major,
+      STM.getFeatureBits().test(AMDGPU::FeatureArchitectedFlatScratch),
+      getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
 
   // Check the addressable register limit before we add ExtraSGPRs.
   if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
       !STM.hasSGPRInitBug()) {
     unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
-    if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
+    uint64_t NumSgpr;
+    if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
+        NumSgpr > MaxAddressableNumSGPRs) {
       // This can happen due to a compiler bug or when using inline asm.
       LLVMContext &Ctx = MF.getFunction().getContext();
       DiagnosticInfoResourceLimit Diag(
-          MF.getFunction(), "addressable scalar registers", ProgInfo.NumSGPR,
+          MF.getFunction(), "addressable scalar registers", NumSgpr,
           MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
       Ctx.diagnose(Diag);
-      ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1;
+      ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);
     }
   }
 
   // Account for extra SGPRs and VGPRs reserved for debugger use.
-  ProgInfo.NumSGPR += ExtraSGPRs;
+  ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx);
 
   const Function &F = MF.getFunction();
 
@@ -819,40 +851,50 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
         }
       }
     }
-    ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
-    ProgInfo.NumArchVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
-    ProgInfo.NumVGPR =
-        Info.getTotalNumVGPRs(STM, Info.NumAGPR, ProgInfo.NumArchVGPR);
+    ProgInfo.NumSGPR = AMDGPUVariadicMCExpr::createMax(
+        {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx);
+
+    ProgInfo.NumArchVGPR = AMDGPUVariadicMCExpr::createMax(
+        {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
+
+    ProgInfo.NumVGPR = AMDGPUVariadicMCExpr::createTotalNumVGPR(
+        STM.hasGFX90AInsts(), ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
   }
 
   // Adjust number of registers used to meet default/requested minimum/maximum
   // number of waves per execution unit request.
-  ProgInfo.NumSGPRsForWavesPerEU = std::max(
-    std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU()));
-  ProgInfo.NumVGPRsForWavesPerEU = std::max(
-    std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU()));
+  ProgInfo.NumSGPRsForWavesPerEU = AMDGPUVariadicMCExpr::createMax(
+      {ProgInfo.NumSGPR, CreateExpr(1ul),
+       CreateExpr(STM.getMinNumSGPRs(MFI->getMaxWavesPerEU()))},
+      Ctx);
+  ProgInfo.NumVGPRsForWavesPerEU = AMDGPUVariadicMCExpr::createMax(
+      {ProgInfo.NumVGPR, CreateExpr(1ul),
+       CreateExpr(STM.getMinNumVGPRs(MFI->getMaxWavesPerEU()))},
+      Ctx);
 
   if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
       STM.hasSGPRInitBug()) {
     unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
-    if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
+    uint64_t NumSgpr;
+    if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
+        NumSgpr > MaxAddressableNumSGPRs) {
       // This can happen due to a compiler bug or when using inline asm to use
       // the registers which are usually reserved for vcc etc.
       LLVMContext &Ctx = MF.getFunction().getContext();
       DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
-                                       ProgInfo.NumSGPR, MaxAddressableNumSGPRs,
+                                       NumSgpr, MaxAddressableNumSGPRs,
                                        DS_Error, DK_ResourceLimit);
       Ctx.diagnose(Diag);
-      ProgInfo.NumSGPR = MaxAddressableNumSGPRs;
-      ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs;
+      ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);
+      ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);
     }
   }
 
   if (STM.hasSGPRInitBug()) {
     ProgInfo.NumSGPR =
-        AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
+        CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);
     ProgInfo.NumSGPRsForWavesPerEU =
-        AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
+        CreateExpr(AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG);
   }
 
   if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
@@ -871,11 +913,26 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
         STM.getAddressableLocalMemorySize(), DS_Error);
     Ctx.diagnose(Diag);
   }
+  // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
+  // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
+  auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
+                                             unsigned Granule) {
+    const MCExpr *OneConst = CreateExpr(1ul);
+    const MCExpr *GranuleConst = CreateExpr(Granule);
+    const MCExpr *MaxNumGPR =
+        AMDGPUVariadicMCExpr::createMax({NumGPR, OneConst}, Ctx);
+    const MCExpr *AlignToGPR =
+        AMDGPUVariadicMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
+    const MCExpr *DivGPR =
+        MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
+    const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
+    return SubGPR;
+  };
 
-  ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks(
-      &STM, ProgInfo.NumSGPRsForWavesPerEU);
-  ProgInfo.VGPRBlocks =
-      IsaInfo::getEncodedNumVGPRBlocks(&STM, ProgInfo.NumVGPRsForWavesPerEU);
+  ProgInfo.SGPRBlocks = GetNumGPRBlocks(ProgInfo.NumSGPRsForWavesPerEU,
+                                        IsaInfo::getSGPREncodingGranule(&STM));
+  ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
+                                        IsaInfo::getVGPREncodingGranule(&STM));
 
   const SIModeRegisterDefaults Mode = MFI->getMode();
 
@@ -904,14 +961,23 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.LDSBlocks =
       alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
 
+  // The MCExpr equivalent of divideCeil.
+  auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
+    const MCExpr *Ceil =
+        AMDGPUVariadicMCExpr::createAlignTo(Numerator, Denominator, Ctx);
+    return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);
+  };
+
   // Scratch is allocated in 64-dword or 256-dword blocks.
   unsigned ScratchAlignShift =
       STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
   // We need to program the hardware with the amount of scratch memory that
   // is used by the entire wave.  ProgInfo.ScratchSize is the amount of
   // scratch memory used per thread.
-  ProgInfo.ScratchBlocks = divideCeil(
-      ProgInfo.ScratchSize * STM.getWavefrontSize(), 1ULL << ScratchAlignShift);
+  ProgInfo.ScratchBlocks = DivideCeil(
+      MCBinaryExpr::createMul(ProgInfo.ScratchSize,
+                              CreateExpr(STM.getWavefrontSize()), Ctx),
+      CreateExpr(1ULL << ScratchAlignShift));
 
   if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
     ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
@@ -930,8 +996,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   // anything to disable it if we know the stack isn't used here. We may still
   // have emitted code reading it to initialize scratch, but if that's unused
   // reading garbage should be OK.
-  ProgInfo.ScratchEnable =
-      ProgInfo.ScratchBlocks > 0 || ProgInfo.DynamicCallStack;
+  ProgInfo.ScratchEnable = MCBinaryExpr::createLOr(
+      MCBinaryExpr::createGT(ProgInfo.ScratchBlocks,
+                             MCConstantExpr::create(0, Ctx), Ctx),
+      ProgInfo.DynamicCallStack, Ctx);
+
   ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
   // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
   ProgInfo.TrapHandlerEnable =
@@ -947,26 +1016,41 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.EXCPEnable = 0;
 
   if (STM.hasGFX90AInsts()) {
-    AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
-                    amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
-                    ProgInfo.AccumOffset);
-    AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
-                    amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
-                    ProgInfo.TgSplit);
+    // return ((Dst & ~Mask) | (Value << Shift))
+    auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
+                          uint32_t Shift) {
+      auto Shft = MCConstantExpr::create(Shift, Ctx);
+      auto Msk = MCConstantExpr::create(Mask, Ctx);
+      Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
+      Dst = MCBinaryExpr::createOr(
+          Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), Ctx);
+      return Dst;
+    };
+
+    ProgInfo.ComputePGMRSrc3GFX90A =
+        SetBits(ProgInfo.ComputePGMRSrc3GFX90A, ProgInfo.AccumOffset,
+                amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
+                amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
+    ProgInfo.ComputePGMRSrc3GFX90A =
+        SetBits(ProgInfo.ComputePGMRSrc3GFX90A, CreateExpr(ProgInfo.TgSplit),
+                amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
+                amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
   }
 
-  ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize,
-                                            ProgInfo.NumSGPRsForWavesPerEU,
-                                            ProgInfo.NumVGPRsForWavesPerEU);
+  ProgInfo.Occupancy = AMDGPUVariadicMCExpr::createOccupancy(
+      STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU,
+      ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
+
   const auto [MinWEU, MaxWEU] =
       AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
-  if (ProgInfo.Occupancy < MinWEU) {
+  uint64_t Occupancy;
+  if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
     DiagnosticInfoOptimizationFailure Diag(
         F, F.getSubprogram(),
         "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
         "'" +
             F.getName() + "': desired occupancy was " + Twine(MinWEU) +
-            ", final occupancy is " + Twine(ProgInfo.Occupancy));
+            ", final occupancy is " + Twine(Occupancy));
     F.getContext().diagnose(Diag);
   }
 }
@@ -989,36 +1073,76 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
+  MCContext &Ctx = MF.getContext();
+
+  // (((Value) & Mask) << Shift)
+  auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
+    const MCExpr *msk = MCConstantExpr::create(Mask, Ctx);
+    const MCExpr *shft = MCConstantExpr::create(Shift, Ctx);
+    return MCBinaryExpr::createShl(MCBinaryExpr::createAnd(Value, msk, Ctx),
+                                   shft, Ctx);
+  };
+
+  auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {
+    int64_t Val;
+    if (Value->evaluateAsAbsolute(Val))
+      OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size);
+    else
+      OutStreamer->emitValue(Value, Size);
+  };
 
   if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
     OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1);
 
-    OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1(STM));
+    EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx),
+                       /*Size=*/4);
 
     OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
-    OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc2());
+    EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4);
 
     OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
-    OutStreamer->emitInt32(
-        STM.getGeneration() >= AMDGPUSubtarget::GFX12
-            ? S_00B860_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
-        : STM.getGeneration() == AMDGPUSubtarget::GFX11
-            ? S_00B860_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
-            : S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
+
+    // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
+    // appropriate generation.
+    if (STM.getGeneration() >= AMDGPUSubtarget::GFX12)
+      EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
+                                 /*Mask=*/0x3FFFF, /*Shift=*/12),
+                         /*Size=*/4);
+    else if (STM.getGeneration() == AMDGPUSubtarget::GFX11)
+      EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
+                                 /*Mask=*/0x7FFF, /*Shift=*/12),
+                         /*Size=*/4);
+    else
+      EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
+                                 /*Mask=*/0x1FFF, /*Shift=*/12),
+                         /*Size=*/4);
 
     // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
     // 0" comment but I don't see a corresponding field in the register spec.
   } else {
     OutStreamer->emitInt32(RsrcReg);
-    OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
-                              S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
+
+    const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
+        SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),
+        SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),
+        MF.getContext());
+    EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);
     OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
-    OutStreamer->emitInt32(
-        STM.getGeneration() >= AMDGPUSubtarget::GFX12
-            ? S_0286E8_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
-        : STM.getGeneration() == AMDGPUSubtarget::GFX11
-            ? S_0286E8_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
-            : S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
+
+    // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
+    // appropriate generation.
+    if (STM.getGeneration() >= AMDGPUSubtarget::GFX12)
+      EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
+                                 /*Mask=*/0x3FFFF, /*Shift=*/12),
+                         /*Size=*/4);
+    else if (STM.getGeneration() == AMDGPUSubtarget::GFX11)
+      EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
+                                 /*Mask=*/0x7FFF, /*Shift=*/12),
+                         /*Size=*/4);
+    else
+      EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
+                                 /*Mask=*/0x1FFF, /*Shift=*/12),
+                         /*Size=*/4);
   }
 
   if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
@@ -1072,31 +1196,35 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
   auto MD = getTargetStreamer()->getPALMetadata();
 
   MD->setEntryPoint(CC, MF.getFunction().getName());
-  MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU);
+  MD->setNumUsedVgprs(CC,
+                      getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU));
 
   // Only set AGPRs for supported devices
   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
   if (STM.hasMAIInsts()) {
-    MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
+    MD->setNumUsedAgprs(CC, getMCExprValue(CurrentProgramInfo.NumAccVGPR));
   }
 
-  MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
+  MD->setNumUsedSgprs(CC,
+                      getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU));
   if (MD->getPALMajorVersion() < 3) {
     MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM));
     if (AMDGPU::isCompute(CC)) {
       MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2());
     } else {
-      if (CurrentProgramInfo.ScratchBlocks > 0)
+      if (getMCExprValue(CurrentProgramInfo.ScratchBlocks) > 0)
         MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
     }
   } else {
     MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
-    MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
+    MD->setHwStage(CC, ".scratch_en",
+                   (bool)getMCExprValue(CurrentProgramInfo.ScratchEnable));
     EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM);
   }
 
   // ScratchSize is in bytes, 16 aligned.
-  MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16));
+  MD->setScratchSize(
+      CC, alignTo(getMCExprValue(CurrentProgramInfo.ScratchSize), 16));
   if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
     unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
                                 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
@@ -1158,8 +1286,10 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
 
   // Set optional info
   MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
-  MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
-  MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
+  MD->setFunctionNumUsedVgprs(
+      FnName, getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU));
+  MD->setFunctionNumUsedSgprs(
+      FnName, getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU));
 }
 
 // This is supposed to be log2(Size)
@@ -1193,7 +1323,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
       (CurrentProgramInfo.getComputePGMRSrc2() << 32);
   Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
 
-  if (CurrentProgramInfo.DynamicCallStack)
+  if (getMCExprValue(CurrentProgramInfo.DynamicCallStack))
     Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK;
 
   AMD_HSA_BITS_SET(Out.code_properties,
@@ -1229,9 +1359,10 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
 
   Align MaxKernArgAlign;
   Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
-  Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
-  Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
-  Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
+  Out.wavefront_sgpr_count = getMCExprValue(CurrentProgramInfo.NumSGPR);
+  Out.workitem_vgpr_count = getMCExprValue(CurrentProgramInfo.NumVGPR);
+  Out.workitem_private_segment_byte_size =
+      getMCExprValue(CurrentProgramInfo.ScratchSize);
   Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
 
   // kernarg_segment_alignment is specified as log of the alignment.
@@ -1324,17 +1455,20 @@ void AMDGPUAsmPrinter::emitResourceUsageRemarks(
   // printing multiple diagnostic location and diag opts.
   EmitResourceUsageRemark("FunctionName", "Function Name",
                           MF.getFunction().getName());
-  EmitResourceUsageRemark("NumSGPR", "SGPRs", CurrentProgramInfo.NumSGPR);
-  EmitResourceUsageRemark("NumVGPR", "VGPRs", CurrentProgramInfo.NumArchVGPR);
+  EmitResourceUsageRemark("NumSGPR", "SGPRs",
+                          getMCExprValue(CurrentProgramInfo.NumSGPR));
+  EmitResourceUsageRemark("NumVGPR", "VGPRs",
+                          getMCExprValue(CurrentProgramInfo.NumArchVGPR));
   if (hasMAIInsts)
-    EmitResourceUsageRemark("NumAGPR", "AGPRs", CurrentProgramInfo.NumAccVGPR);
+    EmitResourceUsageRemark("NumAGPR", "AGPRs",
+                            getMCExprValue(CurrentProgramInfo.NumAccVGPR));
   EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
-                          CurrentProgramInfo.ScratchSize);
+                          getMCExprValue(CurrentProgramInfo.ScratchSize));
   StringRef DynamicStackStr =
-      CurrentProgramInfo.DynamicCallStack ? "True" : "False";
+      getMCExprValue(CurrentProgramInfo.DynamicCallStack) ? "True" : "False";
   EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
   EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
-                          CurrentProgramInfo.Occupancy);
+                          getMCExprValue(CurrentProgramInfo.Occupancy));
   EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
                           CurrentProgramInfo.SGPRSpill);
   EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index b8b2718d293e69d..3d155905c4afeba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -78,6 +78,8 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
 
   void initTargetStreamer(Module &M);
 
+  static uint64_t getMCExprValue(const MCExpr *Value);
+
 public:
   explicit AMDGPUAsmPrinter(TargetMachine &TM,
                             std::unique_ptr<MCStreamer> Streamer);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 9e288ab50e17017..a402e6fc68b4911 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -19,6 +19,7 @@
 #include "SIMachineFunctionInfo.h"
 #include "SIProgramInfo.h"
 #include "llvm/IR/Module.h"
+#include "llvm/MC/MCExpr.h"
 using namespace llvm;
 
 static std::pair<Type *, Align> getArgumentTypeAlign(const Argument &Arg,
@@ -462,6 +463,12 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
   const Function &F = MF.getFunction();
 
+  auto getMCExprValue = [](const MCExpr *Value) {
+    int64_t Val;
+    Value->evaluateAsAbsolute(Val);
+    return static_cast<uint64_t>(Val);
+  };
+
   auto Kern = HSAMetadataDoc->getMapNode();
 
   Align MaxKernArgAlign;
@@ -470,10 +477,10 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
   Kern[".group_segment_fixed_size"] =
       Kern.getDocument()->getNode(ProgramInfo.LDSSize);
   Kern[".private_segment_fixed_size"] =
-      Kern.getDocument()->getNode(ProgramInfo.ScratchSize);
+      Kern.getDocument()->getNode(getMCExprValue(ProgramInfo.ScratchSize));
   if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
-    Kern[".uses_dynamic_stack"] =
-        Kern.getDocument()->getNode(ProgramInfo.DynamicCallStack);
+    Kern[".uses_dynamic_stack"] = Kern.getDocument()->getNode(
+        static_cast<bool>(getMCExprValue(ProgramInfo.DynamicCallStack)));
 
   if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5 && STM.supportsWGP())
     Kern[".workgroup_processor_mode"] =
@@ -484,12 +491,15 @@ MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
       Kern.getDocument()->getNode(std::max(Align(4), MaxKernArgAlign).value());
   Kern[".wavefront_size"] =
       Kern.getDocument()->getNode(STM.getWavefrontSize());
-  Kern[".sgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumSGPR);
-  Kern[".vgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumVGPR);
+  Kern[".sgpr_count"] =
+      Kern.getDocument()->getNode(getMCExprValue(ProgramInfo.NumSGPR));
+  Kern[".vgpr_count"] =
+      Kern.getDocument()->getNode(getMCExprValue(ProgramInfo.NumVGPR));
 
   // Only add AGPR count to metadata for supported devices
   if (STM.hasMAIInsts()) {
-    Kern[".agpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumAccVGPR);
+    Kern[".agpr_count"] =
+        Kern.getDocument()->getNode(getMCExprValue(ProgramInfo.NumAccVGPR));
   }
 
   Kern[".max_flat_workgroup_size"] =
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 38667235211471f..08b4a86994cab58 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDKernelCodeT.h"
+#include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCExpr.h"
 #include "MCTargetDesc/AMDGPUMCKernelDescriptor.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -8406,12 +8407,17 @@ bool AMDGPUAsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     AGVK VK = StringSwitch<AGVK>(TokenId)
                   .Case("max", AGVK::AGVK_Max)
                   .Case("or", AGVK::AGVK_Or)
+                  .Case("extrasgprs", AGVK::AGVK_ExtraSGPRs)
+                  .Case("totalnumvgprs", AGVK::AGVK_TotalNumVGPRs)
+                  .Case("totalnumvgprs90a", AGVK::AGVK_TotalNumVGPRs90A)
+                  .Case("alignto", AGVK::AGVK_AlignTo)
+                  .Case("occupancy", AGVK::AGVK_Occupancy)
                   .Default(AGVK::AGVK_None);
 
     if (VK != AGVK::AGVK_None && peekToken().is(AsmToken::LParen)) {
       SmallVector<const MCExpr *, 4> Exprs;
       uint64_t CommaCount = 0;
-      lex(); // Eat 'max'/'or'
+      lex(); // Eat Arg ('or', 'max', 'occupancy', etc.)
       lex(); // Eat '('
       while (true) {
         if (trySkipToken(AsmToken::RParen)) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
index 4578c33d92dce1a..25813eb30aefd57 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
@@ -7,6 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUMCExpr.h"
+#include "GCNSubtarget.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/IR/Function.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
@@ -16,6 +19,7 @@
 #include <optional>
 
 using namespace llvm;
+using namespace llvm::AMDGPU;
 
 AMDGPUVariadicMCExpr::AMDGPUVariadicMCExpr(VariadicKind Kind,
                                            ArrayRef<const MCExpr *> Args,
@@ -61,6 +65,21 @@ void AMDGPUVariadicMCExpr::printImpl(raw_ostream &OS,
   case AGVK_Max:
     OS << "max(";
     break;
+  case AGVK_ExtraSGPRs:
+    OS << "extrasgprs(";
+    break;
+  case AGVK_TotalNumVGPRs:
+    OS << "totalnumvgprs(";
+    break;
+  case AGVK_TotalNumVGPRs90A:
+    OS << "totalnumvgprs90a(";
+    break;
+  case AGVK_AlignTo:
+    OS << "alignto(";
+    break;
+  case AGVK_Occupancy:
+    OS << "occupancy(";
+    break;
   }
   for (auto It = Args.begin(); It != Args.end(); ++It) {
     (*It)->print(OS, MAI, /*InParens=*/false);
@@ -86,6 +105,158 @@ bool AMDGPUVariadicMCExpr::evaluateAsRelocatableImpl(
     MCValue &Res, const MCAsmLayout *Layout, const MCFixup *Fixup) const {
   std::optional<int64_t> Total;
 
+  auto TryGetMCExprValue = [&](const MCExpr *Arg, uint64_t &ConstantValue) {
+    MCValue MCVal;
+    if (!Arg->evaluateAsRelocatable(MCVal, Layout, Fixup) ||
+        !MCVal.isAbsolute())
+      return false;
+
+    ConstantValue = MCVal.getConstant();
+    return true;
+  };
+
+  if (Kind == AGVK_ExtraSGPRs) {
+    assert(Args.size() == 5 &&
+           "AMDGPUVariadic Argument count incorrect for ExtraSGPRs");
+    uint64_t VCCUsed, FlatScrUsed, MajorVersion, XNACKUsed,
+        hasArchitectedFlatScr, ExtraSGPRs = 0;
+
+    bool Success = true;
+    Success &= TryGetMCExprValue(Args[0], MajorVersion);
+    Success &= TryGetMCExprValue(Args[3], XNACKUsed);
+    Success &= TryGetMCExprValue(Args[4], hasArchitectedFlatScr);
+
+    assert(Success &&
+           "Arguments 1, 4, and 5 for ExtraSGPRs should be known constants");
+    if (!Success || !TryGetMCExprValue(Args[1], VCCUsed) ||
+        !TryGetMCExprValue(Args[2], FlatScrUsed))
+      return false;
+
+    if (VCCUsed)
+      ExtraSGPRs = 2;
+    if (MajorVersion >= 10) {
+      Res = MCValue::get(ExtraSGPRs);
+      return true;
+    }
+    if (MajorVersion < 8) {
+      if (FlatScrUsed)
+        ExtraSGPRs = 4;
+    } else {
+      if (XNACKUsed)
+        ExtraSGPRs = 4;
+      if (FlatScrUsed || hasArchitectedFlatScr)
+        ExtraSGPRs = 6;
+    }
+
+    Res = MCValue::get(ExtraSGPRs);
+    return true;
+  }
+
+  if (Kind == AGVK_AlignTo) {
+    assert(Args.size() == 2 &&
+           "AMDGPUVariadic Argument count incorrect for AlignTo");
+    uint64_t Value, Align;
+    if (!TryGetMCExprValue(Args[0], Value) ||
+        !TryGetMCExprValue(Args[1], Align))
+      return false;
+
+    Res = MCValue::get(alignTo(Value, Align));
+    return true;
+  }
+
+  if (Kind == AGVK_TotalNumVGPRs90A) {
+    assert(Args.size() == 2 &&
+           "AMDGPUVariadic Argument count incorrect for TotalNumVGPRs90A");
+    uint64_t NumAGPR, NumVGPR, Total;
+    if (!TryGetMCExprValue(Args[0], NumAGPR) ||
+        !TryGetMCExprValue(Args[1], NumVGPR))
+      return false;
+
+    if (NumAGPR) {
+      Total = alignTo(NumVGPR, 4) + NumAGPR;
+    } else {
+      Total = std::max(NumVGPR, NumAGPR);
+    }
+
+    Res = MCValue::get(Total);
+    return true;
+  }
+
+  if (Kind == AGVK_TotalNumVGPRs) {
+    assert(Args.size() == 2 &&
+           "AMDGPUVariadic Argument count incorrect for TotalNumVGPRs");
+    uint64_t NumAGPR, NumVGPR;
+    if (!TryGetMCExprValue(Args[0], NumAGPR) ||
+        !TryGetMCExprValue(Args[1], NumVGPR))
+      return false;
+
+    Res = MCValue::get(std::max(NumVGPR, NumAGPR));
+    return true;
+  }
+
+  if (Kind == AGVK_Occupancy) {
+    assert(Args.size() == 7 &&
+           "AMDGPUVariadic Argument count incorrect for Occupancy");
+    uint64_t InitOccupancy, MaxWaves, Granule, TargetTotalNumVGPRs, Generation,
+        NumSGPRs, NumVGPRs;
+
+    bool Success = true;
+    Success &= TryGetMCExprValue(Args[0], MaxWaves);
+    Success &= TryGetMCExprValue(Args[1], Granule);
+    Success &= TryGetMCExprValue(Args[2], TargetTotalNumVGPRs);
+    Success &= TryGetMCExprValue(Args[3], Generation);
+    Success &= TryGetMCExprValue(Args[4], InitOccupancy);
+
+    assert(Success &&
+           "Arguments 1 to 5 for Occupancy should be known constants");
+
+    if (!Success || !TryGetMCExprValue(Args[5], NumSGPRs) ||
+        !TryGetMCExprValue(Args[6], NumVGPRs))
+      return false;
+
+    auto OccWithNumVGPRs = [&](uint64_t NumVGPRs) -> uint64_t {
+      return IsaInfo::getNumWavesPerEUWithNumVGPRs(NumVGPRs, Granule, MaxWaves,
+                                                   TargetTotalNumVGPRs);
+    };
+
+    // Mirrors GCNSubtarget::getOccupancyWithNumSGPRs without dependency on
+    // subtarget.
+    auto OccWithNumSGPRs = [&](uint64_t NumSGPRs) -> uint64_t {
+      if (Generation >= AMDGPUSubtarget::GFX10)
+        return MaxWaves;
+
+      if (Generation >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+        if (NumSGPRs <= 80)
+          return 10;
+        if (NumSGPRs <= 88)
+          return 9;
+        if (NumSGPRs <= 100)
+          return 8;
+        return 7;
+      }
+      if (NumSGPRs <= 48)
+        return 10;
+      if (NumSGPRs <= 56)
+        return 9;
+      if (NumSGPRs <= 64)
+        return 8;
+      if (NumSGPRs <= 72)
+        return 7;
+      if (NumSGPRs <= 80)
+        return 6;
+      return 5;
+    };
+
+    uint64_t Occupancy = InitOccupancy;
+    if (NumSGPRs)
+      Occupancy = std::min(Occupancy, OccWithNumSGPRs(NumSGPRs));
+    if (NumVGPRs)
+      Occupancy = std::min(Occupancy, OccWithNumVGPRs(NumVGPRs));
+
+    Res = MCValue::get(Occupancy);
+    return true;
+  }
+
   for (const MCExpr *Arg : Args) {
     MCValue ArgRes;
     if (!Arg->evaluateAsRelocatable(ArgRes, Layout, Fixup) ||
@@ -113,3 +284,53 @@ MCFragment *AMDGPUVariadicMCExpr::findAssociatedFragment() const {
   }
   return nullptr;
 }
+
+/// Allow delayed MCExpr resolve of ExtraSGPRs (in case VCCUsed or FlatScrUsed
+/// are unresolvable but needed for further MCExprs). Derived from
+/// implementation of IsaInfo::getNumExtraSGPRs in AMDGPUBaseInfo.cpp.
+///
+const AMDGPUVariadicMCExpr *AMDGPUVariadicMCExpr::createExtraSGPRs(
+    const MCExpr *VCCUsed, const MCExpr *FlatScrUsed, unsigned MajorVersion,
+    bool hasArchitectedFlatScratch, bool XNACKUsed, MCContext &Ctx) {
+  auto GetConstantExpr = [&Ctx](int64_t Value) {
+    return MCConstantExpr::create(Value, Ctx);
+  };
+
+  return create(AGVK_ExtraSGPRs,
+                {GetConstantExpr(MajorVersion), VCCUsed, FlatScrUsed,
+                 GetConstantExpr(XNACKUsed),
+                 GetConstantExpr(hasArchitectedFlatScratch)},
+                Ctx);
+}
+
+const AMDGPUVariadicMCExpr *AMDGPUVariadicMCExpr::createTotalNumVGPR(
+    bool has90AInsts, const MCExpr *NumAGPR, const MCExpr *NumVGPR,
+    MCContext &Ctx) {
+  return create(has90AInsts ? AGVK_TotalNumVGPRs90A : AGVK_TotalNumVGPRs,
+                {NumAGPR, NumVGPR}, Ctx);
+}
+
+/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
+///
+/// Remove dependency on GCNSubtarget and depend only only the necessary values
+/// for said occupancy computation. Should match computeOccupancy implementation
+/// without passing \p STM on.
+const AMDGPUVariadicMCExpr *
+AMDGPUVariadicMCExpr::createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
+                                      const MCExpr *NumVGPRs,
+                                      const GCNSubtarget &STM, MCContext &Ctx) {
+  unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
+  unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM);
+  unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
+  unsigned Generation = STM.getGeneration();
+
+  auto CreateExpr = [&Ctx](unsigned Value) {
+    return MCConstantExpr::create(Value, Ctx);
+  };
+
+  return create(AGVK_Occupancy,
+                {CreateExpr(MaxWaves), CreateExpr(Granule),
+                 CreateExpr(TargetTotalNumVGPRs), CreateExpr(Generation),
+                 CreateExpr(InitOcc), NumSGPRs, NumVGPRs},
+                Ctx);
+}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
index 238e0dea791b24d..f317ef73fa3e248 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@@ -14,6 +14,9 @@
 
 namespace llvm {
 
+class Function;
+class GCNSubtarget;
+
 /// AMDGPU target specific variadic MCExpr operations.
 ///
 /// Takes in a minimum of 1 argument to be used with an operation. The supported
@@ -26,7 +29,16 @@ namespace llvm {
 ///
 class AMDGPUVariadicMCExpr : public MCTargetExpr {
 public:
-  enum VariadicKind { AGVK_None, AGVK_Or, AGVK_Max };
+  enum VariadicKind {
+    AGVK_None,
+    AGVK_Or,
+    AGVK_Max,
+    AGVK_ExtraSGPRs,
+    AGVK_TotalNumVGPRs,
+    AGVK_TotalNumVGPRs90A,
+    AGVK_AlignTo,
+    AGVK_Occupancy
+  };
 
 private:
   VariadicKind Kind;
@@ -52,6 +64,27 @@ class AMDGPUVariadicMCExpr : public MCTargetExpr {
     return create(VariadicKind::AGVK_Max, Args, Ctx);
   }
 
+  static const AMDGPUVariadicMCExpr *
+  createExtraSGPRs(const MCExpr *VCCUsed, const MCExpr *FlatScrUsed,
+                   unsigned MajorVersion, bool hasArchitectedFlatScratch,
+                   bool XNACKUsed, MCContext &Ctx);
+
+  static const AMDGPUVariadicMCExpr *createTotalNumVGPR(bool has90AInsts,
+                                                        const MCExpr *NumAGPR,
+                                                        const MCExpr *NumVGPR,
+                                                        MCContext &Ctx);
+
+  static const AMDGPUVariadicMCExpr *
+  createAlignTo(const MCExpr *Value, const MCExpr *Align, MCContext &Ctx) {
+    return create(VariadicKind::AGVK_AlignTo, {Value, Align}, Ctx);
+  }
+
+  static const AMDGPUVariadicMCExpr *createOccupancy(unsigned InitOcc,
+                                                     const MCExpr *NumSGPRs,
+                                                     const MCExpr *NumVGPRs,
+                                                     const GCNSubtarget &STM,
+                                                     MCContext &Ctx);
+
   VariadicKind getKind() const { return Kind; }
   const MCExpr *getSubExpr(size_t Index) const;
 
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
index 9ed7aacc0538ec1..0d40816cdd4b8ea 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -18,57 +18,114 @@
 #include "GCNSubtarget.h"
 #include "SIDefines.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/MC/MCExpr.h"
 
 using namespace llvm;
 
-uint64_t SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST) const {
-  uint64_t Reg = S_00B848_VGPRS(VGPRBlocks) | S_00B848_SGPRS(SGPRBlocks) |
-                 S_00B848_PRIORITY(Priority) | S_00B848_FLOAT_MODE(FloatMode) |
-                 S_00B848_PRIV(Priv) | S_00B848_DEBUG_MODE(DebugMode) |
-                 S_00B848_WGP_MODE(WgpMode) | S_00B848_MEM_ORDERED(MemOrdered);
+void SIProgramInfo::reset(const MachineFunction &MF) {
+  MCContext &Ctx = MF.getContext();
+
+  const MCExpr *ZeroExpr = MCConstantExpr::create(0, Ctx);
+
+  VGPRBlocks = ZeroExpr;
+  SGPRBlocks = ZeroExpr;
+  Priority = 0;
+  FloatMode = 0;
+  Priv = 0;
+  DX10Clamp = 0;
+  DebugMode = 0;
+  IEEEMode = 0;
+  WgpMode = 0;
+  MemOrdered = 0;
+  RrWgMode = 0;
+  ScratchSize = ZeroExpr;
+
+  LDSBlocks = 0;
+  ScratchBlocks = ZeroExpr;
+
+  ScratchEnable = ZeroExpr;
+  UserSGPR = 0;
+  TrapHandlerEnable = 0;
+  TGIdXEnable = 0;
+  TGIdYEnable = 0;
+  TGIdZEnable = 0;
+  TGSizeEnable = 0;
+  TIdIGCompCount = 0;
+  EXCPEnMSB = 0;
+  LdsSize = 0;
+  EXCPEnable = 0;
+
+  ComputePGMRSrc3GFX90A = ZeroExpr;
+
+  NumVGPR = ZeroExpr;
+  NumArchVGPR = ZeroExpr;
+  NumAccVGPR = ZeroExpr;
+  AccumOffset = ZeroExpr;
+  TgSplit = 0;
+  NumSGPR = ZeroExpr;
+  SGPRSpill = 0;
+  VGPRSpill = 0;
+  LDSSize = 0;
+  FlatUsed = ZeroExpr;
+
+  NumSGPRsForWavesPerEU = ZeroExpr;
+  NumVGPRsForWavesPerEU = ZeroExpr;
+  Occupancy = ZeroExpr;
+  DynamicCallStack = ZeroExpr;
+  VCCUsed = ZeroExpr;
+}
+
+static uint64_t getComputePGMRSrc1Reg(const SIProgramInfo &ProgInfo,
+                                      const GCNSubtarget &ST) {
+  uint64_t Reg = S_00B848_PRIORITY(ProgInfo.Priority) |
+                 S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
+                 S_00B848_PRIV(ProgInfo.Priv) |
+                 S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
+                 S_00B848_WGP_MODE(ProgInfo.WgpMode) |
+                 S_00B848_MEM_ORDERED(ProgInfo.MemOrdered);
 
   if (ST.hasDX10ClampMode())
-    Reg |= S_00B848_DX10_CLAMP(DX10Clamp);
+    Reg |= S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp);
 
   if (ST.hasIEEEMode())
-    Reg |= S_00B848_IEEE_MODE(IEEEMode);
+    Reg |= S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
 
   if (ST.hasRrWGMode())
-    Reg |= S_00B848_RR_WG_MODE(RrWgMode);
+    Reg |= S_00B848_RR_WG_MODE(ProgInfo.RrWgMode);
 
   return Reg;
 }
 
-uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC,
-                                    const GCNSubtarget &ST) const {
-  if (AMDGPU::isCompute(CC)) {
-    return getComputePGMRSrc1(ST);
-  }
-  uint64_t Reg = S_00B848_VGPRS(VGPRBlocks) | S_00B848_SGPRS(SGPRBlocks) |
-                 S_00B848_PRIORITY(Priority) | S_00B848_FLOAT_MODE(FloatMode) |
-                 S_00B848_PRIV(Priv) | S_00B848_DEBUG_MODE(DebugMode);
+static uint64_t getPGMRSrc1Reg(const SIProgramInfo &ProgInfo,
+                               CallingConv::ID CC, const GCNSubtarget &ST) {
+  uint64_t Reg = S_00B848_PRIORITY(ProgInfo.Priority) |
+                 S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
+                 S_00B848_PRIV(ProgInfo.Priv) |
+                 S_00B848_DEBUG_MODE(ProgInfo.DebugMode);
 
   if (ST.hasDX10ClampMode())
-    Reg |= S_00B848_DX10_CLAMP(DX10Clamp);
+    Reg |= S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp);
 
   if (ST.hasIEEEMode())
-    Reg |= S_00B848_IEEE_MODE(IEEEMode);
+    Reg |= S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
 
   if (ST.hasRrWGMode())
-    Reg |= S_00B848_RR_WG_MODE(RrWgMode);
+    Reg |= S_00B848_RR_WG_MODE(ProgInfo.RrWgMode);
 
   switch (CC) {
   case CallingConv::AMDGPU_PS:
-    Reg |= S_00B028_MEM_ORDERED(MemOrdered);
+    Reg |= S_00B028_MEM_ORDERED(ProgInfo.MemOrdered);
     break;
   case CallingConv::AMDGPU_VS:
-    Reg |= S_00B128_MEM_ORDERED(MemOrdered);
+    Reg |= S_00B128_MEM_ORDERED(ProgInfo.MemOrdered);
     break;
   case CallingConv::AMDGPU_GS:
-    Reg |= S_00B228_WGP_MODE(WgpMode) | S_00B228_MEM_ORDERED(MemOrdered);
+    Reg |= S_00B228_WGP_MODE(ProgInfo.WgpMode) |
+           S_00B228_MEM_ORDERED(ProgInfo.MemOrdered);
     break;
   case CallingConv::AMDGPU_HS:
-    Reg |= S_00B428_WGP_MODE(WgpMode) | S_00B428_MEM_ORDERED(MemOrdered);
+    Reg |= S_00B428_WGP_MODE(ProgInfo.WgpMode) |
+           S_00B428_MEM_ORDERED(ProgInfo.MemOrdered);
     break;
   default:
     break;
@@ -76,22 +133,108 @@ uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC,
   return Reg;
 }
 
-uint64_t SIProgramInfo::getComputePGMRSrc2() const {
-  uint64_t Reg =
-      S_00B84C_SCRATCH_EN(ScratchEnable) | S_00B84C_USER_SGPR(UserSGPR) |
-      S_00B84C_TRAP_HANDLER(TrapHandlerEnable) |
-      S_00B84C_TGID_X_EN(TGIdXEnable) | S_00B84C_TGID_Y_EN(TGIdYEnable) |
-      S_00B84C_TGID_Z_EN(TGIdZEnable) | S_00B84C_TG_SIZE_EN(TGSizeEnable) |
-      S_00B84C_TIDIG_COMP_CNT(TIdIGCompCount) |
-      S_00B84C_EXCP_EN_MSB(EXCPEnMSB) | S_00B84C_LDS_SIZE(LdsSize) |
-      S_00B84C_EXCP_EN(EXCPEnable);
+static uint64_t getComputePGMRSrc2Reg(const SIProgramInfo &ProgInfo) {
+  uint64_t Reg = S_00B84C_USER_SGPR(ProgInfo.UserSGPR) |
+                 S_00B84C_TRAP_HANDLER(ProgInfo.TrapHandlerEnable) |
+                 S_00B84C_TGID_X_EN(ProgInfo.TGIdXEnable) |
+                 S_00B84C_TGID_Y_EN(ProgInfo.TGIdYEnable) |
+                 S_00B84C_TGID_Z_EN(ProgInfo.TGIdZEnable) |
+                 S_00B84C_TG_SIZE_EN(ProgInfo.TGSizeEnable) |
+                 S_00B84C_TIDIG_COMP_CNT(ProgInfo.TIdIGCompCount) |
+                 S_00B84C_EXCP_EN_MSB(ProgInfo.EXCPEnMSB) |
+                 S_00B84C_LDS_SIZE(ProgInfo.LdsSize) |
+                 S_00B84C_EXCP_EN(ProgInfo.EXCPEnable);
+
+  return Reg;
+}
+
+static const MCExpr *MaskShift(const MCExpr *Val, uint32_t Mask, uint32_t Shift,
+                               MCContext &Ctx) {
+  if (Mask) {
+    const MCExpr *MaskExpr = MCConstantExpr::create(Mask, Ctx);
+    Val = MCBinaryExpr::createAnd(Val, MaskExpr, Ctx);
+  }
+  if (Shift) {
+    const MCExpr *ShiftExpr = MCConstantExpr::create(Shift, Ctx);
+    Val = MCBinaryExpr::createShl(Val, ShiftExpr, Ctx);
+  }
+  return Val;
+}
+
+uint64_t SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST) const {
+  int64_t VBlocks, SBlocks;
+  VGPRBlocks->evaluateAsAbsolute(VBlocks);
+  SGPRBlocks->evaluateAsAbsolute(SBlocks);
+
+  uint64_t Reg = S_00B848_VGPRS(static_cast<uint64_t>(VBlocks)) |
+                 S_00B848_SGPRS(static_cast<uint64_t>(SBlocks)) |
+                 getComputePGMRSrc1Reg(*this, ST);
 
   return Reg;
 }
 
+uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC,
+                                    const GCNSubtarget &ST) const {
+  if (AMDGPU::isCompute(CC)) {
+    return getComputePGMRSrc1(ST);
+  }
+  int64_t VBlocks, SBlocks;
+  VGPRBlocks->evaluateAsAbsolute(VBlocks);
+  SGPRBlocks->evaluateAsAbsolute(SBlocks);
+
+  return getPGMRSrc1Reg(*this, CC, ST) |
+         S_00B848_VGPRS(static_cast<uint64_t>(VBlocks)) |
+         S_00B848_SGPRS(static_cast<uint64_t>(SBlocks));
+}
+
+uint64_t SIProgramInfo::getComputePGMRSrc2() const {
+  int64_t ScratchEn;
+  ScratchEnable->evaluateAsAbsolute(ScratchEn);
+  return ScratchEn | getComputePGMRSrc2Reg(*this);
+}
+
 uint64_t SIProgramInfo::getPGMRSrc2(CallingConv::ID CC) const {
   if (AMDGPU::isCompute(CC))
     return getComputePGMRSrc2();
 
   return 0;
 }
+
+const MCExpr *SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST,
+                                                MCContext &Ctx) const {
+  uint64_t Reg = getComputePGMRSrc1Reg(*this, ST);
+  const MCExpr *RegExpr = MCConstantExpr::create(Reg, Ctx);
+  const MCExpr *Res = MCBinaryExpr::createOr(
+      MaskShift(VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0, Ctx),
+      MaskShift(SGPRBlocks, /*Mask=*/0xF, /*Shift=*/6, Ctx), Ctx);
+  return MCBinaryExpr::createOr(RegExpr, Res, Ctx);
+}
+
+const MCExpr *SIProgramInfo::getPGMRSrc1(CallingConv::ID CC,
+                                         const GCNSubtarget &ST,
+                                         MCContext &Ctx) const {
+  if (AMDGPU::isCompute(CC)) {
+    return getComputePGMRSrc1(ST, Ctx);
+  }
+
+  uint64_t Reg = getPGMRSrc1Reg(*this, CC, ST);
+  const MCExpr *RegExpr = MCConstantExpr::create(Reg, Ctx);
+  const MCExpr *Res = MCBinaryExpr::createOr(
+      MaskShift(VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0, Ctx),
+      MaskShift(SGPRBlocks, /*Mask=*/0xF, /*Shift=*/6, Ctx), Ctx);
+  return MCBinaryExpr::createOr(RegExpr, Res, Ctx);
+}
+
+const MCExpr *SIProgramInfo::getComputePGMRSrc2(MCContext &Ctx) const {
+  uint64_t Reg = getComputePGMRSrc2Reg(*this);
+  const MCExpr *RegExpr = MCConstantExpr::create(Reg, Ctx);
+  return MCBinaryExpr::createOr(ScratchEnable, RegExpr, Ctx);
+}
+
+const MCExpr *SIProgramInfo::getPGMRSrc2(CallingConv::ID CC,
+                                         MCContext &Ctx) const {
+  if (AMDGPU::isCompute(CC))
+    return getComputePGMRSrc2(Ctx);
+
+  return MCConstantExpr::create(0, Ctx);
+}
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index 8c26789f936cff4..5ff9b607c7fbcf2 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -22,12 +22,15 @@
 namespace llvm {
 
 class GCNSubtarget;
+class MCContext;
+class MCExpr;
+class MachineFunction;
 
 /// Track resource usage for kernels / entry functions.
 struct SIProgramInfo {
     // Fields set in PGM_RSRC1 pm4 packet.
-    uint32_t VGPRBlocks = 0;
-    uint32_t SGPRBlocks = 0;
+    const MCExpr *VGPRBlocks = 0;
+    const MCExpr *SGPRBlocks = 0;
     uint32_t Priority = 0;
     uint32_t FloatMode = 0;
     uint32_t Priv = 0;
@@ -37,14 +40,14 @@ struct SIProgramInfo {
     uint32_t WgpMode = 0; // GFX10+
     uint32_t MemOrdered = 0; // GFX10+
     uint32_t RrWgMode = 0;   // GFX12+
-    uint64_t ScratchSize = 0;
+    const MCExpr *ScratchSize = nullptr;
 
     // State used to calculate fields set in PGM_RSRC2 pm4 packet.
     uint32_t LDSBlocks = 0;
-    uint32_t ScratchBlocks = 0;
+    const MCExpr *ScratchBlocks = nullptr;
 
     // Fields set in PGM_RSRC2 pm4 packet
-    uint32_t ScratchEnable = 0;
+    const MCExpr *ScratchEnable = nullptr;
     uint32_t UserSGPR = 0;
     uint32_t TrapHandlerEnable = 0;
     uint32_t TGIdXEnable = 0;
@@ -56,44 +59,52 @@ struct SIProgramInfo {
     uint32_t LdsSize = 0;
     uint32_t EXCPEnable = 0;
 
-    uint64_t ComputePGMRSrc3GFX90A = 0;
+    const MCExpr *ComputePGMRSrc3GFX90A = nullptr;
 
-    uint32_t NumVGPR = 0;
-    uint32_t NumArchVGPR = 0;
-    uint32_t NumAccVGPR = 0;
-    uint32_t AccumOffset = 0;
+    const MCExpr *NumVGPR = nullptr;
+    const MCExpr *NumArchVGPR = nullptr;
+    const MCExpr *NumAccVGPR = nullptr;
+    const MCExpr *AccumOffset = nullptr;
     uint32_t TgSplit = 0;
-    uint32_t NumSGPR = 0;
+    const MCExpr *NumSGPR = nullptr;
     unsigned SGPRSpill = 0;
     unsigned VGPRSpill = 0;
     uint32_t LDSSize = 0;
-    bool FlatUsed = false;
+    const MCExpr *FlatUsed = nullptr;
 
     // Number of SGPRs that meets number of waves per execution unit request.
-    uint32_t NumSGPRsForWavesPerEU = 0;
+    const MCExpr *NumSGPRsForWavesPerEU = nullptr;
 
     // Number of VGPRs that meets number of waves per execution unit request.
-    uint32_t NumVGPRsForWavesPerEU = 0;
+    const MCExpr *NumVGPRsForWavesPerEU = nullptr;
 
     // Final occupancy.
-    uint32_t Occupancy = 0;
+    const MCExpr *Occupancy = nullptr;
 
     // Whether there is recursion, dynamic allocas, indirect calls or some other
     // reason there may be statically unknown stack usage.
-    bool DynamicCallStack = false;
+    const MCExpr *DynamicCallStack = nullptr;
 
     // Bonus information for debugging.
-    bool VCCUsed = false;
+    const MCExpr *VCCUsed = nullptr;
 
     SIProgramInfo() = default;
 
+    void reset(const MachineFunction &MF);
+
     /// Compute the value of the ComputePGMRsrc1 register.
     uint64_t getComputePGMRSrc1(const GCNSubtarget &ST) const;
     uint64_t getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST) const;
+    const MCExpr *getComputePGMRSrc1(const GCNSubtarget &ST,
+                                     MCContext &Ctx) const;
+    const MCExpr *getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST,
+                              MCContext &Ctx) const;
 
     /// Compute the value of the ComputePGMRsrc2 register.
     uint64_t getComputePGMRSrc2() const;
     uint64_t getPGMRSrc2(CallingConv::ID CC) const;
+    const MCExpr *getComputePGMRSrc2(MCContext &Ctx) const;
+    const MCExpr *getPGMRSrc2(CallingConv::ID CC, MCContext &Ctx) const;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 4e0074451aa58ca..82897a68082a746 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1129,12 +1129,18 @@ unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
 
 unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
                                       unsigned NumVGPRs) {
-  unsigned MaxWaves = getMaxWavesPerEU(STI);
-  unsigned Granule = getVGPRAllocGranule(STI);
+  return getNumWavesPerEUWithNumVGPRs(NumVGPRs, getVGPRAllocGranule(STI),
+                                      getMaxWavesPerEU(STI),
+                                      getTotalNumVGPRs(STI));
+}
+
+unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
+                                      unsigned MaxWaves,
+                                      unsigned TotalNumVGPRs) {
   if (NumVGPRs < Granule)
     return MaxWaves;
   unsigned RoundedRegs = alignTo(NumVGPRs, Granule);
-  return std::min(std::max(getTotalNumVGPRs(STI) / RoundedRegs, 1u), MaxWaves);
+  return std::min(std::max(TotalNumVGPRs / RoundedRegs, 1u), MaxWaves);
 }
 
 unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 943588fe701cc8a..8b9ce1b191a2988 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -311,6 +311,12 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
 unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
                                       unsigned NumVGPRs);
 
+/// \returns Number of waves reachable for a given \p NumVGPRs usage, \p Granule
+/// size, \p MaxWaves possible, and \p TotalNumVGPRs available.
+unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
+                                      unsigned MaxWaves,
+                                      unsigned TotalNumVGPRs);
+
 /// \returns Number of VGPR blocks needed for given subtarget \p STI when
 /// \p NumVGPRs are used. We actually return the number of blocks -1, since
 /// that's what we encode.
diff --git a/llvm/test/MC/AMDGPU/alignto_mcexpr.s b/llvm/test/MC/AMDGPU/alignto_mcexpr.s
new file mode 100644
index 000000000000000..e864f3736828c4b
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/alignto_mcexpr.s
@@ -0,0 +1,15 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa < %s | FileCheck --check-prefix=ASM %s
+
+// ASM: .set alignto_zero_eight, 0
+// ASM: .set alignto_one_eight, 8
+// ASM: .set alignto_five_eight, 8
+// ASM: .set alignto_seven_eight, 8
+// ASM: .set alignto_eight_eight, 8
+// ASM: .set alignto_ten_eight, 16
+
+.set alignto_zero_eight, alignto(0, 8)
+.set alignto_one_eight, alignto(1, 8)
+.set alignto_five_eight, alignto(5, 8)
+.set alignto_seven_eight, alignto(7, 8)
+.set alignto_eight_eight, alignto(8, 8)
+.set alignto_ten_eight, alignto(10, 8)
diff --git a/llvm/test/MC/AMDGPU/extrasgprs_mcexpr.s b/llvm/test/MC/AMDGPU/extrasgprs_mcexpr.s
new file mode 100644
index 000000000000000..89e4954b8054715
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/extrasgprs_mcexpr.s
@@ -0,0 +1,41 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa < %s | FileCheck --check-prefix=ASM %s
+
+// ASM: .set extrasgpr_none_gfx7, 0
+// ASM: .set extrasgpr_none_gfx9, 0
+// ASM: .set extrasgpr_none_gfx10, 0
+
+.set extrasgpr_none_gfx7, extrasgprs(7, 0, 0, 0, 0)
+.set extrasgpr_none_gfx9, extrasgprs(9, 0, 0, 0, 0)
+.set extrasgpr_none_gfx10, extrasgprs(10, 0, 0, 0, 0)
+
+// ASM: .set extrasgpr_vcc_gfx7, 2
+// ASM: .set extrasgpr_vcc_gfx9, 2
+// ASM: .set extrasgpr_vcc_gfx10, 2
+
+.set extrasgpr_vcc_gfx7, extrasgprs(7, 1, 0, 0, 0)
+.set extrasgpr_vcc_gfx9, extrasgprs(9, 1, 0, 0, 0)
+.set extrasgpr_vcc_gfx10, extrasgprs(10, 1, 0, 0, 0)
+
+// ASM: .set extrasgpr_flatscr_gfx7, 4
+// ASM: .set extrasgpr_flatscr_gfx9, 6
+// ASM: .set extrasgpr_flatscr_gfx10, 0
+
+.set extrasgpr_flatscr_gfx7, extrasgprs(7, 0, 1, 0, 0)
+.set extrasgpr_flatscr_gfx9, extrasgprs(9, 0, 1, 0, 0)
+.set extrasgpr_flatscr_gfx10, extrasgprs(10, 0, 1, 0, 0)
+
+// ASM: .set extrasgpr_xnack_gfx7, 0
+// ASM: .set extrasgpr_xnack_gfx9, 4
+// ASM: .set extrasgpr_xnack_gfx10, 0
+
+.set extrasgpr_xnack_gfx7, extrasgprs(7, 0, 0, 1, 0)
+.set extrasgpr_xnack_gfx9, extrasgprs(9, 0, 0, 1, 0)
+.set extrasgpr_xnack_gfx10, extrasgprs(10, 0, 0, 1, 0)
+
+// ASM: .set extrasgpr_archflatscr_gfx7, 0
+// ASM: .set extrasgpr_archflatscr_gfx9, 6
+// ASM: .set extrasgpr_archflatscr_gfx10, 0
+
+.set extrasgpr_archflatscr_gfx7, extrasgprs(7, 0, 0, 0, 1)
+.set extrasgpr_archflatscr_gfx9, extrasgprs(9, 0, 0, 0, 1)
+.set extrasgpr_archflatscr_gfx10, extrasgprs(10, 0, 0, 0, 1)
diff --git a/llvm/test/MC/AMDGPU/occupancy_mcexpr.s b/llvm/test/MC/AMDGPU/occupancy_mcexpr.s
new file mode 100644
index 000000000000000..06bec8c538daeab
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/occupancy_mcexpr.s
@@ -0,0 +1,61 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa < %s | FileCheck --check-prefix=ASM %s
+
+// ASM: .set occupancy_init_one, 1
+// ASM: .set occupancy_init_seven, 7
+// ASM: .set occupancy_init_eight, 8
+
+.set occupancy_init_one, occupancy(0, 0, 0, 0, 1, 0, 0)
+.set occupancy_init_seven, occupancy(0, 0, 0, 0, 7, 0, 0)
+.set occupancy_init_eight, occupancy(0, 0, 0, 0, 8, 0, 0)
+
+// ASM: .set occupancy_numsgpr_seaisle_ten, 10
+// ASM: .set occupancy_numsgpr_seaisle_nine, 9
+// ASM: .set occupancy_numsgpr_seaisle_eight, 8
+// ASM: .set occupancy_numsgpr_seaisle_seven, 7
+// ASM: .set occupancy_numsgpr_seaisle_six, 6
+// ASM: .set occupancy_numsgpr_seaisle_five, 5
+
+.set occupancy_numsgpr_seaisle_ten, occupancy(0, 0, 0, 6, 11, 1, 0)
+.set occupancy_numsgpr_seaisle_nine, occupancy(0, 0, 0, 6, 11, 49, 0)
+.set occupancy_numsgpr_seaisle_eight, occupancy(0, 0, 0, 6, 11, 57, 0)
+.set occupancy_numsgpr_seaisle_seven, occupancy(0, 0, 0, 6, 11, 65, 0)
+.set occupancy_numsgpr_seaisle_six, occupancy(0, 0, 0, 6, 11, 73, 0)
+.set occupancy_numsgpr_seaisle_five, occupancy(0, 0, 0, 6, 11, 81, 0)
+
+// ASM: .set occupancy_numsgpr_gfx9_ten, 10
+// ASM: .set occupancy_numsgpr_gfx9_nine, 9
+// ASM: .set occupancy_numsgpr_gfx9_eight, 8
+// ASM: .set occupancy_numsgpr_gfx9_seven, 7
+
+.set occupancy_numsgpr_gfx9_ten, occupancy(0, 0, 0, 8, 11, 1, 0)
+.set occupancy_numsgpr_gfx9_nine, occupancy(0, 0, 0, 8, 11, 81, 0)
+.set occupancy_numsgpr_gfx9_eight, occupancy(0, 0, 0, 8, 11, 89, 0)
+.set occupancy_numsgpr_gfx9_seven, occupancy(0, 0, 0, 8, 11, 101, 0)
+
+// ASM: .set occupancy_numsgpr_gfx10_one, 1
+// ASM: .set occupancy_numsgpr_gfx10_seven, 7
+// ASM: .set occupancy_numsgpr_gfx10_eight, 8
+
+.set occupancy_numsgpr_gfx10_one, occupancy(1, 0, 0, 9, 11, 1, 0)
+.set occupancy_numsgpr_gfx10_seven, occupancy(7, 0, 0, 9, 11, 1, 0)
+.set occupancy_numsgpr_gfx10_eight, occupancy(8, 0, 0, 9, 11, 1, 0)
+
+// ASM: .set occupancy_numvgpr_high_granule_one, 1
+// ASM: .set occupancy_numvgpr_high_granule_seven, 7
+// ASM: .set occupancy_numvgpr_high_granule_eight, 8
+
+.set occupancy_numvgpr_high_granule_one, occupancy(1, 2, 0, 0, 11, 0, 1)
+.set occupancy_numvgpr_high_granule_seven, occupancy(7, 2, 0, 0, 11, 0, 1)
+.set occupancy_numvgpr_high_granule_eight, occupancy(8, 2, 0, 0, 11, 0, 1)
+
+// ASM: .set occupancy_numvgpr_low_total_one, 1
+// ASM: .set occupancy_numvgpr_one, 1
+// ASM: .set occupancy_numvgpr_seven, 7
+// ASM: .set occupancy_numvgpr_eight, 8
+// ASM: .set occupancy_numvgpr_ten, 10
+
+.set occupancy_numvgpr_low_total_one, occupancy(11, 4, 2, 0, 11, 0, 4)
+.set occupancy_numvgpr_one, occupancy(11, 4, 4, 0, 11, 0, 4)
+.set occupancy_numvgpr_seven, occupancy(11, 4, 28, 0, 11, 0, 4)
+.set occupancy_numvgpr_eight, occupancy(11, 4, 32, 0, 11, 0, 4)
+.set occupancy_numvgpr_ten, occupancy(11, 4, 40, 0, 11, 0, 4)
diff --git a/llvm/test/MC/AMDGPU/totalnumvgpr_mcexpr.s b/llvm/test/MC/AMDGPU/totalnumvgpr_mcexpr.s
new file mode 100644
index 000000000000000..58f317731df8493
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/totalnumvgpr_mcexpr.s
@@ -0,0 +1,25 @@
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa < %s | FileCheck --check-prefix=ASM %s
+
+// ASM: .set totalvgpr_none, 0
+// ASM: .set totalvgpr_one, 1
+// ASM: .set totalvgpr_two, 2
+
+.set totalvgpr_none, totalnumvgprs(0, 0)
+.set totalvgpr_one, totalnumvgprs(1, 0)
+.set totalvgpr_two, totalnumvgprs(1, 2)
+
+// ASM: .set totalvgpr90a_none, 0
+// ASM: .set totalvgpr90a_one, 1
+// ASM: .set totalvgpr90a_two, 2
+
+.set totalvgpr90a_none, totalnumvgprs90a(0, 0)
+.set totalvgpr90a_one, totalnumvgprs90a(0, 1)
+.set totalvgpr90a_two, totalnumvgprs90a(0, 2)
+
+// ASM: .set totalvgpr90a_agpr_minimal, 1
+// ASM: .set totalvgpr90a_agpr_rounded_eight, 8
+// ASM: .set totalvgpr90a_agpr_exact_eight, 8
+
+.set totalvgpr90a_agpr_minimal, totalnumvgprs90a(1, 0)
+.set totalvgpr90a_agpr_rounded_eight, totalnumvgprs90a(4, 2)
+.set totalvgpr90a_agpr_exact_eight, totalnumvgprs90a(4, 4)