[llvm] [AMDGPU] Add scheduling stage to rewrite MFMA from VGPR to AGPR (PR #149367)

Tue Jul 29 15:54:49 PDT 2025

https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/149367

>From 929584982b8fd23e45e27420ed3218fe67076287 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 23 Jul 2025 15:41:11 -0700
Subject: [PATCH 1/5] [AMDGPU] More accurately account for AVGPR pressure

Change-Id: I6f129c2723b52a391a96178e390f60535164ac9b
---
 .../Target/AMDGPU/GCNIterativeScheduler.cpp   |  46 +-
 llvm/lib/Target/AMDGPU/GCNRegPressure.cpp     | 156 +++---
 llvm/lib/Target/AMDGPU/GCNRegPressure.h       | 129 +++--
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   |  72 +--
 .../lib/Target/AMDGPU/SIFormMemoryClauses.cpp |   9 +-
 llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir   | 481 ++++++++++++++++++
 .../AMDGPU/debug-value-scheduler-liveins.mir  |   2 +-
 7 files changed, 726 insertions(+), 169 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir

diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index f253a841f16a6..87f5b9f16868a 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -447,11 +447,7 @@ void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
 // BestSchedules aren't deleted on fail.
 unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
   // TODO: assert Regions are sorted descending by pressure
-  const auto &ST = MF.getSubtarget<GCNSubtarget>();
-  const unsigned DynamicVGPRBlockSize =
-      MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
-  const auto Occ =
-      Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);
+  const auto Occ = Regions.front()->MaxPressure.getOccupancy(MF);
   LLVM_DEBUG(dbgs() << "Trying to improve occupancy, target = " << TargetOcc
                     << ", current = " << Occ << '\n');
 
@@ -460,7 +456,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
     // Always build the DAG to add mutations
     BuildDAG DAG(*R, *this);
 
-    if (R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize) >= NewOcc)
+    if (R->MaxPressure.getOccupancy(MF) >= NewOcc)
       continue;
 
     LLVM_DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
@@ -471,7 +467,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
     LLVM_DEBUG(dbgs() << "Occupancy improvement attempt:\n";
                printSchedRP(dbgs(), R->MaxPressure, MaxRP));
 
-    NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST, DynamicVGPRBlockSize));
+    NewOcc = std::min(NewOcc, MaxRP.getOccupancy(MF));
     if (NewOcc <= Occ)
       break;
 
@@ -488,15 +484,12 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
 }
 
 void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
-  bool TryMaximizeOccupancy) {
-  const auto &ST = MF.getSubtarget<GCNSubtarget>();
+    bool TryMaximizeOccupancy) {
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   auto TgtOcc = MFI->getMinAllowedOccupancy();
-  unsigned DynamicVGPRBlockSize = MFI->getDynamicVGPRBlockSize();
 
   sortRegionsByPressure(TgtOcc);
-  auto Occ =
-      Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);
+  auto Occ = Regions.front()->MaxPressure.getOccupancy(MF);
 
   bool IsReentry = false;
   if (TryMaximizeOccupancy && Occ < TgtOcc) {
@@ -527,21 +520,19 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
       const auto RP = getRegionPressure(*R);
       LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
 
-      if (RP.getOccupancy(ST, DynamicVGPRBlockSize) < TgtOcc) {
+      if (RP.getOccupancy(MF) < TgtOcc) {
         LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
-        if (R->BestSchedule.get() && R->BestSchedule->MaxPressure.getOccupancy(
-                                         ST, DynamicVGPRBlockSize) >= TgtOcc) {
+        if (R->BestSchedule.get() &&
+            R->BestSchedule->MaxPressure.getOccupancy(MF) >= TgtOcc) {
           LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
           scheduleBest(*R);
         } else {
           LLVM_DEBUG(dbgs() << ", restoring\n");
           Ovr.restoreOrder();
-          assert(R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize) >=
-                 TgtOcc);
+          assert(R->MaxPressure.getOccupancy(MF) >= TgtOcc);
         }
       }
-      FinalOccupancy =
-          std::min(FinalOccupancy, RP.getOccupancy(ST, DynamicVGPRBlockSize));
+      FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(MF));
     }
   }
   MFI->limitOccupancy(FinalOccupancy);
@@ -582,16 +573,12 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) {
 ///////////////////////////////////////////////////////////////////////////////
 // ILP scheduler port
 
-void GCNIterativeScheduler::scheduleILP(
-  bool TryMaximizeOccupancy) {
-  const auto &ST = MF.getSubtarget<GCNSubtarget>();
+void GCNIterativeScheduler::scheduleILP(bool TryMaximizeOccupancy) {
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   auto TgtOcc = MFI->getMinAllowedOccupancy();
-  unsigned DynamicVGPRBlockSize = MFI->getDynamicVGPRBlockSize();
 
   sortRegionsByPressure(TgtOcc);
-  auto Occ =
-      Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);
+  auto Occ = Regions.front()->MaxPressure.getOccupancy(MF);
 
   bool IsReentry = false;
   if (TryMaximizeOccupancy && Occ < TgtOcc) {
@@ -612,18 +599,17 @@ void GCNIterativeScheduler::scheduleILP(
     const auto RP = getSchedulePressure(*R, ILPSchedule);
     LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
 
-    if (RP.getOccupancy(ST, DynamicVGPRBlockSize) < TgtOcc) {
+    if (RP.getOccupancy(MF) < TgtOcc) {
       LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
-      if (R->BestSchedule.get() && R->BestSchedule->MaxPressure.getOccupancy(
-                                       ST, DynamicVGPRBlockSize) >= TgtOcc) {
+      if (R->BestSchedule.get() &&
+          R->BestSchedule->MaxPressure.getOccupancy(MF) >= TgtOcc) {
         LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
         scheduleBest(*R);
       }
     } else {
       scheduleRegion(*R, ILPSchedule, RP);
       LLVM_DEBUG(printSchedResult(dbgs(), R, RP));
-      FinalOccupancy =
-          std::min(FinalOccupancy, RP.getOccupancy(ST, DynamicVGPRBlockSize));
+      FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(MF));
     }
   }
   MFI->limitOccupancy(FinalOccupancy);
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 334afd3a2a5b4..dd007e6cd6b31 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -99,20 +99,22 @@ void GCNRegPressure::inc(unsigned Reg,
 bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
                           unsigned MaxOccupancy) const {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  unsigned ArchVGPRThreshold = ST.getMaxNumVectorRegs(MF.getFunction()).first;
   unsigned DynamicVGPRBlockSize =
       MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
 
   const auto SGPROcc = std::min(MaxOccupancy,
                                 ST.getOccupancyWithNumSGPRs(getSGPRNum()));
   const auto VGPROcc = std::min(
-      MaxOccupancy, ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()),
-                                                DynamicVGPRBlockSize));
+      MaxOccupancy, ST.getOccupancyWithNumVGPRs(
+                        getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold),
+                        DynamicVGPRBlockSize));
   const auto OtherSGPROcc = std::min(MaxOccupancy,
                                 ST.getOccupancyWithNumSGPRs(O.getSGPRNum()));
-  const auto OtherVGPROcc =
-      std::min(MaxOccupancy,
-               ST.getOccupancyWithNumVGPRs(O.getVGPRNum(ST.hasGFX90AInsts()),
-                                           DynamicVGPRBlockSize));
+  const auto OtherVGPROcc = std::min(
+      MaxOccupancy, ST.getOccupancyWithNumVGPRs(
+                        O.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold),
+                        DynamicVGPRBlockSize));
 
   const auto Occ = std::min(SGPROcc, VGPROcc);
   const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
@@ -135,35 +137,39 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
   unsigned OtherVGPRForSGPRSpills =
       (OtherExcessSGPR + (WaveSize - 1)) / WaveSize;
 
-  unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs();
-
   // Unified excess pressure conditions, accounting for VGPRs used for SGPR
   // spills
-  unsigned ExcessVGPR =
-      std::max(static_cast<int>(getVGPRNum(ST.hasGFX90AInsts()) +
-                                VGPRForSGPRSpills - MaxVGPRs),
-               0);
-  unsigned OtherExcessVGPR =
-      std::max(static_cast<int>(O.getVGPRNum(ST.hasGFX90AInsts()) +
-                                OtherVGPRForSGPRSpills - MaxVGPRs),
-               0);
+  unsigned ExcessVGPR = std::max(
+      static_cast<int>(getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) +
+                       VGPRForSGPRSpills - MaxVGPRs),
+      0);
+  unsigned OtherExcessVGPR = std::max(
+      static_cast<int>(O.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) +
+                       OtherVGPRForSGPRSpills - MaxVGPRs),
+      0);
   // Arch VGPR excess pressure conditions, accounting for VGPRs used for SGPR
   // spills
-  unsigned ExcessArchVGPR = std::max(
-      static_cast<int>(getVGPRNum(false) + VGPRForSGPRSpills - MaxArchVGPRs),
-      0);
+  unsigned AddressableArchVGPRs = ST.getAddressableNumArchVGPRs();
+  unsigned ExcessArchVGPR =
+      std::max(static_cast<int>(getVGPRNum(false, ArchVGPRThreshold) +
+                                VGPRForSGPRSpills - AddressableArchVGPRs),
+               0);
   unsigned OtherExcessArchVGPR =
-      std::max(static_cast<int>(O.getVGPRNum(false) + OtherVGPRForSGPRSpills -
-                                MaxArchVGPRs),
+      std::max(static_cast<int>(O.getVGPRNum(false, ArchVGPRThreshold) +
+                                OtherVGPRForSGPRSpills - AddressableArchVGPRs),
                0);
   // AGPR excess pressure conditions
-  unsigned ExcessAGPR = std::max(
-      static_cast<int>(ST.hasGFX90AInsts() ? (getAGPRNum() - MaxArchVGPRs)
-                                           : (getAGPRNum() - MaxVGPRs)),
-      0);
+  unsigned ExcessAGPR =
+      std::max(static_cast<int>(
+                   ST.hasGFX90AInsts()
+                       ? (getAGPRNum(ArchVGPRThreshold) - AddressableArchVGPRs)
+                       : (getAGPRNum(ArchVGPRThreshold) - MaxVGPRs)),
+               0);
   unsigned OtherExcessAGPR = std::max(
-      static_cast<int>(ST.hasGFX90AInsts() ? (O.getAGPRNum() - MaxArchVGPRs)
-                                           : (O.getAGPRNum() - MaxVGPRs)),
+      static_cast<int>(
+          ST.hasGFX90AInsts()
+              ? (O.getAGPRNum(ArchVGPRThreshold) - AddressableArchVGPRs)
+              : (O.getAGPRNum(ArchVGPRThreshold) - MaxVGPRs)),
       0);
 
   bool ExcessRP = ExcessSGPR || ExcessVGPR || ExcessArchVGPR || ExcessAGPR;
@@ -184,14 +190,21 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
       return VGPRDiff > 0;
     if (SGPRDiff != 0) {
       unsigned PureExcessVGPR =
-          std::max(static_cast<int>(getVGPRNum(ST.hasGFX90AInsts()) - MaxVGPRs),
+          std::max(static_cast<int>(
+                       getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) -
+                       MaxVGPRs),
                    0) +
-          std::max(static_cast<int>(getVGPRNum(false) - MaxArchVGPRs), 0);
+          std::max(static_cast<int>(getVGPRNum(false, ArchVGPRThreshold) -
+                                    AddressableArchVGPRs),
+                   0);
       unsigned OtherPureExcessVGPR =
-          std::max(
-              static_cast<int>(O.getVGPRNum(ST.hasGFX90AInsts()) - MaxVGPRs),
-              0) +
-          std::max(static_cast<int>(O.getVGPRNum(false) - MaxArchVGPRs), 0);
+          std::max(static_cast<int>(
+                       O.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) -
+                       MaxVGPRs),
+                   0) +
+          std::max(static_cast<int>(O.getVGPRNum(false, ArchVGPRThreshold) -
+                                    AddressableArchVGPRs),
+                   0);
 
       // If we have a special case where there is a tie in excess VGPR, but one
       // of the pressures has VGPR usage from SGPR spills, prefer the pressure
@@ -221,38 +234,45 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
       if (SW != OtherSW)
         return SW < OtherSW;
     } else {
-      auto VW = getVGPRTuplesWeight();
-      auto OtherVW = O.getVGPRTuplesWeight();
+      auto VW = getVGPRTuplesWeight(ArchVGPRThreshold);
+      auto OtherVW = O.getVGPRTuplesWeight(ArchVGPRThreshold);
       if (VW != OtherVW)
         return VW < OtherVW;
     }
   }
 
   // Give final precedence to lower general RP.
-  return SGPRImportant ? (getSGPRNum() < O.getSGPRNum()):
-                         (getVGPRNum(ST.hasGFX90AInsts()) <
-                          O.getVGPRNum(ST.hasGFX90AInsts()));
+  return SGPRImportant ? (getSGPRNum() < O.getSGPRNum())
+                       : (getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) <
+                          O.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold));
 }
 
 Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST,
-                      unsigned DynamicVGPRBlockSize) {
-  return Printable([&RP, ST, DynamicVGPRBlockSize](raw_ostream &OS) {
-    OS << "VGPRs: " << RP.getArchVGPRNum() << ' '
-       << "AGPRs: " << RP.getAGPRNum();
-    if (ST)
-      OS << "(O"
-         << ST->getOccupancyWithNumVGPRs(RP.getVGPRNum(ST->hasGFX90AInsts()),
-                                         DynamicVGPRBlockSize)
-         << ')';
-    OS << ", SGPRs: " << RP.getSGPRNum();
-    if (ST)
-      OS << "(O" << ST->getOccupancyWithNumSGPRs(RP.getSGPRNum()) << ')';
-    OS << ", LVGPR WT: " << RP.getVGPRTuplesWeight()
-       << ", LSGPR WT: " << RP.getSGPRTuplesWeight();
-    if (ST)
-      OS << " -> Occ: " << RP.getOccupancy(*ST, DynamicVGPRBlockSize);
-    OS << '\n';
-  });
+                      unsigned DynamicVGPRBlockSize,
+                      const MachineFunction *MF) {
+  unsigned ArchVGPRThreshold = std::numeric_limits<unsigned int>::max();
+  if (ST && MF)
+    ArchVGPRThreshold = ST->getMaxNumVectorRegs(MF->getFunction()).first;
+
+  return Printable(
+      [&RP, ST, DynamicVGPRBlockSize, ArchVGPRThreshold, MF](raw_ostream &OS) {
+        OS << "VGPRs: " << RP.getArchVGPRNum(ArchVGPRThreshold) << ' '
+           << "AGPRs: " << RP.getAGPRNum(ArchVGPRThreshold);
+        if (ST)
+          OS << "(O"
+             << ST->getOccupancyWithNumVGPRs(
+                    RP.getVGPRNum(ST->hasGFX90AInsts(), ArchVGPRThreshold),
+                    DynamicVGPRBlockSize)
+             << ')';
+        OS << ", SGPRs: " << RP.getSGPRNum();
+        if (ST)
+          OS << "(O" << ST->getOccupancyWithNumSGPRs(RP.getSGPRNum()) << ')';
+        OS << ", LVGPR WT: " << RP.getVGPRTuplesWeight(ArchVGPRThreshold)
+           << ", LSGPR WT: " << RP.getSGPRTuplesWeight();
+        if (ST)
+          OS << " -> Occ: " << RP.getOccupancy(*MF);
+        OS << '\n';
+      });
 }
 
 static LaneBitmask getDefRegMask(const MachineOperand &MO,
@@ -398,8 +418,9 @@ void GCNRPTarget::setRegLimits(unsigned NumSGPRs, unsigned NumVGPRs,
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   unsigned DynamicVGPRBlockSize =
       MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+  AddressableNumArchVGPRs = ST.getAddressableNumArchVGPRs();
   MaxSGPRs = std::min(ST.getAddressableNumSGPRs(), NumSGPRs);
-  MaxVGPRs = std::min(ST.getAddressableNumArchVGPRs(), NumVGPRs);
+  MaxVGPRs = std::min(AddressableNumArchVGPRs, NumVGPRs);
   MaxUnifiedVGPRs =
       ST.hasGFX90AInsts()
           ? std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs)
@@ -414,15 +435,21 @@ bool GCNRPTarget::isSaveBeneficial(Register Reg,
 
   if (SRI->isSGPRClass(RC))
     return RP.getSGPRNum() > MaxSGPRs;
-  unsigned NumVGPRs =
-      SRI->isAGPRClass(RC) ? RP.getAGPRNum() : RP.getArchVGPRNum();
+
+  bool ShouldUseAGPR =
+      SRI->isAGPRClass(RC) ||
+      (SRI->isVectorSuperClass(RC) &&
+       RP.getArchVGPRNum(AddressableNumArchVGPRs) >= AddressableNumArchVGPRs);
+  unsigned NumVGPRs = ShouldUseAGPR
+                          ? RP.getAGPRNum(AddressableNumArchVGPRs)
+                          : RP.getArchVGPRNum(AddressableNumArchVGPRs);
   return isVGPRBankSaveBeneficial(NumVGPRs);
 }
 
 bool GCNRPTarget::satisfied() const {
   if (RP.getSGPRNum() > MaxSGPRs)
     return false;
-  if (RP.getVGPRNum(false) > MaxVGPRs &&
+  if (RP.getVGPRNum(false, AddressableNumArchVGPRs) > MaxVGPRs &&
       (!CombineVGPRSavings || !satisifiesVGPRBanksTarget()))
     return false;
   return satisfiesUnifiedTarget();
@@ -876,10 +903,13 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {
 
   OS << "---\nname: " << MF.getName() << "\nbody:             |\n";
 
-  auto printRP = [](const GCNRegPressure &RP) {
-    return Printable([&RP](raw_ostream &OS) {
+  auto printRP = [&MF](const GCNRegPressure &RP) {
+    return Printable([&RP, &MF](raw_ostream &OS) {
       OS << format(PFX "  %-5d", RP.getSGPRNum())
-         << format(" %-5d", RP.getVGPRNum(false));
+         << format(" %-5d", RP.getVGPRNum(false, MF.getSubtarget<GCNSubtarget>()
+                                                     .getMaxNumVectorRegs(
+                                                         MF.getFunction())
+                                                     .first));
     });
   };
 
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index ea33a229110c1..8b80cc42c9bb0 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -18,6 +18,7 @@
 #define LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
 
 #include "GCNSubtarget.h"
+#include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include <algorithm>
@@ -43,51 +44,98 @@ struct GCNRegPressure {
 
   /// \returns the SGPR32 pressure
   unsigned getSGPRNum() const { return Value[SGPR]; }
-  /// \returns the aggregated ArchVGPR32, AccVGPR32, and Pseudo AVGPR pressure
-  /// dependent upon \p UnifiedVGPRFile
-  unsigned getVGPRNum(bool UnifiedVGPRFile) const {
+  unsigned getVGPRNum(bool UnifiedVGPRFile,
+                      unsigned AddressableArchVGPR) const {
     if (UnifiedVGPRFile) {
-      return Value[AGPR]
-                 ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR], Value[AVGPR])
-                 : Value[VGPR] + Value[AVGPR];
+      return Value[AGPR] || Value[AVGPR]
+                 ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR], Value[AVGPR],
+                                     AddressableArchVGPR)
+                 : Value[VGPR];
     }
     // AVGPR assignment priority is based on the width of the register. Account
     // AVGPR pressure as VGPR.
     return std::max(Value[VGPR] + Value[AVGPR], Value[AGPR]);
   }
 
+  inline static unsigned getAVGPRsAsVGPRsNum(unsigned NumArchVGPRs,
+                                             unsigned NumAVGPRs,
+                                             unsigned AddressableArchVGPR) {
+
+    return NumArchVGPRs < AddressableArchVGPR
+               ? std::min((AddressableArchVGPR - NumArchVGPRs), NumAVGPRs)
+               : 0;
+  }
+
+  inline static unsigned getAVGPRsAsAGPRsNum(unsigned NumArchVGPRs,
+                                             unsigned NumAGPRs,
+                                             unsigned NumAVGPRs,
+                                             unsigned AddressableArchVGPR) {
+    unsigned AVGPRsAsVGPRs =
+        getAVGPRsAsVGPRsNum(NumArchVGPRs, NumAVGPRs, AddressableArchVGPR);
+    return NumAVGPRs > AVGPRsAsVGPRs ? NumAVGPRs - AVGPRsAsVGPRs : 0;
+  }
+
   /// Returns the aggregated VGPR pressure, assuming \p NumArchVGPRs ArchVGPRs
   /// \p NumAGPRs AGPRS, and \p NumAVGPRs AVGPRs for a target with a unified
   /// VGPR file.
   inline static unsigned getUnifiedVGPRNum(unsigned NumArchVGPRs,
                                            unsigned NumAGPRs,
-                                           unsigned NumAVGPRs) {
-
-    // Assume AVGPRs will be assigned as VGPRs.
-    return alignTo(NumArchVGPRs + NumAVGPRs,
+                                           unsigned NumAVGPRs,
+                                           unsigned AddressableArchVGPR) {
+
+    // Until we hit the VGPRThreshold, we will assign AV as VGPR. After that
+    // point, we will assign as AGPR.
+    unsigned AVGPRsAsVGPRs =
+        getAVGPRsAsVGPRsNum(NumArchVGPRs, NumAVGPRs, AddressableArchVGPR);
+    unsigned AVGPRsAsAGPRs = getAVGPRsAsAGPRsNum(
+        NumArchVGPRs, NumAGPRs, NumAVGPRs, AddressableArchVGPR);
+    return alignTo(NumArchVGPRs + AVGPRsAsVGPRs,
                    AMDGPU::IsaInfo::getArchVGPRAllocGranule()) +
-           NumAGPRs;
+           NumAGPRs + AVGPRsAsAGPRs;
   }
 
   /// \returns the ArchVGPR32 pressure, plus the AVGPRS which we assume will be
   /// allocated as VGPR
-  unsigned getArchVGPRNum() const { return Value[VGPR] + Value[AVGPR]; }
+  unsigned getArchVGPRNum(unsigned AddressableArchVGPR) const {
+    unsigned AVGPRsAsVGPRs =
+        getAVGPRsAsVGPRsNum(Value[VGPR], Value[AVGPR], AddressableArchVGPR);
+
+    return Value[VGPR] + AVGPRsAsVGPRs;
+  }
   /// \returns the AccVGPR32 pressure
-  unsigned getAGPRNum() const { return Value[AGPR]; }
+  unsigned getAGPRNum(unsigned AddressableArchVGPR) const {
+    unsigned AVGPRsAsAGPRs = getAVGPRsAsAGPRsNum(
+        Value[VGPR], Value[AGPR], Value[AVGPR], AddressableArchVGPR);
+
+    return Value[AGPR] + AVGPRsAsAGPRs;
+  }
   /// \returns the AVGPR32 pressure
   unsigned getAVGPRNum() const { return Value[AVGPR]; }
 
-  unsigned getVGPRTuplesWeight() const {
-    return std::max(Value[TOTAL_KINDS + VGPR] + Value[TOTAL_KINDS + AVGPR],
-                    Value[TOTAL_KINDS + AGPR]);
+  unsigned getVGPRTuplesWeight(unsigned AddressableArchVGPR) const {
+    unsigned AVGPRsAsVGPRs =
+        getAVGPRsAsVGPRsNum(Value[TOTAL_KINDS + VGPR],
+                            Value[TOTAL_KINDS + AVGPR], AddressableArchVGPR);
+    unsigned AVGPRsAsAGPRs = getAVGPRsAsAGPRsNum(
+        Value[TOTAL_KINDS + VGPR], Value[TOTAL_KINDS + AGPR],
+        Value[TOTAL_KINDS + AVGPR], AddressableArchVGPR);
+
+    return std::max(Value[TOTAL_KINDS + VGPR] + AVGPRsAsVGPRs,
+                    Value[TOTAL_KINDS + AGPR] + AVGPRsAsAGPRs);
   }
   unsigned getSGPRTuplesWeight() const { return Value[TOTAL_KINDS + SGPR]; }
 
-  unsigned getOccupancy(const GCNSubtarget &ST,
-                        unsigned DynamicVGPRBlockSize) const {
-    return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()),
-                    ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()),
-                                                DynamicVGPRBlockSize));
+  unsigned getOccupancy(const MachineFunction &MF) const {
+    const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+    unsigned DynamicVGPRBlockSize =
+        MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+
+    return std::min(
+        ST.getOccupancyWithNumSGPRs(getSGPRNum()),
+        ST.getOccupancyWithNumVGPRs(
+            getVGPRNum(ST.hasGFX90AInsts(),
+                       ST.getMaxNumVectorRegs(MF.getFunction()).first),
+            DynamicVGPRBlockSize));
   }
 
   void inc(unsigned Reg,
@@ -95,10 +143,9 @@ struct GCNRegPressure {
            LaneBitmask NewMask,
            const MachineRegisterInfo &MRI);
 
-  bool higherOccupancy(const GCNSubtarget &ST, const GCNRegPressure &O,
-                       unsigned DynamicVGPRBlockSize) const {
-    return getOccupancy(ST, DynamicVGPRBlockSize) >
-           O.getOccupancy(ST, DynamicVGPRBlockSize);
+  bool higherOccupancy(const GCNRegPressure &O,
+                       const MachineFunction &MF) const {
+    return getOccupancy(MF) > O.getOccupancy(MF);
   }
 
   /// Compares \p this GCNRegpressure to \p O, returning true if \p this is
@@ -151,7 +198,7 @@ struct GCNRegPressure {
   friend GCNRegPressure max(const GCNRegPressure &P1,
                             const GCNRegPressure &P2);
 
-  friend Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST,
+  friend Printable print(const GCNRegPressure &RP,
                          unsigned DynamicVGPRBlockSize);
 };
 
@@ -220,16 +267,19 @@ class GCNRPTarget {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   friend raw_ostream &operator<<(raw_ostream &OS, const GCNRPTarget &Target) {
     OS << "Actual/Target: " << Target.RP.getSGPRNum() << '/' << Target.MaxSGPRs
-       << " SGPRs, " << Target.RP.getArchVGPRNum() << '/' << Target.MaxVGPRs
-       << " ArchVGPRs, " << Target.RP.getAGPRNum() << '/' << Target.MaxVGPRs
-       << " AGPRs";
+       << " SGPRs, " << Target.RP.getArchVGPRNum(Target.AddressableNumArchVGPRs)
+       << '/' << Target.MaxVGPRs << " ArchVGPRs, "
+       << Target.RP.getAGPRNum(Target.AddressableNumArchVGPRs) << '/'
+       << Target.MaxVGPRs << " AGPRs";
 
     if (Target.MaxUnifiedVGPRs) {
-      OS << ", " << Target.RP.getVGPRNum(true) << '/' << Target.MaxUnifiedVGPRs
-         << " VGPRs (unified)";
+      OS << ", " << Target.RP.getVGPRNum(true, Target.AddressableNumArchVGPRs)
+         << '/' << Target.MaxUnifiedVGPRs << " VGPRs (unified)";
     } else if (Target.CombineVGPRSavings) {
-      OS << ", " << Target.RP.getArchVGPRNum() + Target.RP.getAGPRNum() << '/'
-         << 2 * Target.MaxVGPRs << " VGPRs (combined target)";
+      OS << ", "
+         << Target.RP.getArchVGPRNum(Target.AddressableNumArchVGPRs) +
+                Target.RP.getAGPRNum(Target.AddressableNumArchVGPRs)
+         << '/' << 2 * Target.MaxVGPRs << " VGPRs (combined target)";
     }
     return OS;
   }
@@ -238,7 +288,6 @@ class GCNRPTarget {
 private:
   /// Current register pressure.
   GCNRegPressure RP;
-
   /// Target number of SGPRs.
   unsigned MaxSGPRs;
   /// Target number of ArchVGPRs and AGPRs.
@@ -246,6 +295,8 @@ class GCNRPTarget {
   /// Target number of overall VGPRs for subtargets with unified RFs. Always 0
   /// for subtargets with non-unified RFs.
   unsigned MaxUnifiedVGPRs;
+  /// The maximum number of arch vgprs allowed by the subtarget.
+  unsigned AddressableNumArchVGPRs;
   /// Whether we consider that the register allocator will be able to swap
   /// between ArchVGPRs and AGPRs by copying them to a super register class.
   /// Concretely, this allows savings in one of the VGPR banks to help toward
@@ -254,12 +305,15 @@ class GCNRPTarget {
 
   inline bool satisifiesVGPRBanksTarget() const {
     assert(CombineVGPRSavings && "only makes sense with combined savings");
-    return RP.getArchVGPRNum() + RP.getAGPRNum() <= 2 * MaxVGPRs;
+    return RP.getArchVGPRNum(AddressableNumArchVGPRs) +
+               RP.getAGPRNum(AddressableNumArchVGPRs) <=
+           2 * MaxVGPRs;
   }
 
   /// Always satisified when the subtarget doesn't have a unified RF.
   inline bool satisfiesUnifiedTarget() const {
-    return !MaxUnifiedVGPRs || RP.getVGPRNum(true) <= MaxUnifiedVGPRs;
+    return !MaxUnifiedVGPRs ||
+           RP.getVGPRNum(true, AddressableNumArchVGPRs) <= MaxUnifiedVGPRs;
   }
 
   inline bool isVGPRBankSaveBeneficial(unsigned NumVGPRs) const {
@@ -517,7 +571,8 @@ bool isEqual(const GCNRPTracker::LiveRegSet &S1,
              const GCNRPTracker::LiveRegSet &S2);
 
 Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST = nullptr,
-                unsigned DynamicVGPRBlockSize = 0);
+                unsigned DynamicVGPRBlockSize = 0,
+                const MachineFunction *MF = nullptr);
 
 Printable print(const GCNRPTracker::LiveRegSet &LiveRegs,
                 const MachineRegisterInfo &MRI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index ce1ce687d0038..3cf9a7c0f972e 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -190,10 +190,14 @@ static void getRegisterPressures(
     TempUpwardTracker.recede(*MI);
     NewPressure = TempUpwardTracker.getPressure();
   }
+  unsigned ArchVGPRThreshold = DAG->MF.getSubtarget<GCNSubtarget>()
+                                   .getMaxNumVectorRegs(DAG->MF.getFunction())
+                                   .first;
   Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
   Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
-      NewPressure.getArchVGPRNum();
-  Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
+      NewPressure.getArchVGPRNum(ArchVGPRThreshold);
+  Pressure[AMDGPU::RegisterPressureSets::AGPR_32] =
+      NewPressure.getAGPRNum(ArchVGPRThreshold);
 }
 
 void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
@@ -339,7 +343,10 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
                             ? static_cast<GCNRPTracker *>(&UpwardTracker)
                             : static_cast<GCNRPTracker *>(&DownwardTracker);
       SGPRPressure = T->getPressure().getSGPRNum();
-      VGPRPressure = T->getPressure().getArchVGPRNum();
+      VGPRPressure = T->getPressure().getArchVGPRNum(
+          DAG->MF.getSubtarget<GCNSubtarget>()
+              .getMaxNumVectorRegs(DAG->MF.getFunction())
+              .first);
     }
   }
   ReadyQueue &Q = Zone.Available;
@@ -1140,8 +1147,7 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() {
   if (DAG.MinOccupancy > InitialOccupancy) {
     for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX)
       DAG.RegionsWithMinOcc[IDX] =
-          DAG.Pressure[IDX].getOccupancy(
-              DAG.ST, DAG.MFI.getDynamicVGPRBlockSize()) == DAG.MinOccupancy;
+          DAG.Pressure[IDX].getOccupancy(DAG.MF) == DAG.MinOccupancy;
 
     LLVM_DEBUG(dbgs() << StageID
                       << " stage successfully increased occupancy to "
@@ -1193,8 +1199,10 @@ bool GCNSchedStage::initGCNRegion() {
       dbgs() << "Pressure before scheduling:\nRegion live-ins:"
              << print(DAG.LiveIns[RegionIdx], DAG.MRI)
              << "Region live-in pressure:  "
-             << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
-             << "Region register pressure: " << print(PressureBefore));
+             << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]),
+                      &ST, 0, &MF)
+             << "Region register pressure: "
+             << print(PressureBefore, &ST, 0, &MF));
 
   S.HasHighPressure = false;
   S.KnownExcessRP = isRegionWithExcessRP();
@@ -1275,17 +1283,17 @@ void GCNSchedStage::checkScheduling() {
   // Check the results of scheduling.
   PressureAfter = DAG.getRealRegPressure(RegionIdx);
 
-  LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
+  LLVM_DEBUG(dbgs() << "Pressure after scheduling: "
+                    << print(PressureAfter, &ST, 0, &MF));
   LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
 
-  unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
-
+  unsigned ArchVGPRThreshold = ST.getMaxNumVectorRegs(MF.getFunction()).first;
   if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
-      PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
+      PressureAfter.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) <=
+          S.VGPRCriticalLimit) {
     DAG.Pressure[RegionIdx] = PressureAfter;
     DAG.RegionsWithMinOcc[RegionIdx] =
-        PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) ==
-        DAG.MinOccupancy;
+        PressureAfter.getOccupancy(DAG.MF) == DAG.MinOccupancy;
 
     // Early out if we have achieved the occupancy target.
     LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
@@ -1294,10 +1302,10 @@ void GCNSchedStage::checkScheduling() {
 
   unsigned TargetOccupancy = std::min(
       S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
-  unsigned WavesAfter = std::min(
-      TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
-  unsigned WavesBefore = std::min(
-      TargetOccupancy, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
+  unsigned WavesAfter =
+      std::min(TargetOccupancy, PressureAfter.getOccupancy(DAG.MF));
+  unsigned WavesBefore =
+      std::min(TargetOccupancy, PressureBefore.getOccupancy(DAG.MF));
   LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
                     << ", after " << WavesAfter << ".\n");
 
@@ -1331,9 +1339,10 @@ void GCNSchedStage::checkScheduling() {
   unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
   unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
 
-  if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
-      PressureAfter.getArchVGPRNum() > MaxArchVGPRs ||
-      PressureAfter.getAGPRNum() > MaxArchVGPRs ||
+  if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) >
+          MaxVGPRs ||
+      PressureAfter.getArchVGPRNum(ArchVGPRThreshold) > MaxArchVGPRs ||
+      PressureAfter.getAGPRNum(ArchVGPRThreshold) > MaxArchVGPRs ||
       PressureAfter.getSGPRNum() > MaxSGPRs) {
     DAG.RegionsWithHighRP[RegionIdx] = true;
     DAG.RegionsWithExcessRP[RegionIdx] = true;
@@ -1346,8 +1355,7 @@ void GCNSchedStage::checkScheduling() {
   } else {
     DAG.Pressure[RegionIdx] = PressureAfter;
     DAG.RegionsWithMinOcc[RegionIdx] =
-        PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) ==
-        DAG.MinOccupancy;
+        PressureAfter.getOccupancy(DAG.MF) == DAG.MinOccupancy;
   }
 }
 
@@ -1471,12 +1479,13 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
 
   // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
   if (DAG.MFI.isDynamicVGPREnabled()) {
+    unsigned ArchVGPRThreshold = ST.getMaxNumVectorRegs(MF.getFunction()).first;
     unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
         &ST, DAG.MFI.getDynamicVGPRBlockSize(),
-        PressureBefore.getVGPRNum(false));
+        PressureBefore.getVGPRNum(false, ArchVGPRThreshold));
     unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
         &ST, DAG.MFI.getDynamicVGPRBlockSize(),
-        PressureAfter.getVGPRNum(false));
+        PressureAfter.getVGPRNum(false, ArchVGPRThreshold));
     if (BlocksAfter > BlocksBefore)
       return true;
   }
@@ -1500,8 +1509,7 @@ bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
 bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
   // If RP is not reduced in the unclustered reschedule stage, revert to the
   // old schedule.
-  if ((WavesAfter <=
-           PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) &&
+  if ((WavesAfter <= PressureBefore.getOccupancy(DAG.MF) &&
        mayCauseSpilling(WavesAfter)) ||
       GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
     LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
@@ -1523,9 +1531,8 @@ bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
   ScheduleMetrics MAfter = getScheduleMetrics(DAG);
   unsigned OldMetric = MBefore.getMetric();
   unsigned NewMetric = MAfter.getMetric();
-  unsigned WavesBefore = std::min(
-      S.getTargetOccupancy(),
-      PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()));
+  unsigned WavesBefore =
+      std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(DAG.MF));
   unsigned Profit =
       ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
        ((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
@@ -1579,8 +1586,7 @@ bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
 
 void GCNSchedStage::revertScheduling() {
   DAG.RegionsWithMinOcc[RegionIdx] =
-      PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) ==
-      DAG.MinOccupancy;
+      PressureBefore.getOccupancy(DAG.MF) == DAG.MinOccupancy;
   LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
   DAG.RegionEnd = DAG.RegionBegin;
   int SkippedDebugInstr = 0;
@@ -2017,9 +2023,7 @@ void PreRARematStage::rematerialize() {
       }
     }
     DAG.Pressure[I] = RP;
-    AchievedOcc = std::min(
-        AchievedOcc, RP.getOccupancy(ST, MF.getInfo<SIMachineFunctionInfo>()
-                                             ->getDynamicVGPRBlockSize()));
+    AchievedOcc = std::min(AchievedOcc, RP.getOccupancy(DAG.MF));
   }
   REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
 }
diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index 6b13b06590102..e29ac72c7ba31 100644
--- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -197,9 +197,7 @@ bool SIFormMemoryClausesImpl::checkPressure(const MachineInstr &MI,
   // pointer becomes dead and could otherwise be reused for destination.
   RPT.advanceToNext();
   GCNRegPressure MaxPressure = RPT.moveMaxPressure();
-  unsigned Occupancy = MaxPressure.getOccupancy(
-      *ST,
-      MI.getMF()->getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
+  unsigned Occupancy = MaxPressure.getOccupancy(*MI.getMF());
 
   // Don't push over half the register budget. We don't want to introduce
   // spilling just to form a soft clause.
@@ -211,7 +209,10 @@ bool SIFormMemoryClausesImpl::checkPressure(const MachineInstr &MI,
   // tracking does not account for the alignment requirements for SGPRs, or the
   // fragmentation of registers the allocator will need to satisfy.
   if (Occupancy >= MFI->getMinAllowedOccupancy() &&
-      MaxPressure.getVGPRNum(ST->hasGFX90AInsts()) <= MaxVGPRs / 2 &&
+      MaxPressure.getVGPRNum(
+          ST->hasGFX90AInsts(),
+          ST->getMaxNumVectorRegs(MI.getMF()->getFunction()).first) <=
+          MaxVGPRs / 2 &&
       MaxPressure.getSGPRNum() <= MaxSGPRs / 2) {
     LastRecordedOccupancy = Occupancy;
     return true;
diff --git a/llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir b/llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir
new file mode 100644
index 0000000000000..a5183ce0d2661
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir
@@ -0,0 +1,481 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950  -run-pass=machine-scheduler --debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+--- |
+  define void @avgpr_rp_occ1() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @avgpr_rp_occ2() #1 {
+  entry:
+    unreachable
+  }
+
+  define void @avgpr_rp_occ3() #2 {
+  entry:
+    unreachable
+  }
+
+  define void @avgpr_rp_occ4() #3 {
+  entry:
+    unreachable
+  }
+
+  define void @avgpr_rp_occ5() #4 {
+  entry:
+    unreachable
+  }
+
+  define void @avgpr_rp_occ6() #5 {
+  entry:
+    unreachable
+  }
+
+  define void @avgpr_rp_occ7() #6 {
+  entry:
+    unreachable
+  }
+
+  define void @avgpr_rp_occ8() #7 {
+  entry:
+    unreachable
+  }
+
+
+  define void @vgpr_rp_occ1() #0 {
+  entry:
+    unreachable
+  }
+
+  define void @vgpr_rp_occ2() #1 {
+  entry:
+    unreachable
+  }
+
+  define void @vgpr_rp_occ3() #2 {
+  entry:
+    unreachable
+  }
+
+  attributes #0 = {"amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="64,64"}
+  attributes #1 = {"amdgpu-waves-per-eu"="2,2" "amdgpu-flat-work-group-size"="64,64"}
+  attributes #2 = {"amdgpu-waves-per-eu"="3,3" "amdgpu-flat-work-group-size"="64,64"}
+  attributes #3 = {"amdgpu-waves-per-eu"="4,4" "amdgpu-flat-work-group-size"="64,64"}
+  attributes #4 = {"amdgpu-waves-per-eu"="5,5" "amdgpu-flat-work-group-size"="64,64"}
+  attributes #5 = {"amdgpu-waves-per-eu"="6,6" "amdgpu-flat-work-group-size"="64,64"}
+  attributes #6 = {"amdgpu-waves-per-eu"="7,7" "amdgpu-flat-work-group-size"="64,64"}
+  attributes #7 = {"amdgpu-waves-per-eu"="8,8" "amdgpu-flat-work-group-size"="64,64"}
+
+
+...
+
+# CHECK: avgpr_rp_occ1:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure:  VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 256 AGPRs: 192(O1), SGPRs: 0(O10), LVGPR WT: 256, LSGPR WT: 0 -> Occ: 1
+
+---
+name:            avgpr_rp_occ1
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:av_1024 = IMPLICIT_DEF
+    %9:av_1024 = IMPLICIT_DEF
+    %10:av_1024 = IMPLICIT_DEF
+    %11:av_1024 = IMPLICIT_DEF
+    %12:av_1024 = IMPLICIT_DEF
+    %13:av_1024 = IMPLICIT_DEF
+    %14:av_1024 = IMPLICIT_DEF
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7
+
+  bb.1:
+    KILL %8, %9, %10, %11, %12, %13, %14
+    S_ENDPGM 0
+...
+
+# CHECK: avgpr_rp_occ2:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure:  VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 128 AGPRs: 64(O2), SGPRs: 0(O10), LVGPR WT: 128, LSGPR WT: 0 -> Occ: 2
+
+---
+name:            avgpr_rp_occ2
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:av_1024 = IMPLICIT_DEF
+    %5:av_1024 = IMPLICIT_DEF
+    %6:av_1024 = IMPLICIT_DEF
+    SCHED_BARRIER 0
+    KILL %1, %2, %3
+
+  bb.1:
+    KILL %4, %5, %6
+    S_ENDPGM 0
+...
+
+# CHECK: avgpr_rp_occ3:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure:  VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 84 AGPRs: 44(O4), SGPRs: 0(O10), LVGPR WT: 84, LSGPR WT: 0 -> Occ: 4
+
+---
+name:            avgpr_rp_occ3
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:av_1024 = IMPLICIT_DEF
+    %4:av_1024 = IMPLICIT_DEF
+    SCHED_BARRIER 0
+    KILL %1, %2
+
+  bb.1:
+    KILL %3, %4
+    S_ENDPGM 0
+...
+
+# CHECK: avgpr_rp_occ4:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure:  VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 64 AGPRs: 64(O4), SGPRs: 0(O10), LVGPR WT: 64, LSGPR WT: 0 -> Occ: 4
+
+---
+name:            avgpr_rp_occ4
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:av_1024 = IMPLICIT_DEF
+    %2:av_1024 = IMPLICIT_DEF
+    %3:av_1024 = IMPLICIT_DEF
+    %4:av_1024 = IMPLICIT_DEF
+    SCHED_BARRIER 0
+    KILL %1, %2 
+
+  bb.1:
+    KILL %3, %4 
+    S_ENDPGM 0
+...
+
+# CHECK: avgpr_rp_occ5:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure:  VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 48 AGPRs: 80(O4), SGPRs: 0(O10), LVGPR WT: 80, LSGPR WT: 0 -> Occ: 4
+
+---
+name:            avgpr_rp_occ5
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:av_1024 = IMPLICIT_DEF
+    %2:av_1024 = IMPLICIT_DEF
+    %3:av_1024 = IMPLICIT_DEF
+    %4:av_1024 = IMPLICIT_DEF
+    SCHED_BARRIER 0
+    KILL %1, %2 
+
+  bb.1:
+    KILL %3, %4 
+    S_ENDPGM 0
+...
+
+# CHECK: avgpr_rp_occ6:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure:  VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 40 AGPRs: 88(O4), SGPRs: 0(O10), LVGPR WT: 88, LSGPR WT: 0 -> Occ: 4
+
+---
+name:            avgpr_rp_occ6
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:av_1024 = IMPLICIT_DEF
+    %2:av_1024 = IMPLICIT_DEF
+    %3:av_1024 = IMPLICIT_DEF
+    %4:av_1024 = IMPLICIT_DEF
+    SCHED_BARRIER 0
+    KILL %1, %2 
+
+  bb.1:
+    KILL %3, %4 
+    S_ENDPGM 0
+...
+
+# CHECK: avgpr_rp_occ7:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure:  VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 36 AGPRs: 92(O4), SGPRs: 0(O10), LVGPR WT: 92, LSGPR WT: 0 -> Occ: 4
+
+---
+name:            avgpr_rp_occ7
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:av_1024 = IMPLICIT_DEF
+    %2:av_1024 = IMPLICIT_DEF
+    %3:av_1024 = IMPLICIT_DEF
+    %4:av_1024 = IMPLICIT_DEF
+    SCHED_BARRIER 0
+    KILL %1, %2 
+
+  bb.1:
+    KILL %3, %4 
+    S_ENDPGM 0
+...
+
+# CHECK: avgpr_rp_occ8:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure:  VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 32 AGPRs: 96(O4), SGPRs: 0(O10), LVGPR WT: 96, LSGPR WT: 0 -> Occ: 4
+
+---
+name:            avgpr_rp_occ8
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:av_1024 = IMPLICIT_DEF
+    %2:av_1024 = IMPLICIT_DEF
+    %3:av_1024 = IMPLICIT_DEF
+    %4:av_1024 = IMPLICIT_DEF
+    SCHED_BARRIER 0
+    KILL %1, %2 
+
+  bb.1:
+    KILL %3, %4 
+    S_ENDPGM 0
+...
+
+# CHECK: vgpr_rp_occ1:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure:  VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 448 AGPRs: 0(O1), SGPRs: 0(O10), LVGPR WT: 448, LSGPR WT: 0 -> Occ: 1
+
+---
+name:            vgpr_rp_occ1
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    %7:vreg_1024 = IMPLICIT_DEF
+    %8:vreg_1024 = IMPLICIT_DEF
+    %9:vreg_1024 = IMPLICIT_DEF
+    %10:vreg_1024 = IMPLICIT_DEF
+    %11:vreg_1024 = IMPLICIT_DEF
+    %12:vreg_1024 = IMPLICIT_DEF
+    %13:vreg_1024 = IMPLICIT_DEF
+    %14:vreg_1024 = IMPLICIT_DEF
+    SCHED_BARRIER 0
+    KILL %1, %2, %3, %4, %5, %6, %7
+
+  bb.1:
+    KILL %8, %9, %10, %11, %12, %13, %14
+    S_ENDPGM 0
+...
+
+# CHECK: vgpr_rp_occ2:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure:  VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 192 AGPRs: 0(O2), SGPRs: 0(O10), LVGPR WT: 192, LSGPR WT: 0 -> Occ: 2
+
+---
+name:            vgpr_rp_occ2
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    %5:vreg_1024 = IMPLICIT_DEF
+    %6:vreg_1024 = IMPLICIT_DEF
+    SCHED_BARRIER 0
+    KILL %1, %2, %3
+
+  bb.1:
+    KILL %4, %5, %6
+    S_ENDPGM 0
+...
+
+# CHECK: vgpr_rp_occ3:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure:  VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 128 AGPRs: 0(O4), SGPRs: 0(O10), LVGPR WT: 128, LSGPR WT: 0 -> Occ: 4
+
+
+---
+name:            vgpr_rp_occ3
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+    workGroupIDX:    { reg: '$sgpr6' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+    workItemIDX:     { reg: '$vgpr0' }
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  bb.0:
+   liveins: $vgpr0, $sgpr4_sgpr5
+    %1:vreg_1024 = IMPLICIT_DEF
+    %2:vreg_1024 = IMPLICIT_DEF
+    %3:vreg_1024 = IMPLICIT_DEF
+    %4:vreg_1024 = IMPLICIT_DEF
+    SCHED_BARRIER 0
+    KILL %1, %2
+
+  bb.1:
+    KILL %3, %4
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir
index 2a08c52e447ba..72181346764fb 100644
--- a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir
+++ b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir
@@ -6,7 +6,7 @@
 # CHECK-NEXT: test_get_liveins:%bb.0
 # CHECK: ********** MI Scheduling **********
 # CHECK-NEXT: test_get_liveins:%bb.1
-# CHECK: Region live-in pressure:  VGPRs: 1 AGPRs: 0, SGPRs: 0, LVGPR WT: 0, LSGPR WT: 0
+# CHECK: Region live-in pressure:  VGPRs: 1 AGPRs: 0(O10), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 10
 # CHECK: ScheduleDAGMILive::schedule starting
 
 ---

>From 13f5604a8e6bbe1576457fea43ac40f4ce954f49 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 15 Jul 2025 15:10:41 -0700
Subject: [PATCH 2/5] [AMDGPU] Add scheduling stage to rewrite MFMA from VGPR
 to AGPR

Change-Id: I47b2a4274a35f3cf0a6d064674d1d29526e4dfd2
---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 283 ++++++++++++++++++++
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h   |  35 ++-
 2 files changed, 313 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 3cf9a7c0f972e..de4f3433c80b2 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -29,6 +29,7 @@
 #include "SIMachineFunctionInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineCycleAnalysis.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -535,6 +536,7 @@ GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
     const MachineSchedContext *C, bool IsLegacyScheduler)
     : GCNSchedStrategy(C) {
   SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
+  SchedStages.push_back(GCNSchedStageID::RewriteSchedule);
   SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
   SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
   SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
@@ -785,6 +787,8 @@ GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
   switch (SchedStageID) {
   case GCNSchedStageID::OccInitialSchedule:
     return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
+  case GCNSchedStageID::RewriteSchedule:
+    return std::make_unique<RewriteScheduleStage>(SchedStageID, *this);
   case GCNSchedStageID::UnclusteredHighRPReschedule:
     return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
   case GCNSchedStageID::ClusteredLowOccupancyReschedule:
@@ -948,10 +952,12 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
   Pressure.resize(Regions.size());
   RegionsWithHighRP.resize(Regions.size());
   RegionsWithExcessRP.resize(Regions.size());
+  RegionsWithExcessArchVGPR.resize(Regions.size());
   RegionsWithMinOcc.resize(Regions.size());
   RegionsWithIGLPInstrs.resize(Regions.size());
   RegionsWithHighRP.reset();
   RegionsWithExcessRP.reset();
+  RegionsWithExcessArchVGPR.reset();
   RegionsWithMinOcc.reset();
   RegionsWithIGLPInstrs.reset();
 
@@ -1010,6 +1016,9 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
   case GCNSchedStageID::OccInitialSchedule:
     OS << "Max Occupancy Initial Schedule";
     break;
+  case GCNSchedStageID::RewriteSchedule:
+    OS << "Instruction Rewriting Reschedule";
+    break;
   case GCNSchedStageID::UnclusteredHighRPReschedule:
     OS << "Unclustered High Register Pressure Reschedule";
     break;
@@ -1043,6 +1052,245 @@ bool GCNSchedStage::initGCNSchedStage() {
   return true;
 }
 
+bool RewriteScheduleStage::initGCNSchedStage() {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  if (!ST.hasGFX90AInsts() || DAG.RegionsWithExcessArchVGPR.none())
+    return false;
+
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo *SRI = ST.getRegisterInfo();
+  SmallPtrSet<MachineInstr *, 16> CrossRCUseCopies;
+  SmallPtrSet<MachineInstr *, 16> CrossRCDefCopies;
+  std::vector<std::pair<MachineInstr *, unsigned>> RewriteInsts;
+
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      if (TII->isMAI(MI)) {
+        int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
+        if (ReplacementOp == -1)
+          continue;
+        const TargetRegisterClass *VGPRRC =
+            DAG.MRI.getRegClass(MI.getOperand(0).getReg());
+        const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(VGPRRC);
+        const TargetRegisterClass *DestConstrainExceptRC =
+            recomputeRegClassExceptRewritable(MI.getOperand(0).getReg(), VGPRRC,
+                                              AGPRRC);
+
+        if (!DestConstrainExceptRC)
+          CrossRCUseCopies.insert(&MI);
+
+        MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+        if (Src2 && Src2->isReg()) {
+          const TargetRegisterClass *Src2ConstrainExceptRC =
+              recomputeRegClassExceptRewritable(Src2->getReg(), VGPRRC, AGPRRC);
+          if ((!Src2ConstrainExceptRC || Src2ConstrainExceptRC != AGPRRC))
+            CrossRCDefCopies.insert(&MI);
+
+          DAG.MRI.setRegClass(Src2->getReg(), AGPRRC);
+        }
+
+        DAG.MRI.setRegClass(MI.getOperand(0).getReg(), AGPRRC);
+
+        auto OriginalOpc = MI.getOpcode();
+        MI.setDesc(TII->get(ReplacementOp));
+        RewriteInsts.push_back({&MI, OriginalOpc});
+      }
+    }
+  }
+
+  bool ShouldRewrite = false;
+  for (unsigned RegionIdx = 0; RegionIdx < DAG.Regions.size(); RegionIdx++) {
+    if (!DAG.RegionsWithExcessArchVGPR[RegionIdx])
+      continue;
+
+    // For the cases we care about (i.e. ArchVGPR usage is greater than the
+    // addressable limit), rewriting alone should bring pressure to manageable
+    // level. If we find any such region, then the rewrite is potentially
+    // beneficial.
+    auto PressureAfter = DAG.getRealRegPressure(RegionIdx);
+    unsigned MaxCombinedVGPRs = ST.getMaxNumVGPRs(MF);
+    if (PressureAfter.getArchVGPRNum() <= ST.getAddressableNumArchVGPRs() &&
+        PressureAfter.getVGPRNum(true) <= MaxCombinedVGPRs) {
+      ShouldRewrite = true;
+      break;
+    }
+  }
+
+  // If we find that we'll need to insert cross RC copies inside loop bodies,
+  // then bail
+  if (ShouldRewrite) {
+    CI.clear();
+    CI.compute(MF);
+
+    for (auto *DefMI : CrossRCUseCopies) {
+      auto DefReg = DefMI->getOperand(0).getReg();
+
+      for (auto &UseMI : DAG.MRI.use_nodbg_instructions(DefReg)) {
+        for (unsigned OpNo = 0; OpNo < UseMI.getNumOperands(); OpNo++) {
+          auto &TheOp = UseMI.getOperand(OpNo);
+          if (!TheOp.isReg() || !TheOp.isUse())
+            continue;
+          if (TheOp.getReg() != DefReg)
+            continue;
+
+          auto RequiredRC = UseMI.getRegClassConstraint(OpNo, DAG.TII, DAG.TRI);
+          if (!RequiredRC || SRI->hasAGPRs(RequiredRC))
+            continue;
+
+          unsigned DefDepth = CI.getCycleDepth(DefMI->getParent());
+          if (DefDepth && CI.getCycleDepth(UseMI.getParent()) >= DefDepth) {
+            ShouldRewrite = false;
+            break;
+          }
+        }
+        if (!ShouldRewrite)
+          break;
+      }
+      if (!ShouldRewrite)
+        break;
+    }
+  }
+
+  // If we haven't found the beneficial conditions, prefer the VGPR form which
+  // may result in less cross RC copies.
+  if (!ShouldRewrite) {
+    for (auto RI : RewriteInsts) {
+      MachineInstr *MI = RI.first;
+
+      assert(TII->isMAI(*MI));
+      const TargetRegisterClass *AGPRRC =
+          DAG.MRI.getRegClass(MI->getOperand(0).getReg());
+      const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(AGPRRC);
+
+      MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
+      assert(Src2);
+
+      if (Src2->isReg()) {
+        DAG.MRI.setRegClass(Src2->getReg(), VGPRRC);
+      }
+      DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VGPRRC);
+      MI->setDesc(TII->get(RI.second));
+    }
+
+    return false;
+  }
+
+  DAG.RegionsWithExcessArchVGPR.reset();
+  DAG.RegionsWithExcessRP.reset();
+
+  // Insert cross RC copies for the users of the MFMA result
+  for (auto MI : CrossRCUseCopies) {
+    auto DefReg = MI->getOperand(0).getReg();
+    SmallVector<MachineInstr *, 4> UseInstrs;
+    for (auto &UseMI : DAG.MRI.use_nodbg_instructions(DefReg))
+      UseInstrs.push_back(&UseMI);
+
+    DenseMap<Register, MachineInstr *> NewCopies;
+    for (auto UseMI : UseInstrs) {
+      for (unsigned OpNo = 0; OpNo < UseMI->getNumOperands(); OpNo++) {
+        auto &TheOp = UseMI->getOperand(OpNo);
+        if (!TheOp.isReg() || !TheOp.isUse())
+          continue;
+        if (TheOp.getReg() != DefReg)
+          continue;
+
+        auto RequiredRC = UseMI->getRegClassConstraint(OpNo, DAG.TII, DAG.TRI);
+
+        if (!RequiredRC || SRI->hasAGPRs(RequiredRC))
+          continue;
+
+        Register DestVGPR;
+        if (!NewCopies.contains(DefReg)) {
+          Register DestVGPR = DAG.MRI.createVirtualRegister(
+              SRI->getEquivalentVGPRClass(DAG.MRI.getRegClass(DefReg)));
+
+          // Insert copy near the user to avoid inserting inside loops.
+          MachineInstrBuilder VGPRCopy =
+              BuildMI(*UseMI->getParent(), UseMI->getIterator(),
+                      UseMI->getDebugLoc(), TII->get(TargetOpcode::COPY))
+                  .addDef(DestVGPR, 0, 0)
+                  .addUse(DefReg, 0, 0);
+
+          NewCopies[DefReg] = VGPRCopy;
+        }
+        DestVGPR = NewCopies[DefReg]->getOperand(0).getReg();
+        TheOp.setReg(DestVGPR);
+      }
+    }
+    if (NewCopies.contains(DefReg)) {
+      DAG.LIS->InsertMachineInstrInMaps(*NewCopies[DefReg]);
+      DAG.LIS->removeInterval(DefReg);
+      DAG.LIS->createAndComputeVirtRegInterval(DefReg);
+      DAG.LIS->createAndComputeVirtRegInterval(
+          NewCopies[DefReg]->getOperand(0).getReg());
+    }
+  }
+
+  // Insert cross RC copies for the use operands of the MFMA
+  for (auto MI : CrossRCDefCopies) {
+    MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
+    if (!Src2)
+      continue;
+    if (!Src2->isReg())
+      continue;
+    auto Src2Reg = Src2->getReg();
+    SmallVector<MachineInstr *, 4> DefInstrs;
+    for (auto &DefMI : DAG.MRI.def_instructions(Src2Reg))
+      DefInstrs.push_back(&DefMI);
+
+    DenseMap<Register, MachineInstr *> NewCopies;
+    for (auto DefMI : DefInstrs) {
+      for (unsigned OpNo = 0; OpNo < DefMI->getNumOperands(); OpNo++) {
+        auto &TheOp = DefMI->getOperand(OpNo);
+        if (!TheOp.isReg() || !TheOp.isDef())
+          continue;
+        if (TheOp.getReg() != Src2Reg)
+          continue;
+
+        auto RequiredRC = DefMI->getRegClassConstraint(OpNo, DAG.TII, DAG.TRI);
+
+        if (!RequiredRC || SRI->hasAGPRs(RequiredRC))
+          continue;
+
+        Register SrcVGPR;
+        if (!NewCopies.contains(Src2Reg)) {
+          Register SrcVGPR = DAG.MRI.createVirtualRegister(
+              SRI->getEquivalentVGPRClass(DAG.MRI.getRegClass(Src2Reg)));
+
+          // Insert copy near the def to avoid inserting inside loops.
+          MachineInstrBuilder VGPRCopy =
+              BuildMI(*DefMI->getParent(), ++DefMI->getIterator(),
+                      DefMI->getDebugLoc(), TII->get(TargetOpcode::COPY))
+                  .addDef(Src2Reg, 0, 0)
+                  .addUse(SrcVGPR, 0, 0);
+
+          NewCopies[Src2Reg] = VGPRCopy;
+        }
+
+        SrcVGPR = NewCopies[Src2Reg]->getOperand(1).getReg();
+        TheOp.setReg(SrcVGPR);
+      }
+    }
+
+    if (NewCopies.contains(Src2Reg)) {
+      DAG.LIS->InsertMachineInstrInMaps(*NewCopies[Src2Reg]);
+      DAG.LIS->removeInterval(Src2Reg);
+      DAG.LIS->createAndComputeVirtRegInterval(Src2Reg);
+      DAG.LIS->createAndComputeVirtRegInterval(
+          NewCopies[Src2Reg]->getOperand(1).getReg());
+    }
+  }
+
+  // Liveins may have been modified for cross RC copies
+  RegionPressureMap LiveInUpdater(&DAG, false);
+  LiveInUpdater.buildLiveRegMap();
+
+  for (unsigned RegionIdx = 0; RegionIdx < DAG.Regions.size(); RegionIdx++)
+    DAG.LiveIns[RegionIdx] = LiveInUpdater.getLiveRegsForRegionIdx(RegionIdx);
+
+  return true;
+}
+
 bool UnclusteredHighRPStage::initGCNSchedStage() {
   if (DisableUnclusterHighRP)
     return false;
@@ -1348,6 +1596,9 @@ void GCNSchedStage::checkScheduling() {
     DAG.RegionsWithExcessRP[RegionIdx] = true;
   }
 
+  if (PressureAfter.getArchVGPRNum() > ST.getAddressableNumArchVGPRs())
+    DAG.RegionsWithExcessArchVGPR[RegionIdx] = true;
+
   // Revert if this region's schedule would cause a drop in occupancy or
   // spilling.
   if (shouldRevertScheduling(WavesAfter)) {
@@ -1648,6 +1899,38 @@ void GCNSchedStage::revertScheduling() {
   DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
 }
 
+bool RewriteScheduleStage::isRewriteCandidate(MachineInstr *MI) const {
+
+  if (!static_cast<const SIInstrInfo *>(DAG.TII)->isMAI(*MI))
+    return false;
+  return AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode()) != -1;
+}
+
+const TargetRegisterClass *
+RewriteScheduleStage::recomputeRegClassExceptRewritable(
+    Register Reg, const TargetRegisterClass *OldRC,
+    const TargetRegisterClass *NewRC) const {
+
+  // Accumulate constraints from all uses.
+  for (MachineOperand &MO : DAG.MRI.reg_nodbg_operands(Reg)) {
+    // Apply the effect of the given operand to NewRC.
+    MachineInstr *MI = MO.getParent();
+    // We can swap the classes of dst + src2 as a pair to AGPR, so ignore the
+    // effects of rewrite candidates. It just so happens that we can use either
+    // AGPR or VGPR in src0/src1, so don't bother checking the constraint
+    // effects of the individual operands.
+    if (isRewriteCandidate(MI))
+      continue;
+
+    unsigned OpNo = &MO - &MI->getOperand(0);
+    NewRC = MI->getRegClassConstraintEffect(OpNo, NewRC, DAG.TII, DAG.TRI);
+    if (!NewRC || NewRC == OldRC)
+      return nullptr;
+  }
+
+  return NewRC;
+}
+
 bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
                                          SlotIndex OriginalIdx,
                                          SlotIndex RematIdx) const {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 94cd795bbc8f6..af264d062ce5a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -28,11 +28,12 @@ class GCNSchedStage;
 
 enum class GCNSchedStageID : unsigned {
   OccInitialSchedule = 0,
-  UnclusteredHighRPReschedule = 1,
-  ClusteredLowOccupancyReschedule = 2,
-  PreRARematerialize = 3,
-  ILPInitialSchedule = 4,
-  MemoryClauseInitialSchedule = 5
+  RewriteSchedule = 1,
+  UnclusteredHighRPReschedule = 2,
+  ClusteredLowOccupancyReschedule = 3,
+  PreRARematerialize = 4,
+  ILPInitialSchedule = 5,
+  MemoryClauseInitialSchedule = 6
 };
 
 #ifndef NDEBUG
@@ -224,6 +225,7 @@ using RegionBoundaries =
 class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   friend class GCNSchedStage;
   friend class OccInitialScheduleStage;
+  friend class RewriteScheduleStage;
   friend class UnclusteredHighRPStage;
   friend class ClusteredLowOccStage;
   friend class PreRARematStage;
@@ -250,6 +252,11 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   // limit. Register pressure in these regions usually will result in spilling.
   BitVector RegionsWithExcessRP;
 
+  // Record regions with excess archvgpr register pressure over the physical
+  // register limit. Register pressure in these regions usually will result in
+  // spilling.
+  BitVector RegionsWithExcessArchVGPR;
+
   // Regions that has the same occupancy as the latest MinOccupancy
   BitVector RegionsWithMinOcc;
 
@@ -401,6 +408,24 @@ class OccInitialScheduleStage : public GCNSchedStage {
       : GCNSchedStage(StageID, DAG) {}
 };
 
+class RewriteScheduleStage : public GCNSchedStage {
+private:
+  const TargetRegisterClass *
+  recomputeRegClassExceptRewritable(Register Reg,
+                                    const TargetRegisterClass *OldRC,
+                                    const TargetRegisterClass *NewRC) const;
+
+  bool isRewriteCandidate(MachineInstr *MI) const;
+
+  MachineCycleInfo CI;
+
+public:
+  bool initGCNSchedStage() override;
+
+  RewriteScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+      : GCNSchedStage(StageID, DAG) {}
+};
+
 class UnclusteredHighRPStage : public GCNSchedStage {
 private:
   // Save the initial occupancy before starting this stage.

>From 678aa740ab750b44a950630db3883b3d50282fd0 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 29 Jul 2025 09:39:49 -0700
Subject: [PATCH 3/5] Rebase + block frequencies

Change-Id: I3ca5a96c10fad06bb9c66d6b4e36fbe48157070a
---
 llvm/lib/Target/AMDGPU/GCNRegPressure.h     |  1 -
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 85 +++++++++++++++++----
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h   |  6 +-
 3 files changed, 76 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 8b80cc42c9bb0..81df54126cdfe 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -99,7 +99,6 @@ struct GCNRegPressure {
   unsigned getArchVGPRNum(unsigned AddressableArchVGPR) const {
     unsigned AVGPRsAsVGPRs =
         getAVGPRsAsVGPRsNum(Value[VGPR], Value[AVGPR], AddressableArchVGPR);
-
     return Value[VGPR] + AVGPRsAsVGPRs;
   }
   /// \returns the AccVGPR32 pressure
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index de4f3433c80b2..154024157056b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1098,29 +1098,80 @@ bool RewriteScheduleStage::initGCNSchedStage() {
     }
   }
 
-  bool ShouldRewrite = false;
+  unsigned ArchVGPRThreshold =
+      ST.getMaxNumVectorRegs(DAG.MF.getFunction()).first;
+
+  int64_t Cost = 0;
+      MBFI.calculate(MF, MBPI, *DAG.MLI);
   for (unsigned RegionIdx = 0; RegionIdx < DAG.Regions.size(); RegionIdx++) {
     if (!DAG.RegionsWithExcessArchVGPR[RegionIdx])
       continue;
 
+    unsigned MaxCombinedVGPRs = ST.getMaxNumVGPRs(MF);
+
+    auto PressureBefore = DAG.Pressure[RegionIdx];
+    unsigned UnifiedPressureBefore =
+        PressureBefore.getVGPRNum(true, ArchVGPRThreshold);
+    unsigned ArchPressureBefore =
+        PressureBefore.getArchVGPRNum(ArchVGPRThreshold);
+    unsigned AGPRPressureBefore = PressureBefore.getAGPRNum(ArchVGPRThreshold);
+    unsigned UnifiedSpillBefore =
+        UnifiedPressureBefore > MaxCombinedVGPRs
+            ? (UnifiedPressureBefore - MaxCombinedVGPRs)
+            : 0;
+    unsigned ArchSpillBefore =
+        ArchPressureBefore > ST.getAddressableNumArchVGPRs()
+            ? (ArchPressureBefore - ST.getAddressableNumArchVGPRs())
+            : 0;
+    unsigned AGPRSpillBefore =
+        AGPRPressureBefore > ST.getAddressableNumArchVGPRs()
+            ? (AGPRPressureBefore - ST.getAddressableNumArchVGPRs())
+            : 0;
+
+    unsigned SpillCostBefore =
+        std::max(UnifiedSpillBefore, (ArchSpillBefore + AGPRSpillBefore));
+
+   
     // For the cases we care about (i.e. ArchVGPR usage is greater than the
     // addressable limit), rewriting alone should bring pressure to manageable
     // level. If we find any such region, then the rewrite is potentially
     // beneficial.
     auto PressureAfter = DAG.getRealRegPressure(RegionIdx);
-    unsigned MaxCombinedVGPRs = ST.getMaxNumVGPRs(MF);
-    if (PressureAfter.getArchVGPRNum() <= ST.getAddressableNumArchVGPRs() &&
-        PressureAfter.getVGPRNum(true) <= MaxCombinedVGPRs) {
-      ShouldRewrite = true;
-      break;
-    }
+    unsigned UnifiedPressureAfter =
+        PressureAfter.getVGPRNum(true, ArchVGPRThreshold);
+    unsigned ArchPressureAfter =
+        PressureAfter.getArchVGPRNum(ArchVGPRThreshold);
+    unsigned AGPRPressureAfter = PressureAfter.getAGPRNum(ArchVGPRThreshold);
+    unsigned UnifiedSpillAfter = UnifiedPressureAfter > MaxCombinedVGPRs
+                                     ? (UnifiedPressureAfter - MaxCombinedVGPRs)
+                                     : 0;
+    unsigned ArchSpillAfter =
+        ArchPressureAfter > ST.getAddressableNumArchVGPRs()
+            ? (ArchPressureAfter - ST.getAddressableNumArchVGPRs())
+            : 0;
+    unsigned AGPRSpillAfter =
+        AGPRPressureAfter > ST.getAddressableNumArchVGPRs()
+            ? (AGPRPressureAfter - ST.getAddressableNumArchVGPRs())
+            : 0;
+
+    unsigned SpillCostAfter =
+        std::max(UnifiedSpillAfter, (ArchSpillAfter + AGPRSpillAfter));
+
+    uint64_t EntryFreq = MBFI.getEntryFreq().getFrequency();
+    uint64_t BlockFreq =
+        EntryFreq ? MBFI.getBlockFreq(DAG.Regions[RegionIdx].first->getParent())
+                        .getFrequency() / EntryFreq
+                  : 1;
+
+    // Assumes perfect spilling -- giving edge to VGPR form.
+    Cost += ((int)SpillCostAfter - (int)SpillCostBefore) * (int)BlockFreq;
   }
 
   // If we find that we'll need to insert cross RC copies inside loop bodies,
   // then bail
+  bool ShouldRewrite = Cost < 0;
   if (ShouldRewrite) {
-    CI.clear();
-    CI.compute(MF);
+    uint64_t EntryFreq = MBFI.getEntryFreq().getFrequency();
 
     for (auto *DefMI : CrossRCUseCopies) {
       auto DefReg = DefMI->getOperand(0).getReg();
@@ -1137,11 +1188,16 @@ bool RewriteScheduleStage::initGCNSchedStage() {
           if (!RequiredRC || SRI->hasAGPRs(RequiredRC))
             continue;
 
-          unsigned DefDepth = CI.getCycleDepth(DefMI->getParent());
-          if (DefDepth && CI.getCycleDepth(UseMI.getParent()) >= DefDepth) {
-            ShouldRewrite = false;
+          uint64_t UseFreq =
+              EntryFreq ? MBFI.getBlockFreq(UseMI.getParent()).getFrequency() /
+                              EntryFreq
+                        : 1;
+
+          // Assumes no copy-reuse, giving edge to VGPR form.
+          Cost += UseFreq;
+          ShouldRewrite = Cost < 0;
+          if (!ShouldRewrite)
             break;
-          }
         }
         if (!ShouldRewrite)
           break;
@@ -1596,7 +1652,8 @@ void GCNSchedStage::checkScheduling() {
     DAG.RegionsWithExcessRP[RegionIdx] = true;
   }
 
-  if (PressureAfter.getArchVGPRNum() > ST.getAddressableNumArchVGPRs())
+  if (PressureAfter.getArchVGPRNum(ArchVGPRThreshold) >
+      ST.getAddressableNumArchVGPRs())
     DAG.RegionsWithExcessArchVGPR[RegionIdx] = true;
 
   // Revert if this region's schedule would cause a drop in occupancy or
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index af264d062ce5a..786b5264bcec2 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -16,6 +16,9 @@
 #include "GCNRegPressure.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 
@@ -417,7 +420,8 @@ class RewriteScheduleStage : public GCNSchedStage {
 
   bool isRewriteCandidate(MachineInstr *MI) const;
 
-  MachineCycleInfo CI;
+  MachineBranchProbabilityInfo MBPI;
+  MachineBlockFrequencyInfo MBFI;
 
 public:
   bool initGCNSchedStage() override;

>From 639642e6e809729a23006312c2fe1f7ad389b214 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 29 Jul 2025 15:06:32 -0700
Subject: [PATCH 4/5] Track copies by MBB

Change-Id: Icd3934749a199e29660bfb0d187f045b18f1bd7d
---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 45 ++++++++++++---------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 154024157056b..24e5383fc5963 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1241,7 +1241,7 @@ bool RewriteScheduleStage::initGCNSchedStage() {
     for (auto &UseMI : DAG.MRI.use_nodbg_instructions(DefReg))
       UseInstrs.push_back(&UseMI);
 
-    DenseMap<Register, MachineInstr *> NewCopies;
+    DenseMap<Register, DenseMap<MachineBasicBlock *, MachineInstr *>> NewCopies;
     for (auto UseMI : UseInstrs) {
       for (unsigned OpNo = 0; OpNo < UseMI->getNumOperands(); OpNo++) {
         auto &TheOp = UseMI->getOperand(OpNo);
@@ -1256,29 +1256,32 @@ bool RewriteScheduleStage::initGCNSchedStage() {
           continue;
 
         Register DestVGPR;
-        if (!NewCopies.contains(DefReg)) {
+        if (!NewCopies.contains(DefReg) || !NewCopies[DefReg].contains(UseMI->getParent())) {
           Register DestVGPR = DAG.MRI.createVirtualRegister(
               SRI->getEquivalentVGPRClass(DAG.MRI.getRegClass(DefReg)));
 
           // Insert copy near the user to avoid inserting inside loops.
+          // TODO  insert point
           MachineInstrBuilder VGPRCopy =
-              BuildMI(*UseMI->getParent(), UseMI->getIterator(),
+              BuildMI(*UseMI->getParent(), UseMI->getParent()->getFirstNonPHI(),
                       UseMI->getDebugLoc(), TII->get(TargetOpcode::COPY))
                   .addDef(DestVGPR, 0, 0)
                   .addUse(DefReg, 0, 0);
 
-          NewCopies[DefReg] = VGPRCopy;
+          NewCopies[DefReg][UseMI->getParent()] = VGPRCopy;
         }
-        DestVGPR = NewCopies[DefReg]->getOperand(0).getReg();
+        DestVGPR = NewCopies[DefReg][UseMI->getParent()]->getOperand(0).getReg();
         TheOp.setReg(DestVGPR);
       }
     }
     if (NewCopies.contains(DefReg)) {
-      DAG.LIS->InsertMachineInstrInMaps(*NewCopies[DefReg]);
-      DAG.LIS->removeInterval(DefReg);
-      DAG.LIS->createAndComputeVirtRegInterval(DefReg);
-      DAG.LIS->createAndComputeVirtRegInterval(
-          NewCopies[DefReg]->getOperand(0).getReg());
+      for (auto NewCopy : NewCopies[DefReg]) {
+        DAG.LIS->InsertMachineInstrInMaps(*NewCopy.second);
+        DAG.LIS->removeInterval(DefReg);
+        DAG.LIS->createAndComputeVirtRegInterval(DefReg);
+        DAG.LIS->createAndComputeVirtRegInterval(
+        NewCopy.second->getOperand(0).getReg());
+      }
     }
   }
 
@@ -1294,7 +1297,7 @@ bool RewriteScheduleStage::initGCNSchedStage() {
     for (auto &DefMI : DAG.MRI.def_instructions(Src2Reg))
       DefInstrs.push_back(&DefMI);
 
-    DenseMap<Register, MachineInstr *> NewCopies;
+    DenseMap<Register, DenseMap<MachineBasicBlock *, MachineInstr *>> NewCopies;
     for (auto DefMI : DefInstrs) {
       for (unsigned OpNo = 0; OpNo < DefMI->getNumOperands(); OpNo++) {
         auto &TheOp = DefMI->getOperand(OpNo);
@@ -1309,31 +1312,33 @@ bool RewriteScheduleStage::initGCNSchedStage() {
           continue;
 
         Register SrcVGPR;
-        if (!NewCopies.contains(Src2Reg)) {
+        if (!NewCopies.contains(Src2Reg) || !NewCopies[Src2Reg].contains(DefMI->getParent())) {
           Register SrcVGPR = DAG.MRI.createVirtualRegister(
               SRI->getEquivalentVGPRClass(DAG.MRI.getRegClass(Src2Reg)));
 
           // Insert copy near the def to avoid inserting inside loops.
           MachineInstrBuilder VGPRCopy =
-              BuildMI(*DefMI->getParent(), ++DefMI->getIterator(),
+              BuildMI(*DefMI->getParent(), DefMI->getParent()->end(),
                       DefMI->getDebugLoc(), TII->get(TargetOpcode::COPY))
                   .addDef(Src2Reg, 0, 0)
                   .addUse(SrcVGPR, 0, 0);
 
-          NewCopies[Src2Reg] = VGPRCopy;
+          NewCopies[Src2Reg][DefMI->getParent()] = VGPRCopy;
         }
 
-        SrcVGPR = NewCopies[Src2Reg]->getOperand(1).getReg();
+        SrcVGPR = NewCopies[Src2Reg][DefMI->getParent()]->getOperand(1).getReg();
         TheOp.setReg(SrcVGPR);
       }
     }
 
     if (NewCopies.contains(Src2Reg)) {
-      DAG.LIS->InsertMachineInstrInMaps(*NewCopies[Src2Reg]);
-      DAG.LIS->removeInterval(Src2Reg);
-      DAG.LIS->createAndComputeVirtRegInterval(Src2Reg);
-      DAG.LIS->createAndComputeVirtRegInterval(
-          NewCopies[Src2Reg]->getOperand(1).getReg());
+      for (auto NewCopy : NewCopies[Src2Reg]) {
+        DAG.LIS->InsertMachineInstrInMaps(*NewCopy.second);
+        DAG.LIS->removeInterval(Src2Reg);
+        DAG.LIS->createAndComputeVirtRegInterval(Src2Reg);
+        DAG.LIS->createAndComputeVirtRegInterval(
+            NewCopy.second->getOperand(1).getReg());
+      }
     }
   }
 

>From c62a2f127cba5d6df350474dfd4a6e5f9250fe4f Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 29 Jul 2025 15:47:12 -0700
Subject: [PATCH 5/5] Formatting

Change-Id: Iab30e9e090220757e79f0c6b2c898465c1262abf
---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 24e5383fc5963..f6b9931ddaef5 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1102,7 +1102,7 @@ bool RewriteScheduleStage::initGCNSchedStage() {
       ST.getMaxNumVectorRegs(DAG.MF.getFunction()).first;
 
   int64_t Cost = 0;
-      MBFI.calculate(MF, MBPI, *DAG.MLI);
+  MBFI.calculate(MF, MBPI, *DAG.MLI);
   for (unsigned RegionIdx = 0; RegionIdx < DAG.Regions.size(); RegionIdx++) {
     if (!DAG.RegionsWithExcessArchVGPR[RegionIdx])
       continue;
@@ -1131,7 +1131,6 @@ bool RewriteScheduleStage::initGCNSchedStage() {
     unsigned SpillCostBefore =
         std::max(UnifiedSpillBefore, (ArchSpillBefore + AGPRSpillBefore));
 
-   
     // For the cases we care about (i.e. ArchVGPR usage is greater than the
     // addressable limit), rewriting alone should bring pressure to manageable
     // level. If we find any such region, then the rewrite is potentially
@@ -1160,7 +1159,8 @@ bool RewriteScheduleStage::initGCNSchedStage() {
     uint64_t EntryFreq = MBFI.getEntryFreq().getFrequency();
     uint64_t BlockFreq =
         EntryFreq ? MBFI.getBlockFreq(DAG.Regions[RegionIdx].first->getParent())
-                        .getFrequency() / EntryFreq
+                            .getFrequency() /
+                        EntryFreq
                   : 1;
 
     // Assumes perfect spilling -- giving edge to VGPR form.
@@ -1256,7 +1256,8 @@ bool RewriteScheduleStage::initGCNSchedStage() {
           continue;
 
         Register DestVGPR;
-        if (!NewCopies.contains(DefReg) || !NewCopies[DefReg].contains(UseMI->getParent())) {
+        if (!NewCopies.contains(DefReg) ||
+            !NewCopies[DefReg].contains(UseMI->getParent())) {
           Register DestVGPR = DAG.MRI.createVirtualRegister(
               SRI->getEquivalentVGPRClass(DAG.MRI.getRegClass(DefReg)));
 
@@ -1270,7 +1271,8 @@ bool RewriteScheduleStage::initGCNSchedStage() {
 
           NewCopies[DefReg][UseMI->getParent()] = VGPRCopy;
         }
-        DestVGPR = NewCopies[DefReg][UseMI->getParent()]->getOperand(0).getReg();
+        DestVGPR =
+            NewCopies[DefReg][UseMI->getParent()]->getOperand(0).getReg();
         TheOp.setReg(DestVGPR);
       }
     }
@@ -1280,7 +1282,7 @@ bool RewriteScheduleStage::initGCNSchedStage() {
         DAG.LIS->removeInterval(DefReg);
         DAG.LIS->createAndComputeVirtRegInterval(DefReg);
         DAG.LIS->createAndComputeVirtRegInterval(
-        NewCopy.second->getOperand(0).getReg());
+            NewCopy.second->getOperand(0).getReg());
       }
     }
   }
@@ -1312,7 +1314,8 @@ bool RewriteScheduleStage::initGCNSchedStage() {
           continue;
 
         Register SrcVGPR;
-        if (!NewCopies.contains(Src2Reg) || !NewCopies[Src2Reg].contains(DefMI->getParent())) {
+        if (!NewCopies.contains(Src2Reg) ||
+            !NewCopies[Src2Reg].contains(DefMI->getParent())) {
           Register SrcVGPR = DAG.MRI.createVirtualRegister(
               SRI->getEquivalentVGPRClass(DAG.MRI.getRegClass(Src2Reg)));
 
@@ -1326,7 +1329,8 @@ bool RewriteScheduleStage::initGCNSchedStage() {
           NewCopies[Src2Reg][DefMI->getParent()] = VGPRCopy;
         }
 
-        SrcVGPR = NewCopies[Src2Reg][DefMI->getParent()]->getOperand(1).getReg();
+        SrcVGPR =
+            NewCopies[Src2Reg][DefMI->getParent()]->getOperand(1).getReg();
         TheOp.setReg(SrcVGPR);
       }
     }