[llvm] [AMDGPU][Scheduler] Refactor VGPR rematerialization during scheduling (PR #118722)

Wed Dec 11 13:49:22 PST 2024

https://github.com/lucas-rami updated https://github.com/llvm/llvm-project/pull/118722

>From e5abab8b2143a4813359097fded2e1965ebaf591 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Wed, 4 Dec 2024 15:49:33 +0100
Subject: [PATCH 01/10] Working refactoring of simple VGPR remat.

---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   | 404 +++++++++---------
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h     |  56 ++-
 llvm/lib/Target/AMDGPU/GCNSubtarget.cpp       |   4 +
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |   5 +
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  13 +
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |   6 +
 .../machine-scheduler-sink-trivial-remats.mir | 145 ++++++-
 7 files changed, 418 insertions(+), 215 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 57f517bfba0ebb..1b39a5a7db7192 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -25,8 +25,13 @@
 
 #include "GCNSchedStrategy.h"
 #include "AMDGPUIGroupLP.h"
+#include "GCNRegPressure.h"
 #include "SIMachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/Support/ErrorHandling.h"
 
 #define DEBUG_TYPE "machine-scheduler"
 
@@ -945,20 +950,19 @@ bool PreRARematStage::initGCNSchedStage() {
     return false;
 
   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-  // Check maximum occupancy
+  // Rematerialization will not help if occupancy is LDS-limited.
   if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) ==
       DAG.MinOccupancy)
     return false;
 
   // FIXME: This pass will invalidate cached MBBLiveIns for regions
-  // inbetween the defs and region we sinked the def to. Cached pressure
-  // for regions where a def is sinked from will also be invalidated. Will
-  // need to be fixed if there is another pass after this pass.
+  // inbetween the defs and region we sinked the def to. Will need to be fixed
+  // if there is another pass after this pass.
   assert(!S.hasNextStage());
 
-  collectRematerializableInstructions();
-  if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
+  if (!collectRematerializableInstructions())
     return false;
+  sinkTriviallyRematInsts(ST, TII);
 
   LLVM_DEBUG(
       dbgs() << "Retrying function scheduling with improved occupancy of "
@@ -1467,231 +1471,249 @@ void GCNSchedStage::revertScheduling() {
   DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
 }
 
-void PreRARematStage::collectRematerializableInstructions() {
+/// Allows to easily filter for this stage's debug output.
+#define RA_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;)
+
+bool PreRARematStage::collectRematerializableInstructions() {
   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
-  for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
-    Register Reg = Register::index2VirtReg(I);
-    if (!DAG.LIS->hasInterval(Reg))
-      continue;
 
-    // TODO: Handle AGPR and SGPR rematerialization
-    if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
-        !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
-      continue;
+  RA_DEBUG(dbgs() << "Collecting rematerializable instructions\n");
 
-    MachineOperand *Op = DAG.MRI.getOneDef(Reg);
-    MachineInstr *Def = Op->getParent();
-    if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
+  // Maps optimizable regions (i.e., regions at minimum and VGPR-limited
+  // occupancy) to the numbers of VGPRs that must be deducted from their maximum
+  // VGPR pressure for their occupancy to be increased by one.
+  DenseMap<unsigned, unsigned> OptRegions;
+  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+    if (!DAG.RegionsWithMinOcc[I])
       continue;
+    GCNRegPressure &RP = DAG.Pressure[I];
 
-    MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
-    if (Def->getParent() == UseI->getParent())
+    // We do not rematerialize SGPR-defining regions yet so do not bother
+    // optimizing regions whose occupancy is SGPR-limited.
+    if (ST.getOccupancyWithNumSGPRs(RP.getSGPRNum()) == DAG.MinOccupancy)
       continue;
 
-    // We are only collecting defs that are defined in another block and are
-    // live-through or used inside regions at MinOccupancy. This means that the
-    // register must be in the live-in set for the region.
-    bool AddedToRematList = false;
-    for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
-      auto It = DAG.LiveIns[I].find(Reg);
-      if (It != DAG.LiveIns[I].end() && !It->second.none()) {
-        if (DAG.RegionsWithMinOcc[I]) {
-          RematerializableInsts[I][Def] = UseI;
-          AddedToRematList = true;
-        }
-
-        // Collect regions with rematerializable reg as live-in to avoid
-        // searching later when updating RP.
-        RematDefToLiveInRegions[Def].push_back(I);
-      }
-    }
-    if (!AddedToRematList)
-      RematDefToLiveInRegions.erase(Def);
+    unsigned NumVGPRs = RP.getVGPRNum(ST.hasGFX90AInsts());
+    unsigned NumToIncreaseOcc = ST.getNumVGPRsToIncreaseOccupancy(NumVGPRs);
+    OptRegions.insert({I, NumToIncreaseOcc});
+    RA_DEBUG(dbgs() << "Region " << I << " has min. occupancy: decrease by "
+                    << NumToIncreaseOcc << " VGPR(s) to improve occupancy\n");
   }
-}
-
-bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
-                                              const TargetInstrInfo *TII) {
-  // Temporary copies of cached variables we will be modifying and replacing if
-  // sinking succeeds.
-  SmallVector<
-      std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
-      NewRegions;
-  DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
-  DenseMap<unsigned, GCNRegPressure> NewPressure;
-  BitVector NewRescheduleRegions;
-  LiveIntervals *LIS = DAG.LIS;
-
-  NewRegions.resize(DAG.Regions.size());
-  NewRescheduleRegions.resize(DAG.Regions.size());
-
-  // Collect only regions that has a rematerializable def as a live-in.
-  SmallSet<unsigned, 16> ImpactedRegions;
-  for (const auto &It : RematDefToLiveInRegions)
-    ImpactedRegions.insert(It.second.begin(), It.second.end());
-
-  // Make copies of register pressure and live-ins cache that will be updated
-  // as we rematerialize.
-  for (auto Idx : ImpactedRegions) {
-    NewPressure[Idx] = DAG.Pressure[Idx];
-    NewLiveIns[Idx] = DAG.LiveIns[Idx];
-  }
-  NewRegions = DAG.Regions;
-  NewRescheduleRegions.reset();
+  if (OptRegions.empty())
+    return false;
 
-  DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
-  bool Improved = false;
-  for (auto I : ImpactedRegions) {
-    if (!DAG.RegionsWithMinOcc[I])
-      continue;
+  // Tracks estimated rematerialization gains (i.e., reduction in RP) for
+  // this instruction in each optimizable region.
+  auto ReduceRPInRegion = [&](auto OptIt, unsigned I,
+                              LaneBitmask Mask) -> bool {
+    auto NumRegs = SIRegisterInfo::getNumCoveredRegs(Mask);
+    unsigned &RPExcess = OptIt->getSecond();
+    if (NumRegs >= RPExcess) {
+      OptRegions.erase(I);
+      LLVM_DEBUG(dbgs() << "sinking increases occupancy in region " << I
+                      << "\n");
+    } else {
+      RPExcess -= NumRegs;
+      LLVM_DEBUG(dbgs() << "sinking reduces excess pressure in region " << I
+                      << " by " << NumRegs << " (" << RPExcess << " left)\n");
+    }
+    return OptRegions.empty();
+  };
+
+  // We need up-to-date live-out info. to query live-out register masks in
+  // regions containing rematerializable instructions.
+  DAG.RegionLiveOuts.buildLiveRegMap();
+
+  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+    auto Region = DAG.Regions[I];
+    for (auto MI = Region.first; MI != Region.second; ++MI) {
+      // We only support instructions with at least one register (but
+      // non-subregister) operand.
+      MachineInstr &DefMI = *MI;
+      if (DefMI.isBundle() || !DefMI.getNumOperands() ||
+          !DefMI.getOperand(0).isReg() || DefMI.getOperand(0).getSubReg())
+        continue;
 
-    Improved = false;
-    int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts());
-    int SGPRUsage = NewPressure[I].getSGPRNum();
+      // We only support rematerializing virtual VGPRs with one definition and
+      // one use.
+      Register Reg = DefMI.getOperand(0).getReg();
+      if (!Reg.isVirtual() || !DAG.LIS->hasInterval(Reg) ||
+          !SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
+          !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
+        continue;
 
-    // TODO: Handle occupancy drop due to AGPR and SGPR.
-    // Check if cause of occupancy drop is due to VGPR usage and not SGPR.
-    if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy)
-      break;
+      // The instruction must be trivially rematerializable and have no virtual
+      // register use.
+      if (!isTriviallyReMaterializable(DefMI))
+        continue;
 
-    // The occupancy of this region could have been improved by a previous
-    // iteration's sinking of defs.
-    if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) {
-      NewRescheduleRegions[I] = true;
-      Improved = true;
-      continue;
-    }
+      // We only care to rematerialize the instruction if its single use is in a
+      // different block.
+      MachineInstr *UseMI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
+      if (DefMI.getParent() == UseMI->getParent())
+        continue;
 
-    // First check if we have enough trivially rematerializable instructions to
-    // improve occupancy. Optimistically assume all instructions we are able to
-    // sink decreased RP.
-    int TotalSinkableRegs = 0;
-    for (const auto &It : RematerializableInsts[I]) {
-      MachineInstr *Def = It.first;
-      Register DefReg = Def->getOperand(0).getReg();
-      TotalSinkableRegs +=
-          SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]);
-    }
-    int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
-    unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
-    // If in the most optimistic scenario, we cannot improve occupancy, then do
-    // not attempt to sink any instructions.
-    if (OptimisticOccupancy <= DAG.MinOccupancy)
-      break;
+      RA_DEBUG(dbgs() << "In region " << I << ", instruction " << DefMI
+                      << " is rematerializable with single use " << *UseMI);
+      auto &Remat = RematInstructions.emplace_back(&DefMI, I, UseMI);
+
+      bool RematUseful = false;
+      if (auto It = OptRegions.find(I); It != OptRegions.end()) {
+        // Optimistically consider that moving the instruction out of its
+        // defining region will reduce RP in the latter; this assumes that
+        // maximum RP in the region is reached somewhere between the defining
+        // instruction and the end of the region.
+        RA_DEBUG(dbgs() << "  Instruction's defining region is optimizable: ");
+        RematUseful = true;
+        auto RegMask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
+        if (ReduceRPInRegion(It, I, RegMask))
+          return true;
+      }
 
-    unsigned ImproveOccupancy = 0;
-    SmallVector<MachineInstr *, 4> SinkedDefs;
-    for (auto &It : RematerializableInsts[I]) {
-      MachineInstr *Def = It.first;
-      MachineBasicBlock::iterator InsertPos =
-          MachineBasicBlock::iterator(It.second);
-      Register Reg = Def->getOperand(0).getReg();
-      // Rematerialize MI to its use block. Since we are only rematerializing
-      // instructions that do not have any virtual reg uses, we do not need to
-      // call LiveRangeEdit::allUsesAvailableAt() and
-      // LiveRangeEdit::canRematerializeAt().
-      TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
-                         Def->getOperand(0).getSubReg(), *Def, *DAG.TRI);
-      MachineInstr *NewMI = &*std::prev(InsertPos);
-      LIS->InsertMachineInstrInMaps(*NewMI);
-      LIS->removeInterval(Reg);
-      LIS->createAndComputeVirtRegInterval(Reg);
-      InsertedMIToOldDef[NewMI] = Def;
-
-      // Update region boundaries in scheduling region we sinked from since we
-      // may sink an instruction that was at the beginning or end of its region
-      DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
-                                 /*Removing =*/true);
-
-      // Update region boundaries in region we sinked to.
-      DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI);
-
-      LaneBitmask PrevMask = NewLiveIns[I][Reg];
-      // FIXME: Also update cached pressure for where the def was sinked from.
-      // Update RP for all regions that has this reg as a live-in and remove
-      // the reg from all regions as a live-in.
-      for (auto Idx : RematDefToLiveInRegions[Def]) {
-        NewLiveIns[Idx].erase(Reg);
-        if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) {
-          // Def is live-through and not used in this block.
-          NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
+      for (unsigned LVRegion = 0; LVRegion != E; ++LVRegion) {
+        // We are only collecting regions in which the register is a live-in
+        // (and may be live-through).
+        auto It = DAG.LiveIns[LVRegion].find(Reg);
+        if (It == DAG.LiveIns[LVRegion].end() || It->second.none())
+          continue;
+        Remat.LiveInRegions.insert(LVRegion);
+        RA_DEBUG(dbgs() << "  Def is live-in in region " << LVRegion
+                        << ": ");
+
+        // Account for the reduction in RP due to the rematerialization in an
+        // optimizable region in which the defined register is a live-in. This
+        // is exact for live-through region but optimistic in the using region,
+        // where RP is actually reduced only if maximum RP is reached somewhere
+        // between the beginning of the region and the rematerializable
+        // instruction's use.
+        if (auto It = OptRegions.find(LVRegion); It != OptRegions.end()) {
+          RematUseful = true;
+          if (ReduceRPInRegion(It, LVRegion, DAG.LiveIns[LVRegion][Reg]))
+            return true;
         } else {
-          // Def is used and rematerialized into this block.
-          GCNDownwardRPTracker RPT(*LIS);
-          auto *NonDbgMI = &*skipDebugInstructionsForward(
-              NewRegions[Idx].first, NewRegions[Idx].second);
-          RPT.reset(*NonDbgMI, &NewLiveIns[Idx]);
-          RPT.advance(NewRegions[Idx].second);
-          NewPressure[Idx] = RPT.moveMaxPressure();
+          LLVM_DEBUG(dbgs() << "unoptimizable region\n");
         }
       }
 
-      SinkedDefs.push_back(Def);
-      ImproveOccupancy = NewPressure[I].getOccupancy(ST);
-      if (ImproveOccupancy > DAG.MinOccupancy)
-        break;
+      // If the instruction is not a live-in or live-out in any optimizable
+      // region then there is no point in rematerializing it.
+      if (!RematUseful) {
+        RematInstructions.pop_back();
+        RA_DEBUG(
+            dbgs()
+            << "  No impact on any optimizable region, dropping instruction\n");
+      }
     }
+  }
 
-    // Remove defs we just sinked from all regions' list of sinkable defs
-    for (auto &Def : SinkedDefs)
-      for (auto TrackedIdx : RematDefToLiveInRegions[Def])
-        RematerializableInsts[TrackedIdx].erase(Def);
+  RA_DEBUG(dbgs() << "Cannot increase occupancy through rematerialization\n");
+  return false;
+}
 
-    if (ImproveOccupancy <= DAG.MinOccupancy)
-      break;
+void PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
+                                              const TargetInstrInfo *TII) {
+  // Collect regions whose live-ins or register pressure will change due to
+  // rematerialization, and map those whose maximum RP we need to fully
+  // recompute at the end to true.
+  SmallDenseMap<unsigned, bool> ImpactedRegions;
+  // Maps rematerialized instuctions to the one they were rematerialized from.
+  DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
+  LiveIntervals *LIS = DAG.LIS;
 
-    NewRescheduleRegions[I] = true;
-    Improved = true;
+  // TODO: In the spirit of rematerializing the minimum number of instructions
+  // to increase occupancy, here we could sort the list of rematerializable
+  // instructions in decreasing order of "expected profitability" so that we end
+  // up moving as few instructions as possible in the loop below.
+  for (RematInstruction &Remat : RematInstructions) {
+    MachineInstr *DefMI = Remat.RematMI;
+    MachineBasicBlock::iterator InsertPos(Remat.UseMI);
+    Register Reg = DefMI->getOperand(0).getReg();
+
+    // Rematerialize MI to its use block. Since we are only rematerializing
+    // instructions that do not have any virtual reg uses, we do not need to
+    // call LiveRangeEdit::allUsesAvailableAt() and
+    // LiveRangeEdit::canRematerializeAt().
+    TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
+                       DefMI->getOperand(0).getSubReg(), *DefMI, *DAG.TRI);
+    MachineInstr *NewMI = &*std::prev(InsertPos);
+    LIS->InsertMachineInstrInMaps(*NewMI);
+    LIS->removeInterval(Reg);
+    LIS->createAndComputeVirtRegInterval(Reg);
+    InsertedMIToOldDef[NewMI] = DefMI;
+
+    // Update region boundaries in scheduling region we sinked from since we
+    // may sink an instruction that was at the beginning or end of its region.
+    DAG.updateRegionBoundaries(DAG.Regions, DefMI, /*NewMI =*/nullptr,
+                               /*Removing =*/true);
+
+    // Update region boundaries in region we sinked to.
+    DAG.updateRegionBoundaries(DAG.Regions, InsertPos, NewMI);
+
+    // Collect all regions impacted by the rematerialization and update their
+    // live-in/RP information.
+    for (unsigned I : Remat.LiveInRegions) {
+      ImpactedRegions.insert({I, false});
+      GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I];
+
+      // The register is no longer a live-in in all regions but the one that
+      // contains the single use. In live-through regions, maximum register
+      // pressure decreases predictably so we can directly update it. In the
+      // using region, maximum register pressure may or may not decrease, so we
+      // will mark it for re-computation after all materializations.
+      RegionLiveIns.erase(Reg);
+      if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent()) {
+        // Register is live-through and not used in this block.
+        LaneBitmask PrevMask = RegionLiveIns[Reg];
+        DAG.Pressure[I].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
+      } else {
+        // Register is used in this block.
+        ImpactedRegions[I] = true;
+      }
+    }
+
+    // RP in the region from which the instruction was rematerialized may or may
+    // not change, recompute-it fully later.
+    ImpactedRegions[Remat.DefRegion] = true;
   }
 
-  if (!Improved) {
-    // Occupancy was not improved for all regions that were at MinOccupancy.
-    // Undo sinking and remove newly rematerialized instructions.
-    for (auto &Entry : InsertedMIToOldDef) {
-      MachineInstr *MI = Entry.first;
-      MachineInstr *OldMI = Entry.second;
-      Register Reg = MI->getOperand(0).getReg();
-      LIS->RemoveMachineInstrFromMaps(*MI);
-      MI->eraseFromParent();
-      OldMI->clearRegisterDeads(Reg);
-      LIS->removeInterval(Reg);
-      LIS->createAndComputeVirtRegInterval(Reg);
+  // All regions impacted by at least one rematerialization must be rescheduled.
+  BitVector NewRescheduleRegions(DAG.Regions.size());
+  for (auto &[I, RecomputeRP] : ImpactedRegions) {
+    NewRescheduleRegions[I] = true;
+    DAG.MBBLiveIns.erase(DAG.Regions[I].first->getParent());
+
+    // Recompute maximum RP in regions in which at least one instruction was
+    // rematerialized from or to.
+    if (RecomputeRP) {
+      GCNDownwardRPTracker RPT(*LIS);
+      auto *NonDbgMI = &*skipDebugInstructionsForward(DAG.Regions[I].first,
+                                                      DAG.Regions[I].second);
+      RPT.reset(*NonDbgMI, &DAG.LiveIns[I]);
+      RPT.advance(DAG.Regions[I].second);
+      DAG.Pressure[I] = RPT.moveMaxPressure();
     }
-    return false;
   }
+  DAG.RescheduleRegions = NewRescheduleRegions;
 
-  // Occupancy was improved for all regions.
-  for (auto &Entry : InsertedMIToOldDef) {
-    MachineInstr *MI = Entry.first;
-    MachineInstr *OldMI = Entry.second;
-
-    // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
+  // Clean up the IR; remove rematerialized instructions from state.
+  for (auto &[NewMI, OldMI] : InsertedMIToOldDef) {
+    // Remove rematerialized instruction from BBLiveInMap since we are sinking
+    // it from its MBB.
     DAG.BBLiveInMap.erase(OldMI);
 
-    // Remove OldMI and update LIS
-    Register Reg = MI->getOperand(0).getReg();
+    // Remove the rematerialized instruction and update LIS.
+    Register Reg = NewMI->getOperand(0).getReg();
     LIS->RemoveMachineInstrFromMaps(*OldMI);
     OldMI->eraseFromParent();
     LIS->removeInterval(Reg);
     LIS->createAndComputeVirtRegInterval(Reg);
   }
 
-  // Update live-ins, register pressure, and regions caches.
-  for (auto Idx : ImpactedRegions) {
-    DAG.LiveIns[Idx] = NewLiveIns[Idx];
-    DAG.Pressure[Idx] = NewPressure[Idx];
-    DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent());
-  }
-  DAG.Regions = NewRegions;
-  DAG.RescheduleRegions = NewRescheduleRegions;
-
   if (GCNTrackers)
     DAG.RegionLiveOuts.buildLiveRegMap();
 
   SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
   MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
-
-  return true;
 }
 
 // Copied from MachineLICM
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 64d517038f90e0..c378564d5b2e97 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -14,7 +14,8 @@
 #define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
 
 #include "GCNRegPressure.h"
-#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 
 namespace llvm {
@@ -421,27 +422,42 @@ class ClusteredLowOccStage : public GCNSchedStage {
 
 class PreRARematStage : public GCNSchedStage {
 private:
-  // Each region at MinOccupancy will have their own list of trivially
-  // rematerializable instructions we can remat to reduce RP. The list maps an
-  // instruction to the position we should remat before, usually the MI using
-  // the rematerializable instruction.
-  MapVector<unsigned, MapVector<MachineInstr *, MachineInstr *>>
-      RematerializableInsts;
-
-  // Map a trivially rematerializable def to a list of regions at MinOccupancy
-  // that has the defined reg as a live-in.
-  DenseMap<MachineInstr *, SmallVector<unsigned, 4>> RematDefToLiveInRegions;
-
-  // Collect all trivially rematerializable VGPR instructions with a single def
-  // and single use outside the defining block into RematerializableInsts.
-  void collectRematerializableInstructions();
-
+  /// A trivially rematerializable VGPR-defining instruction along with
+  /// pre-computed information to help update the scheduler's status when we
+  /// rematerialize it.
+  struct RematInstruction {
+    /// Trivially rematerializable instruction.
+    MachineInstr *RematMI;
+    /// Region containing the rematerializable instruction.
+    unsigned DefRegion;
+    /// Single use of the rematerializable instruction's defined register,
+    /// located in a different block.
+    MachineInstr *UseMI;
+    /// Set of regions in which the rematerializable instruction's defined
+    /// register is a live-in.
+    SmallDenseSet<unsigned, 4> LiveInRegions;
+
+    RematInstruction(MachineInstr *RematMI, unsigned DefRegion,
+                     MachineInstr *UseMI)
+        : RematMI(RematMI), DefRegion(DefRegion), UseMI(UseMI) {}
+  };
+
+  /// List of eligible rematerializable instructions to sink to increase
+  /// occupancy, in function instruction order.
+  std::vector<RematInstruction> RematInstructions;
+
+  /// Collect all trivially rematerializable VGPR instructions with a single def
+  /// and single use outside the defining block into RematerializableInsts.
+  bool collectRematerializableInstructions();
+
+  /// Whether the MI is trivially rematerializable and does not have eany
+  /// virtual register use.
   bool isTriviallyReMaterializable(const MachineInstr &MI);
 
-  // TODO: Should also attempt to reduce RP of SGPRs and AGPRs
-  // Attempt to reduce RP of VGPR by sinking trivially rematerializable
-  // instructions. Returns true if we were able to sink instruction(s).
-  bool sinkTriviallyRematInsts(const GCNSubtarget &ST,
+  /// TODO: Should also attempt to reduce RP of SGPRs and AGPRs
+  /// Sinks all instructions in RematInstructions to increase function
+  /// occupancy. Modified regions are tagged for rescheduling.
+  void sinkTriviallyRematInsts(const GCNSubtarget &ST,
                                const TargetInstrInfo *TII);
 
 public:
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 51361b75940560..78f15ed8be8cae 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -367,6 +367,10 @@ unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {
   return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs);
 }
 
+unsigned GCNSubtarget::getNumVGPRsToIncreaseOccupancy(unsigned NumVGPRs) const {
+  return AMDGPU::IsaInfo::getNumVGPRsToIncreaseWavesPerEU(this, NumVGPRs);
+}
+
 unsigned
 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
   if (getGeneration() >= AMDGPUSubtarget::GFX10)
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index ea5e159fdd8363..20e1ba350ed8e7 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1368,6 +1368,11 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// VGPRs
   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
 
+  /// Returns the necessary reduction in number of VGPRs from using \p VGPRs
+  /// VGPRs to increase occupancy by 1. Returns 0 when using \p VGPRs VGPRs
+  /// already results in maximum occupancy.
+  unsigned getNumVGPRsToIncreaseOccupancy(unsigned VGPRs) const;
+  
   /// Return occupancy for the given function. Used LDS and a number of
   /// registers if provided.
   /// Note, occupancy can be affected by the scratch allocation as well, but
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index ab5f0694c07f95..a996cb21848643 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1185,6 +1185,19 @@ unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
   return std::min(std::max(TotalNumVGPRs / RoundedRegs, 1u), MaxWaves);
 }
 
+unsigned getNumVGPRsToIncreaseWavesPerEU(const MCSubtargetInfo *STI,
+                                         unsigned NumVGPRs) {
+  unsigned Granule = getVGPRAllocGranule(STI);
+  unsigned MaxWaves = getMaxWavesPerEU(STI);
+  unsigned TotalNumVGPRs = getTotalNumVGPRs(STI);
+
+  unsigned NumWaves =
+      getNumWavesPerEUWithNumVGPRs(NumVGPRs, Granule, MaxWaves, TotalNumVGPRs);
+  if (NumWaves == MaxWaves)
+    return 0;
+  return NumVGPRs - alignDown(TotalNumVGPRs / (NumWaves + 1), Granule);
+}
+
 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves,
                                   AMDGPUSubtarget::Generation Gen) {
   if (Gen >= AMDGPUSubtarget::GFX10)
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 9f7fbec6a542f7..8b4136da9503cc 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -324,6 +324,12 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
 unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
                                       unsigned NumVGPRs);
 
+/// Returns the necessary reduction in number of VGPRs from using \p VGPRs VGPRs
+/// to increase the achievable number of waves per EU for this subtarget by 1.
+/// Returns 0 when using \p VGPRs VGPRs already results in maximum number of waves per EU.
+unsigned getNumVGPRsToIncreaseWavesPerEU(const MCSubtargetInfo *STI,
+                                         unsigned NumVGPRs);
+
 /// \returns Number of waves reachable for a given \p NumVGPRs usage, \p Granule
 /// size, \p MaxWaves possible, and \p TotalNumVGPRs available.
 unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
index 018da7f81e3d4b..abed0e080b7eeb 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
@@ -3163,11 +3163,11 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -3353,11 +3353,11 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
-  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -5913,4 +5913,141 @@ body:             |
     S_NOP 0, implicit %22
     S_ENDPGM 0
 ...
+---
+name:            test_live_through_occ_7_sink_for_8
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX908-LABEL: name: test_live_through_occ_7_sink_for_8
+  ; GFX908: bb.0:
+  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.1:
+  ; GFX908-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   dead [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.2:
+  ; GFX908-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_17]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_18]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_19]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_20]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_21]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_22]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_23]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_9]], implicit [[V_CVT_I32_F64_e32_25]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_26]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_27]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_28]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_29]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_30]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_31]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_32]]
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.3:
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_]]
+  ; GFX908-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1
+
+    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode
+    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode
+
+  bb.1:
+    successors: %bb.2
 
+    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
+    %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
+
+  bb.2:
+    successors: %bb.3
+
+    S_NOP 0, implicit %2, implicit %18
+    S_NOP 0, implicit %3, implicit %19
+    S_NOP 0, implicit %4, implicit %20
+    S_NOP 0, implicit %5, implicit %21
+    S_NOP 0, implicit %6, implicit %22
+    S_NOP 0, implicit %7, implicit %23
+    S_NOP 0, implicit %8, implicit %24
+    S_NOP 0, implicit %9, implicit %25
+    S_NOP 0, implicit %10, implicit %26
+    S_NOP 0, implicit %11, implicit %27
+    S_NOP 0, implicit %12, implicit %28
+    S_NOP 0, implicit %13, implicit %29
+    S_NOP 0, implicit %14, implicit %30
+    S_NOP 0, implicit %15, implicit %31
+    S_NOP 0, implicit %16, implicit %32
+    S_NOP 0, implicit %33
+
+  bb.3:
+    S_NOP 0, implicit %0, implicit %1
+    S_ENDPGM 0
+...

>From a2e20f4db4d88db6a95198c628f8dcf69551c29d Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Wed, 4 Dec 2024 16:45:16 +0100
Subject: [PATCH 02/10] Edit stale comments and slightly rearch stage

---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 35 +++++++++------------
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h   | 26 ++++++++-------
 2 files changed, 30 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 1b39a5a7db7192..e43014efa08c5a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -960,9 +960,10 @@ bool PreRARematStage::initGCNSchedStage() {
   // if there is another pass after this pass.
   assert(!S.hasNextStage());
 
-  if (!collectRematerializableInstructions())
+  std::vector<RematInstruction> RematInstructions;
+  if (!canIncreaseOccupancy(RematInstructions))
     return false;
-  sinkTriviallyRematInsts(ST, TII);
+  sinkTriviallyRematInsts(RematInstructions, ST, TII);
 
   LLVM_DEBUG(
       dbgs() << "Retrying function scheduling with improved occupancy of "
@@ -1258,8 +1259,7 @@ GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
 #ifndef NDEBUG
   LLVM_DEBUG(
       printScheduleModel(ReadyCyclesSorted);
-      dbgs() << "\n\t"
-             << "Metric: "
+      dbgs() << "\n\t" << "Metric: "
              << (SumBubbles
                      ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
                      : 1)
@@ -1294,8 +1294,7 @@ GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) {
 #ifndef NDEBUG
   LLVM_DEBUG(
       printScheduleModel(ReadyCyclesSorted);
-      dbgs() << "\n\t"
-             << "Metric: "
+      dbgs() << "\n\t" << "Metric: "
              << (SumBubbles
                      ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
                      : 1)
@@ -1343,8 +1342,7 @@ bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
       dbgs()
       << "\n\t      *** In shouldRevertScheduling ***\n"
       << "      *********** BEFORE UnclusteredHighRPStage ***********\n");
-  ScheduleMetrics MBefore =
-      getScheduleMetrics(DAG.SUnits);
+  ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
   LLVM_DEBUG(
       dbgs()
       << "\n      *********** AFTER UnclusteredHighRPStage ***********\n");
@@ -1474,7 +1472,8 @@ void GCNSchedStage::revertScheduling() {
 /// Allows to easily filter for this stage's debug output.
 #define RA_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;)
 
-bool PreRARematStage::collectRematerializableInstructions() {
+bool PreRARematStage::canIncreaseOccupancy(
+    std::vector<RematInstruction> &RematInstructions) {
   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
 
   RA_DEBUG(dbgs() << "Collecting rematerializable instructions\n");
@@ -1511,11 +1510,11 @@ bool PreRARematStage::collectRematerializableInstructions() {
     if (NumRegs >= RPExcess) {
       OptRegions.erase(I);
       LLVM_DEBUG(dbgs() << "sinking increases occupancy in region " << I
-                      << "\n");
+                        << "\n");
     } else {
       RPExcess -= NumRegs;
       LLVM_DEBUG(dbgs() << "sinking reduces excess pressure in region " << I
-                      << " by " << NumRegs << " (" << RPExcess << " left)\n");
+                        << " by " << NumRegs << " (" << RPExcess << " left)\n");
     }
     return OptRegions.empty();
   };
@@ -1577,8 +1576,7 @@ bool PreRARematStage::collectRematerializableInstructions() {
         if (It == DAG.LiveIns[LVRegion].end() || It->second.none())
           continue;
         Remat.LiveInRegions.insert(LVRegion);
-        RA_DEBUG(dbgs() << "  Def is live-in in region " << LVRegion
-                        << ": ");
+        RA_DEBUG(dbgs() << "  Def is live-in in region " << LVRegion << ": ");
 
         // Account for the reduction in RP due to the rematerialization in an
         // optimizable region in which the defined register is a live-in. This
@@ -1610,8 +1608,9 @@ bool PreRARematStage::collectRematerializableInstructions() {
   return false;
 }
 
-void PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
-                                              const TargetInstrInfo *TII) {
+void PreRARematStage::sinkTriviallyRematInsts(
+    ArrayRef<RematInstruction> RematInstructions, const GCNSubtarget &ST,
+    const TargetInstrInfo *TII) {
   // Collect regions whose live-ins or register pressure will change due to
   // rematerialization, and map those whose maximum RP we need to fully
   // recompute at the end to true.
@@ -1620,11 +1619,7 @@ void PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
   DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
   LiveIntervals *LIS = DAG.LIS;
 
-  // TODO: In the spirit of rematerializing the minimum number of instructions
-  // to increase occupancy, here we could sort the list of rematerializable
-  // instructions in decreasing order of "expected profitability" so that we end
-  // up moving as few instructions as possible in the loop below.
-  for (RematInstruction &Remat : RematInstructions) {
+  for (const RematInstruction &Remat : RematInstructions) {
     MachineInstr *DefMI = Remat.RematMI;
     MachineBasicBlock::iterator InsertPos(Remat.UseMI);
     Register Reg = DefMI->getOperand(0).getReg();
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index c378564d5b2e97..e9ffea09955daf 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -420,6 +420,12 @@ class ClusteredLowOccStage : public GCNSchedStage {
       : GCNSchedStage(StageID, DAG) {}
 };
 
+/// Attempts to increase function occupancy with respect to VGPR usage by one by
+/// sinking trivially rematerializable instructions to their use. When the stage
+/// estimates increasing occupancy is possible, as few instructions as possible
+/// are rematerialized to reduce potential negative effects on function latency.
+///
+/// TODO: We should extend this to work on SGPRs and AGPRs as well.
 class PreRARematStage : public GCNSchedStage {
 private:
   /// A trivially rematerializable VGPR-defining instruction along with
@@ -442,22 +448,20 @@ class PreRARematStage : public GCNSchedStage {
         : RematMI(RematMI), DefRegion(DefRegion), UseMI(UseMI) {}
   };
 
-  /// List of eligible rematerializable instructions to sink to increase
-  /// occupancy, in function instruction order.
-  std::vector<RematInstruction> RematInstructions;
+  /// Determines whether we can increase function occupancy by 1 through
+  /// rematerialization. If we can, returns true and fill \p RematInstructions
+  /// with a list of rematerializable instructions whose sinking would result in
+  /// increased occupancy; returns false otherwise.
+  bool canIncreaseOccupancy(std::vector<RematInstruction> &RematInstructions);
 
-  /// Collect all trivially rematerializable VGPR instructions with a single def
-  /// and single use outside the defining block into RematerializableInsts.
-  bool collectRematerializableInstructions();
-
-  /// Whether the MI is trivially rematerializable and does not have eany
+  /// Whether the MI is trivially rematerializable and does not have any
   /// virtual register use.
   bool isTriviallyReMaterializable(const MachineInstr &MI);
 
-  /// TODO: Should also attempt to reduce RP of SGPRs and AGPRs
-  /// Sinks all instructions in RematInstructions to increase function
+  /// Sinks all instructions in \p RematInstructions to increase function
   /// occupancy. Modified regions are tagged for rescheduling.
-  void sinkTriviallyRematInsts(const GCNSubtarget &ST,
+  void sinkTriviallyRematInsts(ArrayRef<RematInstruction> RematInstructions,
+                               const GCNSubtarget &ST,
                                const TargetInstrInfo *TII);
 
 public:

>From 0d9103463ae71faf19645dc0efb0337c0e972426 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Thu, 5 Dec 2024 01:15:48 +0100
Subject: [PATCH 03/10] Wrap comment correctly

---
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 8b4136da9503cc..783827f994c0e0 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -326,7 +326,8 @@ unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
 
 /// Returns the necessary reduction in number of VGPRs from using \p VGPRs VGPRs
 /// to increase the achievable number of waves per EU for this subtarget by 1.
-/// Returns 0 when using \p VGPRs VGPRs already results in maximum number of waves per EU.
+/// Returns 0 when using \p VGPRs VGPRs already results in maximum number of
+/// waves per EU.
 unsigned getNumVGPRsToIncreaseWavesPerEU(const MCSubtargetInfo *STI,
                                          unsigned NumVGPRs);
 

>From c095a15b92aa9318489b78ab7fa0cffaa544aaea Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Thu, 5 Dec 2024 01:27:12 +0100
Subject: [PATCH 04/10] Fix format

---
 llvm/lib/Target/AMDGPU/GCNSubtarget.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 20e1ba350ed8e7..99e163a5661fe1 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1372,7 +1372,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// VGPRs to increase occupancy by 1. Returns 0 when using \p VGPRs VGPRs
   /// already results in maximum occupancy.
   unsigned getNumVGPRsToIncreaseOccupancy(unsigned VGPRs) const;
-  
+
   /// Return occupancy for the given function. Used LDS and a number of
   /// registers if provided.
   /// Note, occupancy can be affected by the scratch allocation as well, but

>From e81f2adda81110da67a4b65e31d9ca21d9b515e3 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Fri, 6 Dec 2024 14:18:23 +0100
Subject: [PATCH 05/10] Address review comments

---
 .../llvm/CodeGen/MachineRegisterInfo.h        |   4 +
 llvm/lib/CodeGen/MachineRegisterInfo.cpp      |   5 +
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   |  88 ++++-----
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h     |  16 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.cpp       |   2 +-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |   4 +-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |   5 +-
 .../machine-scheduler-sink-trivial-remats.mir | 183 ++++++++++++++++--
 8 files changed, 236 insertions(+), 71 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 5dc51aaed81c7b..9bdae6aef134f0 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -23,6 +23,7 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/RegisterBank.h"
@@ -592,6 +593,9 @@ class MachineRegisterInfo {
   /// multiple uses.
   bool hasOneNonDBGUser(Register RegNo) const;
 
+  /// If the register has a single non-Debug instruction using the specified
+  /// register, returns it; otherwise returns nullptr. 
+  MachineInstr* getOneNonDBGUser(Register RegNo) const;
 
   /// hasAtMostUses - Return true if the given register has at most \p MaxUsers
   /// non-debug user instructions.
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index fcedb302d228c4..5739357d31c2c1 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -431,6 +431,11 @@ bool MachineRegisterInfo::hasOneNonDBGUser(Register RegNo) const {
   return hasSingleElement(use_nodbg_instructions(RegNo));
 }
 
+MachineInstr* MachineRegisterInfo::getOneNonDBGUser(Register RegNo) const {
+  auto RegNoDbgUsers = use_nodbg_instructions(RegNo);
+  return hasSingleElement(RegNoDbgUsers) ? &*RegNoDbgUsers.begin() : nullptr;
+}
+
 bool MachineRegisterInfo::hasAtMostUserInstrs(Register Reg,
                                               unsigned MaxUsers) const {
   return hasNItemsOrLess(use_instr_nodbg_begin(Reg), use_instr_nodbg_end(),
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index e43014efa08c5a..42d50e86345a70 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -960,10 +960,11 @@ bool PreRARematStage::initGCNSchedStage() {
   // if there is another pass after this pass.
   assert(!S.hasNextStage());
 
-  std::vector<RematInstruction> RematInstructions;
+  SmallVector<RematInstruction> RematInstructions;
   if (!canIncreaseOccupancy(RematInstructions))
     return false;
-  sinkTriviallyRematInsts(RematInstructions, ST, TII);
+  sinkTriviallyRematInsts(RematInstructions, ST,
+                          static_cast<const SIInstrInfo *>(TII));
 
   LLVM_DEBUG(
       dbgs() << "Retrying function scheduling with improved occupancy of "
@@ -1473,7 +1474,7 @@ void GCNSchedStage::revertScheduling() {
 #define RA_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;)
 
 bool PreRARematStage::canIncreaseOccupancy(
-    std::vector<RematInstruction> &RematInstructions) {
+    SmallVectorImpl<RematInstruction> &RematInstructions) {
   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
 
   RA_DEBUG(dbgs() << "Collecting rematerializable instructions\n");
@@ -1510,7 +1511,7 @@ bool PreRARematStage::canIncreaseOccupancy(
     if (NumRegs >= RPExcess) {
       OptRegions.erase(I);
       LLVM_DEBUG(dbgs() << "sinking increases occupancy in region " << I
-                        << "\n");
+                        << '\n');
     } else {
       RPExcess -= NumRegs;
       LLVM_DEBUG(dbgs() << "sinking reduces excess pressure in region " << I
@@ -1526,34 +1527,27 @@ bool PreRARematStage::canIncreaseOccupancy(
   for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
     auto Region = DAG.Regions[I];
     for (auto MI = Region.first; MI != Region.second; ++MI) {
-      // We only support instructions with at least one register (but
-      // non-subregister) operand.
+      // The instruction must be trivially rematerializable.
       MachineInstr &DefMI = *MI;
-      if (DefMI.isBundle() || !DefMI.getNumOperands() ||
-          !DefMI.getOperand(0).isReg() || DefMI.getOperand(0).getSubReg())
+      if (!isTriviallyReMaterializable(DefMI))
         continue;
 
-      // We only support rematerializing virtual VGPRs with one definition and
-      // one use.
+      // We only support rematerializing virtual VGPRs with one definition.
       Register Reg = DefMI.getOperand(0).getReg();
       if (!Reg.isVirtual() || !DAG.LIS->hasInterval(Reg) ||
           !SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
-          !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
-        continue;
-
-      // The instruction must be trivially rematerializable and have no virtual
-      // register use.
-      if (!isTriviallyReMaterializable(DefMI))
+          !DAG.MRI.hasOneDef(Reg))
         continue;
 
-      // We only care to rematerialize the instruction if its single use is in a
-      // different block.
-      MachineInstr *UseMI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
-      if (DefMI.getParent() == UseMI->getParent())
+      // We only care to rematerialize the instruction if has a single non-debug
+      // user in a different block.
+      MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
+      if (!UseMI || DefMI.getParent() == UseMI->getParent())
         continue;
 
-      RA_DEBUG(dbgs() << "In region " << I << ", instruction " << DefMI
-                      << " is rematerializable with single use " << *UseMI);
+      RA_DEBUG(dbgs() << "In region " << I << ", rematerializable instruction "
+                      << DefMI);
+      RA_DEBUG(dbgs() << "with single use " << *UseMI);
       auto &Remat = RematInstructions.emplace_back(&DefMI, I, UseMI);
 
       bool RematUseful = false;
@@ -1564,7 +1558,8 @@ bool PreRARematStage::canIncreaseOccupancy(
         // instruction and the end of the region.
         RA_DEBUG(dbgs() << "  Instruction's defining region is optimizable: ");
         RematUseful = true;
-        auto RegMask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
+        LaneBitmask RegMask =
+            DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
         if (ReduceRPInRegion(It, I, RegMask))
           return true;
       }
@@ -1610,15 +1605,15 @@ bool PreRARematStage::canIncreaseOccupancy(
 
 void PreRARematStage::sinkTriviallyRematInsts(
     ArrayRef<RematInstruction> RematInstructions, const GCNSubtarget &ST,
-    const TargetInstrInfo *TII) {
+    const SIInstrInfo *TII) {
   // Collect regions whose live-ins or register pressure will change due to
   // rematerialization, and map those whose maximum RP we need to fully
   // recompute at the end to true.
   SmallDenseMap<unsigned, bool> ImpactedRegions;
   // Maps rematerialized instuctions to the one they were rematerialized from.
   DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
-  LiveIntervals *LIS = DAG.LIS;
 
+  // Rematerialize all instructions.
   for (const RematInstruction &Remat : RematInstructions) {
     MachineInstr *DefMI = Remat.RematMI;
     MachineBasicBlock::iterator InsertPos(Remat.UseMI);
@@ -1631,9 +1626,8 @@ void PreRARematStage::sinkTriviallyRematInsts(
     TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
                        DefMI->getOperand(0).getSubReg(), *DefMI, *DAG.TRI);
     MachineInstr *NewMI = &*std::prev(InsertPos);
-    LIS->InsertMachineInstrInMaps(*NewMI);
-    LIS->removeInterval(Reg);
-    LIS->createAndComputeVirtRegInterval(Reg);
+    NewMI->getOperand(0).setSubReg(DefMI->getOperand(0).getSubReg());
+    DAG.LIS->InsertMachineInstrInMaps(*NewMI);
     InsertedMIToOldDef[NewMI] = DefMI;
 
     // Update region boundaries in scheduling region we sinked from since we
@@ -1654,7 +1648,8 @@ void PreRARematStage::sinkTriviallyRematInsts(
       // contains the single use. In live-through regions, maximum register
       // pressure decreases predictably so we can directly update it. In the
       // using region, maximum register pressure may or may not decrease, so we
-      // will mark it for re-computation after all materializations.
+      // will mark it for re-computation after all materializations have taken
+      // place.
       RegionLiveIns.erase(Reg);
       if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent()) {
         // Register is live-through and not used in this block.
@@ -1671,6 +1666,20 @@ void PreRARematStage::sinkTriviallyRematInsts(
     ImpactedRegions[Remat.DefRegion] = true;
   }
 
+  // Clean up the IR; remove rematerialized instructions from state.
+  for (auto &[NewMI, OldMI] : InsertedMIToOldDef) {
+    // Remove rematerialized instruction from BBLiveInMap since we are sinking
+    // it from its MBB.
+    DAG.BBLiveInMap.erase(OldMI);
+
+    // Remove the rematerialized instruction and update live intervals.
+    Register Reg = NewMI->getOperand(0).getReg();
+    DAG.LIS->RemoveMachineInstrFromMaps(*OldMI);
+    OldMI->eraseFromParent();
+    DAG.LIS->removeInterval(Reg);
+    DAG.LIS->createAndComputeVirtRegInterval(Reg);
+  }
+
   // All regions impacted by at least one rematerialization must be rescheduled.
   BitVector NewRescheduleRegions(DAG.Regions.size());
   for (auto &[I, RecomputeRP] : ImpactedRegions) {
@@ -1680,7 +1689,7 @@ void PreRARematStage::sinkTriviallyRematInsts(
     // Recompute maximum RP in regions in which at least one instruction was
     // rematerialized from or to.
     if (RecomputeRP) {
-      GCNDownwardRPTracker RPT(*LIS);
+      GCNDownwardRPTracker RPT(*DAG.LIS);
       auto *NonDbgMI = &*skipDebugInstructionsForward(DAG.Regions[I].first,
                                                       DAG.Regions[I].second);
       RPT.reset(*NonDbgMI, &DAG.LiveIns[I]);
@@ -1690,20 +1699,6 @@ void PreRARematStage::sinkTriviallyRematInsts(
   }
   DAG.RescheduleRegions = NewRescheduleRegions;
 
-  // Clean up the IR; remove rematerialized instructions from state.
-  for (auto &[NewMI, OldMI] : InsertedMIToOldDef) {
-    // Remove rematerialized instruction from BBLiveInMap since we are sinking
-    // it from its MBB.
-    DAG.BBLiveInMap.erase(OldMI);
-
-    // Remove the rematerialized instruction and update LIS.
-    Register Reg = NewMI->getOperand(0).getReg();
-    LIS->RemoveMachineInstrFromMaps(*OldMI);
-    OldMI->eraseFromParent();
-    LIS->removeInterval(Reg);
-    LIS->createAndComputeVirtRegInterval(Reg);
-  }
-
   if (GCNTrackers)
     DAG.RegionLiveOuts.buildLiveRegMap();
 
@@ -1716,6 +1711,11 @@ bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
   if (!DAG.TII->isTriviallyReMaterializable(MI))
     return false;
 
+  // Even though TargetInstrInfo::isReallyTriviallyReMaterializable already
+  // ensures that the instruction has no virtual register uses,
+  // SIInstrInfo::isReallyTriviallyReMaterializable may consider an instruction
+  // rematerializable and return before calling its parent's method, so we need
+  // to double-check here.
   for (const MachineOperand &MO : MI.all_uses())
     if (MO.getReg().isVirtual())
       return false;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index e9ffea09955daf..9ed88f2b39cf0b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -434,35 +434,35 @@ class PreRARematStage : public GCNSchedStage {
   struct RematInstruction {
     /// Trivially rematerializable instruction.
     MachineInstr *RematMI;
-    /// Region containing the rematerializable instruction.
-    unsigned DefRegion;
     /// Single use of the rematerializable instruction's defined register,
     /// located in a different block.
     MachineInstr *UseMI;
     /// Set of regions in which the rematerializable instruction's defined
     /// register is a live-in.
     SmallDenseSet<unsigned, 4> LiveInRegions;
+    /// Region containing the rematerializable instruction.
+    unsigned DefRegion;
 
     RematInstruction(MachineInstr *RematMI, unsigned DefRegion,
                      MachineInstr *UseMI)
-        : RematMI(RematMI), DefRegion(DefRegion), UseMI(UseMI) {}
+        : RematMI(RematMI), UseMI(UseMI), DefRegion(DefRegion) {}
   };
 
   /// Determines whether we can increase function occupancy by 1 through
   /// rematerialization. If we can, returns true and fill \p RematInstructions
   /// with a list of rematerializable instructions whose sinking would result in
   /// increased occupancy; returns false otherwise.
-  bool canIncreaseOccupancy(std::vector<RematInstruction> &RematInstructions);
+  bool
+  canIncreaseOccupancy(SmallVectorImpl<RematInstruction> &RematInstructions);
 
-  /// Whether the MI is trivially rematerializable and does not have any
-  /// virtual register use.
+  /// Whether the MI is trivially rematerializable and does not have any virtual
+  /// register use.
   bool isTriviallyReMaterializable(const MachineInstr &MI);
 
   /// Sinks all instructions in \p RematInstructions to increase function
   /// occupancy. Modified regions are tagged for rescheduling.
   void sinkTriviallyRematInsts(ArrayRef<RematInstruction> RematInstructions,
-                               const GCNSubtarget &ST,
-                               const TargetInstrInfo *TII);
+                               const GCNSubtarget &ST, const SIInstrInfo *TII);
 
 public:
   bool initGCNSchedStage() override;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 78f15ed8be8cae..06ad445f865ff8 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -368,7 +368,7 @@ unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {
 }
 
 unsigned GCNSubtarget::getNumVGPRsToIncreaseOccupancy(unsigned NumVGPRs) const {
-  return AMDGPU::IsaInfo::getNumVGPRsToIncreaseWavesPerEU(this, NumVGPRs);
+  return AMDGPU::IsaInfo::getVGPRReductionToIncreaseWavesPerEU(this, NumVGPRs);
 }
 
 unsigned
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index a996cb21848643..67332b25a45e08 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1185,8 +1185,8 @@ unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
   return std::min(std::max(TotalNumVGPRs / RoundedRegs, 1u), MaxWaves);
 }
 
-unsigned getNumVGPRsToIncreaseWavesPerEU(const MCSubtargetInfo *STI,
-                                         unsigned NumVGPRs) {
+unsigned getVGPRReductionToIncreaseWavesPerEU(const MCSubtargetInfo *STI,
+                                              unsigned NumVGPRs) {
   unsigned Granule = getVGPRAllocGranule(STI);
   unsigned MaxWaves = getMaxWavesPerEU(STI);
   unsigned TotalNumVGPRs = getTotalNumVGPRs(STI);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 783827f994c0e0..375a02f242e33b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -328,8 +328,9 @@ unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
 /// to increase the achievable number of waves per EU for this subtarget by 1.
 /// Returns 0 when using \p VGPRs VGPRs already results in maximum number of
 /// waves per EU.
-unsigned getNumVGPRsToIncreaseWavesPerEU(const MCSubtargetInfo *STI,
-                                         unsigned NumVGPRs);
+
+unsigned getVGPRReductionToIncreaseWavesPerEU(const MCSubtargetInfo *STI,
+                                              unsigned NumVGPRs);
 
 /// \returns Number of waves reachable for a given \p NumVGPRs usage, \p Granule
 /// size, \p MaxWaves possible, and \p TotalNumVGPRs available.
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
index abed0e080b7eeb..0a9978dea0461a 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
@@ -5585,12 +5585,13 @@ body:             |
     S_ENDPGM 0
 ...
 ---
-name:            test_occ_9_no_sink_one_def_of_undef_subreg
+---
+name:            test_occ_7_sink_one_def_of_undef_subreg_for_8
 tracksRegLiveness: true
 machineFunctionInfo:
   isEntryFunction: true
 body:             |
-  ; GFX908-LABEL: name: test_occ_9_no_sink_one_def_of_undef_subreg
+  ; GFX908-LABEL: name: test_occ_7_sink_one_def_of_undef_subreg_for_8
   ; GFX908: bb.0:
   ; GFX908-NEXT:   successors: %bb.1(0x80000000)
   ; GFX908-NEXT: {{  $}}
@@ -5617,16 +5618,22 @@ body:             |
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1:
   ; GFX908-NEXT:   successors: %bb.2(0x80000000)
   ; GFX908-NEXT: {{  $}}
-  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_23]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]]
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2:
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_1]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_3]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_5]]
@@ -5638,7 +5645,13 @@ body:             |
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_16]], implicit [[V_CVT_I32_F64_e32_17]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_18]], implicit [[V_CVT_I32_F64_e32_19]]
   ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_20]], implicit [[V_CVT_I32_F64_e32_21]]
-  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_22]], implicit [[V_CVT_I32_F64_e32_23]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_24]], implicit [[V_CVT_I32_F64_e32_25]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_26]], implicit [[V_CVT_I32_F64_e32_27]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_28]], implicit [[V_CVT_I32_F64_e32_29]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_30]], implicit [[V_CVT_I32_F64_e32_31]]
+  ; GFX908-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1
   ; GFX908-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1
@@ -5666,19 +5679,24 @@ body:             |
     %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
     %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
     %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
-    undef %23.sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
+    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+    undef %32.sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
-    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
-    S_NOP 0, implicit %24
+    S_NOP 0, implicit %0
 
   bb.2:
-  ; predcessors: %bb.1
 
-    S_NOP 0, implicit %23.sub1
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
     S_NOP 0, implicit %4, implicit %5
@@ -5690,7 +5708,12 @@ body:             |
     S_NOP 0, implicit %16, implicit %17
     S_NOP 0, implicit %18, implicit %19
     S_NOP 0, implicit %20, implicit %21
-    S_NOP 0, implicit %22
+    S_NOP 0, implicit %22, implicit %23
+    S_NOP 0, implicit %24, implicit %25
+    S_NOP 0, implicit %26, implicit %27
+    S_NOP 0, implicit %28, implicit %29
+    S_NOP 0, implicit %30, implicit %31
+    S_NOP 0, implicit %32.sub1
     S_ENDPGM 0
 ...
 ---
@@ -6051,3 +6074,135 @@ body:             |
     S_NOP 0, implicit %0, implicit %1
     S_ENDPGM 0
 ...
+---
+name:            test_remat_over_exec_modif
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX908-LABEL: name: test_remat_over_exec_modif
+  ; GFX908: bb.0:
+  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX908-NEXT:   liveins: $sgpr2_sgpr3
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   %new_exec:sgpr_64 = COPY $sgpr2_sgpr3
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   %save_exec:sreg_64 = S_MOV_B64 $exec
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+  ; GFX908-NEXT:   $exec = S_MOV_B64 %new_exec
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.1:
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_]], implicit [[V_CVT_I32_F64_e32_16]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_17]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_18]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_19]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_20]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_21]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_22]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_23]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_9]], implicit [[V_CVT_I32_F64_e32_25]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_26]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_27]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_28]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_29]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_30]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_31]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F64_e32_32]]
+  ; GFX908-NEXT:   $exec = S_MOV_B64 %save_exec
+  ; GFX908-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $sgpr2_sgpr3
+
+    %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode, implicit-def $m0
+    %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode, implicit-def $m0
+    %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+    %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+    %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+    %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+    %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+    %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+    %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+    %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+    %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+    %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+    %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+    %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+    %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+    %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+    %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+    %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+    %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+    %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+    %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+    %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+    %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+    %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+    %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+    %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+    %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+    %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+    %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+    %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+    %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+    %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+    %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode
+
+    %save_exec:sreg_64 = S_MOV_B64 $exec
+    %new_exec:sgpr_64 = COPY $sgpr2_sgpr3
+    $exec = S_MOV_B64 %new_exec
+
+  bb.1:
+
+    S_NOP 0, implicit %0, implicit %16
+    S_NOP 0, implicit %1, implicit %17
+    S_NOP 0, implicit %2, implicit %18
+    S_NOP 0, implicit %3, implicit %19
+    S_NOP 0, implicit %4, implicit %20
+    S_NOP 0, implicit %5, implicit %21
+    S_NOP 0, implicit %6, implicit %22
+    S_NOP 0, implicit %7, implicit %23
+    S_NOP 0, implicit %8, implicit %24
+    S_NOP 0, implicit %9, implicit %25
+    S_NOP 0, implicit %10, implicit %26
+    S_NOP 0, implicit %11, implicit %27
+    S_NOP 0, implicit %12, implicit %28
+    S_NOP 0, implicit %13, implicit %29
+    S_NOP 0, implicit %14, implicit %30
+    S_NOP 0, implicit %15, implicit %31
+    S_NOP 0, implicit %32
+
+    $exec = S_MOV_B64 %save_exec
+    S_ENDPGM 0
+...

>From 1ceec9d26ff33bd2139d53aab433405ca23c8a56 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Fri, 6 Dec 2024 14:22:52 +0100
Subject: [PATCH 06/10] Remove useless closure argument

---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 42d50e86345a70..57497ba002c9d4 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -306,11 +306,11 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
     HasHighPressure = true;
     if (SGPRDelta > VGPRDelta) {
       Cand.RPDelta.CriticalMax =
-        PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
+          PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
       Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
     } else {
       Cand.RPDelta.CriticalMax =
-        PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
+          PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
       Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
     }
   }
@@ -419,7 +419,7 @@ SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
       pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
                         /*IsBottomUp=*/false);
       assert(TCand.SU == TopCand.SU &&
-           "Last pick result should correspond to re-picking right now");
+             "Last pick result should correspond to re-picking right now");
     }
 #endif
   }
@@ -1502,11 +1502,12 @@ bool PreRARematStage::canIncreaseOccupancy(
   if (OptRegions.empty())
     return false;
 
-  // Tracks estimated rematerialization gains (i.e., reduction in RP) for
-  // this instruction in each optimizable region.
-  auto ReduceRPInRegion = [&](auto OptIt, unsigned I,
-                              LaneBitmask Mask) -> bool {
+  // Accounts for a reduction in RP in an optimizable region. Returns whether we
+  // estimate that we have identified enough rematerialization opportunities to
+  // increase function occupancy.
+  auto ReduceRPInRegion = [&](auto OptIt, LaneBitmask Mask) -> bool {
     auto NumRegs = SIRegisterInfo::getNumCoveredRegs(Mask);
+    unsigned I = OptIt->getFirst();
     unsigned &RPExcess = OptIt->getSecond();
     if (NumRegs >= RPExcess) {
       OptRegions.erase(I);
@@ -1560,7 +1561,7 @@ bool PreRARematStage::canIncreaseOccupancy(
         RematUseful = true;
         LaneBitmask RegMask =
             DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
-        if (ReduceRPInRegion(It, I, RegMask))
+        if (ReduceRPInRegion(It, RegMask))
           return true;
       }
 
@@ -1581,7 +1582,7 @@ bool PreRARematStage::canIncreaseOccupancy(
         // instruction's use.
         if (auto It = OptRegions.find(LVRegion); It != OptRegions.end()) {
           RematUseful = true;
-          if (ReduceRPInRegion(It, LVRegion, DAG.LiveIns[LVRegion][Reg]))
+          if (ReduceRPInRegion(It, DAG.LiveIns[LVRegion][Reg]))
             return true;
         } else {
           LLVM_DEBUG(dbgs() << "unoptimizable region\n");

>From ce4e1ad9e0412167fe31892bf867000785b76556 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Fri, 6 Dec 2024 14:33:53 +0100
Subject: [PATCH 07/10] Fix format

---
 llvm/include/llvm/CodeGen/MachineRegisterInfo.h | 4 ++--
 llvm/lib/CodeGen/MachineRegisterInfo.cpp        | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 9bdae6aef134f0..23e4b30696e06d 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -594,8 +594,8 @@ class MachineRegisterInfo {
   bool hasOneNonDBGUser(Register RegNo) const;
 
   /// If the register has a single non-Debug instruction using the specified
-  /// register, returns it; otherwise returns nullptr. 
-  MachineInstr* getOneNonDBGUser(Register RegNo) const;
+  /// register, returns it; otherwise returns nullptr.
+  MachineInstr *getOneNonDBGUser(Register RegNo) const;
 
   /// hasAtMostUses - Return true if the given register has at most \p MaxUsers
   /// non-debug user instructions.
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index 5739357d31c2c1..5c29e902849aa5 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -431,7 +431,7 @@ bool MachineRegisterInfo::hasOneNonDBGUser(Register RegNo) const {
   return hasSingleElement(use_nodbg_instructions(RegNo));
 }
 
-MachineInstr* MachineRegisterInfo::getOneNonDBGUser(Register RegNo) const {
+MachineInstr *MachineRegisterInfo::getOneNonDBGUser(Register RegNo) const {
   auto RegNoDbgUsers = use_nodbg_instructions(RegNo);
   return hasSingleElement(RegNoDbgUsers) ? &*RegNoDbgUsers.begin() : nullptr;
 }

>From b4c8af98b71b4bd9c9d212aaffc13cf4f4e42127 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Mon, 9 Dec 2024 16:12:01 +0100
Subject: [PATCH 08/10] Update live interval manually instead of fully
 recomputing

---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 40 ++++++++++++++++-----
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 57497ba002c9d4..2d193b3a2abf64 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1611,8 +1611,8 @@ void PreRARematStage::sinkTriviallyRematInsts(
   // rematerialization, and map those whose maximum RP we need to fully
   // recompute at the end to true.
   SmallDenseMap<unsigned, bool> ImpactedRegions;
-  // Maps rematerialized instuctions to the one they were rematerialized from.
-  DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
+  // Collect new MIs, in rematerialization order.
+  SmallVector<MachineInstr *> NewMIs;
 
   // Rematerialize all instructions.
   for (const RematInstruction &Remat : RematInstructions) {
@@ -1629,7 +1629,7 @@ void PreRARematStage::sinkTriviallyRematInsts(
     MachineInstr *NewMI = &*std::prev(InsertPos);
     NewMI->getOperand(0).setSubReg(DefMI->getOperand(0).getSubReg());
     DAG.LIS->InsertMachineInstrInMaps(*NewMI);
-    InsertedMIToOldDef[NewMI] = DefMI;
+    NewMIs.push_back(NewMI);
 
     // Update region boundaries in scheduling region we sinked from since we
     // may sink an instruction that was at the beginning or end of its region.
@@ -1668,15 +1668,39 @@ void PreRARematStage::sinkTriviallyRematInsts(
   }
 
   // Clean up the IR; remove rematerialized instructions from state.
-  for (auto &[NewMI, OldMI] : InsertedMIToOldDef) {
+  for (auto [Remat, NewMI] : llvm::zip_equal(RematInstructions, NewMIs)) {
     // Remove rematerialized instruction from BBLiveInMap since we are sinking
-    // it from its MBB.
+    // it from its MBB. Also remove it from live intervals and from its MBB.
+    MachineInstr *OldMI = Remat.RematMI;
     DAG.BBLiveInMap.erase(OldMI);
-
-    // Remove the rematerialized instruction and update live intervals.
-    Register Reg = NewMI->getOperand(0).getReg();
     DAG.LIS->RemoveMachineInstrFromMaps(*OldMI);
     OldMI->eraseFromParent();
+
+    // Update the register's live interval manually. This is aligned with the
+    // instruction collection phase in that it assumes a single def and use
+    // (possibly subreg).
+    SlotIndexes *Slots = DAG.LIS->getSlotIndexes();
+    SlotIndex NewDef = Slots->getInstructionIndex(*NewMI).getRegSlot();
+    SlotIndex NewKill = Slots->getInstructionIndex(*Remat.UseMI).getRegSlot();
+
+    auto UpdateSegment = [&](LiveRange::Segments &Segments) -> void {
+      assert(Segments.size() == 1 && "expected single segment");
+      LiveRange::Segment &Seg = Segments.front();
+      Seg.start = NewDef;
+      Seg.end = NewKill;
+      if (Seg.valno)
+        Seg.valno->def = NewDef;
+    };
+
+    Register Reg = NewMI->getOperand(0).getReg();
+    LiveInterval &RegLI = DAG.LIS->getInterval(Reg);
+    UpdateSegment(RegLI.segments);
+    if (RegLI.hasSubRanges()) {
+      LiveInterval::SubRange &SubRange = *RegLI.subrange_begin();
+      assert(!SubRange.Next && "expected at most one subrange");
+      UpdateSegment(SubRange.segments);
+    }
+
     DAG.LIS->removeInterval(Reg);
     DAG.LIS->createAndComputeVirtRegInterval(Reg);
   }

>From 928699e7b9ca1a457bba10d7e7fc5eba96e9a865 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Mon, 9 Dec 2024 16:15:35 +0100
Subject: [PATCH 09/10] Remove spurious live interval recomputing

---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 2d193b3a2abf64..070fbba5aed793 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1700,9 +1700,6 @@ void PreRARematStage::sinkTriviallyRematInsts(
       assert(!SubRange.Next && "expected at most one subrange");
       UpdateSegment(SubRange.segments);
     }
-
-    DAG.LIS->removeInterval(Reg);
-    DAG.LIS->createAndComputeVirtRegInterval(Reg);
   }
 
   // All regions impacted by at least one rematerialization must be rescheduled.

>From f47638e427e9eeddefa314f6bb9a0bd637d3c8b9 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Wed, 11 Dec 2024 22:48:53 +0100
Subject: [PATCH 10/10] Early return on SGPR-limited region

---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 070fbba5aed793..7496ffa62e1416 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1490,8 +1490,11 @@ bool PreRARematStage::canIncreaseOccupancy(
 
     // We do not rematerialize SGPR-defining regions yet so do not bother
     // optimizing regions whose occupancy is SGPR-limited.
-    if (ST.getOccupancyWithNumSGPRs(RP.getSGPRNum()) == DAG.MinOccupancy)
-      continue;
+    if (ST.getOccupancyWithNumSGPRs(RP.getSGPRNum()) == DAG.MinOccupancy) {
+      RA_DEBUG(dbgs() << "Region " << I
+                      << "'s occupancy is SGPR-limited, aborting\n");
+      return false;
+    }
 
     unsigned NumVGPRs = RP.getVGPRNum(ST.hasGFX90AInsts());
     unsigned NumToIncreaseOcc = ST.getNumVGPRsToIncreaseOccupancy(NumVGPRs);