[llvm] [AMDGPU][Scheduler] Refactor ArchVGPR rematerialization during scheduling (PR #125885)

Mon May 5 04:38:51 PDT 2025

================
@@ -1688,174 +1696,421 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
   return true;
 }
 
-void PreRARematStage::collectRematerializableInstructions() {
-  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
-  for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
-    Register Reg = Register::index2VirtReg(I);
-    if (!DAG.LIS->hasInterval(Reg))
-      continue;
+namespace {
+/// Models excess register pressure in a region and tracks our progress as we
+/// identify rematerialization opportunities.
+struct ExcessRP {
+  /// Number of excess ArchVGPRs.
+  unsigned ArchVGPRs = 0;
+  /// Number of excess AGPRs.
+  unsigned AGPRs = 0;
+  /// For unified register files, number of excess VGPRs.
+  unsigned VGPRs = 0;
+  /// For unified register files with AGPR usage, number of excess ArchVGPRs to
+  /// save before we are able to save a whole allocation granule.
+  unsigned ArchVGPRsToAlignment = 0;
+  /// Whether the region uses AGPRs.
+  bool HasAGPRs = false;
+  /// Whether the subtarget has a unified RF.
+  bool UnifiedRF;
+
+  /// Constructs the excess RP model; determines the excess pressure w.r.t. a
+  /// maximum number of allowed VGPRs.
+  ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP, unsigned MaxVGPRs);
+
+  /// Accounts for \p NumRegs saved ArchVGPRs in the model. If \p
+  /// UseArchVGPRForAGPRSpill is true, saved ArchVGPRs are used to save excess
+  /// AGPRs once excess ArchVGPR pressure has been eliminated. Returns whether
+  /// saving these ArchVGPRs helped reduce excess pressure.
+  bool saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill);
+
+  /// Accounts for \p NumRegs saved AGPRS in the model. Returns whether saving
+  /// these ArchVGPRs helped reduce excess pressure.
+  bool saveAGPRs(unsigned NumRegs);
+
+  /// Returns whether there is any excess register pressure.
+  operator bool() const { return ArchVGPRs != 0 || AGPRs != 0 || VGPRs != 0; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  friend raw_ostream &operator<<(raw_ostream &OS, const ExcessRP &Excess) {
+    OS << Excess.ArchVGPRs << " ArchVGPRs, " << Excess.AGPRs << " AGPRs, and "
+       << Excess.VGPRs << " VGPRs (next ArchVGPR aligment in "
+       << Excess.ArchVGPRsToAlignment << " registers)\n";
+    return OS;
+  }
+#endif
 
-    // TODO: Handle AGPR and SGPR rematerialization
-    if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
-        !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
-      continue;
+private:
+  static inline bool saveRegs(unsigned &LeftToSave, unsigned &NumRegs) {
+    unsigned NumSaved = std::min(LeftToSave, NumRegs);
+    NumRegs -= NumSaved;
+    LeftToSave -= NumSaved;
+    return NumSaved;
+  }
+};
+} // namespace
+
+ExcessRP::ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP,
+                   unsigned MaxVGPRs)
+    : UnifiedRF(ST.hasGFX90AInsts()) {
+  unsigned NumArchVGPRs = RP.getArchVGPRNum();
+  unsigned NumAGPRs = RP.getAGPRNum();
+  HasAGPRs = NumAGPRs;
+
+  if (!UnifiedRF) {
+    // Non-unified RF. Account for excess pressure for ArchVGPRs and AGPRs
+    // independently.
+    if (NumArchVGPRs > MaxVGPRs)
+      ArchVGPRs = NumArchVGPRs - MaxVGPRs;
+    if (NumAGPRs > MaxVGPRs)
+      AGPRs = NumAGPRs - MaxVGPRs;
+    return;
+  }
 
-    MachineOperand *Op = DAG.MRI.getOneDef(Reg);
-    MachineInstr *Def = Op->getParent();
-    if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
-      continue;
+  // Independently of whether overall VGPR pressure is under the limit, we still
+  // have to check whether ArchVGPR pressure or AGPR pressure alone exceeds the
+  // number of addressable registers in each category.
+  const unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs();
+  if (NumArchVGPRs > MaxArchVGPRs) {
+    ArchVGPRs = NumArchVGPRs - MaxArchVGPRs;
+    NumArchVGPRs = MaxArchVGPRs;
+  }
+  if (NumAGPRs > MaxArchVGPRs) {
+    AGPRs = NumAGPRs - MaxArchVGPRs;
+    NumAGPRs = MaxArchVGPRs;
+  }
 
-    MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
-    if (Def->getParent() == UseI->getParent())
-      continue;
+  // Check overall VGPR usage against the limit; any excess above addressable
+  // register limits has already been accounted for.
+  const unsigned Granule = AMDGPU::IsaInfo::getArchVGPRAllocGranule();
+  unsigned NumVGPRs = GCNRegPressure::getUnifiedVGPRNum(NumArchVGPRs, NumAGPRs);
+  if (NumVGPRs > MaxVGPRs) {
+    VGPRs = NumVGPRs - MaxVGPRs;
+    ArchVGPRsToAlignment = NumArchVGPRs - alignDown(NumArchVGPRs, Granule);
+    if (!ArchVGPRsToAlignment)
+      ArchVGPRsToAlignment = Granule;
+  }
+}
 
-    bool HasRematDependency = false;
-    // Check if this instruction uses any registers that are planned to be
-    // rematerialized
-    for (auto &RematEntry : RematerializableInsts) {
-      if (find_if(RematEntry.second,
-                  [&Def](std::pair<MachineInstr *, MachineInstr *> &Remat) {
-                    for (MachineOperand &MO : Def->operands()) {
-                      if (!MO.isReg())
-                        continue;
-                      if (MO.getReg() == Remat.first->getOperand(0).getReg())
-                        return true;
-                    }
-                    return false;
-                  }) != RematEntry.second.end()) {
-        HasRematDependency = true;
-        break;
-      }
+bool ExcessRP::saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill) {
+  bool Progress = saveRegs(ArchVGPRs, NumRegs);
+  if (!NumRegs)
+    return Progress;
+
+  if (!UnifiedRF) {
+    if (UseArchVGPRForAGPRSpill)
+      Progress |= saveRegs(AGPRs, NumRegs);
+  } else if (HasAGPRs && (VGPRs || (UseArchVGPRForAGPRSpill && AGPRs))) {
+    // There is progress as long as there are VGPRs left to save, even if the
+    // save induced by this particular call does not cross an ArchVGPR alignment
+    // barrier.
+    Progress = true;
+
+    // ArchVGPRs can only be allocated as a multiple of a granule in unified RF.
+    unsigned NumSavedRegs = 0;
+
+    // Count the number of whole ArchVGPR allocation granules we can save.
+    const unsigned Granule = AMDGPU::IsaInfo::getArchVGPRAllocGranule();
+    if (unsigned NumGranules = NumRegs / Granule; NumGranules) {
+      NumSavedRegs = NumGranules * Granule;
+      NumRegs -= NumSavedRegs;
     }
-    // Do not rematerialize an instruction if it uses an instruction that we
-    // have designated for rematerialization.
-    // FIXME: Allow for rematerialization chains: this requires 1. updating
-    // remat points to account for uses that are rematerialized, and 2. either
-    // rematerializing the candidates in careful ordering, or deferring the MBB
-    // RP walk until the entire chain has been rematerialized.
-    if (HasRematDependency)
-      continue;
 
-    // Similarly, check if the UseI is planned to be remat.
-    for (auto &RematEntry : RematerializableInsts) {
-      if (find_if(RematEntry.second,
-                  [&UseI](std::pair<MachineInstr *, MachineInstr *> &Remat) {
-                    return Remat.first == UseI;
-                  }) != RematEntry.second.end()) {
-        HasRematDependency = true;
-        break;
-      }
+    // We may be able to save one more whole ArchVGPR allocation granule.
+    if (NumRegs >= ArchVGPRsToAlignment) {
+      NumSavedRegs += Granule;
+      ArchVGPRsToAlignment = Granule - (NumRegs - ArchVGPRsToAlignment);
+    } else {
+      ArchVGPRsToAlignment -= NumRegs;
     }
 
-    if (HasRematDependency)
-      break;
+    // Prioritize saving generic VGPRs, then AGPRs if we allow AGPR-to-ArchVGPR
+    // spilling and have some free ArchVGPR slots.
+    saveRegs(VGPRs, NumSavedRegs);
+    if (UseArchVGPRForAGPRSpill)
+      saveRegs(AGPRs, NumSavedRegs);
+  } else {
+    // No AGPR usage in the region i.e., no allocation granule to worry about.
+    Progress |= saveRegs(VGPRs, NumRegs);
+  }
 
-    // We are only collecting defs that are defined in another block and are
-    // live-through or used inside regions at MinOccupancy. This means that the
-    // register must be in the live-in set for the region.
-    bool AddedToRematList = false;
-    for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
-      auto It = DAG.LiveIns[I].find(Reg);
-      if (It != DAG.LiveIns[I].end() && !It->second.none()) {
-        if (DAG.RegionsWithMinOcc[I]) {
-          SlotIndex DefIdx = DAG.LIS->getInstructionIndex(*Def);
-          SlotIndex UseIdx =
-              DAG.LIS->getInstructionIndex(*UseI).getRegSlot(true);
-          if (allUsesAvailableAt(Def, DefIdx, UseIdx)) {
-            RematerializableInsts[I][Def] = UseI;
-            AddedToRematList = true;
-          }
-        }
+  return Progress;
+}
+
+bool ExcessRP::saveAGPRs(unsigned NumRegs) {
+  return saveRegs(AGPRs, NumRegs) || saveRegs(VGPRs, NumRegs);
+}
 
-        // Collect regions with rematerializable reg as live-in to avoid
-        // searching later when updating RP.
-        RematDefToLiveInRegions[Def].push_back(I);
+bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
+  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
+
+  REMAT_DEBUG(dbgs() << "Collecting rematerializable instructions in "
+                     << MF.getFunction().getName() << '\n');
+
+  // Maps optimizable regions (i.e., regions at minimum and VGPR-limited
+  // occupancy, or regions with VGPR spilling) to a model of their excess RP.
+  DenseMap<unsigned, ExcessRP> OptRegions;
+  const Function &F = MF.getFunction();
+
+  std::pair<unsigned, unsigned> WavesPerEU = ST.getWavesPerEU(F);
+  const unsigned MaxSGPRsNoSpill = ST.getBaseMaxNumSGPRs(
+      F, WavesPerEU, ST.getMaxNumPreloadedSGPRs(), ST.getReservedNumSGPRs(F));
+  const unsigned MaxVGPRsNoSpill =
+      ST.getBaseMaxNumVGPRs(F, {ST.getMinNumVGPRs(WavesPerEU.second),
+                                ST.getMaxNumVGPRs(WavesPerEU.first)});
+  const unsigned MaxSGPRsIncOcc =
+      ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false);
+  const unsigned MaxVGPRsIncOcc = ST.getMaxNumVGPRs(DAG.MinOccupancy + 1);
+  IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;
+
+  auto ClearOptRegionsIf = [&](bool Cond) -> bool {
+    if (Cond) {
+      // We won't try to increase occupancy.
+      IncreaseOccupancy = false;
+      OptRegions.clear();
+    }
+    return Cond;
+  };
+
+  // Collect optimizable regions. If there is spilling in any region we will
+  // just try to reduce ArchVGPR spilling. Otherwise we will try to increase
+  // occupancy by one in the whole function.
+  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+    GCNRegPressure &RP = DAG.Pressure[I];
+
+    // Check whether SGPR pressures prevents us from eliminating spilling.
+    unsigned NumSGPRs = RP.getSGPRNum();
+    if (NumSGPRs > MaxSGPRsNoSpill)
+      ClearOptRegionsIf(IncreaseOccupancy);
+
+    ExcessRP Excess(ST, RP, MaxVGPRsNoSpill);
+    if (Excess) {
+      ClearOptRegionsIf(IncreaseOccupancy);
+    } else if (IncreaseOccupancy) {
+      // Check whether SGPR pressure prevents us from increasing occupancy.
+      if (ClearOptRegionsIf(NumSGPRs > MaxSGPRsIncOcc)) {
+        if (DAG.MinOccupancy >= WavesPerEU.first)
+          return false;
+        continue;
+      }
+      if ((Excess = ExcessRP(ST, RP, MaxVGPRsIncOcc))) {
+        // We can only rematerialize ArchVGPRs at this point.
+        unsigned NumArchVGPRsToRemat = Excess.ArchVGPRs + Excess.VGPRs;
+        bool NotEnoughArchVGPRs = NumArchVGPRsToRemat > RP.getArchVGPRNum();
+        if (ClearOptRegionsIf(Excess.AGPRs || NotEnoughArchVGPRs)) {
+          if (DAG.MinOccupancy >= WavesPerEU.first)
+            return false;
+          continue;
+        }
       }
     }
-    if (!AddedToRematList)
-      RematDefToLiveInRegions.erase(Def);
+    if (Excess)
+      OptRegions.insert({I, Excess});
   }
-}
+  if (OptRegions.empty())
+    return false;
 
-bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
-                                              const TargetInstrInfo *TII) {
-  // Temporary copies of cached variables we will be modifying and replacing if
-  // sinking succeeds.
-  SmallVector<
-      std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
-      NewRegions;
-  DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
-  DenseMap<unsigned, GCNRegPressure> NewPressure;
-  BitVector NewRescheduleRegions;
-  LiveIntervals *LIS = DAG.LIS;
+#ifndef NDEBUG
+  if (IncreaseOccupancy)
+    REMAT_DEBUG(dbgs() << "Occupancy minimal in regions:\n");
+  else
+    REMAT_DEBUG(dbgs() << "Spilling in regions:\n");
+  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+    if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end())
+      REMAT_DEBUG(dbgs() << "  " << I << ": " << OptIt->getSecond() << '\n');
+  }
+#endif
+
+  // When we are reducing spilling, the target is the minimum target number of
+  // waves/EU determined by the subtarget.
+  TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 : WavesPerEU.first;
+
+  // Accounts for a reduction in RP in an optimizable region. Returns whether we
+  // estimate that we have identified enough rematerialization opportunities to
+  // achieve our goal, and sets Progress to true when this particular reduction
+  // in pressure was helpful toward that goal.
+  auto ReduceRPInRegion = [&](auto OptIt, LaneBitmask Mask,
+                              bool &Progress) -> bool {
+    ExcessRP &Excess = OptIt->getSecond();
+    // We allow saved ArchVGPRs to be considered as free spill slots for AGPRs
+    // only when we are just trying to eliminate spilling to memory. At this
+    // point we err on the conservative side and do not increase
+    // register-to-register spilling for the sake of increasing occupancy.
+    Progress |=
+        Excess.saveArchVGPRs(SIRegisterInfo::getNumCoveredRegs(Mask),
+                             /*UseArchVGPRForAGPRSpill=*/!IncreaseOccupancy);
+    if (!Excess)
+      OptRegions.erase(OptIt->getFirst());
+    return OptRegions.empty();
+  };
+
+  // We need up-to-date live-out info. to query live-out register masks in
+  // regions containing rematerializable instructions.
+  DAG.RegionLiveOuts.buildLiveRegMap();
+
+  // Cache set of registers that are going to be rematerialized.
+  DenseSet<unsigned> RematRegs;
+
+  // Identify rematerializable instructions in the function.
+  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+    auto Region = DAG.Regions[I];
+    for (auto MI = Region.first; MI != Region.second; ++MI) {
+      // The instruction must be trivially rematerializable.
+      MachineInstr &DefMI = *MI;
+      if (!isTriviallyReMaterializable(DefMI))
+        continue;
 
-  NewRegions.resize(DAG.Regions.size());
-  NewRescheduleRegions.resize(DAG.Regions.size());
+      // We only support rematerializing virtual VGPRs with one definition.
+      Register Reg = DefMI.getOperand(0).getReg();
+      if (!Reg.isVirtual() || !DAG.LIS->hasInterval(Reg) ||
+          !SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
+          !DAG.MRI.hasOneDef(Reg))
+        continue;
 
-  // Collect only regions that has a rematerializable def as a live-in.
-  SmallSet<unsigned, 16> ImpactedRegions;
-  for (const auto &It : RematDefToLiveInRegions)
-    ImpactedRegions.insert_range(It.second);
+      // We only care to rematerialize the instruction if it has a single
+      // non-debug user in a different block. The using MI may not belong to a
+      // region if it is a lone region terminator.
+      MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
+      auto UseRegion = MIRegion.find(UseMI);
+      if (!UseMI || (UseRegion != MIRegion.end() && UseRegion->second == I))
+        continue;
 
-  // Make copies of register pressure and live-ins cache that will be updated
-  // as we rematerialize.
-  for (auto Idx : ImpactedRegions) {
-    NewPressure[Idx] = DAG.Pressure[Idx];
-    NewLiveIns[Idx] = DAG.LiveIns[Idx];
-  }
-  NewRegions = DAG.Regions;
-  NewRescheduleRegions.reset();
+      // Do not rematerialize an instruction if it uses or is used by an
+      // instruction that we have designated for rematerialization.
+      // FIXME: Allow for rematerialization chains: this requires 1. updating
+      // remat points to account for uses that are rematerialized, and 2. either
+      // rematerializing the candidates in careful ordering, or deferring the
+      // MBB RP walk until the entire chain has been rematerialized.
+      if (Rematerializations.contains(UseMI) ||
+          llvm::any_of(DefMI.operands(), [&RematRegs](MachineOperand &MO) {
+            return MO.isReg() && RematRegs.contains(MO.getReg());
+          }))
+        continue;
 
-  DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
-  bool Improved = false;
-  for (auto I : ImpactedRegions) {
-    if (!DAG.RegionsWithMinOcc[I])
-      continue;
+      // Do not rematerialize an instruction it it uses registers that aren't
+      // available at its use. This ensures that we are not extending any live
+      // range while rematerializing.
+      SlotIndex DefIdx = DAG.LIS->getInstructionIndex(DefMI);
+      SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true);
+      if (!allUsesAvailableAt(&DefMI, DefIdx, UseIdx))
+        continue;
 
-    Improved = false;
-    int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts());
-    int SGPRUsage = NewPressure[I].getSGPRNum();
+      REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI);
+      RematInstruction &Remat =
+          Rematerializations.try_emplace(&DefMI, UseMI).first->second;
+
+      bool RematUseful = false;
+      if (auto It = OptRegions.find(I); It != OptRegions.end()) {
+        // Optimistically consider that moving the instruction out of its
+        // defining region will reduce RP in the latter; this assumes that
+        // maximum RP in the region is reached somewhere between the defining
+        // instruction and the end of the region.
+        REMAT_DEBUG(dbgs() << "  Defining region is optimizable\n");
+        LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
+        if (ReduceRPInRegion(It, Mask, RematUseful))
+          return true;
+      }
 
-    // TODO: Handle occupancy drop due to AGPR and SGPR.
-    // Check if cause of occupancy drop is due to VGPR usage and not SGPR.
-    if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy)
-      break;
+      for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) {
+        // We are only collecting regions in which the register is a live-in
+        // (and may be live-through).
+        auto It = DAG.LiveIns[LIRegion].find(Reg);
+        if (It == DAG.LiveIns[LIRegion].end() || It->second.none())
+          continue;
+        Remat.LiveInRegions.insert(LIRegion);
+
+        // Account for the reduction in RP due to the rematerialization in an
+        // optimizable region in which the defined register is a live-in. This
+        // is exact for live-through region but optimistic in the using region,
+        // where RP is actually reduced only if maximum RP is reached somewhere
+        // between the beginning of the region and the rematerializable
+        // instruction's use.
+        if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
+          REMAT_DEBUG(dbgs() << "  Live-in in region " << LIRegion << '\n');
+          if (ReduceRPInRegion(It, DAG.LiveIns[LIRegion][Reg], RematUseful))
+            return true;
+        }
+      }
 
-    // The occupancy of this region could have been improved by a previous
-    // iteration's sinking of defs.
-    if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) {
-      NewRescheduleRegions[I] = true;
-      Improved = true;
-      continue;
+      // If the instruction is not a live-in or live-out in any optimizable
+      // region then there is no point in rematerializing it.
+      if (!RematUseful) {
+        Rematerializations.pop_back();
+        REMAT_DEBUG(dbgs() << "  No impact, not rematerializing instruction\n");
+      } else {
+        RematRegs.insert(Reg);
+      }
+    }
+  }
+
+  if (IncreaseOccupancy) {
+    // We were trying to increase occupancy but failed, abort the stage.
+    REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
+    Rematerializations.clear();
+    return false;
+  }
+  REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n");
+  return !Rematerializations.empty();
+}
+
+void PreRARematStage::rematerialize() {
+  const auto *TII =
+      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  // Collect regions whose RP changes in unpredictable way; we will have to
+  // fully recompute their RP after all rematerailizations.
+  DenseSet<unsigned> RecomputeRP;
+
+  // Rematerialize all instructions.
+  for (auto &[DefMI, Remat] : Rematerializations) {
+    MachineBasicBlock::iterator InsertPos(Remat.UseMI);
+    Register Reg = DefMI->getOperand(0).getReg();
+    unsigned SubReg = DefMI->getOperand(0).getSubReg();
+    unsigned DefRegion = MIRegion.at(DefMI);
+
+    // Rematerialize DefMI to its use block.
+    TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, SubReg, *DefMI,
+                       *DAG.TRI);
+    Remat.RematMI = &*std::prev(InsertPos);
+    Remat.RematMI->getOperand(0).setSubReg(SubReg);
+    DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
+
+    // Update region boundaries in regions we sinked from (remove defining MI)
+    // and to (insert MI rematerialized in use block). Only then we can erase
+    // the original MI.
+    DAG.updateRegionBoundaries(DAG.Regions[DefRegion], DefMI, nullptr);
+    auto UseRegion = MIRegion.find(Remat.UseMI);
+    if (UseRegion != MIRegion.end()) {
+      DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], InsertPos,
+                                 Remat.RematMI);
     }
+    DefMI->eraseFromParent();
+    DAG.LIS->RemoveMachineInstrFromMaps(*DefMI);
+
+    // Collect all regions impacted by the rematerialization and update their
+    // live-in/RP information.
+    for (unsigned I : Remat.LiveInRegions) {
+      ImpactedRegions.insert({I, DAG.Pressure[I]});
+      GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I];
 
-    // First check if we have enough trivially rematerializable instructions to
-    // improve occupancy. Optimistically assume all instructions we are able to
-    // sink decreased RP.
-    int TotalSinkableRegs = 0;
-    for (const auto &It : RematerializableInsts[I]) {
-      MachineInstr *Def = It.first;
-      Register DefReg = Def->getOperand(0).getReg();
-      TotalSinkableRegs +=
-          SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]);
 #ifdef EXPENSIVE_CHECKS
       // All uses are known to be available / live at the remat point. Thus, the
       // uses should already be live in to the region.
-      for (MachineOperand &MO : Def->operands()) {
+      for (MachineOperand &MO : DefMI->operands()) {
         if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
           continue;
 
         Register UseReg = MO.getReg();
         if (!UseReg.isVirtual())
           continue;
 
-        LiveInterval &LI = LIS->getInterval(UseReg);
+        LiveInterval &LI = DAG.LIS->getInterval(UseReg);
         LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
         if (LI.hasSubRanges() && MO.getSubReg())
           LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
 
-        assert(NewLiveIns[I].contains(UseReg));
-        LaneBitmask LiveInMask = NewLiveIns[I][UseReg];
+        assert(RegionLiveIns.contains(UseReg));
+        LaneBitmask LiveInMask = RegionLiveIns[UseReg];
----------------
arsenm wrote:

Avoid double map lookup. Is this RegionLiveIns.at? 

https://github.com/llvm/llvm-project/pull/125885