[llvm] [AMDGPU][Scheduler] Scoring system for rematerialization candidates (PR #153092)

Thu Oct 2 12:21:09 PDT 2025

================
@@ -1817,106 +1987,145 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
       if (!allUsesAvailableAt(&DefMI, DefIdx, UseIdx))
         continue;
 
-      REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI);
-      RematInstruction &Remat =
-          Rematerializations.try_emplace(&DefMI, UseMI).first->second;
-
-      bool RematUseful = false;
-      if (auto It = OptRegions.find(I); It != OptRegions.end()) {
-        // Optimistically consider that moving the instruction out of its
-        // defining region will reduce RP in the latter; this assumes that
-        // maximum RP in the region is reached somewhere between the defining
-        // instruction and the end of the region.
-        REMAT_DEBUG(dbgs() << "  Defining region is optimizable\n");
-        LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
-        if (ReduceRPInRegion(It, Reg, Mask, RematUseful))
-          return true;
-      }
-
-      for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) {
-        // We are only collecting regions in which the register is a live-in
-        // (and may be live-through).
-        auto It = DAG.LiveIns[LIRegion].find(Reg);
-        if (It == DAG.LiveIns[LIRegion].end() || It->second.none())
-          continue;
-        Remat.LiveInRegions.insert(LIRegion);
-
-        // Account for the reduction in RP due to the rematerialization in an
-        // optimizable region in which the defined register is a live-in. This
-        // is exact for live-through region but optimistic in the using region,
-        // where RP is actually reduced only if maximum RP is reached somewhere
-        // between the beginning of the region and the rematerializable
-        // instruction's use.
-        if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
-          REMAT_DEBUG(dbgs() << "  Live-in in region " << LIRegion << '\n');
-          if (ReduceRPInRegion(It, Reg, DAG.LiveIns[LIRegion][Reg],
-                               RematUseful))
-            return true;
-        }
-      }
-
-      // If the instruction is not a live-in or live-out in any optimizable
-      // region then there is no point in rematerializing it.
-      if (!RematUseful) {
-        Rematerializations.pop_back();
-        REMAT_DEBUG(dbgs() << "  No impact, not rematerializing instruction\n");
-      } else {
-        RematRegs.insert(Reg);
-      }
+      // Add the instruction to the rematerializable list.
+      RematRegSet.insert(Reg);
+      RematRegs.emplace_back(&DefMI, UseMI, DAG, MIRegion, RegionFreq);
     }
   }
 
-  if (TargetOcc) {
-    // We were trying to increase occupancy but failed, abort the stage.
-    REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
-    Rematerializations.clear();
-    return false;
-  }
-  REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n");
-  return !Rematerializations.empty();
+  return !RematRegs.empty();
 }
 
-void PreRARematStage::rematerialize() {
-  const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
+PreRARematStage::RematReg::RematReg(
+    MachineInstr *DefMI, MachineInstr *UseMI, GCNScheduleDAGMILive &DAG,
+    const DenseMap<MachineInstr *, unsigned> &MIRegion,
+    ArrayRef<uint64_t> RegionFreq)
+    : DefMI(DefMI), UseMI(UseMI), UseRegion(MIRegion.at(UseMI)),
+      LiveIn(DAG.Regions.size()), LiveOut(DAG.Regions.size()),
+      Live(DAG.Regions.size()), DefFrequency(RegionFreq[MIRegion.at(DefMI)]),
+      UseFrequency(RegionFreq[MIRegion.at(UseMI)]) {
 
-  // Collect regions whose RP changes in unpredictable way; we will have to
-  // fully recompute their RP after all rematerailizations.
-  DenseSet<unsigned> RecomputeRP;
-
-  // Rematerialize all instructions.
-  for (auto &[DefMI, Remat] : Rematerializations) {
-    MachineBasicBlock::iterator InsertPos(Remat.UseMI);
-    Register Reg = DefMI->getOperand(0).getReg();
-    unsigned DefRegion = MIRegion.at(DefMI);
-
-    // Rematerialize DefMI to its use block.
-    TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
-                       AMDGPU::NoSubRegister, *DefMI, *DAG.TRI);
-    Remat.RematMI = &*std::prev(InsertPos);
-    DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
-
-    // Update region boundaries in regions we sinked from (remove defining MI)
-    // and to (insert MI rematerialized in use block). Only then we can erase
-    // the original MI.
-    DAG.updateRegionBoundaries(DAG.Regions[DefRegion], DefMI, nullptr);
-    auto UseRegion = MIRegion.find(Remat.UseMI);
-    if (UseRegion != MIRegion.end()) {
-      DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], InsertPos,
-                                 Remat.RematMI);
+  // Mark regions in which the rematerializable register is live.
+  Register Reg = DefMI->getOperand(0).getReg();
+  for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+    auto LiveInIt = DAG.LiveIns[I].find(Reg);
+    if (LiveInIt != DAG.LiveIns[I].end() && LiveInIt->second.any())
+      LiveIn.set(I);
+    auto LiveOutIt = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).find(Reg);
+    auto LiveOutEnd = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).end();
+    if (LiveOutIt != LiveOutEnd && LiveOutIt->second.any())
+      LiveOut.set(I);
+  }
+  Live |= LiveIn;
+  Live |= LiveOut;
+
+  // Store the register's lane bitmask.
+  unsigned SubIdx = DefMI->getOperand(0).getSubReg();
+  Mask = SubIdx ? DAG.TRI->getSubRegIndexLaneMask(SubIdx)
+                : DAG.MRI.getMaxLaneMaskForVReg(Reg);
+}
+
+MachineInstr *
+PreRARematStage::RematReg::insertMI(unsigned RegionIdx,
+                                    MachineBasicBlock::iterator InsertPos,
+                                    GCNScheduleDAGMILive &DAG) const {
+  MachineInstr *NewMI = &*std::prev(InsertPos);
+  DAG.updateRegionBoundaries(DAG.Regions[RegionIdx], InsertPos, NewMI);
+  DAG.LIS->InsertMachineInstrInMaps(*NewMI);
+  DAG.LIS->createAndComputeVirtRegInterval(NewMI->getOperand(0).getReg());
+  return NewMI;
+}
+
+PreRARematStage::ScoredRemat::ScoredRemat(const RematReg *Remat,
+                                          const GCNSubtarget &ST,
+                                          const TargetInstrInfo &TII)
+    : Remat(Remat) {
+  const InstrItineraryData *Itin = ST.getInstrItineraryData();
+  if (Remat->DefFrequency && Remat->UseFrequency) {
+    InstrLatencyGain = Remat->DefFrequency - Remat->UseFrequency;
+    *InstrLatencyGain *= TII.getInstrLatency(Itin, *Remat->DefMI);
+  }
+  resetScore();
+}
+
+void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
+                                          ArrayRef<GCNRPTarget> RPTargets,
+                                          ArrayRef<uint64_t> RegionFreq,
+                                          bool ReduceSpill) {
+  // Exit early if no target region intersects with the registers's live
+  // regions.
+  if (!Remat->intersectWithTarget(TargetRegions))
+    return setUselessRemat();
+  resetScore();
+
+  // When the stage is trying to reduce spilling, we want to pick
+  // rematerialization candidates that will be beneficial to latency. When it is
+  // trying to increase occupancy, we are fine increasing latency to try to
+  // reduce RP.
+  // FIXME: In the increasing occupancy case, we should be able to incorporate
+  // the latency loss induced by rematerializations into the final score. It
+  // seems possible to very roughly estimate the overall kernel latency upside
+  // we get by increasing occupancy and compare it to the latency hit each wave
+  // will be subjected to.
+  if (ReduceSpill) {
+    // It may be better to let the register spill if it is defined by a very
+    // high latency instruction. Try to estimate the latency gain induced by
+    // rematerializing the register.
+    //
+    // If we don't know the rematerializations's latency gain we don't know
+    // what to compare the spill latency against. We still consider the
+    // rematerialization potentially beneficial in such cases because we don't
+    // want to miss rematerialization opportunities and rematerializing is in
+    // most cases cheaper than spilling. We still give a bonus to remats for
+    // which we are able to do the calculation.
+    if (InstrLatencyGain && *InstrLatencyGain < 0) {
+      int SpillLatencyGain = SaveCost * Remat->DefFrequency;
+      SpillLatencyGain += RestoreCost * Remat->UseFrequency;
+      if (*InstrLatencyGain + SpillLatencyGain < 0)
+        return setUselessRemat();
+      setKnownLatencyGain();
     }
-    DAG.LIS->RemoveMachineInstrFromMaps(*DefMI);
-    DefMI->eraseFromParent();
+  }
+
+  // The estimated RP reduction is proportional to the total frequency in target
+  // regions where the register is live.
+  Register Reg = Remat->DefMI->getOperand(0).getReg();
+  unsigned RPScore = 0;
+  for (unsigned I : TargetRegions.set_bits()) {
+    unsigned Freq = std::max(RegionFreq[I], static_cast<uint64_t>(1));
+    if (Remat->isBeneficialRegion(I))
+      Score += WeightRP * RPTargets[I].isSaveBeneficial(Reg) * Freq;
+    else if (Remat->isMaybeBeneficialRegion(I))
+      Score += WeightRPMaybe * RPTargets[I].isSaveBeneficial(Reg) * Freq;
+  }
 
-    // Collect all regions impacted by the rematerialization and update their
-    // live-in/RP information.
-    for (unsigned I : Remat.LiveInRegions) {
-      ImpactedRegions.insert({I, DAG.Pressure[I]});
-      GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I];
+  // The estimated RP reduction is directly proportional to the size of the
+  // rematerializable register.
+  setRPScore(RPScore * SIRegisterInfo::getNumCoveredRegs(Remat->Mask));
----------------
jrbyrnes wrote:

IMO, the scoring model should take into account the (perhaps in order of priority):
	1. maximal freuency of affected regions (if spilling)
	2. delta between def / remat frequency
	3. the number of target regions affected
	4. then reg width.

For the occupancy case, the frequency of the intermediate target blocks doesn't matter -- we remove latency from the original def region, and we add latency to the remat region. We shouldn't care about the intermediate frequency as doing a remat has no impact on those regions -- if we do enough remats we will increase occupancy which is function wide.

For the spilling case, the frequencey of the intermediate target blocks does matter. If our RP is above the threshold, then we will spill somewhere, we would prefer not to spill in the hottest blocks. Thus, we should prioritize reducing RP in the highest frequency regions first.


https://github.com/llvm/llvm-project/pull/153092