[llvm] [AMDGPU][Scheduler] Scoring system for rematerialization candidates (PR #153092)

Tue Sep 30 07:35:15 PDT 2025

================
@@ -1089,33 +1100,224 @@ bool ClusteredLowOccStage::initGCNSchedStage() {
 #define REMAT_PREFIX "[PreRARemat] "
 #define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void PreRARematStage::printTargetRegions(bool PrintAll) const {
+  if (PrintAll) {
+    for (auto [I, Target] : enumerate(RPTargets))
+      dbgs() << REMAT_PREFIX << "  [" << I << "] " << Target << '\n';
+    return;
+  }
+  if (TargetRegions.none()) {
+    dbgs() << REMAT_PREFIX << "No target regions\n";
+    return;
+  }
+  dbgs() << REMAT_PREFIX << "Target regions:\n";
+  for (unsigned I : TargetRegions.set_bits())
+    dbgs() << REMAT_PREFIX << "  [" << I << "] " << RPTargets[I] << '\n';
+}
+
+void PreRARematStage::RematReg::print(
+    const DenseMap<MachineInstr *, unsigned> &MIRegion) const {
+  dbgs() << REMAT_PREFIX << "  [" << MIRegion.at(DefMI) << "] " << *DefMI;
+  dbgs() << REMAT_PREFIX << "    -> used in [" << UseRegion << "] " << *UseMI;
+  const unsigned NumRegions = Live.size();
+  dbgs() << REMAT_PREFIX << "    Guaranteed RP reduction in:";
+  for (unsigned I = 0; I < NumRegions; ++I) {
+    if (isBeneficialRegion(I))
+      dbgs() << " [" << I << "]";
+  }
+  dbgs() << '\n';
+  dbgs() << REMAT_PREFIX << "    Possible RP reduction in:";
+  for (unsigned I = 0; I < NumRegions; ++I) {
+    if (isMaybeBeneficialRegion(I))
+      dbgs() << " [" << I << "]";
+  }
+  dbgs() << '\n';
+}
+
+#endif
+
 bool PreRARematStage::initGCNSchedStage() {
   // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
   // regions inbetween the defs and region we sinked the def to. Will need to be
   // fixed if there is another pass after this pass.
   assert(!S.hasNextStage());
 
-  if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() == 1)
+  if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() <= 1)
     return false;
 
   // Before performing any IR modification record the parent region of each MI
   // and the parent MBB of each region.
   const unsigned NumRegions = DAG.Regions.size();
-  RegionBB.reserve(NumRegions);
   for (unsigned I = 0; I < NumRegions; ++I) {
     RegionBoundaries Region = DAG.Regions[I];
     for (auto MI = Region.first; MI != Region.second; ++MI)
       MIRegion.insert({&*MI, I});
-    RegionBB.push_back(Region.first->getParent());
+    MachineBasicBlock *ParentMBB = Region.first->getParent();
+    if (Region.second != ParentMBB->end())
+      MIRegion.insert({&*Region.second, I});
+    RegionBB.push_back(ParentMBB);
   }
 
-  if (!canIncreaseOccupancyOrReduceSpill())
+  setObjective();
+  REMAT_DEBUG({
+    dbgs() << "Analyzing ";
+    MF.getFunction().printAsOperand(dbgs(), false);
+    dbgs() << ": ";
+    if (TargetRegions.none()) {
+      dbgs() << "no objective to achieve, occupancy is maximal at "
+             << MFI.getMaxWavesPerEU() << '\n';
+    } else if (TargetOcc) {
+      dbgs() << "increase occupancy from " << *TargetOcc - 1 << '\n';
+    } else {
+      dbgs() << "reduce spilling (minimum target occupancy is "
+             << MFI.getMinWavesPerEU() << ")\n";
+    }
+    printTargetRegions(/*PrintAll=*/TargetRegions.none());
+  });
+
+  // Compute region frequencies. 0 encodes an unknown region frequency.
+  SmallVector<uint64_t> RegionFreq;
+  RegionFreq.reserve(NumRegions);
+  assert(DAG.MLI && "MLI not defined in DAG");
+  MachineBranchProbabilityInfo MBPI;
+  MachineBlockFrequencyInfo MBFI(MF, MBPI, *DAG.MLI);
+  uint64_t EntryFreq = MBFI.getEntryFreq().getFrequency();
+  if (EntryFreq) {
+    for (const MachineBasicBlock *MBB : RegionBB)
+      RegionFreq.push_back(MBFI.getBlockFreq(MBB).getFrequency() / EntryFreq);
+  } else {
+    RegionFreq.insert(RegionFreq.end(), RegionBB.size(), 0);
+  }
+  REMAT_DEBUG({
+    dbgs() << "Region frequencies:\n";
+    for (auto [I, Freq] : enumerate(RegionFreq)) {
+      dbgs() << REMAT_PREFIX << "  [" << I << "] ";
+      if (Freq)
+        dbgs() << Freq;
+      else
+        dbgs() << "unknown ";
+      dbgs() << " | " << *DAG.Regions[I].first;
+    }
+  });
+
+  if (!collectRematRegs(RegionFreq)) {
+    REMAT_DEBUG(dbgs() << "No rematerializable registers\n");
     return false;
+  }
+
+  REMAT_DEBUG({
+    dbgs() << "Rematerializable registers:\n";
+    for (const RematReg &Remat : RematRegs)
+      Remat.print(MIRegion);
+  });
+
+  // Start by rematerializing always beneficial registers. These should never
+  // be rollbacked. All other rematerialization candidates get added to list of
+  // rematerializations that will be scored.
+  REMAT_DEBUG(dbgs() << "==== ALWAYS BENEFICIAL ====\n");
+  SmallVector<ScoredRemat> ScoredRemats;
+  BitVector RecomputeRP(NumRegions);
+  for (const RematReg &Remat : RematRegs) {
+    if (Remat.isAlwaysBeneficial()) {
+      REMAT_DEBUG(dbgs() << "[" << MIRegion[Remat.DefMI]
+                         << "] REMAT (always) | " << *Remat.DefMI);
+      rematerialize(Remat, RecomputeRP);
+    } else {
+      ScoredRemats.emplace_back(&Remat, DAG.ST, *DAG.TII);
+    }
+  }
+  unsetSatisifedRPTargets(RescheduleRegions);
+
+  LLVM_DEBUG(printTargetRegions());
+#ifndef NDEBUG
+  unsigned RoundNum = 0;
+#endif
+
+  // Rematerialize registers in successive rounds until all RP targets are
+  // satisifed or until we run out of rematerialization candidates.
+  while ((updateAndVerifyRPTargets(RecomputeRP) || TargetRegions.any()) &&
+         !ScoredRemats.empty()) {
+    // (Re-)Score and (re-)sort all remats in increasing score order.
+    for (ScoredRemat &Remat : ScoredRemats)
+      Remat.update(TargetRegions, RPTargets, RegionFreq, !TargetOcc);
+    sort(ScoredRemats);
+
+    REMAT_DEBUG({
+      dbgs() << "==== ROUND " << RoundNum << " ====\n";
+      for (const ScoredRemat &SRemat : ScoredRemats) {
+        dbgs() << REMAT_PREFIX << "*" << SRemat.getScore() << "* | "
+               << *SRemat.Remat->DefMI;
+      }
+    });
+
+    RecomputeRP.reset();
+    int RematIdx = ScoredRemats.size() - 1;
+
+    // Rematerialize registers in decreasing score order until we estimate that
+    // all RP targets are satisfied or until rematerialization candidates are no
+    // longer useful to decrease RP.
+    for (; RematIdx >= 0 && TargetRegions.any(); --RematIdx) {
+      const RematReg &Remat = *ScoredRemats[RematIdx].Remat;
+      int Score = ScoredRemats[RematIdx].getScore();
+
+      // Stop when scores become negative. Since scores monotonically decrease
+      // as remats are performed, we know there is nothing useful left to do in
+      // such cases.
+      if (Score <= 0) {
+        REMAT_DEBUG(dbgs() << "Stop remats on non-positive score | "
+                           << *Remat.DefMI);
+        RematIdx = -1;
+        break;
+      }
+
+      // When previous rematerializations in this round have already satisfied
+      // RP targets in all regions this rematerialization can impact, we have a
+      // good indication that our scores have diverged significantly from
+      // reality, in which case we interrupt this round and re-score. This also
+      // ensures that every rematerialization we perform is possibly impactful
+      // in at least one target region.
+      if (!Remat.intersectWithTarget(TargetRegions)) {
+        REMAT_DEBUG(dbgs() << "Stop round on stale score | " << *Remat.DefMI);
+        break;
+      }
+
+      REMAT_DEBUG(dbgs() << "[" << MIRegion[Remat.DefMI] << "] REMAT *" << Score
+                         << "* | " << *Remat.DefMI);
+      MachineInstr *RematMI = rematerialize(Remat, RecomputeRP);
+      // Every rematerialization done with the objective of increasing occupancy
+      // increases latency. If we don't manage to increase occupancy, we want to
----------------
lucas-rami wrote:

Thanks for the catch, clarified the comment.

https://github.com/llvm/llvm-project/pull/153092