[llvm] [AMDGPU][Scheduler] Scoring system for rematerialization candidates (PR #153092)
Lucas Ramirez via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 30 07:35:15 PDT 2025
================
@@ -1089,33 +1100,224 @@ bool ClusteredLowOccStage::initGCNSchedStage() {
#define REMAT_PREFIX "[PreRARemat] "
#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void PreRARematStage::printTargetRegions(bool PrintAll) const {
+ if (PrintAll) {
+ for (auto [I, Target] : enumerate(RPTargets))
+ dbgs() << REMAT_PREFIX << " [" << I << "] " << Target << '\n';
+ return;
+ }
+ if (TargetRegions.none()) {
+ dbgs() << REMAT_PREFIX << "No target regions\n";
+ return;
+ }
+ dbgs() << REMAT_PREFIX << "Target regions:\n";
+ for (unsigned I : TargetRegions.set_bits())
+ dbgs() << REMAT_PREFIX << " [" << I << "] " << RPTargets[I] << '\n';
+}
+
+void PreRARematStage::RematReg::print(
+ const DenseMap<MachineInstr *, unsigned> &MIRegion) const {
+ dbgs() << REMAT_PREFIX << " [" << MIRegion.at(DefMI) << "] " << *DefMI;
+ dbgs() << REMAT_PREFIX << " -> used in [" << UseRegion << "] " << *UseMI;
+ const unsigned NumRegions = Live.size();
+ dbgs() << REMAT_PREFIX << " Guaranteed RP reduction in:";
+ for (unsigned I = 0; I < NumRegions; ++I) {
+ if (isBeneficialRegion(I))
+ dbgs() << " [" << I << "]";
+ }
+ dbgs() << '\n';
+ dbgs() << REMAT_PREFIX << " Possible RP reduction in:";
+ for (unsigned I = 0; I < NumRegions; ++I) {
+ if (isMaybeBeneficialRegion(I))
+ dbgs() << " [" << I << "]";
+ }
+ dbgs() << '\n';
+}
+
+#endif
+
bool PreRARematStage::initGCNSchedStage() {
// FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
// regions inbetween the defs and region we sinked the def to. Will need to be
// fixed if there is another pass after this pass.
assert(!S.hasNextStage());
- if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() == 1)
+ if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() <= 1)
return false;
// Before performing any IR modification record the parent region of each MI
// and the parent MBB of each region.
const unsigned NumRegions = DAG.Regions.size();
- RegionBB.reserve(NumRegions);
for (unsigned I = 0; I < NumRegions; ++I) {
RegionBoundaries Region = DAG.Regions[I];
for (auto MI = Region.first; MI != Region.second; ++MI)
MIRegion.insert({&*MI, I});
- RegionBB.push_back(Region.first->getParent());
+ MachineBasicBlock *ParentMBB = Region.first->getParent();
+ if (Region.second != ParentMBB->end())
+ MIRegion.insert({&*Region.second, I});
+ RegionBB.push_back(ParentMBB);
}
- if (!canIncreaseOccupancyOrReduceSpill())
+ setObjective();
+ REMAT_DEBUG({
+ dbgs() << "Analyzing ";
+ MF.getFunction().printAsOperand(dbgs(), false);
+ dbgs() << ": ";
+ if (TargetRegions.none()) {
+ dbgs() << "no objective to achieve, occupancy is maximal at "
+ << MFI.getMaxWavesPerEU() << '\n';
+ } else if (TargetOcc) {
+ dbgs() << "increase occupancy from " << *TargetOcc - 1 << '\n';
+ } else {
+ dbgs() << "reduce spilling (minimum target occupancy is "
+ << MFI.getMinWavesPerEU() << ")\n";
+ }
+ printTargetRegions(/*PrintAll=*/TargetRegions.none());
+ });
+
+ // Compute region frequencies. 0 encodes an unknown region frequency.
+ SmallVector<uint64_t> RegionFreq;
+ RegionFreq.reserve(NumRegions);
+ assert(DAG.MLI && "MLI not defined in DAG");
+ MachineBranchProbabilityInfo MBPI;
+ MachineBlockFrequencyInfo MBFI(MF, MBPI, *DAG.MLI);
+ uint64_t EntryFreq = MBFI.getEntryFreq().getFrequency();
+ if (EntryFreq) {
+ for (const MachineBasicBlock *MBB : RegionBB)
+ RegionFreq.push_back(MBFI.getBlockFreq(MBB).getFrequency() / EntryFreq);
+ } else {
+ RegionFreq.insert(RegionFreq.end(), RegionBB.size(), 0);
+ }
+ REMAT_DEBUG({
+ dbgs() << "Region frequencies:\n";
+ for (auto [I, Freq] : enumerate(RegionFreq)) {
+ dbgs() << REMAT_PREFIX << " [" << I << "] ";
+ if (Freq)
+ dbgs() << Freq;
+ else
+ dbgs() << "unknown ";
+ dbgs() << " | " << *DAG.Regions[I].first;
+ }
+ });
+
+ if (!collectRematRegs(RegionFreq)) {
+ REMAT_DEBUG(dbgs() << "No rematerializable registers\n");
return false;
+ }
+
+ REMAT_DEBUG({
+ dbgs() << "Rematerializable registers:\n";
+ for (const RematReg &Remat : RematRegs)
+ Remat.print(MIRegion);
+ });
+
+ // Start by rematerializing always beneficial registers. These should never
+ // be rollbacked. All other rematerialization candidates get added to list of
+ // rematerializations that will be scored.
+ REMAT_DEBUG(dbgs() << "==== ALWAYS BENEFICIAL ====\n");
+ SmallVector<ScoredRemat> ScoredRemats;
+ BitVector RecomputeRP(NumRegions);
+ for (const RematReg &Remat : RematRegs) {
+ if (Remat.isAlwaysBeneficial()) {
+ REMAT_DEBUG(dbgs() << "[" << MIRegion[Remat.DefMI]
+ << "] REMAT (always) | " << *Remat.DefMI);
+ rematerialize(Remat, RecomputeRP);
+ } else {
+ ScoredRemats.emplace_back(&Remat, DAG.ST, *DAG.TII);
+ }
+ }
+ unsetSatisifedRPTargets(RescheduleRegions);
+
+ LLVM_DEBUG(printTargetRegions());
+#ifndef NDEBUG
+ unsigned RoundNum = 0;
+#endif
+
+ // Rematerialize registers in successive rounds until all RP targets are
+ // satisifed or until we run out of rematerialization candidates.
+ while ((updateAndVerifyRPTargets(RecomputeRP) || TargetRegions.any()) &&
+ !ScoredRemats.empty()) {
+ // (Re-)Score and (re-)sort all remats in increasing score order.
+ for (ScoredRemat &Remat : ScoredRemats)
+ Remat.update(TargetRegions, RPTargets, RegionFreq, !TargetOcc);
+ sort(ScoredRemats);
+
+ REMAT_DEBUG({
+ dbgs() << "==== ROUND " << RoundNum << " ====\n";
+ for (const ScoredRemat &SRemat : ScoredRemats) {
+ dbgs() << REMAT_PREFIX << "*" << SRemat.getScore() << "* | "
+ << *SRemat.Remat->DefMI;
+ }
+ });
+
+ RecomputeRP.reset();
+ int RematIdx = ScoredRemats.size() - 1;
+
+ // Rematerialize registers in decreasing score order until we estimate that
+ // all RP targets are satisfied or until rematerialization candidates are no
+ // longer useful to decrease RP.
+ for (; RematIdx >= 0 && TargetRegions.any(); --RematIdx) {
+ const RematReg &Remat = *ScoredRemats[RematIdx].Remat;
+ int Score = ScoredRemats[RematIdx].getScore();
+
+ // Stop when scores become negative. Since scores monotonically decrease
+ // as remats are performed, we know there is nothing useful left to do in
+ // such cases.
+ if (Score <= 0) {
+ REMAT_DEBUG(dbgs() << "Stop remats on non-positive score | "
+ << *Remat.DefMI);
+ RematIdx = -1;
+ break;
+ }
+
+ // When previous rematerializations in this round have already satisfied
+ // RP targets in all regions this rematerialization can impact, we have a
+ // good indication that our scores have diverged significantly from
+ // reality, in which case we interrupt this round and re-score. This also
+ // ensures that every rematerialization we perform is possibly impactful
+ // in at least one target region.
+ if (!Remat.intersectWithTarget(TargetRegions)) {
+ REMAT_DEBUG(dbgs() << "Stop round on stale score | " << *Remat.DefMI);
+ break;
+ }
+
+ REMAT_DEBUG(dbgs() << "[" << MIRegion[Remat.DefMI] << "] REMAT *" << Score
+ << "* | " << *Remat.DefMI);
+ MachineInstr *RematMI = rematerialize(Remat, RecomputeRP);
+ // Every rematerialization done with the objective of increasing occupancy
+ // increases latency. If we don't manage to increase occupancy, we want to
----------------
lucas-rami wrote:
Thanks for the catch, clarified the comment.
https://github.com/llvm/llvm-project/pull/153092
More information about the llvm-commits
mailing list