[llvm] [AMDGPU][Scheduler] Scoring system for rematerialization candidates (PR #153092)
Lucas Ramirez via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 20 13:56:54 PDT 2025
================
@@ -1090,33 +1101,223 @@ bool ClusteredLowOccStage::initGCNSchedStage() {
#define REMAT_PREFIX "[PreRARemat] "
#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void PreRARematStage::printTargetRegions(bool PrintAll) const {
+ if (PrintAll) {
+ for (auto [I, Target] : enumerate(RPTargets))
+ dbgs() << REMAT_PREFIX << " [" << I << "] " << Target << '\n';
+ return;
+ }
+ if (TargetRegions.none()) {
+ dbgs() << REMAT_PREFIX << "No target regions\n";
+ return;
+ }
+ dbgs() << REMAT_PREFIX << "Target regions:\n";
+ for (unsigned I : TargetRegions.set_bits())
+ dbgs() << REMAT_PREFIX << " [" << I << "] " << RPTargets[I] << '\n';
+}
+
+void PreRARematStage::RematReg::print() const {
+ dbgs() << REMAT_PREFIX << " [" << DefRegion << "] " << *DefMI;
+ dbgs() << REMAT_PREFIX << " -> used in [" << UseRegion << "] " << *UseMI;
+ dbgs() << REMAT_PREFIX << " Guaranteed RP reduction in:";
+ for (unsigned I : Live.set_bits()) {
+ if (isUnusedLiveThrough(I))
+ dbgs() << " [" << I << "]";
+ }
+ dbgs() << '\n';
+ dbgs() << REMAT_PREFIX << " Possible RP reduction in:";
+ for (unsigned I : Live.set_bits()) {
+ if (!isUnusedLiveThrough(I))
+ dbgs() << " [" << I << "]";
+ }
+ dbgs() << '\n';
+}
+
+void PreRARematStage::ScoredRemat::print() const {
+ ScoreTy ShiftScore = Score;
+ ScoreTy RegionImpact = ShiftScore & ((1 << RegionImpactWidth) - 1);
+ ShiftScore >>= RegionImpactWidth;
+ ScoreTy FreqDiff = ShiftScore & ((1 << FreqDiffWidth) - 1);
+ ShiftScore >>= FreqDiffWidth;
+ ScoreTy MaxFreq = ShiftScore;
+ dbgs() << '(' << MaxFreq << ", " << FreqDiff << ", " << RegionImpact << ')';
+}
+#endif
+
bool PreRARematStage::initGCNSchedStage() {
// FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
// regions inbetween the defs and region we sinked the def to. Will need to be
// fixed if there is another pass after this pass.
assert(!S.hasNextStage());
- if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() == 1)
+ if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() <= 1)
return false;
+ // Maps all MIs (except lone terminators, which are not part of any region) to
+ // their parent region. Non-lone terminators are considered part of the region
+ // they delimitate.
+ DenseMap<MachineInstr *, unsigned> MIRegion(MF.getInstructionCount());
+
// Before performing any IR modification record the parent region of each MI
// and the parent MBB of each region.
const unsigned NumRegions = DAG.Regions.size();
- RegionBB.reserve(NumRegions);
for (unsigned I = 0; I < NumRegions; ++I) {
RegionBoundaries Region = DAG.Regions[I];
for (auto MI = Region.first; MI != Region.second; ++MI)
MIRegion.insert({&*MI, I});
- RegionBB.push_back(Region.first->getParent());
+ MachineBasicBlock *ParentMBB = Region.first->getParent();
+ if (Region.second != ParentMBB->end())
+ MIRegion.insert({&*Region.second, I});
+ RegionBB.push_back(ParentMBB);
}
- if (!canIncreaseOccupancyOrReduceSpill())
+ // Set an objective for the stage based on current RP in each region.
+ REMAT_DEBUG({
+ dbgs() << "Analyzing ";
+ MF.getFunction().printAsOperand(dbgs(), false);
+ dbgs() << ": ";
+ });
+ if (!setObjective()) {
+ LLVM_DEBUG(dbgs() << "no objective to achieve, occupancy is maximal at "
+ << MFI.getMaxWavesPerEU() << '\n');
+ return false;
+ }
+ LLVM_DEBUG({
+ if (TargetOcc) {
+ dbgs() << "increase occupancy from " << *TargetOcc - 1 << '\n';
+ } else {
+ dbgs() << "reduce spilling (minimum target occupancy is "
+ << MFI.getMinWavesPerEU() << ")\n";
+ }
+ printTargetRegions(/*PrintAll=*/TargetRegions.none());
+ });
+
+ if (!collectRematRegs(MIRegion)) {
+ REMAT_DEBUG(dbgs() << "No rematerializable registers\n");
+ return false;
+ }
+ REMAT_DEBUG({
+ dbgs() << "Rematerializable registers:\n";
+ for (const RematReg &Remat : RematRegs)
+ Remat.print();
+ });
+
+ const ScoredRemat::FreqInfo FreqInfo(MF, DAG);
+ REMAT_DEBUG({
+ dbgs() << "Region frequencies\n";
+ for (auto [I, Freq] : enumerate(FreqInfo.Regions)) {
+ dbgs() << REMAT_PREFIX << " [" << I << "] ";
+ if (Freq)
+ dbgs() << Freq;
+ else
+ dbgs() << "unknown ";
+ dbgs() << " | " << *DAG.Regions[I].first;
+ }
+ });
+
+ SmallVector<ScoredRemat> ScoredRemats;
+ for (const RematReg &Remat : RematRegs)
+ ScoredRemats.emplace_back(&Remat, FreqInfo, DAG);
+
+// Rematerialize registers in successive rounds until all RP targets are
+// satisifed or until we run out of rematerialization candidates.
+#ifndef NDEBUG
+ unsigned RoundNum = 0;
+#endif
+ BitVector RecomputeRP(NumRegions);
+ do {
+ // (Re-)Score and (re-)sort all remats in increasing score order.
+ for (ScoredRemat &Remat : ScoredRemats)
+ Remat.update(TargetRegions, RPTargets, FreqInfo, !TargetOcc);
+ sort(ScoredRemats);
+
+ REMAT_DEBUG({
+ dbgs() << "==== ROUND " << RoundNum << " ====\n";
+ for (const ScoredRemat &SRemat : ScoredRemats) {
+ dbgs() << REMAT_PREFIX;
+ SRemat.print();
+ dbgs() << " | " << *SRemat.Remat->DefMI;
+ }
+ printTargetRegions();
+ });
+
+ RecomputeRP.reset();
+ int RematIdx = ScoredRemats.size() - 1;
+
+ // Rematerialize registers in decreasing score order until we estimate
+ // that all RP targets are satisfied or until rematerialization candidates
+ // are no longer useful to decrease RP.
+ for (; RematIdx >= 0 && TargetRegions.any(); --RematIdx) {
+ const RematReg &Remat = *ScoredRemats[RematIdx].Remat;
+ // Stop on null score. Since scores monotonically decrease as we
+ // rematerialize, we know there is nothing useful left to do in such
+ // cases.
+ if (ScoredRemats[RematIdx].hasNullScore()) {
+ REMAT_DEBUG(dbgs() << "*** Stop on null score | " << *Remat.DefMI);
+ RematIdx = -1;
+ break;
+ }
+
+ // When previous rematerializations in this round have already satisfied
+ // RP targets in all regions this rematerialization can impact, we have a
+ // good indication that our scores have diverged significantly from
+ // reality, in which case we interrupt this round and re-score. This also
+ // ensures that every rematerialization we perform is possibly impactful
+ // in at least one target region.
+ if (!Remat.maybeBeneficial(TargetRegions, RPTargets)) {
+ REMAT_DEBUG(dbgs() << "*** Stop round on stale score | "
+ << *Remat.DefMI);
+ break;
+ }
+
+ REMAT_DEBUG(dbgs() << "*** REMAT [" << Remat.DefRegion << " -> "
+ << Remat.UseRegion << "] | " << *Remat.DefMI);
+ // Every rematerialization we do here is likely to move the instruction
+ // into a higher frequency region, increasing the total sum latency of the
+ // instruction itself. This is acceptable if we are eliminating a spill in
+ // the process, but when the goal is increasing occupancy we get nothing
+ // out of rematerialization if occupancy is not increased in the end; in
+ // such cases we want to roll back the rematerialization.
+ RollbackInfo *Rollback =
+ TargetOcc ? &Rollbacks.emplace_back(&Remat) : nullptr;
+ rematerialize(Remat, RecomputeRP, Rollback);
+ unsetSatisifedRPTargets(Remat.Live);
+ }
+
+#ifndef NDEBUG
+ ++RoundNum;
+#endif
+ REMAT_DEBUG({
+ if (!TargetRegions.any())
+ dbgs() << "*** Stop round on all targets achieved\n";
+ else if (RematIdx == -1)
+ dbgs() << "*** Stop round on exhausted remat opportunities\n";
+ });
+
+ // Peel off registers we already rematerialized from the vector's tail.
+ ScoredRemats.truncate(RematIdx + 1);
+ } while ((updateAndVerifyRPTargets(RecomputeRP) || TargetRegions.any()) &&
----------------
lucas-rami wrote:
I don't think so. If a remat is no longer beneficial while there are still target regions we will check whether any region was incorrectly unmarked as a target due to our heuristics, then re-score and start a new round. Then, either there is at least one remat which is still useful (the same one or another) and we will be able to make progress, or the first remat we check will have null score and we will stop rematerializing altogether.
https://github.com/llvm/llvm-project/pull/153092
More information about the llvm-commits
mailing list