[llvm] [AMDGPU][Scheduler] Scoring system for rematerialization candidates (PR #153092)
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 15 14:18:27 PDT 2025
================
@@ -1090,33 +1101,223 @@ bool ClusteredLowOccStage::initGCNSchedStage() {
#define REMAT_PREFIX "[PreRARemat] "
#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void PreRARematStage::printTargetRegions(bool PrintAll) const {
+ if (PrintAll) {
+ for (auto [I, Target] : enumerate(RPTargets))
+ dbgs() << REMAT_PREFIX << " [" << I << "] " << Target << '\n';
+ return;
+ }
+ if (TargetRegions.none()) {
+ dbgs() << REMAT_PREFIX << "No target regions\n";
+ return;
+ }
+ dbgs() << REMAT_PREFIX << "Target regions:\n";
+ for (unsigned I : TargetRegions.set_bits())
+ dbgs() << REMAT_PREFIX << " [" << I << "] " << RPTargets[I] << '\n';
+}
+
+void PreRARematStage::RematReg::print() const {
+ dbgs() << REMAT_PREFIX << " [" << DefRegion << "] " << *DefMI;
+ dbgs() << REMAT_PREFIX << " -> used in [" << UseRegion << "] " << *UseMI;
+ dbgs() << REMAT_PREFIX << " Guaranteed RP reduction in:";
+ for (unsigned I : Live.set_bits()) {
+ if (isUnusedLiveThrough(I))
+ dbgs() << " [" << I << "]";
+ }
+ dbgs() << '\n';
+ dbgs() << REMAT_PREFIX << " Possible RP reduction in:";
+ for (unsigned I : Live.set_bits()) {
+ if (!isUnusedLiveThrough(I))
+ dbgs() << " [" << I << "]";
+ }
+ dbgs() << '\n';
+}
+
+void PreRARematStage::ScoredRemat::print() const {
+ ScoreTy ShiftScore = Score;
+ ScoreTy RegionImpact = ShiftScore & ((1 << RegionImpactWidth) - 1);
+ ShiftScore >>= RegionImpactWidth;
+ ScoreTy FreqDiff = ShiftScore & ((1 << FreqDiffWidth) - 1);
+ ShiftScore >>= FreqDiffWidth;
+ ScoreTy MaxFreq = ShiftScore;
+ dbgs() << '(' << MaxFreq << ", " << FreqDiff << ", " << RegionImpact << ')';
+}
+#endif
+
bool PreRARematStage::initGCNSchedStage() {
// FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
// regions inbetween the defs and region we sinked the def to. Will need to be
// fixed if there is another pass after this pass.
assert(!S.hasNextStage());
- if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() == 1)
+ if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() <= 1)
return false;
+ // Maps all MIs (except lone terminators, which are not part of any region) to
+ // their parent region. Non-lone terminators are considered part of the region
+ // they delimitate.
+ DenseMap<MachineInstr *, unsigned> MIRegion(MF.getInstructionCount());
+
// Before performing any IR modification record the parent region of each MI
// and the parent MBB of each region.
const unsigned NumRegions = DAG.Regions.size();
- RegionBB.reserve(NumRegions);
for (unsigned I = 0; I < NumRegions; ++I) {
RegionBoundaries Region = DAG.Regions[I];
for (auto MI = Region.first; MI != Region.second; ++MI)
MIRegion.insert({&*MI, I});
- RegionBB.push_back(Region.first->getParent());
+ MachineBasicBlock *ParentMBB = Region.first->getParent();
+ if (Region.second != ParentMBB->end())
+ MIRegion.insert({&*Region.second, I});
+ RegionBB.push_back(ParentMBB);
}
- if (!canIncreaseOccupancyOrReduceSpill())
+ // Set an objective for the stage based on current RP in each region.
+ REMAT_DEBUG({
+ dbgs() << "Analyzing ";
+ MF.getFunction().printAsOperand(dbgs(), false);
+ dbgs() << ": ";
+ });
+ if (!setObjective()) {
+ LLVM_DEBUG(dbgs() << "no objective to achieve, occupancy is maximal at "
+ << MFI.getMaxWavesPerEU() << '\n');
+ return false;
+ }
+ LLVM_DEBUG({
+ if (TargetOcc) {
+ dbgs() << "increase occupancy from " << *TargetOcc - 1 << '\n';
+ } else {
+ dbgs() << "reduce spilling (minimum target occupancy is "
+ << MFI.getMinWavesPerEU() << ")\n";
+ }
+ printTargetRegions(/*PrintAll=*/TargetRegions.none());
+ });
+
+ if (!collectRematRegs(MIRegion)) {
+ REMAT_DEBUG(dbgs() << "No rematerializable registers\n");
+ return false;
+ }
+ REMAT_DEBUG({
+ dbgs() << "Rematerializable registers:\n";
+ for (const RematReg &Remat : RematRegs)
+ Remat.print();
+ });
+
+ const ScoredRemat::FreqInfo FreqInfo(MF, DAG);
+ REMAT_DEBUG({
+ dbgs() << "Region frequencies\n";
+ for (auto [I, Freq] : enumerate(FreqInfo.Regions)) {
+ dbgs() << REMAT_PREFIX << " [" << I << "] ";
+ if (Freq)
+ dbgs() << Freq;
+ else
+ dbgs() << "unknown ";
+ dbgs() << " | " << *DAG.Regions[I].first;
+ }
+ });
+
+ SmallVector<ScoredRemat> ScoredRemats;
+ for (const RematReg &Remat : RematRegs)
+ ScoredRemats.emplace_back(&Remat, FreqInfo, DAG);
+
+// Rematerialize registers in successive rounds until all RP targets are
+// satisifed or until we run out of rematerialization candidates.
+#ifndef NDEBUG
+ unsigned RoundNum = 0;
+#endif
+ BitVector RecomputeRP(NumRegions);
+ do {
+ // (Re-)Score and (re-)sort all remats in increasing score order.
+ for (ScoredRemat &Remat : ScoredRemats)
+ Remat.update(TargetRegions, RPTargets, FreqInfo, !TargetOcc);
+ sort(ScoredRemats);
+
+ REMAT_DEBUG({
+ dbgs() << "==== ROUND " << RoundNum << " ====\n";
+ for (const ScoredRemat &SRemat : ScoredRemats) {
+ dbgs() << REMAT_PREFIX;
+ SRemat.print();
+ dbgs() << " | " << *SRemat.Remat->DefMI;
+ }
+ printTargetRegions();
+ });
+
+ RecomputeRP.reset();
+ int RematIdx = ScoredRemats.size() - 1;
+
+ // Rematerialize registers in decreasing score order until we estimate
+ // that all RP targets are satisfied or until rematerialization candidates
+ // are no longer useful to decrease RP.
+ for (; RematIdx >= 0 && TargetRegions.any(); --RematIdx) {
+ const RematReg &Remat = *ScoredRemats[RematIdx].Remat;
+ // Stop on null score. Since scores monotonically decrease as we
+ // rematerialize, we know there is nothing useful left to do in such
+ // cases.
+ if (ScoredRemats[RematIdx].hasNullScore()) {
+ REMAT_DEBUG(dbgs() << "*** Stop on null score | " << *Remat.DefMI);
+ RematIdx = -1;
+ break;
+ }
+
+ // When previous rematerializations in this round have already satisfied
+ // RP targets in all regions this rematerialization can impact, we have a
+ // good indication that our scores have diverged significantly from
+ // reality, in which case we interrupt this round and re-score. This also
+ // ensures that every rematerialization we perform is possibly impactful
+ // in at least one target region.
+ if (!Remat.maybeBeneficial(TargetRegions, RPTargets)) {
+ REMAT_DEBUG(dbgs() << "*** Stop round on stale score | "
+ << *Remat.DefMI);
+ break;
+ }
+
+ REMAT_DEBUG(dbgs() << "*** REMAT [" << Remat.DefRegion << " -> "
+ << Remat.UseRegion << "] | " << *Remat.DefMI);
+ // Every rematerialization we do here is likely to move the instruction
+ // into a higher frequency region, increasing the total sum latency of the
+ // instruction itself. This is acceptable if we are eliminating a spill in
+ // the process, but when the goal is increasing occupancy we get nothing
+ // out of rematerialization if occupancy is not increased in the end; in
+ // such cases we want to roll back the rematerialization.
+ RollbackInfo *Rollback =
+ TargetOcc ? &Rollbacks.emplace_back(&Remat) : nullptr;
+ rematerialize(Remat, RecomputeRP, Rollback);
+ unsetSatisifedRPTargets(Remat.Live);
+ }
+
+#ifndef NDEBUG
+ ++RoundNum;
+#endif
+ REMAT_DEBUG({
+ if (!TargetRegions.any())
+ dbgs() << "*** Stop round on all targets achieved\n";
+ else if (RematIdx == -1)
+ dbgs() << "*** Stop round on exhausted remat opportunities\n";
+ });
+
+ // Peel off registers we already rematerialized from the vector's tail.
+ ScoredRemats.truncate(RematIdx + 1);
+ } while ((updateAndVerifyRPTargets(RecomputeRP) || TargetRegions.any()) &&
----------------
jrbyrnes wrote:
Is it possible to hit infinite loop ? If we have a `ScoredRemat` that is no longer beneficial, but non-empty `TargetRegions`?
https://github.com/llvm/llvm-project/pull/153092
More information about the llvm-commits
mailing list