[llvm] [AMDGPU][Scheduler] Scoring system for rematerialization candidates (PR #153092)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 11 22:51:32 PDT 2025
================
@@ -1817,106 +1989,146 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
if (!allUsesAvailableAt(&DefMI, DefIdx, UseIdx))
continue;
- REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI);
- RematInstruction &Remat =
- Rematerializations.try_emplace(&DefMI, UseMI).first->second;
-
- bool RematUseful = false;
- if (auto It = OptRegions.find(I); It != OptRegions.end()) {
- // Optimistically consider that moving the instruction out of its
- // defining region will reduce RP in the latter; this assumes that
- // maximum RP in the region is reached somewhere between the defining
- // instruction and the end of the region.
- REMAT_DEBUG(dbgs() << " Defining region is optimizable\n");
- LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
- if (ReduceRPInRegion(It, Reg, Mask, RematUseful))
- return true;
- }
-
- for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) {
- // We are only collecting regions in which the register is a live-in
- // (and may be live-through).
- auto It = DAG.LiveIns[LIRegion].find(Reg);
- if (It == DAG.LiveIns[LIRegion].end() || It->second.none())
- continue;
- Remat.LiveInRegions.insert(LIRegion);
-
- // Account for the reduction in RP due to the rematerialization in an
- // optimizable region in which the defined register is a live-in. This
- // is exact for live-through region but optimistic in the using region,
- // where RP is actually reduced only if maximum RP is reached somewhere
- // between the beginning of the region and the rematerializable
- // instruction's use.
- if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
- REMAT_DEBUG(dbgs() << " Live-in in region " << LIRegion << '\n');
- if (ReduceRPInRegion(It, Reg, DAG.LiveIns[LIRegion][Reg],
- RematUseful))
- return true;
- }
- }
-
- // If the instruction is not a live-in or live-out in any optimizable
- // region then there is no point in rematerializing it.
- if (!RematUseful) {
- Rematerializations.pop_back();
- REMAT_DEBUG(dbgs() << " No impact, not rematerializing instruction\n");
- } else {
- RematRegs.insert(Reg);
- }
+ // Add the instruction to the rematerializable list.
+ RematRegSet.insert(Reg);
+ RematRegs.emplace_back(&DefMI, UseMI, DAG, MIRegion, RegionFreq);
}
}
- if (TargetOcc) {
- // We were trying to increase occupancy but failed, abort the stage.
- REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
- Rematerializations.clear();
- return false;
- }
- REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n");
- return !Rematerializations.empty();
+ return !RematRegs.empty();
}
-void PreRARematStage::rematerialize() {
- const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
+PreRARematStage::RematReg::RematReg(
+ MachineInstr *DefMI, MachineInstr *UseMI, GCNScheduleDAGMILive &DAG,
+ const DenseMap<MachineInstr *, unsigned> &MIRegion,
+ ArrayRef<uint64_t> RegionFreq)
+ : DefMI(DefMI), UseMI(UseMI), UseRegion(MIRegion.at(UseMI)),
+ LiveIn(DAG.Regions.size()), LiveOut(DAG.Regions.size()),
+ Live(DAG.Regions.size()), DefFrequency(RegionFreq[MIRegion.at(DefMI)]),
+ UseFrequency(RegionFreq[MIRegion.at(UseMI)]) {
- // Collect regions whose RP changes in unpredictable way; we will have to
- // fully recompute their RP after all rematerailizations.
- DenseSet<unsigned> RecomputeRP;
-
- // Rematerialize all instructions.
- for (auto &[DefMI, Remat] : Rematerializations) {
- MachineBasicBlock::iterator InsertPos(Remat.UseMI);
- Register Reg = DefMI->getOperand(0).getReg();
- unsigned DefRegion = MIRegion.at(DefMI);
-
- // Rematerialize DefMI to its use block.
- TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
- AMDGPU::NoSubRegister, *DefMI, *DAG.TRI);
- Remat.RematMI = &*std::prev(InsertPos);
- DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
-
- // Update region boundaries in regions we sinked from (remove defining MI)
- // and to (insert MI rematerialized in use block). Only then we can erase
- // the original MI.
- DAG.updateRegionBoundaries(DAG.Regions[DefRegion], DefMI, nullptr);
- auto UseRegion = MIRegion.find(Remat.UseMI);
- if (UseRegion != MIRegion.end()) {
- DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], InsertPos,
- Remat.RematMI);
+ // Mark regions in which the rematerializable register is live.
+ Register Reg = DefMI->getOperand(0).getReg();
+ for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+ auto LiveInIt = DAG.LiveIns[I].find(Reg);
+ if (LiveInIt != DAG.LiveIns[I].end() && LiveInIt->second.any())
+ LiveIn.set(I);
+ auto LiveOutIt = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).find(Reg);
+ auto LiveOutEnd = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).end();
+ if (LiveOutIt != LiveOutEnd && LiveOutIt->second.any())
+ LiveOut.set(I);
+ }
+ Live |= LiveIn;
+ Live |= LiveOut;
+
+ // Store the register's lane bitmask.
+ unsigned SubReg = DefMI->getOperand(0).getSubReg();
+ Mask = SubReg ? DAG.TRI->getSubRegIndexLaneMask(SubReg)
+ : DAG.MRI.getMaxLaneMaskForVReg(Reg);
+}
+
+MachineInstr *
+PreRARematStage::RematReg::insertMI(unsigned RegionIdx,
+ MachineBasicBlock::iterator InsertPos,
+ GCNScheduleDAGMILive &DAG) const {
+ MachineInstr *NewMI = &*std::prev(InsertPos);
+ DAG.updateRegionBoundaries(DAG.Regions[RegionIdx], InsertPos, NewMI);
+ DAG.LIS->InsertMachineInstrInMaps(*NewMI);
+ DAG.LIS->createAndComputeVirtRegInterval(NewMI->getOperand(0).getReg());
+ return NewMI;
+}
+
+PreRARematStage::ScoredRemat::ScoredRemat(const RematReg *Remat,
+ const GCNSubtarget &ST,
+ const TargetInstrInfo &TII)
+ : Remat(Remat) {
+ const InstrItineraryData *Itin = ST.getInstrItineraryData();
+ if (Remat->DefFrequency && Remat->UseFrequency) {
+ InstrLatencyGain = Remat->DefFrequency - Remat->UseFrequency;
+ *InstrLatencyGain *= TII.getInstrLatency(Itin, *Remat->DefMI);
+ }
+ resetScore();
+}
+
+void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
+ ArrayRef<GCNRPTarget> RPTargets,
+ ArrayRef<uint64_t> RegionFreq,
+ bool ReduceSpill) {
+ // Exit early if no target region intersects with the registers's live
+ // regions.
+ if (!Remat->intersectWithTarget(TargetRegions))
+ return setUselessRemat();
+ resetScore();
+
+ // When the stage is trying to reduce spilling, we want to pick
+ // rematerialization candidates that will be beneficial to latency. When it is
+ // trying to increase occupancy, we are fine increasing latency to try to
+ // reduce RP.
+ // FIXME: In the increasing occupancy case, we should be able to incorporate
+ // the latency loss induced by rematerializations into the final score. It
+ // seems possible to very roughly estimate the overall kernel latency upside
+ // we get by increasing occupancy and compare it to the latency hit each wave
+ // will be subjected to.
+ if (ReduceSpill) {
+ // It may be better to let the register spill if it is defined by a very
+ // high latency instruction. Try to estimate the latency gain induced by
+ // rematerializing the register.
+ //
+ // If we don't know the rematerializations's latency gain we don't know
+ // what to compare the spill latency against. We still consider the
+ // rematerialization potentially beneficial in such cases because we don't
+ // want to miss rematerialization opportunities and rematerializing is in
+ // most cases cheaper than spilling. We still give a bonus to remats for
+ // which we are able to do the calculation.
+ if (InstrLatencyGain && *InstrLatencyGain < 0) {
+ int SpillLatencyGain = SaveCost * Remat->DefFrequency;
+ SpillLatencyGain += RestoreCost * Remat->UseFrequency;
+ if (*InstrLatencyGain + SpillLatencyGain < 0)
+ return setUselessRemat();
+ setKnownLatencyGain();
}
- DAG.LIS->RemoveMachineInstrFromMaps(*DefMI);
- DefMI->eraseFromParent();
+ }
+
+ // The estimated RP reduction is proportional to the total frequency in target
+ // regions where the register is live.
+ Register Reg = Remat->DefMI->getOperand(0).getReg();
+ unsigned RPScore = 0;
+ for (unsigned I : TargetRegions.set_bits()) {
+ unsigned Freq = std::max(RegionFreq[I], static_cast<uint64_t>(1));
+ if (Remat->isBeneficialRegion(I))
+ Score += WeightRP * RPTargets[I].isSaveBeneficial(Reg) * Freq;
+ else if (Remat->isMaybeBeneficialRegion(I))
+ Score += WeightRPMaybe * RPTargets[I].isSaveBeneficial(Reg) * Freq;
+ }
- // Collect all regions impacted by the rematerialization and update their
- // live-in/RP information.
- for (unsigned I : Remat.LiveInRegions) {
- ImpactedRegions.insert({I, DAG.Pressure[I]});
- GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I];
+ // The estimated RP reduction is directly proportional to the size of the
+ // rematerializable register.
+ setRPScore(RPScore * SIRegisterInfo::getNumCoveredRegs(Remat->Mask));
+}
+MachineInstr *PreRARematStage::rematerialize(const RematReg &Remat,
+ BitVector &RecomputeRP) {
+ const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
+ MachineInstr &DefMI = *Remat.DefMI;
+ Register Reg = DefMI.getOperand(0).getReg();
+ const TargetRegisterClass *RC = DAG.MRI.getRegClass(Reg);
+ Register NewReg = DAG.MRI.createVirtualRegister(RC);
----------------
arsenm wrote:
cloneVirtualRegister? Could this lose WWM flags?
https://github.com/llvm/llvm-project/pull/153092
More information about the llvm-commits
mailing list