[llvm] [AMDGPU][Scheduler] Scoring system for rematerialization candidates (PR #153092)
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 15 13:23:55 PDT 2025
================
@@ -1760,106 +1931,203 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
*DAG.TII))
continue;
- REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI);
- RematInstruction &Remat =
- Rematerializations.try_emplace(&DefMI, UseMI).first->second;
-
- bool RematUseful = false;
- if (auto It = OptRegions.find(I); It != OptRegions.end()) {
- // Optimistically consider that moving the instruction out of its
- // defining region will reduce RP in the latter; this assumes that
- // maximum RP in the region is reached somewhere between the defining
- // instruction and the end of the region.
- REMAT_DEBUG(dbgs() << " Defining region is optimizable\n");
- LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
- if (ReduceRPInRegion(It, Reg, Mask, RematUseful))
- return true;
- }
+ // Add the instruction to the rematerializable list.
+ RematRegSet.insert(Reg);
+ RematRegs.emplace_back(&DefMI, UseMI, DAG, MIRegion);
+ }
+ }
- for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) {
- // We are only collecting regions in which the register is a live-in
- // (and may be live-through).
- auto It = DAG.LiveIns[LIRegion].find(Reg);
- if (It == DAG.LiveIns[LIRegion].end() || It->second.none())
- continue;
- Remat.LiveInRegions.insert(LIRegion);
-
- // Account for the reduction in RP due to the rematerialization in an
- // optimizable region in which the defined register is a live-in. This
- // is exact for live-through region but optimistic in the using region,
- // where RP is actually reduced only if maximum RP is reached somewhere
- // between the beginning of the region and the rematerializable
- // instruction's use.
- if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
- REMAT_DEBUG(dbgs() << " Live-in in region " << LIRegion << '\n');
- if (ReduceRPInRegion(It, Reg, DAG.LiveIns[LIRegion][Reg],
- RematUseful))
- return true;
- }
- }
+ return !RematRegs.empty();
+}
- // If the instruction is not a live-in or live-out in any optimizable
- // region then there is no point in rematerializing it.
- if (!RematUseful) {
- Rematerializations.pop_back();
- REMAT_DEBUG(dbgs() << " No impact, not rematerializing instruction\n");
- } else {
- RematRegs.insert(Reg);
- }
- }
+PreRARematStage::RematReg::RematReg(
+ MachineInstr *DefMI, MachineInstr *UseMI, GCNScheduleDAGMILive &DAG,
+ const DenseMap<MachineInstr *, unsigned> &MIRegion)
+ : DefMI(DefMI), UseMI(UseMI), LiveIn(DAG.Regions.size()),
+ LiveOut(DAG.Regions.size()), Live(DAG.Regions.size()),
+ DefRegion(MIRegion.at(DefMI)), UseRegion(MIRegion.at(UseMI)) {
+
+ // Mark regions in which the rematerializable register is live.
+ Register Reg = getReg();
+ for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+ auto LiveInIt = DAG.LiveIns[I].find(Reg);
+ if (LiveInIt != DAG.LiveIns[I].end() && LiveInIt->second.any())
+ LiveIn.set(I);
+ auto LiveOutIt = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).find(Reg);
+ auto LiveOutEnd = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).end();
+ if (LiveOutIt != LiveOutEnd && LiveOutIt->second.any())
+ LiveOut.set(I);
}
+ Live |= LiveIn;
+ Live |= LiveOut;
- if (TargetOcc) {
- // We were trying to increase occupancy but failed, abort the stage.
- REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
- Rematerializations.clear();
- return false;
+ // Store the register's lane bitmask.
+ unsigned SubIdx = DefMI->getOperand(0).getSubReg();
+ Mask = SubIdx ? DAG.TRI->getSubRegIndexLaneMask(SubIdx)
+ : DAG.MRI.getMaxLaneMaskForVReg(Reg);
+}
+
+bool PreRARematStage::RematReg::maybeBeneficial(
+ const BitVector &TargetRegions, ArrayRef<GCNRPTarget> RPTargets) const {
+ Register Reg = getReg();
+ for (unsigned I : TargetRegions.set_bits()) {
+ if (Live[I] && RPTargets[I].isSaveBeneficial(Reg))
+ return true;
}
- REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n");
- return !Rematerializations.empty();
+ return false;
}
-void PreRARematStage::rematerialize() {
- const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
+void PreRARematStage::RematReg::insertMI(unsigned RegionIdx,
+ MachineInstr *RematMI,
+ GCNScheduleDAGMILive &DAG) const {
+ RegionBoundaries &Bounds = DAG.Regions[RegionIdx];
+ if (Bounds.first == std::next(MachineBasicBlock::iterator(RematMI)))
+ Bounds.first = RematMI;
+ DAG.LIS->InsertMachineInstrInMaps(*RematMI);
+ DAG.LIS->createAndComputeVirtRegInterval(RematMI->getOperand(0).getReg());
+}
- // Collect regions whose RP changes in unpredictable way; we will have to
- // fully recompute their RP after all rematerailizations.
- DenseSet<unsigned> RecomputeRP;
-
- // Rematerialize all instructions.
- for (auto &[DefMI, Remat] : Rematerializations) {
- MachineBasicBlock::iterator InsertPos(Remat.UseMI);
- Register Reg = DefMI->getOperand(0).getReg();
- unsigned DefRegion = MIRegion.at(DefMI);
-
- // Rematerialize DefMI to its use block.
- TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
- AMDGPU::NoSubRegister, *DefMI, *DAG.TRI);
- Remat.RematMI = &*std::prev(InsertPos);
- DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
-
- // Update region boundaries in regions we sinked from (remove defining MI)
- // and to (insert MI rematerialized in use block). Only then we can erase
- // the original MI.
- DAG.updateRegionBoundaries(DAG.Regions[DefRegion], DefMI, nullptr);
- auto UseRegion = MIRegion.find(Remat.UseMI);
- if (UseRegion != MIRegion.end()) {
- DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], InsertPos,
- Remat.RematMI);
- }
- DAG.LIS->RemoveMachineInstrFromMaps(*DefMI);
- DefMI->eraseFromParent();
+PreRARematStage::ScoredRemat::FreqInfo::FreqInfo(
+ MachineFunction &MF, const GCNScheduleDAGMILive &DAG) {
+ assert(DAG.MLI && "MLI not defined in DAG");
+ MachineBranchProbabilityInfo MBPI;
+ MachineBlockFrequencyInfo MBFI(MF, MBPI, *DAG.MLI);
+
+ const unsigned NumRegions = DAG.Regions.size();
+ uint64_t MinFreq = MBFI.getEntryFreq().getFrequency();
+ Regions.reserve(NumRegions);
+ for (unsigned I = 0; I < NumRegions; ++I) {
+ MachineBasicBlock *MBB = DAG.Regions[I].first->getParent();
+ uint64_t BlockFreq = MBFI.getBlockFreq(MBB).getFrequency();
+ Regions.push_back(BlockFreq);
+ if (BlockFreq && BlockFreq < MinFreq)
+ MinFreq = BlockFreq;
+ else if (BlockFreq > MaxFreq)
+ MaxFreq = BlockFreq;
+ }
+ if (!MinFreq)
+ return;
+
+ // Normalize to minimum observed frequency to avoid overflows when adding up
+ // frequencies.
+ for (uint64_t &Freq : Regions)
+ Freq /= MinFreq;
+ MaxFreq /= MinFreq;
+
+ // Compute the scaling factor for scoring frequency differences.
+ const uint64_t MaxDiff = MaxFreq - 1;
+ const uint64_t MaxReprFreqValue = (1 << FreqDiffWidth) - 1;
+ RescaleIsDenom = (2 * MaxDiff) & ~MaxReprFreqValue;
+ if (RescaleIsDenom)
+ RescaleFactor = (2 * MaxDiff) >> FreqDiffWidth;
+ else
+ RescaleFactor = MaxDiff ? MaxReprFreqValue / (2 * MaxDiff) : 1;
+}
+
+PreRARematStage::ScoredRemat::ScoredRemat(const RematReg *Remat,
+ const FreqInfo &Freq,
+ const GCNScheduleDAGMILive &DAG)
+ : Remat(Remat), NumRegs(getNumRegs(DAG)), FreqDiff(getFreqDiff(Freq)) {}
+
+unsigned PreRARematStage::ScoredRemat::getNumRegs(
+ const GCNScheduleDAGMILive &DAG) const {
+ const TargetRegisterClass &RC = *DAG.MRI.getRegClass(Remat->getReg());
+ unsigned RegSize = DAG.TRI->getRegSizeInBits(RC);
+ if (unsigned SubIdx = Remat->DefMI->getOperand(0).getSubReg()) {
+ // The following may return -1 (i.e., a large unsigned number) on indices
+ // that may be used to access subregisters of multiple sizes; in such cases
+ // fallback on the size derived from the register class.
+ unsigned SubRegSize = DAG.TRI->getSubRegIdxSize(SubIdx);
+ if (SubRegSize < RegSize)
+ RegSize = SubRegSize;
+ }
+ return divideCeil(RegSize, 32);
+}
+
+uint64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Freq) const {
+ // Get frequencies of defining and using regions. A rematerialization from the
+ // least frequent region to the most frequent region will yield the greatest
+ // latency penalty and therefore should get minimum score. Reciprocally, a
+ // rematerialization in the other direction should get maximum score. Default
+ // to values that will yield the worst possible score given known frequencies
+ // in order to penalize rematerializations from or into regions whose
+ // frequency is unknown.
+ uint64_t DefOrOne = std::max(Freq.Regions[Remat->DefRegion], (uint64_t)1);
+ uint64_t UseOrMax = Freq.Regions[Remat->UseRegion];
+ if (!UseOrMax)
+ UseOrMax = Freq.MaxFreq;
+
+ // Maximum difference in frequency between defining and using regions.
+ const uint64_t MaxDiff = Freq.MaxFreq - 1;
+ // The difference between defining and using frequency is in the range
+ // [-MaxDiff, MaxDiff], shift it to [0,2 x MaxDiff] to stay in the positive
+ // range, then rescale to the representable range in the final score.
+ const uint64_t FreqDiff = (MaxDiff + (DefOrOne - UseOrMax));
----------------
jrbyrnes wrote:
Can you add UnitTests or provide some more info about how this FreqDiff scaling is supposed to work -- it's not very clear just reading through it.
Alternatively, I think you simplify the `ScoredRemat::score` handling to use a struct or something instead of the complex bitpack algorithm, which I think would allow us to use a more direct calculation for the relative frequency.
https://github.com/llvm/llvm-project/pull/153092
More information about the llvm-commits
mailing list