[llvm] [AMDGPU][Scheduler] Refactor VGPR rematerialization during scheduling (PR #118722)
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 31 10:56:22 PST 2025
================
@@ -1467,284 +1488,460 @@ void GCNSchedStage::revertScheduling() {
DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
}
-void PreRARematStage::collectRematerializableInstructions() {
+bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
- for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
- Register Reg = Register::index2VirtReg(I);
- if (!DAG.LIS->hasInterval(Reg))
- continue;
-
- // TODO: Handle AGPR and SGPR rematerialization
- if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
- !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
- continue;
- MachineOperand *Op = DAG.MRI.getOneDef(Reg);
- MachineInstr *Def = Op->getParent();
- if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
- continue;
-
- MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
- if (Def->getParent() == UseI->getParent())
+ RA_DEBUG(dbgs() << "Collecting rematerializable instructions in "
+ << MF.getFunction().getName() << '\n');
+
+ // Models excess register pressure in a region.
+ struct ExcessRP {
+ // Amount of VGPR spilling, if any.
+ unsigned Spilling = 0;
+ // Number of VGPRs to save to increase occupancy. std::nullopt when
+ // occupancy is SGPR-limited.
+ std::optional<unsigned> Occupancy = std::nullopt;
+ };
+ // Maps optimizable regions (i.e., regions at minimum and VGPR-limited
+ // occupancy, or regions with VGPR spilling) to their excess RP.
+ DenseMap<unsigned, ExcessRP> OptRegions;
+
+ // Collect optimizable regions.
+ unsigned NumRegionsSpilling = 0;
+ bool CanIncreaseOccupancy = true;
+ for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+ if (!DAG.RegionsWithMinOcc[I])
continue;
+ GCNRegPressure &RP = DAG.Pressure[I];
+
+ unsigned NumVGPRs = RP.getVGPRNum(ST.hasGFX90AInsts());
+ ExcessRP Excess;
+ if ((Excess.Spilling = ST.getNumVGPRsToEliminateSpilling(NumVGPRs))) {
+ // Region has VGPR spilling, we may not be able to increase occupancy but
+ // we can at least try to reduce spilling as much as possible.
+ ++NumRegionsSpilling;
+ RA_DEBUG(dbgs() << "Region " << I << " is spilling VGPRs, save "
+ << Excess.Spilling << " VGPR(s) to eliminate spilling\n");
+ }
+ if (ST.getOccupancyWithNumSGPRs(RP.getSGPRNum()) != DAG.MinOccupancy) {
+ // Occupancy is VGPR-limited. If occupancy is minimal and there is
+ // spilling, the number of registers to save to increase occupancy
+ // includes the number of spilled registers to save; deduct the latter
+ // from the former to only get the number of registers to save to
+ // increase occupancy as if there was no spilling.
+ Excess.Occupancy =
+ ST.getNumVGPRsToIncreaseOccupancy(NumVGPRs) - Excess.Spilling;
+ RA_DEBUG(dbgs() << "Region " << I << " has min. occupancy: save "
+ << Excess.Occupancy << " VGPR(s) to improve occupancy\n");
+ } else {
+ CanIncreaseOccupancy = false;
+ }
+ if (Excess.Spilling || Excess.Occupancy)
+ OptRegions.insert({I, Excess});
+ }
+ if (OptRegions.empty())
+ return false;
- // We are only collecting defs that are defined in another block and are
- // live-through or used inside regions at MinOccupancy. This means that the
- // register must be in the live-in set for the region.
- bool AddedToRematList = false;
- for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
- auto It = DAG.LiveIns[I].find(Reg);
- if (It != DAG.LiveIns[I].end() && !It->second.none()) {
- if (DAG.RegionsWithMinOcc[I]) {
- RematerializableInsts[I][Def] = UseI;
- AddedToRematList = true;
+ bool FuncHasSpilling = NumRegionsSpilling != 0;
+ unsigned RematSpillingCutoff = 0;
+ TargetOcc = DAG.MinOccupancy + (CanIncreaseOccupancy ? 1 : 0);
+
+ // Accounts for a reduction in RP in an optimizable region. Returns whether we
+ // estimate that we have identified enough rematerialization opportunities to
+ // increase function occupancy.
+ auto ReduceRPInRegion = [&](auto OptIt, LaneBitmask Mask) -> bool {
+ auto NumRegs = SIRegisterInfo::getNumCoveredRegs(Mask);
+ unsigned I = OptIt->getFirst();
+ ExcessRP &Excess = OptIt->getSecond();
+
+ // While there is spilling, saved registers only serve to reduce it.
+ if (Excess.Spilling) {
+ unsigned Reduction = std::min(NumRegs, Excess.Spilling);
+ Excess.Spilling -= Reduction;
+ NumRegs -= Reduction;
+ if (!Excess.Spilling) {
+ // We have eliminated spilling in the region.
+ LLVM_DEBUG(dbgs() << "sinking eliminates spilling in region " << I
+ << '\n');
+ if (--NumRegionsSpilling)
+ RematSpillingCutoff = Rematerializations.size();
+ if (!Excess.Occupancy) {
+ // Occupancy cannot be increased, so we are done with this region.
+ OptRegions.erase(I);
+ return OptRegions.empty();
}
-
- // Collect regions with rematerializable reg as live-in to avoid
- // searching later when updating RP.
- RematDefToLiveInRegions[Def].push_back(I);
+ } else {
+ LLVM_DEBUG(dbgs() << "sinking reduces spilling in region " << I
+ << " by " << Reduction << '\n');
}
}
- if (!AddedToRematList)
- RematDefToLiveInRegions.erase(Def);
- }
-}
-
-bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
- const TargetInstrInfo *TII) {
- // Temporary copies of cached variables we will be modifying and replacing if
- // sinking succeeds.
- SmallVector<
- std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
- NewRegions;
- DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
- DenseMap<unsigned, GCNRegPressure> NewPressure;
- BitVector NewRescheduleRegions;
- LiveIntervals *LIS = DAG.LIS;
-
- NewRegions.resize(DAG.Regions.size());
- NewRescheduleRegions.resize(DAG.Regions.size());
-
- // Collect only regions that has a rematerializable def as a live-in.
- SmallSet<unsigned, 16> ImpactedRegions;
- for (const auto &It : RematDefToLiveInRegions)
- ImpactedRegions.insert(It.second.begin(), It.second.end());
-
- // Make copies of register pressure and live-ins cache that will be updated
- // as we rematerialize.
- for (auto Idx : ImpactedRegions) {
- NewPressure[Idx] = DAG.Pressure[Idx];
- NewLiveIns[Idx] = DAG.LiveIns[Idx];
- }
- NewRegions = DAG.Regions;
- NewRescheduleRegions.reset();
-
- DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
- bool Improved = false;
- for (auto I : ImpactedRegions) {
- if (!DAG.RegionsWithMinOcc[I])
- continue;
- Improved = false;
- int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts());
- int SGPRUsage = NewPressure[I].getSGPRNum();
+ // Once spilling has been eliminated, saved registers serve to increase
+ // occupancy.
+ if (NumRegs && Excess.Occupancy) {
+ if (NumRegs >= *Excess.Occupancy) {
+ OptRegions.erase(I);
+ LLVM_DEBUG(dbgs() << "sinking increases occupancy in region " << I
+ << '\n');
+ } else {
+ *Excess.Occupancy -= NumRegs;
+ LLVM_DEBUG(dbgs() << "sinking reduces excess pressure in region " << I
+ << " by " << NumRegs << '\n');
+ }
+ }
+ return OptRegions.empty();
+ };
+
+ // We need up-to-date live-out info. to query live-out register masks in
+ // regions containing rematerializable instructions.
+ DAG.RegionLiveOuts.buildLiveRegMap();
+
+ for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+ auto Region = DAG.Regions[I];
+ for (auto MI = Region.first; MI != Region.second; ++MI) {
+ // The instruction must be trivially rematerializable.
+ MachineInstr &DefMI = *MI;
+ if (!isTriviallyReMaterializable(DefMI))
+ continue;
- // TODO: Handle occupancy drop due to AGPR and SGPR.
- // Check if cause of occupancy drop is due to VGPR usage and not SGPR.
- if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy)
- break;
+ // We only support rematerializing virtual VGPRs with one definition.
+ Register Reg = DefMI.getOperand(0).getReg();
+ if (!Reg.isVirtual() || !DAG.LIS->hasInterval(Reg) ||
+ !SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
+ !DAG.MRI.hasOneDef(Reg))
+ continue;
- // The occupancy of this region could have been improved by a previous
- // iteration's sinking of defs.
- if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) {
- NewRescheduleRegions[I] = true;
- Improved = true;
- continue;
- }
+ // We only care to rematerialize the instruction if it has a single
+ // non-debug user in a different block.
+ MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
+ if (!UseMI || DefMI.getParent() == UseMI->getParent())
+ continue;
- // First check if we have enough trivially rematerializable instructions to
- // improve occupancy. Optimistically assume all instructions we are able to
- // sink decreased RP.
- int TotalSinkableRegs = 0;
- for (const auto &It : RematerializableInsts[I]) {
- MachineInstr *Def = It.first;
- Register DefReg = Def->getOperand(0).getReg();
- TotalSinkableRegs +=
- SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]);
- }
- int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
- unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
- // If in the most optimistic scenario, we cannot improve occupancy, then do
- // not attempt to sink any instructions.
- if (OptimisticOccupancy <= DAG.MinOccupancy)
- break;
+ RA_DEBUG(dbgs() << "In region " << I << ", rematerializable instruction ";
+ DefMI.print(dbgs(), true, false, false, false);
+ dbgs() << " with single use " << *UseMI);
+ auto &Remat = Rematerializations.emplace_back(&DefMI, I, UseMI);
+
+ bool RematUseful = false;
+ if (auto It = OptRegions.find(I); It != OptRegions.end()) {
+ // Optimistically consider that moving the instruction out of its
+ // defining region will reduce RP in the latter; this assumes that
+ // maximum RP in the region is reached somewhere between the defining
+ // instruction and the end of the region.
+ RA_DEBUG(dbgs() << " Instruction's defining region is optimizable: ");
+ RematUseful = true;
+ LaneBitmask RegMask =
+ DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
+ if (ReduceRPInRegion(It, RegMask))
+ return true;
+ }
- unsigned ImproveOccupancy = 0;
- SmallVector<MachineInstr *, 4> SinkedDefs;
- for (auto &It : RematerializableInsts[I]) {
- MachineInstr *Def = It.first;
- MachineBasicBlock::iterator InsertPos =
- MachineBasicBlock::iterator(It.second);
- Register Reg = Def->getOperand(0).getReg();
- // Rematerialize MI to its use block. Since we are only rematerializing
- // instructions that do not have any virtual reg uses, we do not need to
- // call LiveRangeEdit::allUsesAvailableAt() and
- // LiveRangeEdit::canRematerializeAt().
- TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
- Def->getOperand(0).getSubReg(), *Def, *DAG.TRI);
- MachineInstr *NewMI = &*std::prev(InsertPos);
- LIS->InsertMachineInstrInMaps(*NewMI);
- LIS->removeInterval(Reg);
- LIS->createAndComputeVirtRegInterval(Reg);
- InsertedMIToOldDef[NewMI] = Def;
-
- // Update region boundaries in scheduling region we sinked from since we
- // may sink an instruction that was at the beginning or end of its region
- DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
- /*Removing =*/true);
-
- // Update region boundaries in region we sinked to.
- DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI);
-
- LaneBitmask PrevMask = NewLiveIns[I][Reg];
- // FIXME: Also update cached pressure for where the def was sinked from.
- // Update RP for all regions that has this reg as a live-in and remove
- // the reg from all regions as a live-in.
- for (auto Idx : RematDefToLiveInRegions[Def]) {
- NewLiveIns[Idx].erase(Reg);
- if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) {
- // Def is live-through and not used in this block.
- NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
+ for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) {
+ // We are only collecting regions in which the register is a live-in
+ // (and may be live-through).
+ auto It = DAG.LiveIns[LIRegion].find(Reg);
+ if (It == DAG.LiveIns[LIRegion].end() || It->second.none())
+ continue;
+ Remat.LiveInRegions.insert(LIRegion);
+ RA_DEBUG(dbgs() << " Def is live-in in region " << LIRegion << ": ");
+
+ // Account for the reduction in RP due to the rematerialization in an
+ // optimizable region in which the defined register is a live-in. This
+ // is exact for live-through region but optimistic in the using region,
+ // where RP is actually reduced only if maximum RP is reached somewhere
+ // between the beginning of the region and the rematerializable
+ // instruction's use.
+ if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
+ RematUseful = true;
+ if (ReduceRPInRegion(It, DAG.LiveIns[LIRegion][Reg]))
+ return true;
} else {
- // Def is used and rematerialized into this block.
- GCNDownwardRPTracker RPT(*LIS);
- auto *NonDbgMI = &*skipDebugInstructionsForward(
- NewRegions[Idx].first, NewRegions[Idx].second);
- RPT.reset(*NonDbgMI, &NewLiveIns[Idx]);
- RPT.advance(NewRegions[Idx].second);
- NewPressure[Idx] = RPT.moveMaxPressure();
+ LLVM_DEBUG(dbgs() << "unoptimizable region\n");
}
}
- SinkedDefs.push_back(Def);
- ImproveOccupancy = NewPressure[I].getOccupancy(ST);
- if (ImproveOccupancy > DAG.MinOccupancy)
- break;
+ // If the instruction is not a live-in or live-out in any optimizable
+ // region then there is no point in rematerializing it.
+ if (!RematUseful) {
+ Rematerializations.pop_back();
+ RA_DEBUG(
+ dbgs()
+ << " No impact on any optimizable region, dropping instruction\n");
+ }
}
-
- // Remove defs we just sinked from all regions' list of sinkable defs
- for (auto &Def : SinkedDefs)
- for (auto TrackedIdx : RematDefToLiveInRegions[Def])
- RematerializableInsts[TrackedIdx].erase(Def);
-
- if (ImproveOccupancy <= DAG.MinOccupancy)
- break;
-
- NewRescheduleRegions[I] = true;
- Improved = true;
}
- if (!Improved) {
- // Occupancy was not improved for all regions that were at MinOccupancy.
- // Undo sinking and remove newly rematerialized instructions.
- for (auto &Entry : InsertedMIToOldDef) {
- MachineInstr *MI = Entry.first;
- MachineInstr *OldMI = Entry.second;
- Register Reg = MI->getOperand(0).getReg();
- LIS->RemoveMachineInstrFromMaps(*MI);
- MI->eraseFromParent();
- OldMI->clearRegisterDeads(Reg);
- LIS->removeInterval(Reg);
- LIS->createAndComputeVirtRegInterval(Reg);
+ if (FuncHasSpilling && !Rematerializations.empty()) {
+ // We won't be able to increase occupancy, but we still want to reduce
+ // spilling. Drop any instruction we collected to increase occupancy.
+ TargetOcc = DAG.MinOccupancy;
+ if (!NumRegionsSpilling) {
+ Rematerializations.truncate(RematSpillingCutoff);
+ RA_DEBUG(dbgs() << "Can only eliminate spilling\n");
+ } else {
+ RA_DEBUG(dbgs() << "Can only reduce spilling\n");
}
- return false;
- }
-
- // Occupancy was improved for all regions.
- for (auto &Entry : InsertedMIToOldDef) {
- MachineInstr *MI = Entry.first;
- MachineInstr *OldMI = Entry.second;
-
- // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
- DAG.BBLiveInMap.erase(OldMI);
-
- // Remove OldMI and update LIS
- Register Reg = MI->getOperand(0).getReg();
- LIS->RemoveMachineInstrFromMaps(*OldMI);
- OldMI->eraseFromParent();
- LIS->removeInterval(Reg);
- LIS->createAndComputeVirtRegInterval(Reg);
+ return true;
}
+ RA_DEBUG(dbgs() << "Cannot increase occupancy through rematerialization\n");
+ Rematerializations.clear();
+ return false;
+}
- // Update live-ins, register pressure, and regions caches.
- for (auto Idx : ImpactedRegions) {
- DAG.LiveIns[Idx] = NewLiveIns[Idx];
- DAG.Pressure[Idx] = NewPressure[Idx];
- DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent());
+void PreRARematStage::rematerialize() {
+ const auto *TII =
+ static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+ // Collect regions whose RP changes in unpredictable way; we will have to
+ // fully recompute their RP after all rematerailizations.
+ DenseSet<unsigned> RecomputeRP;
+ SlotIndexes *Slots = DAG.LIS->getSlotIndexes();
+
+ // Rematerialize all instructions.
+ for (RematInstruction &Remat : Rematerializations) {
+ MachineInstr &DefMI = *Remat.DefMI;
+ MachineBasicBlock::iterator InsertPos(Remat.UseMI);
+ Register Reg = DefMI.getOperand(0).getReg();
+ unsigned SubReg = DefMI.getOperand(0).getSubReg();
+
+ // Rematerialize DefMI to its use block. Since we are only rematerializing
+ // instructions that do not have any virtual reg uses, we do not need to
+ // call LiveRangeEdit::allUsesAvailableAt() and
+ // LiveRangeEdit::canRematerializeAt().
+ TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg, SubReg, DefMI,
+ *DAG.TRI);
+ Remat.RematMI = &*std::prev(InsertPos);
+ Remat.RematMI->getOperand(0).setSubReg(SubReg);
+ DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
+
+ // Update region boundaries in regions we sinked from (remove defining MI)
+ // and to (insert MI rematerialized in use block). Only then we can erase
+ // the original MI.
+ DAG.updateRegionBoundaries(DAG.Regions, DefMI, nullptr);
+ DAG.updateRegionBoundaries(DAG.Regions, InsertPos, Remat.RematMI);
+ DefMI.eraseFromParent();
+ DAG.LIS->RemoveMachineInstrFromMaps(DefMI);
+
+ // Collect all regions impacted by the rematerialization and update their
+ // live-in/RP information.
+ for (unsigned I : Remat.LiveInRegions) {
+ ImpactedRegions.insert({I, DAG.Pressure[I]});
+ GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I];
+
+ // The register is no longer a live-in in all regions but the one that
+ // contains the single use. In live-through regions, maximum register
+ // pressure decreases predictably so we can directly update it. In the
+ // using region, maximum RP may or may not decrease, so we will mark it
+ // for re-computation after all materializations have taken place.
+ RegionLiveIns.erase(Reg);
+ LaneBitmask PrevMask = RegionLiveIns[Reg];
----------------
jrbyrnes wrote:
Accessing the element after erasing?
https://github.com/llvm/llvm-project/pull/118722
More information about the llvm-commits
mailing list