[llvm] [AMDGPU][Scheduler] Refactor ArchVGPR rematerialization during scheduling (PR #125885)
Lucas Ramirez via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 26 15:13:03 PDT 2025
================
@@ -1688,174 +1696,426 @@ bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
return true;
}
-void PreRARematStage::collectRematerializableInstructions() {
- const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
- for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
- Register Reg = Register::index2VirtReg(I);
- if (!DAG.LIS->hasInterval(Reg))
- continue;
+namespace {
+/// Models excess register pressure in a region and tracks our progress as we
+/// identify rematerialization opportunities.
+struct ExcessRP {
+ /// Number of excess ArchVGPRs.
+ unsigned ArchVGPRs = 0;
+ /// Number of excess AGPRs.
+ unsigned AGPRs = 0;
+ /// For unified register files, number of excess VGPRs.
+ unsigned VGPRs = 0;
+ /// For unified register files with AGPR usage, number of excess ArchVGPRs to
+ /// save before we are able to save a whole allocation granule.
+ unsigned ArchVGPRsToAlignment = 0;
+ /// Whether the region uses AGPRs.
+ bool HasAGPRs = false;
+
+ /// Constructs the excess RP model; determines the excess pressure w.r.t. a
+ /// maximum number of allowed VGPRs.
+ ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP, unsigned MaxVGPRs);
+
+ /// Accounts for \p NumRegs saved ArchVGPRs in the model. If \p
+ /// UseArchVGPRForAGPRSpill is true, saved ArchVGPRs are used to save excess
+ /// AGPRs once excess ArchVGPR pressure has been eliminated. Returns whether
+ /// saving these ArchVGPRs helped reduce excess pressure.
+ bool saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill);
+
+ /// Accounts for \p NumRegs saved AGPRS in the model. Returns whether saving
+ /// these ArchVGPRs helped reduce excess pressure.
+ bool saveAGPRs(unsigned NumRegs);
+
+ /// Returns whether there is any excess register pressure.
+ operator bool() const { return ArchVGPRs != 0 || AGPRs != 0 || VGPRs != 0; }
- // TODO: Handle AGPR and SGPR rematerialization
- if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
- !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
- continue;
+#ifndef NDEBUG
+ friend raw_ostream &operator<<(raw_ostream &OS, const ExcessRP &Excess) {
+ OS << Excess.ArchVGPRs << " ArchVGPRs, " << Excess.AGPRs << " AGPRs, and "
+ << Excess.VGPRs << " VGPRs (next ArchVGPR aligment in "
+ << Excess.ArchVGPRsToAlignment << " registers)\n";
+ return OS;
+ }
+#endif
- MachineOperand *Op = DAG.MRI.getOneDef(Reg);
- MachineInstr *Def = Op->getParent();
- if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
- continue;
+private:
+ static inline bool saveRegs(unsigned &LeftToSave, unsigned &NumRegs) {
+ unsigned NumSaved = std::min(LeftToSave, NumRegs);
+ NumRegs -= NumSaved;
+ LeftToSave -= NumSaved;
+ return NumSaved;
+ }
+};
+} // namespace
+
+ExcessRP::ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP,
+ unsigned MaxVGPRs) {
+ unsigned NumArchVGPRs = RP.getArchVGPRNum();
+ unsigned NumAGPRs = RP.getAGPRNum();
+ HasAGPRs = NumAGPRs;
+
+ if (!ST.hasGFX90AInsts()) {
+ // Non-unified RF. Account for excess pressure for ArchVGPRs and AGPRs
+ // independently.
+ if (NumArchVGPRs > MaxVGPRs)
+ ArchVGPRs = NumArchVGPRs - MaxVGPRs;
+ if (NumAGPRs > MaxVGPRs)
+ AGPRs = NumAGPRs - MaxVGPRs;
+ return;
+ }
- MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
- if (Def->getParent() == UseI->getParent())
- continue;
+ // Independently of whether overall VGPR pressure is under the limit, we still
+ // have to check whether ArchVGPR pressure or AGPR pressure alone exceeds the
+ // number of addressable registers in each category.
+ const unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs();
+ if (NumArchVGPRs > MaxArchVGPRs) {
+ ArchVGPRs = NumArchVGPRs - MaxArchVGPRs;
+ NumArchVGPRs = MaxArchVGPRs;
+ }
+ if (NumAGPRs > MaxArchVGPRs) {
+ AGPRs = NumAGPRs - MaxArchVGPRs;
+ NumAGPRs = MaxArchVGPRs;
+ }
- bool HasRematDependency = false;
- // Check if this instruction uses any registers that are planned to be
- // rematerialized
- for (auto &RematEntry : RematerializableInsts) {
- if (find_if(RematEntry.second,
- [&Def](std::pair<MachineInstr *, MachineInstr *> &Remat) {
- for (MachineOperand &MO : Def->operands()) {
- if (!MO.isReg())
- continue;
- if (MO.getReg() == Remat.first->getOperand(0).getReg())
- return true;
- }
- return false;
- }) != RematEntry.second.end()) {
- HasRematDependency = true;
- break;
- }
+ // Check overall VGPR usage against the limit; any excess above addressable
+ // register limits has already been accounted for.
+ unsigned NumVGPRs = GCNRegPressure::getUnifiedVGPRNum(NumArchVGPRs, NumAGPRs);
+ if (NumVGPRs > MaxVGPRs) {
+ VGPRs = NumVGPRs - MaxVGPRs;
+ ArchVGPRsToAlignment = NumArchVGPRs - alignDown(NumArchVGPRs, 4);
+ if (!ArchVGPRsToAlignment)
+ ArchVGPRsToAlignment = 4;
+ }
+}
+
+bool ExcessRP::saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill) {
+ bool Progress = saveRegs(ArchVGPRs, NumRegs);
+
+ if (HasAGPRs) {
+ // ArchVGPRs can only be allocated as a multiple of a granule.
+ const unsigned Granule = 4;
+ unsigned NumSavedRegs = 0;
+
+ // Count the number of whole ArchVGPR allocation granules we can save.
+ if (unsigned NumGranules = NumRegs / Granule; NumGranules) {
+ NumSavedRegs = NumGranules * Granule;
+ NumRegs -= NumSavedRegs;
}
- // Do not rematerialize an instruction if it uses an instruction that we
- // have designated for rematerialization.
- // FIXME: Allow for rematerialization chains: this requires 1. updating
- // remat points to account for uses that are rematerialized, and 2. either
- // rematerializing the candidates in careful ordering, or deferring the MBB
- // RP walk until the entire chain has been rematerialized.
- if (HasRematDependency)
- continue;
- // Similarly, check if the UseI is planned to be remat.
- for (auto &RematEntry : RematerializableInsts) {
- if (find_if(RematEntry.second,
- [&UseI](std::pair<MachineInstr *, MachineInstr *> &Remat) {
- return Remat.first == UseI;
- }) != RematEntry.second.end()) {
- HasRematDependency = true;
- break;
- }
+ // We may be able to save one more whole ArchVGPR allocation granule.
+ if (NumRegs >= ArchVGPRsToAlignment) {
+ NumSavedRegs += Granule;
+ ArchVGPRsToAlignment = Granule - (NumRegs - ArchVGPRsToAlignment);
+ } else {
+ ArchVGPRsToAlignment -= NumRegs;
}
- if (HasRematDependency)
- break;
+ // Prioritize saving generic VGPRs, then AGPRs if we allow AGPR-to-ArchVGPR
+ // spilling and have some free ArchVGPR slots.
+ Progress |= saveRegs(VGPRs, NumSavedRegs);
+ if (UseArchVGPRForAGPRSpill)
+ Progress |= saveRegs(AGPRs, NumSavedRegs);
+ } else {
+ // No AGPR usage in the region i.e., no allocation granule to worry about.
+ Progress |= saveRegs(VGPRs, NumRegs);
+ }
- // We are only collecting defs that are defined in another block and are
- // live-through or used inside regions at MinOccupancy. This means that the
- // register must be in the live-in set for the region.
- bool AddedToRematList = false;
- for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
- auto It = DAG.LiveIns[I].find(Reg);
- if (It != DAG.LiveIns[I].end() && !It->second.none()) {
- if (DAG.RegionsWithMinOcc[I]) {
- SlotIndex DefIdx = DAG.LIS->getInstructionIndex(*Def);
- SlotIndex UseIdx =
- DAG.LIS->getInstructionIndex(*UseI).getRegSlot(true);
- if (allUsesAvailableAt(Def, DefIdx, UseIdx)) {
- RematerializableInsts[I][Def] = UseI;
- AddedToRematList = true;
- }
- }
+ return Progress;
+}
- // Collect regions with rematerializable reg as live-in to avoid
- // searching later when updating RP.
- RematDefToLiveInRegions[Def].push_back(I);
+bool ExcessRP::saveAGPRs(unsigned NumRegs) {
+ return saveRegs(AGPRs, NumRegs) || saveRegs(VGPRs, NumRegs);
+}
+
+bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
+ const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
+
+ REMAT_DEBUG(dbgs() << "Collecting rematerializable instructions in "
+ << MF.getFunction().getName() << '\n');
+
+ // Maps optimizable regions (i.e., regions at minimum and VGPR-limited
+ // occupancy, or regions with VGPR spilling) to a model of their excess RP.
+ DenseMap<unsigned, ExcessRP> OptRegions;
+ const Function &F = MF.getFunction();
+
+ // Adjust workgroup size induced occupancy bounds with the
+ // "amdgpu-waves-per-eu" attribute. This should be offloaded to a subtarget
+ // method, but at this point is if unclear how other parts of the codebase
+ // interpret this attribute and the default behavior produces unexpected
+ // bounds. Here we want to allow users to ask for target occupancies lower
+ // than the default lower bound.
+ std::pair<unsigned, unsigned> OccBounds =
+ ST.getOccupancyWithWorkGroupSizes(MF);
+ std::pair<unsigned, unsigned> WavesPerEU =
+ AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
+ if (WavesPerEU.first <= WavesPerEU.second) {
+ if (WavesPerEU.first && WavesPerEU.first <= OccBounds.second)
+ OccBounds.first = WavesPerEU.first;
+ if (WavesPerEU.second)
+ OccBounds.second = std::min(OccBounds.second, WavesPerEU.second);
+ }
+
+ // We call the "base max functions" directly because otherwise it uses the
+ // subtarget's logic for combining "amdgpu-waves-per-eu" with the function's
+ // groupsize induced occupancy bounds, producing unexpected results.
+ const unsigned MaxSGPRsNoSpill = ST.getBaseMaxNumSGPRs(
+ F, OccBounds, ST.getMaxNumPreloadedSGPRs(), ST.getReservedNumSGPRs(F));
+ const unsigned MaxVGPRsNoSpill =
+ ST.getBaseMaxNumVGPRs(F, {ST.getMinNumVGPRs(OccBounds.second),
+ ST.getMaxNumVGPRs(OccBounds.first)});
+ const unsigned MaxSGPRsIncOcc =
+ ST.getMaxNumSGPRs(DAG.MinOccupancy + 1, false);
+ const unsigned MaxVGPRsIncOcc = ST.getMaxNumVGPRs(DAG.MinOccupancy + 1);
+ IncreaseOccupancy = OccBounds.second > DAG.MinOccupancy;
+
+ auto ClearOptRegionsIf = [&](bool Cond) -> bool {
+ if (Cond) {
+ // We won't try to increase occupancy.
+ IncreaseOccupancy = false;
+ OptRegions.clear();
+ }
+ return Cond;
+ };
+
+ // Collect optimizable regions. If there is spilling in any region we will
+ // just try to reduce ArchVGPR spilling. Otherwise we will try to increase
+ // occupancy by one in the whole function.
+ for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+ GCNRegPressure &RP = DAG.Pressure[I];
+
+ // Check whether SGPR pressures prevents us from eliminating spilling.
+ unsigned NumSGPRs = RP.getSGPRNum();
+ if (NumSGPRs > MaxSGPRsNoSpill)
+ ClearOptRegionsIf(IncreaseOccupancy);
+
+ ExcessRP Excess(ST, RP, MaxVGPRsNoSpill);
+ if (Excess) {
+ ClearOptRegionsIf(IncreaseOccupancy);
+ } else if (IncreaseOccupancy) {
+ // Check whether SGPR pressure prevents us from increasing occupancy.
+ if (ClearOptRegionsIf(NumSGPRs > MaxSGPRsIncOcc)) {
+ if (DAG.MinOccupancy >= OccBounds.first)
+ return false;
+ continue;
+ }
+ if ((Excess = ExcessRP(ST, RP, MaxVGPRsIncOcc))) {
+ // We can only rematerialize ArchVGPRs at this point.
+ unsigned NumArchVGPRsToRemat = Excess.ArchVGPRs + Excess.VGPRs;
----------------
lucas-rami wrote:
I think it should be the sum of the two. For non-unified RFs `Excess.VGPRs` is always 0 so it makes no difference. For unified RFs `Excess.ArchVGPRs` is only used to account for ArchVGPR pressure above the addressable limit, then the "regular excess pressure" is accounted for in Excess.VGPRs.
For example, on gfx942 with occ lower bound of 4, no AGPR usage and ArchVGPR usage of 257, we will get `MaxVGPRsIncOcc=512/4=128`, `Excess.ArchVGPRs=1`, `Excess.VGPRs=128`, and we will need to save 129 ArchVGPRs to hope to reach the min occupancy.
https://github.com/llvm/llvm-project/pull/125885
More information about the llvm-commits
mailing list