[llvm] [AMDGPU][Scheduler] Refactor VGPR rematerialization during scheduling (PR #118722)
Lucas Ramirez via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 4 16:27:39 PST 2024
https://github.com/lucas-rami updated https://github.com/llvm/llvm-project/pull/118722
>From e5abab8b2143a4813359097fded2e1965ebaf591 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Wed, 4 Dec 2024 15:49:33 +0100
Subject: [PATCH 1/4] Working refactoring of simple VGPR remat.
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 404 +++++++++---------
llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 56 ++-
llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 4 +
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 5 +
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 13 +
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 6 +
.../machine-scheduler-sink-trivial-remats.mir | 145 ++++++-
7 files changed, 418 insertions(+), 215 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 57f517bfba0ebb..1b39a5a7db7192 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -25,8 +25,13 @@
#include "GCNSchedStrategy.h"
#include "AMDGPUIGroupLP.h"
+#include "GCNRegPressure.h"
#include "SIMachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/Support/ErrorHandling.h"
#define DEBUG_TYPE "machine-scheduler"
@@ -945,20 +950,19 @@ bool PreRARematStage::initGCNSchedStage() {
return false;
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
- // Check maximum occupancy
+ // Rematerialization will not help if occupancy is LDS-limited.
if (ST.computeOccupancy(MF.getFunction(), MFI.getLDSSize()) ==
DAG.MinOccupancy)
return false;
// FIXME: This pass will invalidate cached MBBLiveIns for regions
- // inbetween the defs and region we sinked the def to. Cached pressure
- // for regions where a def is sinked from will also be invalidated. Will
- // need to be fixed if there is another pass after this pass.
+ // inbetween the defs and region we sinked the def to. Will need to be fixed
+ // if there is another pass after this pass.
assert(!S.hasNextStage());
- collectRematerializableInstructions();
- if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
+ if (!collectRematerializableInstructions())
return false;
+ sinkTriviallyRematInsts(ST, TII);
LLVM_DEBUG(
dbgs() << "Retrying function scheduling with improved occupancy of "
@@ -1467,231 +1471,249 @@ void GCNSchedStage::revertScheduling() {
DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
}
-void PreRARematStage::collectRematerializableInstructions() {
+/// Allows to easily filter for this stage's debug output.
+#define RA_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;)
+
+bool PreRARematStage::collectRematerializableInstructions() {
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
- for (unsigned I = 0, E = DAG.MRI.getNumVirtRegs(); I != E; ++I) {
- Register Reg = Register::index2VirtReg(I);
- if (!DAG.LIS->hasInterval(Reg))
- continue;
- // TODO: Handle AGPR and SGPR rematerialization
- if (!SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
- !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
- continue;
+ RA_DEBUG(dbgs() << "Collecting rematerializable instructions\n");
- MachineOperand *Op = DAG.MRI.getOneDef(Reg);
- MachineInstr *Def = Op->getParent();
- if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
+ // Maps optimizable regions (i.e., regions at minimum and VGPR-limited
+ // occupancy) to the numbers of VGPRs that must be deducted from their maximum
+ // VGPR pressure for their occupancy to be increased by one.
+ DenseMap<unsigned, unsigned> OptRegions;
+ for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+ if (!DAG.RegionsWithMinOcc[I])
continue;
+ GCNRegPressure &RP = DAG.Pressure[I];
- MachineInstr *UseI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
- if (Def->getParent() == UseI->getParent())
+ // We do not rematerialize SGPR-defining regions yet so do not bother
+ // optimizing regions whose occupancy is SGPR-limited.
+ if (ST.getOccupancyWithNumSGPRs(RP.getSGPRNum()) == DAG.MinOccupancy)
continue;
- // We are only collecting defs that are defined in another block and are
- // live-through or used inside regions at MinOccupancy. This means that the
- // register must be in the live-in set for the region.
- bool AddedToRematList = false;
- for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
- auto It = DAG.LiveIns[I].find(Reg);
- if (It != DAG.LiveIns[I].end() && !It->second.none()) {
- if (DAG.RegionsWithMinOcc[I]) {
- RematerializableInsts[I][Def] = UseI;
- AddedToRematList = true;
- }
-
- // Collect regions with rematerializable reg as live-in to avoid
- // searching later when updating RP.
- RematDefToLiveInRegions[Def].push_back(I);
- }
- }
- if (!AddedToRematList)
- RematDefToLiveInRegions.erase(Def);
+ unsigned NumVGPRs = RP.getVGPRNum(ST.hasGFX90AInsts());
+ unsigned NumToIncreaseOcc = ST.getNumVGPRsToIncreaseOccupancy(NumVGPRs);
+ OptRegions.insert({I, NumToIncreaseOcc});
+ RA_DEBUG(dbgs() << "Region " << I << " has min. occupancy: decrease by "
+ << NumToIncreaseOcc << " VGPR(s) to improve occupancy\n");
}
-}
-
-bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
- const TargetInstrInfo *TII) {
- // Temporary copies of cached variables we will be modifying and replacing if
- // sinking succeeds.
- SmallVector<
- std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
- NewRegions;
- DenseMap<unsigned, GCNRPTracker::LiveRegSet> NewLiveIns;
- DenseMap<unsigned, GCNRegPressure> NewPressure;
- BitVector NewRescheduleRegions;
- LiveIntervals *LIS = DAG.LIS;
-
- NewRegions.resize(DAG.Regions.size());
- NewRescheduleRegions.resize(DAG.Regions.size());
-
- // Collect only regions that has a rematerializable def as a live-in.
- SmallSet<unsigned, 16> ImpactedRegions;
- for (const auto &It : RematDefToLiveInRegions)
- ImpactedRegions.insert(It.second.begin(), It.second.end());
-
- // Make copies of register pressure and live-ins cache that will be updated
- // as we rematerialize.
- for (auto Idx : ImpactedRegions) {
- NewPressure[Idx] = DAG.Pressure[Idx];
- NewLiveIns[Idx] = DAG.LiveIns[Idx];
- }
- NewRegions = DAG.Regions;
- NewRescheduleRegions.reset();
+ if (OptRegions.empty())
+ return false;
- DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
- bool Improved = false;
- for (auto I : ImpactedRegions) {
- if (!DAG.RegionsWithMinOcc[I])
- continue;
+ // Tracks estimated rematerialization gains (i.e., reduction in RP) for
+ // this instruction in each optimizable region.
+ auto ReduceRPInRegion = [&](auto OptIt, unsigned I,
+ LaneBitmask Mask) -> bool {
+ auto NumRegs = SIRegisterInfo::getNumCoveredRegs(Mask);
+ unsigned &RPExcess = OptIt->getSecond();
+ if (NumRegs >= RPExcess) {
+ OptRegions.erase(I);
+ LLVM_DEBUG(dbgs() << "sinking increases occupancy in region " << I
+ << "\n");
+ } else {
+ RPExcess -= NumRegs;
+ LLVM_DEBUG(dbgs() << "sinking reduces excess pressure in region " << I
+ << " by " << NumRegs << " (" << RPExcess << " left)\n");
+ }
+ return OptRegions.empty();
+ };
+
+ // We need up-to-date live-out info. to query live-out register masks in
+ // regions containing rematerializable instructions.
+ DAG.RegionLiveOuts.buildLiveRegMap();
+
+ for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
+ auto Region = DAG.Regions[I];
+ for (auto MI = Region.first; MI != Region.second; ++MI) {
+ // We only support instructions with at least one register (but
+ // non-subregister) operand.
+ MachineInstr &DefMI = *MI;
+ if (DefMI.isBundle() || !DefMI.getNumOperands() ||
+ !DefMI.getOperand(0).isReg() || DefMI.getOperand(0).getSubReg())
+ continue;
- Improved = false;
- int VGPRUsage = NewPressure[I].getVGPRNum(ST.hasGFX90AInsts());
- int SGPRUsage = NewPressure[I].getSGPRNum();
+ // We only support rematerializing virtual VGPRs with one definition and
+ // one use.
+ Register Reg = DefMI.getOperand(0).getReg();
+ if (!Reg.isVirtual() || !DAG.LIS->hasInterval(Reg) ||
+ !SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
+ !DAG.MRI.hasOneDef(Reg) || !DAG.MRI.hasOneNonDBGUse(Reg))
+ continue;
- // TODO: Handle occupancy drop due to AGPR and SGPR.
- // Check if cause of occupancy drop is due to VGPR usage and not SGPR.
- if (ST.getOccupancyWithNumSGPRs(SGPRUsage) == DAG.MinOccupancy)
- break;
+ // The instruction must be trivially rematerializable and have no virtual
+ // register use.
+ if (!isTriviallyReMaterializable(DefMI))
+ continue;
- // The occupancy of this region could have been improved by a previous
- // iteration's sinking of defs.
- if (NewPressure[I].getOccupancy(ST) > DAG.MinOccupancy) {
- NewRescheduleRegions[I] = true;
- Improved = true;
- continue;
- }
+ // We only care to rematerialize the instruction if its single use is in a
+ // different block.
+ MachineInstr *UseMI = &*DAG.MRI.use_instr_nodbg_begin(Reg);
+ if (DefMI.getParent() == UseMI->getParent())
+ continue;
- // First check if we have enough trivially rematerializable instructions to
- // improve occupancy. Optimistically assume all instructions we are able to
- // sink decreased RP.
- int TotalSinkableRegs = 0;
- for (const auto &It : RematerializableInsts[I]) {
- MachineInstr *Def = It.first;
- Register DefReg = Def->getOperand(0).getReg();
- TotalSinkableRegs +=
- SIRegisterInfo::getNumCoveredRegs(NewLiveIns[I][DefReg]);
- }
- int VGPRsAfterSink = VGPRUsage - TotalSinkableRegs;
- unsigned OptimisticOccupancy = ST.getOccupancyWithNumVGPRs(VGPRsAfterSink);
- // If in the most optimistic scenario, we cannot improve occupancy, then do
- // not attempt to sink any instructions.
- if (OptimisticOccupancy <= DAG.MinOccupancy)
- break;
+ RA_DEBUG(dbgs() << "In region " << I << ", instruction " << DefMI
+ << " is rematerializable with single use " << *UseMI);
+ auto &Remat = RematInstructions.emplace_back(&DefMI, I, UseMI);
+
+ bool RematUseful = false;
+ if (auto It = OptRegions.find(I); It != OptRegions.end()) {
+ // Optimistically consider that moving the instruction out of its
+ // defining region will reduce RP in the latter; this assumes that
+ // maximum RP in the region is reached somewhere between the defining
+ // instruction and the end of the region.
+ RA_DEBUG(dbgs() << " Instruction's defining region is optimizable: ");
+ RematUseful = true;
+ auto RegMask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
+ if (ReduceRPInRegion(It, I, RegMask))
+ return true;
+ }
- unsigned ImproveOccupancy = 0;
- SmallVector<MachineInstr *, 4> SinkedDefs;
- for (auto &It : RematerializableInsts[I]) {
- MachineInstr *Def = It.first;
- MachineBasicBlock::iterator InsertPos =
- MachineBasicBlock::iterator(It.second);
- Register Reg = Def->getOperand(0).getReg();
- // Rematerialize MI to its use block. Since we are only rematerializing
- // instructions that do not have any virtual reg uses, we do not need to
- // call LiveRangeEdit::allUsesAvailableAt() and
- // LiveRangeEdit::canRematerializeAt().
- TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
- Def->getOperand(0).getSubReg(), *Def, *DAG.TRI);
- MachineInstr *NewMI = &*std::prev(InsertPos);
- LIS->InsertMachineInstrInMaps(*NewMI);
- LIS->removeInterval(Reg);
- LIS->createAndComputeVirtRegInterval(Reg);
- InsertedMIToOldDef[NewMI] = Def;
-
- // Update region boundaries in scheduling region we sinked from since we
- // may sink an instruction that was at the beginning or end of its region
- DAG.updateRegionBoundaries(NewRegions, Def, /*NewMI =*/nullptr,
- /*Removing =*/true);
-
- // Update region boundaries in region we sinked to.
- DAG.updateRegionBoundaries(NewRegions, InsertPos, NewMI);
-
- LaneBitmask PrevMask = NewLiveIns[I][Reg];
- // FIXME: Also update cached pressure for where the def was sinked from.
- // Update RP for all regions that has this reg as a live-in and remove
- // the reg from all regions as a live-in.
- for (auto Idx : RematDefToLiveInRegions[Def]) {
- NewLiveIns[Idx].erase(Reg);
- if (InsertPos->getParent() != DAG.Regions[Idx].first->getParent()) {
- // Def is live-through and not used in this block.
- NewPressure[Idx].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
+ for (unsigned LVRegion = 0; LVRegion != E; ++LVRegion) {
+ // We are only collecting regions in which the register is a live-in
+ // (and may be live-through).
+ auto It = DAG.LiveIns[LVRegion].find(Reg);
+ if (It == DAG.LiveIns[LVRegion].end() || It->second.none())
+ continue;
+ Remat.LiveInRegions.insert(LVRegion);
+ RA_DEBUG(dbgs() << " Def is live-in in region " << LVRegion
+ << ": ");
+
+ // Account for the reduction in RP due to the rematerialization in an
+ // optimizable region in which the defined register is a live-in. This
+ // is exact for live-through region but optimistic in the using region,
+ // where RP is actually reduced only if maximum RP is reached somewhere
+ // between the beginning of the region and the rematerializable
+ // instruction's use.
+ if (auto It = OptRegions.find(LVRegion); It != OptRegions.end()) {
+ RematUseful = true;
+ if (ReduceRPInRegion(It, LVRegion, DAG.LiveIns[LVRegion][Reg]))
+ return true;
} else {
- // Def is used and rematerialized into this block.
- GCNDownwardRPTracker RPT(*LIS);
- auto *NonDbgMI = &*skipDebugInstructionsForward(
- NewRegions[Idx].first, NewRegions[Idx].second);
- RPT.reset(*NonDbgMI, &NewLiveIns[Idx]);
- RPT.advance(NewRegions[Idx].second);
- NewPressure[Idx] = RPT.moveMaxPressure();
+ LLVM_DEBUG(dbgs() << "unoptimizable region\n");
}
}
- SinkedDefs.push_back(Def);
- ImproveOccupancy = NewPressure[I].getOccupancy(ST);
- if (ImproveOccupancy > DAG.MinOccupancy)
- break;
+ // If the instruction is not a live-in or live-out in any optimizable
+ // region then there is no point in rematerializing it.
+ if (!RematUseful) {
+ RematInstructions.pop_back();
+ RA_DEBUG(
+ dbgs()
+ << " No impact on any optimizable region, dropping instruction\n");
+ }
}
+ }
- // Remove defs we just sinked from all regions' list of sinkable defs
- for (auto &Def : SinkedDefs)
- for (auto TrackedIdx : RematDefToLiveInRegions[Def])
- RematerializableInsts[TrackedIdx].erase(Def);
+ RA_DEBUG(dbgs() << "Cannot increase occupancy through rematerialization\n");
+ return false;
+}
- if (ImproveOccupancy <= DAG.MinOccupancy)
- break;
+void PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
+ const TargetInstrInfo *TII) {
+ // Collect regions whose live-ins or register pressure will change due to
+ // rematerialization, and map those whose maximum RP we need to fully
+ // recompute at the end to true.
+ SmallDenseMap<unsigned, bool> ImpactedRegions;
+ // Maps rematerialized instuctions to the one they were rematerialized from.
+ DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
+ LiveIntervals *LIS = DAG.LIS;
- NewRescheduleRegions[I] = true;
- Improved = true;
+ // TODO: In the spirit of rematerializing the minimum number of instructions
+ // to increase occupancy, here we could sort the list of rematerializable
+ // instructions in decreasing order of "expected profitability" so that we end
+ // up moving as few instructions as possible in the loop below.
+ for (RematInstruction &Remat : RematInstructions) {
+ MachineInstr *DefMI = Remat.RematMI;
+ MachineBasicBlock::iterator InsertPos(Remat.UseMI);
+ Register Reg = DefMI->getOperand(0).getReg();
+
+ // Rematerialize MI to its use block. Since we are only rematerializing
+ // instructions that do not have any virtual reg uses, we do not need to
+ // call LiveRangeEdit::allUsesAvailableAt() and
+ // LiveRangeEdit::canRematerializeAt().
+ TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
+ DefMI->getOperand(0).getSubReg(), *DefMI, *DAG.TRI);
+ MachineInstr *NewMI = &*std::prev(InsertPos);
+ LIS->InsertMachineInstrInMaps(*NewMI);
+ LIS->removeInterval(Reg);
+ LIS->createAndComputeVirtRegInterval(Reg);
+ InsertedMIToOldDef[NewMI] = DefMI;
+
+ // Update region boundaries in scheduling region we sinked from since we
+ // may sink an instruction that was at the beginning or end of its region.
+ DAG.updateRegionBoundaries(DAG.Regions, DefMI, /*NewMI =*/nullptr,
+ /*Removing =*/true);
+
+ // Update region boundaries in region we sinked to.
+ DAG.updateRegionBoundaries(DAG.Regions, InsertPos, NewMI);
+
+ // Collect all regions impacted by the rematerialization and update their
+ // live-in/RP information.
+ for (unsigned I : Remat.LiveInRegions) {
+ ImpactedRegions.insert({I, false});
+ GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I];
+
+ // The register is no longer a live-in in all regions but the one that
+ // contains the single use. In live-through regions, maximum register
+ // pressure decreases predictably so we can directly update it. In the
+ // using region, maximum register pressure may or may not decrease, so we
+ // will mark it for re-computation after all materializations.
+ RegionLiveIns.erase(Reg);
+ if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent()) {
+ // Register is live-through and not used in this block.
+ LaneBitmask PrevMask = RegionLiveIns[Reg];
+ DAG.Pressure[I].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
+ } else {
+ // Register is used in this block.
+ ImpactedRegions[I] = true;
+ }
+ }
+
+ // RP in the region from which the instruction was rematerialized may or may
+ // not change, recompute-it fully later.
+ ImpactedRegions[Remat.DefRegion] = true;
}
- if (!Improved) {
- // Occupancy was not improved for all regions that were at MinOccupancy.
- // Undo sinking and remove newly rematerialized instructions.
- for (auto &Entry : InsertedMIToOldDef) {
- MachineInstr *MI = Entry.first;
- MachineInstr *OldMI = Entry.second;
- Register Reg = MI->getOperand(0).getReg();
- LIS->RemoveMachineInstrFromMaps(*MI);
- MI->eraseFromParent();
- OldMI->clearRegisterDeads(Reg);
- LIS->removeInterval(Reg);
- LIS->createAndComputeVirtRegInterval(Reg);
+ // All regions impacted by at least one rematerialization must be rescheduled.
+ BitVector NewRescheduleRegions(DAG.Regions.size());
+ for (auto &[I, RecomputeRP] : ImpactedRegions) {
+ NewRescheduleRegions[I] = true;
+ DAG.MBBLiveIns.erase(DAG.Regions[I].first->getParent());
+
+ // Recompute maximum RP in regions in which at least one instruction was
+ // rematerialized from or to.
+ if (RecomputeRP) {
+ GCNDownwardRPTracker RPT(*LIS);
+ auto *NonDbgMI = &*skipDebugInstructionsForward(DAG.Regions[I].first,
+ DAG.Regions[I].second);
+ RPT.reset(*NonDbgMI, &DAG.LiveIns[I]);
+ RPT.advance(DAG.Regions[I].second);
+ DAG.Pressure[I] = RPT.moveMaxPressure();
}
- return false;
}
+ DAG.RescheduleRegions = NewRescheduleRegions;
- // Occupancy was improved for all regions.
- for (auto &Entry : InsertedMIToOldDef) {
- MachineInstr *MI = Entry.first;
- MachineInstr *OldMI = Entry.second;
-
- // Remove OldMI from BBLiveInMap since we are sinking it from its MBB.
+ // Clean up the IR; remove rematerialized instructions from state.
+ for (auto &[NewMI, OldMI] : InsertedMIToOldDef) {
+ // Remove rematerialized instruction from BBLiveInMap since we are sinking
+ // it from its MBB.
DAG.BBLiveInMap.erase(OldMI);
- // Remove OldMI and update LIS
- Register Reg = MI->getOperand(0).getReg();
+ // Remove the rematerialized instruction and update LIS.
+ Register Reg = NewMI->getOperand(0).getReg();
LIS->RemoveMachineInstrFromMaps(*OldMI);
OldMI->eraseFromParent();
LIS->removeInterval(Reg);
LIS->createAndComputeVirtRegInterval(Reg);
}
- // Update live-ins, register pressure, and regions caches.
- for (auto Idx : ImpactedRegions) {
- DAG.LiveIns[Idx] = NewLiveIns[Idx];
- DAG.Pressure[Idx] = NewPressure[Idx];
- DAG.MBBLiveIns.erase(DAG.Regions[Idx].first->getParent());
- }
- DAG.Regions = NewRegions;
- DAG.RescheduleRegions = NewRescheduleRegions;
-
if (GCNTrackers)
DAG.RegionLiveOuts.buildLiveRegMap();
SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
-
- return true;
}
// Copied from MachineLICM
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 64d517038f90e0..c378564d5b2e97 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -14,7 +14,8 @@
#define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
#include "GCNRegPressure.h"
-#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineScheduler.h"
namespace llvm {
@@ -421,27 +422,42 @@ class ClusteredLowOccStage : public GCNSchedStage {
class PreRARematStage : public GCNSchedStage {
private:
- // Each region at MinOccupancy will have their own list of trivially
- // rematerializable instructions we can remat to reduce RP. The list maps an
- // instruction to the position we should remat before, usually the MI using
- // the rematerializable instruction.
- MapVector<unsigned, MapVector<MachineInstr *, MachineInstr *>>
- RematerializableInsts;
-
- // Map a trivially rematerializable def to a list of regions at MinOccupancy
- // that has the defined reg as a live-in.
- DenseMap<MachineInstr *, SmallVector<unsigned, 4>> RematDefToLiveInRegions;
-
- // Collect all trivially rematerializable VGPR instructions with a single def
- // and single use outside the defining block into RematerializableInsts.
- void collectRematerializableInstructions();
-
+ /// A trivially rematerializable VGPR-defining instruction along with
+ /// pre-computed information to help update the scheduler's status when we
+ /// rematerialize it.
+ struct RematInstruction {
+ /// Trivially rematerializable instruction.
+ MachineInstr *RematMI;
+ /// Region containing the rematerializable instruction.
+ unsigned DefRegion;
+ /// Single use of the rematerializable instruction's defined register,
+ /// located in a different block.
+ MachineInstr *UseMI;
+ /// Set of regions in which the rematerializable instruction's defined
+ /// register is a live-in.
+ SmallDenseSet<unsigned, 4> LiveInRegions;
+
+ RematInstruction(MachineInstr *RematMI, unsigned DefRegion,
+ MachineInstr *UseMI)
+ : RematMI(RematMI), DefRegion(DefRegion), UseMI(UseMI) {}
+ };
+
+ /// List of eligible rematerializable instructions to sink to increase
+ /// occupancy, in function instruction order.
+ std::vector<RematInstruction> RematInstructions;
+
+ /// Collect all trivially rematerializable VGPR instructions with a single def
+ /// and single use outside the defining block into RematerializableInsts.
+ bool collectRematerializableInstructions();
+
+ /// Whether the MI is trivially rematerializable and does not have eany
+ /// virtual register use.
bool isTriviallyReMaterializable(const MachineInstr &MI);
- // TODO: Should also attempt to reduce RP of SGPRs and AGPRs
- // Attempt to reduce RP of VGPR by sinking trivially rematerializable
- // instructions. Returns true if we were able to sink instruction(s).
- bool sinkTriviallyRematInsts(const GCNSubtarget &ST,
+ /// TODO: Should also attempt to reduce RP of SGPRs and AGPRs
+ /// Sinks all instructions in RematInstructions to increase function
+ /// occupancy. Modified regions are tagged for rescheduling.
+ void sinkTriviallyRematInsts(const GCNSubtarget &ST,
const TargetInstrInfo *TII);
public:
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 51361b75940560..78f15ed8be8cae 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -367,6 +367,10 @@ unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {
return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs);
}
+unsigned GCNSubtarget::getNumVGPRsToIncreaseOccupancy(unsigned NumVGPRs) const {
+ return AMDGPU::IsaInfo::getNumVGPRsToIncreaseWavesPerEU(this, NumVGPRs);
+}
+
unsigned
GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
if (getGeneration() >= AMDGPUSubtarget::GFX10)
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index ea5e159fdd8363..20e1ba350ed8e7 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1368,6 +1368,11 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
/// VGPRs
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
+ /// Returns the necessary reduction in number of VGPRs from using \p VGPRs
+ /// VGPRs to increase occupancy by 1. Returns 0 when using \p VGPRs VGPRs
+ /// already results in maximum occupancy.
+ unsigned getNumVGPRsToIncreaseOccupancy(unsigned VGPRs) const;
+
/// Return occupancy for the given function. Used LDS and a number of
/// registers if provided.
/// Note, occupancy can be affected by the scratch allocation as well, but
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index ab5f0694c07f95..a996cb21848643 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1185,6 +1185,19 @@ unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
return std::min(std::max(TotalNumVGPRs / RoundedRegs, 1u), MaxWaves);
}
+unsigned getNumVGPRsToIncreaseWavesPerEU(const MCSubtargetInfo *STI,
+ unsigned NumVGPRs) {
+ unsigned Granule = getVGPRAllocGranule(STI);
+ unsigned MaxWaves = getMaxWavesPerEU(STI);
+ unsigned TotalNumVGPRs = getTotalNumVGPRs(STI);
+
+ unsigned NumWaves =
+ getNumWavesPerEUWithNumVGPRs(NumVGPRs, Granule, MaxWaves, TotalNumVGPRs);
+ if (NumWaves == MaxWaves)
+ return 0;
+ return NumVGPRs - alignDown(TotalNumVGPRs / (NumWaves + 1), Granule);
+}
+
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves,
AMDGPUSubtarget::Generation Gen) {
if (Gen >= AMDGPUSubtarget::GFX10)
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 9f7fbec6a542f7..8b4136da9503cc 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -324,6 +324,12 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
unsigned NumVGPRs);
+/// Returns the necessary reduction in number of VGPRs from using \p VGPRs VGPRs
+/// to increase the achievable number of waves per EU for this subtarget by 1.
+/// Returns 0 when using \p VGPRs VGPRs already results in maximum number of waves per EU.
+unsigned getNumVGPRsToIncreaseWavesPerEU(const MCSubtargetInfo *STI,
+ unsigned NumVGPRs);
+
/// \returns Number of waves reachable for a given \p NumVGPRs usage, \p Granule
/// size, \p MaxWaves possible, and \p TotalNumVGPRs available.
unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
index 018da7f81e3d4b..abed0e080b7eeb 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
@@ -3163,11 +3163,11 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+ ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
- ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
; GFX908-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -3353,11 +3353,11 @@ body: |
; GFX908-NEXT: [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+ ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
; GFX908-NEXT: [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
; GFX908-NEXT: [[V_CVT_I32_F64_e32_34:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 34, implicit $exec, implicit $mode, implicit-def $m0
- ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
- ; GFX908-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
; GFX908-NEXT: {{ $}}
; GFX908-NEXT: bb.1:
; GFX908-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
@@ -5913,4 +5913,141 @@ body: |
S_NOP 0, implicit %22
S_ENDPGM 0
...
+---
+name: test_live_through_occ_7_sink_for_8
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ ; GFX908-LABEL: name: test_live_through_occ_7_sink_for_8
+ ; GFX908: bb.0:
+ ; GFX908-NEXT: successors: %bb.1(0x80000000)
+ ; GFX908-NEXT: {{ $}}
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode
+ ; GFX908-NEXT: {{ $}}
+ ; GFX908-NEXT: bb.1:
+ ; GFX908-NEXT: successors: %bb.2(0x80000000)
+ ; GFX908-NEXT: {{ $}}
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: dead [[V_CVT_I32_F64_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_32:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
+ ; GFX908-NEXT: {{ $}}
+ ; GFX908-NEXT: bb.2:
+ ; GFX908-NEXT: successors: %bb.3(0x80000000)
+ ; GFX908-NEXT: {{ $}}
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_1]], implicit [[V_CVT_I32_F64_e32_17]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_2]], implicit [[V_CVT_I32_F64_e32_18]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_3]], implicit [[V_CVT_I32_F64_e32_19]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_4]], implicit [[V_CVT_I32_F64_e32_20]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_5]], implicit [[V_CVT_I32_F64_e32_21]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_6]], implicit [[V_CVT_I32_F64_e32_22]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_7]], implicit [[V_CVT_I32_F64_e32_23]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_8]], implicit [[V_CVT_I32_F64_e32_24]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_9]], implicit [[V_CVT_I32_F64_e32_25]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_10]], implicit [[V_CVT_I32_F64_e32_26]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_11]], implicit [[V_CVT_I32_F64_e32_27]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_12]], implicit [[V_CVT_I32_F64_e32_28]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_13]], implicit [[V_CVT_I32_F64_e32_29]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_14]], implicit [[V_CVT_I32_F64_e32_30]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_15]], implicit [[V_CVT_I32_F64_e32_31]]
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_32]]
+ ; GFX908-NEXT: {{ $}}
+ ; GFX908-NEXT: bb.3:
+ ; GFX908-NEXT: [[V_CVT_I32_F64_e32_33:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode
+ ; GFX908-NEXT: S_NOP 0, implicit [[V_CVT_I32_F64_e32_33]], implicit [[V_CVT_I32_F64_e32_]]
+ ; GFX908-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1
+
+ %0:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 0, implicit $exec, implicit $mode
+ %1:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 1, implicit $exec, implicit $mode
+
+ bb.1:
+ successors: %bb.2
+ %2:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 2, implicit $exec, implicit $mode, implicit-def $m0
+ %3:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 3, implicit $exec, implicit $mode, implicit-def $m0
+ %4:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 4, implicit $exec, implicit $mode, implicit-def $m0
+ %5:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 5, implicit $exec, implicit $mode, implicit-def $m0
+ %6:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 6, implicit $exec, implicit $mode, implicit-def $m0
+ %7:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 7, implicit $exec, implicit $mode, implicit-def $m0
+ %8:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 8, implicit $exec, implicit $mode, implicit-def $m0
+ %9:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 9, implicit $exec, implicit $mode, implicit-def $m0
+ %10:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 10, implicit $exec, implicit $mode, implicit-def $m0
+ %11:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 11, implicit $exec, implicit $mode, implicit-def $m0
+ %12:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 12, implicit $exec, implicit $mode, implicit-def $m0
+ %13:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 13, implicit $exec, implicit $mode, implicit-def $m0
+ %14:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 14, implicit $exec, implicit $mode, implicit-def $m0
+ %15:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 15, implicit $exec, implicit $mode, implicit-def $m0
+ %16:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 16, implicit $exec, implicit $mode, implicit-def $m0
+ %17:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 17, implicit $exec, implicit $mode, implicit-def $m0
+ %18:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 18, implicit $exec, implicit $mode, implicit-def $m0
+ %19:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 19, implicit $exec, implicit $mode, implicit-def $m0
+ %20:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 20, implicit $exec, implicit $mode, implicit-def $m0
+ %21:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode, implicit-def $m0
+ %22:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
+ %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
+ %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
+ %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode, implicit-def $m0
+ %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
+ %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
+ %28:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 28, implicit $exec, implicit $mode, implicit-def $m0
+ %29:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 29, implicit $exec, implicit $mode, implicit-def $m0
+ %30:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 30, implicit $exec, implicit $mode, implicit-def $m0
+ %31:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 31, implicit $exec, implicit $mode, implicit-def $m0
+ %32:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 32, implicit $exec, implicit $mode, implicit-def $m0
+ %33:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 33, implicit $exec, implicit $mode, implicit-def $m0
+
+ bb.2:
+ successors: %bb.3
+
+ S_NOP 0, implicit %2, implicit %18
+ S_NOP 0, implicit %3, implicit %19
+ S_NOP 0, implicit %4, implicit %20
+ S_NOP 0, implicit %5, implicit %21
+ S_NOP 0, implicit %6, implicit %22
+ S_NOP 0, implicit %7, implicit %23
+ S_NOP 0, implicit %8, implicit %24
+ S_NOP 0, implicit %9, implicit %25
+ S_NOP 0, implicit %10, implicit %26
+ S_NOP 0, implicit %11, implicit %27
+ S_NOP 0, implicit %12, implicit %28
+ S_NOP 0, implicit %13, implicit %29
+ S_NOP 0, implicit %14, implicit %30
+ S_NOP 0, implicit %15, implicit %31
+ S_NOP 0, implicit %16, implicit %32
+ S_NOP 0, implicit %33
+
+ bb.3:
+ S_NOP 0, implicit %0, implicit %1
+ S_ENDPGM 0
+...
>From a2e20f4db4d88db6a95198c628f8dcf69551c29d Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Wed, 4 Dec 2024 16:45:16 +0100
Subject: [PATCH 2/4] Edit stale comments and slightly rearch stage
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 35 +++++++++------------
llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 26 ++++++++-------
2 files changed, 30 insertions(+), 31 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 1b39a5a7db7192..e43014efa08c5a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -960,9 +960,10 @@ bool PreRARematStage::initGCNSchedStage() {
// if there is another pass after this pass.
assert(!S.hasNextStage());
- if (!collectRematerializableInstructions())
+ std::vector<RematInstruction> RematInstructions;
+ if (!canIncreaseOccupancy(RematInstructions))
return false;
- sinkTriviallyRematInsts(ST, TII);
+ sinkTriviallyRematInsts(RematInstructions, ST, TII);
LLVM_DEBUG(
dbgs() << "Retrying function scheduling with improved occupancy of "
@@ -1258,8 +1259,7 @@ GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
#ifndef NDEBUG
LLVM_DEBUG(
printScheduleModel(ReadyCyclesSorted);
- dbgs() << "\n\t"
- << "Metric: "
+ dbgs() << "\n\t" << "Metric: "
<< (SumBubbles
? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
: 1)
@@ -1294,8 +1294,7 @@ GCNSchedStage::getScheduleMetrics(const GCNScheduleDAGMILive &DAG) {
#ifndef NDEBUG
LLVM_DEBUG(
printScheduleModel(ReadyCyclesSorted);
- dbgs() << "\n\t"
- << "Metric: "
+ dbgs() << "\n\t" << "Metric: "
<< (SumBubbles
? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
: 1)
@@ -1343,8 +1342,7 @@ bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
dbgs()
<< "\n\t *** In shouldRevertScheduling ***\n"
<< " *********** BEFORE UnclusteredHighRPStage ***********\n");
- ScheduleMetrics MBefore =
- getScheduleMetrics(DAG.SUnits);
+ ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
LLVM_DEBUG(
dbgs()
<< "\n *********** AFTER UnclusteredHighRPStage ***********\n");
@@ -1474,7 +1472,8 @@ void GCNSchedStage::revertScheduling() {
/// Allows to easily filter for this stage's debug output.
#define RA_DEBUG(X) LLVM_DEBUG(dbgs() << "[PreRARemat] "; X;)
-bool PreRARematStage::collectRematerializableInstructions() {
+bool PreRARematStage::canIncreaseOccupancy(
+ std::vector<RematInstruction> &RematInstructions) {
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(DAG.TRI);
RA_DEBUG(dbgs() << "Collecting rematerializable instructions\n");
@@ -1511,11 +1510,11 @@ bool PreRARematStage::collectRematerializableInstructions() {
if (NumRegs >= RPExcess) {
OptRegions.erase(I);
LLVM_DEBUG(dbgs() << "sinking increases occupancy in region " << I
- << "\n");
+ << "\n");
} else {
RPExcess -= NumRegs;
LLVM_DEBUG(dbgs() << "sinking reduces excess pressure in region " << I
- << " by " << NumRegs << " (" << RPExcess << " left)\n");
+ << " by " << NumRegs << " (" << RPExcess << " left)\n");
}
return OptRegions.empty();
};
@@ -1577,8 +1576,7 @@ bool PreRARematStage::collectRematerializableInstructions() {
if (It == DAG.LiveIns[LVRegion].end() || It->second.none())
continue;
Remat.LiveInRegions.insert(LVRegion);
- RA_DEBUG(dbgs() << " Def is live-in in region " << LVRegion
- << ": ");
+ RA_DEBUG(dbgs() << " Def is live-in in region " << LVRegion << ": ");
// Account for the reduction in RP due to the rematerialization in an
// optimizable region in which the defined register is a live-in. This
@@ -1610,8 +1608,9 @@ bool PreRARematStage::collectRematerializableInstructions() {
return false;
}
-void PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
- const TargetInstrInfo *TII) {
+void PreRARematStage::sinkTriviallyRematInsts(
+ ArrayRef<RematInstruction> RematInstructions, const GCNSubtarget &ST,
+ const TargetInstrInfo *TII) {
// Collect regions whose live-ins or register pressure will change due to
// rematerialization, and map those whose maximum RP we need to fully
// recompute at the end to true.
@@ -1620,11 +1619,7 @@ void PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
DenseMap<MachineInstr *, MachineInstr *> InsertedMIToOldDef;
LiveIntervals *LIS = DAG.LIS;
- // TODO: In the spirit of rematerializing the minimum number of instructions
- // to increase occupancy, here we could sort the list of rematerializable
- // instructions in decreasing order of "expected profitability" so that we end
- // up moving as few instructions as possible in the loop below.
- for (RematInstruction &Remat : RematInstructions) {
+ for (const RematInstruction &Remat : RematInstructions) {
MachineInstr *DefMI = Remat.RematMI;
MachineBasicBlock::iterator InsertPos(Remat.UseMI);
Register Reg = DefMI->getOperand(0).getReg();
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index c378564d5b2e97..e9ffea09955daf 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -420,6 +420,12 @@ class ClusteredLowOccStage : public GCNSchedStage {
: GCNSchedStage(StageID, DAG) {}
};
+/// Attempts to increase function occupancy with respect to VGPR usage by one by
+/// sinking trivially rematerializable instructions to their use. When the stage
+/// estimates increasing occupancy is possible, as few instructions as possible
+/// are rematerialized to reduce potential negative effects on function latency.
+///
+/// TODO: We should extend this to work on SGPRs and AGPRs as well.
class PreRARematStage : public GCNSchedStage {
private:
/// A trivially rematerializable VGPR-defining instruction along with
@@ -442,22 +448,20 @@ class PreRARematStage : public GCNSchedStage {
: RematMI(RematMI), DefRegion(DefRegion), UseMI(UseMI) {}
};
- /// List of eligible rematerializable instructions to sink to increase
- /// occupancy, in function instruction order.
- std::vector<RematInstruction> RematInstructions;
+ /// Determines whether we can increase function occupancy by 1 through
+ /// rematerialization. If we can, returns true and fill \p RematInstructions
+ /// with a list of rematerializable instructions whose sinking would result in
+ /// increased occupancy; returns false otherwise.
+ bool canIncreaseOccupancy(std::vector<RematInstruction> &RematInstructions);
- /// Collect all trivially rematerializable VGPR instructions with a single def
- /// and single use outside the defining block into RematerializableInsts.
- bool collectRematerializableInstructions();
-
- /// Whether the MI is trivially rematerializable and does not have eany
+ /// Whether the MI is trivially rematerializable and does not have any
/// virtual register use.
bool isTriviallyReMaterializable(const MachineInstr &MI);
- /// TODO: Should also attempt to reduce RP of SGPRs and AGPRs
- /// Sinks all instructions in RematInstructions to increase function
+ /// Sinks all instructions in \p RematInstructions to increase function
/// occupancy. Modified regions are tagged for rescheduling.
- void sinkTriviallyRematInsts(const GCNSubtarget &ST,
+ void sinkTriviallyRematInsts(ArrayRef<RematInstruction> RematInstructions,
+ const GCNSubtarget &ST,
const TargetInstrInfo *TII);
public:
>From 0d9103463ae71faf19645dc0efb0337c0e972426 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Thu, 5 Dec 2024 01:15:48 +0100
Subject: [PATCH 3/4] Wrap comment correctly
---
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 8b4136da9503cc..783827f994c0e0 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -326,7 +326,8 @@ unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo *STI,
/// Returns the necessary reduction in number of VGPRs from using \p VGPRs VGPRs
/// to increase the achievable number of waves per EU for this subtarget by 1.
-/// Returns 0 when using \p VGPRs VGPRs already results in maximum number of waves per EU.
+/// Returns 0 when using \p VGPRs VGPRs already results in maximum number of
+/// waves per EU.
unsigned getNumVGPRsToIncreaseWavesPerEU(const MCSubtargetInfo *STI,
unsigned NumVGPRs);
>From c095a15b92aa9318489b78ab7fa0cffaa544aaea Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Thu, 5 Dec 2024 01:27:12 +0100
Subject: [PATCH 4/4] Fix format
---
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 20e1ba350ed8e7..99e163a5661fe1 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1372,7 +1372,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
/// VGPRs to increase occupancy by 1. Returns 0 when using \p VGPRs VGPRs
/// already results in maximum occupancy.
unsigned getNumVGPRsToIncreaseOccupancy(unsigned VGPRs) const;
-
+
/// Return occupancy for the given function. Used LDS and a number of
/// registers if provided.
/// Note, occupancy can be affected by the scratch allocation as well, but
More information about the llvm-commits
mailing list