[llvm] [AMDGPU] Introduce Next-Use Analysis for SSA-based Register Allocation (PR #156079)
Chris Jackson via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 8 07:09:28 PST 2025
================
@@ -0,0 +1,456 @@
+#include "AMDGPUNextUseAnalysis.h"
+#include "AMDGPU.h"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Passes/PassPlugin.h"
+#include "llvm/Support/Timer.h"
+
+#define DEBUG_TYPE "amdgpu-next-use"
+
+using namespace llvm;
+
+// Command-line option to enable timing instrumentation
+static cl::opt<bool>
+ EnableTimers("amdgpu-next-use-analysis-timers",
+ cl::desc("Enable timing for Next Use Analysis"),
+ cl::init(false), cl::Hidden);
+
+// Static timers for performance tracking across all analysis runs
+static llvm::TimerGroup TG("amdgpu-next-use", "AMDGPU Next Use Analysis");
+static llvm::Timer AnalyzeTimer("analyze", "Time spent in analyze()", TG);
+static llvm::Timer GetDistanceTimer("getNextUseDistance",
+ "Time spent in getNextUseDistance()", TG);
+
+// Three-tier ranking system for spiller decisions
+unsigned NextUseResult::materializeForRank(int64_t Stored,
+ unsigned SnapshotOffset) const {
+ int64_t Mat64 = materialize(Stored, SnapshotOffset);
+
+ // Tier 1: Finite distances (0 to LoopTag-1) → return as-is
+ // Tier 2: Loop-exit distances (LoopTag to DeadTag-1) → map to 60000-64999
+ // range Tier 3: Dead registers (DeadTag+) → return Infinity (65535)
+ if (Mat64 >= DeadTag) {
+ return Infinity; // Tier 3: Dead registers get maximum distance
+ }
+ if (Mat64 >= LoopTag) {
+ // Tier 2: Loop-exit distances get mapped to high range [60000, 64999]
+ int64_t LoopRemainder = Mat64 - LoopTag;
+ // Clamp the remainder to fit in available range (5000 values)
+ unsigned ClampedRemainder = static_cast<unsigned>(
+ std::min(LoopRemainder, static_cast<int64_t>(4999)));
+ return 60000 + ClampedRemainder;
+ }
+ if (Mat64 <= 0) {
+ return 0; // Tier 1: Zero-distance for immediate uses
+ }
+ return static_cast<unsigned>(Mat64); // Tier 1: Finite distances as-is
+}
+
+void NextUseResult::init(const MachineFunction &MF) {
+ for (auto *L : LI->getLoopsInPreorder()) {
+ SmallVector<std::pair<MachineBasicBlock *, MachineBasicBlock *>> Exiting;
+ L->getExitEdges(Exiting);
+ for (auto P : Exiting) {
+ LoopExits[P.first->getNumber()] = P.second->getNumber();
+ }
+ }
+}
+
+void NextUseResult::analyze(const MachineFunction &MF) {
+ // Upward-exposed distances are only necessary to convey the data flow from
+ // the block to its predecessors. No need to store it beyond the analyze
+ // function as the analysis users are only interested in the use distances
+ // relatively to the given MI or the given block end.
+ DenseMap<unsigned, VRegDistances> UpwardNextUses;
+ if (EnableTimers)
+ AnalyzeTimer.startTimer();
+ bool Changed = true;
+ while (Changed) {
+ Changed = false;
+ for (const auto *MBB : post_order(&MF)) {
+ unsigned Offset = 0;
+ unsigned MBBNum = MBB->getNumber();
+ VRegDistances Curr, Prev;
+ if (UpwardNextUses.contains(MBBNum)) {
+ Prev = UpwardNextUses[MBBNum];
+ }
+
+ LLVM_DEBUG({
+ dbgs() << "\nMerging successors for "
+ << "MBB_" << MBB->getNumber() << "." << MBB->getName() << "\n";
+ });
+
+ for (auto *Succ : successors(MBB)) {
+ unsigned SuccNum = Succ->getNumber();
+
+ if (!UpwardNextUses.contains(SuccNum))
+ continue;
+
+ VRegDistances SuccDist = UpwardNextUses[SuccNum];
+ LLVM_DEBUG({
+ dbgs() << "\nMerging "
+ << "MBB_" << Succ->getNumber() << "." << Succ->getName()
+ << "\n";
+ });
+
+ // Check if the edge from MBB to Succ goes out of the Loop
+ int64_t EdgeWeight = 0;
+ if (LoopExits.contains(MBB->getNumber())) {
+ unsigned ExitTo = LoopExits[MBB->getNumber()];
+ if (SuccNum == ExitTo)
+ EdgeWeight = LoopTag;
+ }
+
+ if (LI->getLoopDepth(MBB) < LI->getLoopDepth(Succ)) {
+ // MBB->Succ is entering the Succ's loop (analysis exiting the loop)
+ // Two transformations:
+ // 1. Outside-loop uses (>= LoopTag): subtract LoopTag
+ // 2. Inside-loop uses (< LoopTag): reset to preheader position
+ // This models: if spilled before loop, reload at preheader
+ for (auto &P : SuccDist) {
+ auto &Dists = P.second;
+ VRegDistances::SortedRecords NewDists;
+ for (auto R : Dists) {
+ if (R.second >= LoopTag) {
+ // Outside-loop use: subtract LoopTag
+ R.second -= LoopTag;
+ } else {
+ // Inside-loop use: reset so distance = 0 at preheader bottom
+ R.second = -(int64_t)EntryOff[SuccNum];
+ }
+ NewDists.insert(R);
+ }
+ Dists = std::move(NewDists);
+ }
+ }
+ LLVM_DEBUG({
+ dbgs() << "\nCurr:";
+ printVregDistances(Curr /*, 0 - we're at the block bottom*/);
+ if (EdgeWeight != 0)
+ dbgs() << "\nSucc (EdgeWeight " << EdgeWeight << " applied):";
+ else
+ dbgs() << "\nSucc:";
+ printVregDistances(SuccDist, EntryOff[SuccNum], EdgeWeight);
+ });
+
+ // Filter out successor's PHI operands with SourceBlock != MBB
+ // PHI operands are only live on their specific incoming edge
+ for (auto &PHI : Succ->phis()) {
+ // Check each PHI operand pair (value, source block)
+ for (unsigned OpIdx = 1; OpIdx < PHI.getNumOperands(); OpIdx += 2) {
+ const MachineOperand &UseOp = PHI.getOperand(OpIdx);
+ const MachineOperand &BlockOp = PHI.getOperand(OpIdx + 1);
+
+ // Skip if this operand doesn't come from current MBB
+ if (BlockOp.getMBB() != MBB) {
+ VRegMaskPair PhiVMP(UseOp, TRI, MRI);
+ // Remove this PHI operand from the successor distances
+ SuccDist.clear(PhiVMP);
+ }
+ }
+ }
+
+ Curr.merge(SuccDist, EntryOff[SuccNum], EdgeWeight);
+ LLVM_DEBUG({
+ dbgs() << "\nCurr after merge:";
+ printVregDistances(Curr);
+ });
+ }
+
+ NextUseMap[MBBNum].Bottom = Curr;
+
+ for (auto &MI : make_range(MBB->rbegin(), MBB->rend())) {
+
+ for (auto &MO : MI.operands()) {
+
+ // Only process virtual register operands
+ // Undef operands don't represent real uses
+ if (!MO.isReg() || !MO.getReg().isVirtual() || MO.isUndef())
+ continue;
+
+ VRegMaskPair P(MO, TRI, MRI);
+ if (MO.isUse()) {
+ Curr.insert(P, -(int64_t)Offset);
+ UsedInBlock[MBB->getNumber()].insert(P);
+ } else if (MO.isDef()) {
+ Curr.clear(P);
+ UsedInBlock[MBB->getNumber()].remove(P);
+ }
+ }
+ NextUseMap[MBBNum].InstrDist[&MI] = Curr;
+ NextUseMap[MBBNum].InstrOffset[&MI] = Offset;
+ // printVregDistances(Curr, Offset);
+ if (!MI.isPHI())
+ ++Offset;
+ }
+
+ // EntryOff needs the TOTAL instruction count for correct predecessor
+ // distances while InstrOffset uses individual instruction offsets for
+ // materialization
+
+ LLVM_DEBUG({
+ dbgs() << "\nFinal distances for MBB_" << MBB->getNumber() << "."
+ << MBB->getName() << "\n";
+ printVregDistances(Curr, Offset);
+ dbgs() << "\nPrevious distances for MBB_" << MBB->getNumber() << "."
+ << MBB->getName() << "\n";
+ printVregDistances(Prev, Offset);
+ dbgs() << "\nUsed in block:\n";
+ dumpUsedInBlock();
+ });
+
+ // EntryOff -offset of the first instruction in the block top-down walk
+ EntryOff[MBBNum] = Offset;
+ UpwardNextUses[MBBNum] = std::move(Curr);
+
+ bool Changed4MBB = (Prev != UpwardNextUses[MBBNum]);
+
+ Changed |= Changed4MBB;
+ }
+ }
+ // Dump complete analysis results for testing
+ LLVM_DEBUG(dumpAllNextUseDistances(MF));
+ if (EnableTimers) {
+ AnalyzeTimer.stopTimer();
+ TG.print(llvm::errs());
+ }
+}
+
+void NextUseResult::getFromSortedRecords(
+ const VRegDistances::SortedRecords &Dists, LaneBitmask Mask,
+ unsigned SnapshotOffset, unsigned &D) {
+ LLVM_DEBUG({
+ dbgs() << "Mask : [" << PrintLaneMask(Mask) << "] "
+ << "SnapshotOffset=" << SnapshotOffset << "\n";
+ });
+
+ // Records are sorted by stored value in increasing order. Since all entries
+ // in this snapshot share the same SnapshotOffset, ordering by stored value
+ // is equivalent to ordering by materialized distance.
+ for (const auto &P : Dists) {
+ const LaneBitmask UseMask = P.first;
+ LLVM_DEBUG(dbgs() << " UseMask : [" << PrintLaneMask(UseMask) << "]\n");
+
+ // Check for any overlap between the queried mask and the use mask.
+ // This handles both subregister and superregister uses:
+ // - If UseMask covers Mask: superregister use (e.g., querying sub0, finding
+ // full reg)
+ // - If Mask covers UseMask: subregister use (e.g., querying full reg,
+ // finding sub0)
+ // - If they overlap partially: partial overlap (both are valid uses)
+ if ((Mask & UseMask).any()) {
+ // Use materializeForRank for three-tier ranking system
+ int64_t Stored = static_cast<int64_t>(P.second);
+ D = materializeForRank(Stored, SnapshotOffset);
+
+ break; // first overlapping record is the nearest for this snapshot
+ }
+ }
+}
+
+SmallVector<VRegMaskPair>
+NextUseResult::getSortedSubregUses(const MachineBasicBlock::iterator I,
+ const VRegMaskPair VMP) {
+ SmallVector<VRegMaskPair> Result;
+ const MachineBasicBlock *MBB = I->getParent();
+ unsigned MBBNum = MBB->getNumber();
+ if (NextUseMap.contains(MBBNum) &&
+ NextUseMap[MBBNum].InstrDist.contains(&*I)) {
+ // VRegDistances Dists = NextUseMap[MBBNum].InstrDist[&*I];
+ if (NextUseMap[MBBNum].InstrDist[&*I].contains(VMP.getVReg())) {
+ VRegDistances::SortedRecords Dists =
+ NextUseMap[MBBNum].InstrDist[&*I][VMP.getVReg()];
+ LLVM_DEBUG({
+ dbgs() << "Mask : [" << PrintLaneMask(VMP.getLaneMask()) << "]\n";
+ });
+ for (auto P : reverse(Dists)) {
+ LaneBitmask UseMask = P.first;
+ LLVM_DEBUG(
+ { dbgs() << "Used mask : [" << PrintLaneMask(UseMask) << "]\n"; });
+ if ((UseMask & VMP.getLaneMask()) == UseMask) {
+ Result.push_back({VMP.getVReg(), UseMask});
+ }
+ }
+ }
+ }
+ return Result;
+}
+
+SmallVector<VRegMaskPair>
+NextUseResult::getSortedSubregUses(const MachineBasicBlock &MBB,
+ const VRegMaskPair VMP) {
+ SmallVector<VRegMaskPair> Result;
+ unsigned MBBNum = MBB.getNumber();
+ if (NextUseMap.contains(MBBNum) &&
+ NextUseMap[MBBNum].Bottom.contains(VMP.getVReg())) {
+ VRegDistances::SortedRecords Dists =
+ NextUseMap[MBBNum].Bottom[VMP.getVReg()];
+ LLVM_DEBUG(
+ { dbgs() << "Mask : [" << PrintLaneMask(VMP.getLaneMask()) << "]\n"; });
+ for (auto P : reverse(Dists)) {
+ LaneBitmask UseMask = P.first;
+ LLVM_DEBUG(dbgs() << "Used mask : [" << PrintLaneMask(UseMask) << "]\n");
+ if ((UseMask & VMP.getLaneMask()) == UseMask) {
+ Result.push_back({VMP.getVReg(), UseMask});
+ }
+ }
+ }
+ return Result;
+}
+
+void NextUseResult::dumpUsedInBlock() {
+ for (auto P : UsedInBlock) {
+ dbgs() << "MBB_" << P.first << ":\n";
+ for (auto VMP : P.second) {
+ dbgs() << "[ " << printReg(VMP.getVReg()) << " : <"
+ << PrintLaneMask(VMP.getLaneMask()) << "> ]\n";
+ }
+ }
+}
+
+unsigned NextUseResult::getNextUseDistance(const MachineBasicBlock::iterator I,
+ const VRegMaskPair VMP) {
+ if (EnableTimers)
+ GetDistanceTimer.startTimer();
+
+ unsigned Dist = Infinity;
+ const MachineBasicBlock *MBB = I->getParent();
+ unsigned MBBNum = MBB->getNumber();
+ if (NextUseMap.contains(MBBNum) &&
+ NextUseMap[MBBNum].InstrDist.contains(&*I)) {
+ VRegDistances Dists = NextUseMap[MBBNum].InstrDist[&*I];
+ if (NextUseMap[MBBNum].InstrDist[&*I].contains(VMP.getVReg())) {
+ // printSortedRecords(Dists[VMP.VReg], VMP.VReg);
----------------
chrisjbris wrote:
Can be removed.
https://github.com/llvm/llvm-project/pull/156079
More information about the llvm-commits
mailing list