[llvm] [AMDGPU] Introduce Next-Use Analysis for SSA-based Register Allocation (PR #156079)

Mon Dec 8 23:47:04 PST 2025

================
@@ -0,0 +1,482 @@
+//===-- AMDGPUNextUseAnalysis.cpp - Next Use Analysis ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file implements the Next Use Analysis for AMDGPU targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUNextUseAnalysis.h"
+#include "AMDGPU.h"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Passes/PassPlugin.h"
+#include "llvm/Support/Timer.h"
+
+#define DEBUG_TYPE "amdgpu-next-use"
+
+using namespace llvm;
+
+// Command-line option to enable timing instrumentation
+static cl::opt<bool>
+    EnableTimers("amdgpu-next-use-analysis-timers",
+                 cl::desc("Enable timing for Next Use Analysis"),
+                 cl::init(false), cl::Hidden);
+
+// Static timers for performance tracking across all analysis runs
+static llvm::TimerGroup TG("amdgpu-next-use", "AMDGPU Next Use Analysis");
+static llvm::Timer AnalyzeTimer("analyze", "Time spent in analyze()", TG);
+static llvm::Timer GetDistanceTimer("getNextUseDistance",
+                                    "Time spent in getNextUseDistance()", TG);
+
+// Three-tier ranking system for spiller decisions
+unsigned NextUseResult::materializeForRank(int64_t Stored,
+                                           unsigned SnapshotOffset) const {
+  int64_t Mat64 = materialize(Stored, SnapshotOffset);
+
+  // Tier 1: Finite distances (0 to LoopTag-1) → return as-is
+  // Tier 2: Loop-exit distances (LoopTag to DeadTag-1) → map to 60000-64999
+  // Tier 3: Dead registers (DeadTag+) → return DeadDistance (65535)
+  if (Mat64 >= DeadTag)
+    return DeadDistance;
+
+  if (Mat64 >= LoopTag) {
+    // Tier 2: Loop-exit distances get mapped to high range [60000, 64999]
+    int64_t LoopRemainder = Mat64 - LoopTag;
+    // Clamp the remainder to fit in available range (5000 values)
+    unsigned ClampedRemainder = static_cast<unsigned>(
+        std::min(LoopRemainder, static_cast<int64_t>(4999)));
+    return 60000 + ClampedRemainder;
+  }
+
+  if (Mat64 <= 0)
+    return 0; // Tier 1: Zero-distance for immediate uses
+
+  return static_cast<unsigned>(Mat64); // Tier 1: Finite distances as-is
+}
+
+void NextUseResult::init(const MachineFunction &MF) {
+  for (const MachineLoop *L : LI->getLoopsInPreorder()) {
+    SmallVector<std::pair<MachineBasicBlock *, MachineBasicBlock *>> Exiting;
+    L->getExitEdges(Exiting);
+    for (const std::pair<MachineBasicBlock *, MachineBasicBlock *> &P :
+         Exiting) {
+      LoopExits[P.first->getNumber()] = P.second->getNumber();
+    }
+  }
+}
+
+void NextUseResult::analyze(const MachineFunction &MF) {
+  // Upward-exposed distances are only necessary to convey the data flow from
+  // the block to its predecessors. No need to store it beyond the analyze
+  // function as the analysis users are only interested in the use distances
+  // relatively to the given MI or the given block end.
+  DenseMap<unsigned, VRegDistances> UpwardNextUses;
+  iterator_range<po_iterator<const llvm::MachineFunction *>> POT =
+      post_order(&MF);
+  if (EnableTimers)
+    AnalyzeTimer.startTimer();
+  bool Changed = true;
+  while (Changed) {
+    Changed = false;
+    for (const MachineBasicBlock *MBB : POT) {
+      unsigned Offset = 0;
+      unsigned MBBNum = MBB->getNumber();
+      VRegDistances Curr, Prev;
+      DenseMap<unsigned, VRegDistances>::iterator PrevIt =
+          UpwardNextUses.find(MBBNum);
+      if (PrevIt != UpwardNextUses.end()) {
+        Prev = PrevIt->second;
+      }
+
+      LLVM_DEBUG({
+        dbgs() << "\nMerging successors for "
+               << "MBB_" << MBB->getNumber() << "." << MBB->getName() << "\n";
+      });
+
+      for (MachineBasicBlock *Succ : successors(MBB)) {
+        unsigned SuccNum = Succ->getNumber();
+
+        if (!UpwardNextUses.contains(SuccNum))
+          continue;
+
+        VRegDistances SuccDist = UpwardNextUses[SuccNum];
+        LLVM_DEBUG({
+          dbgs() << "\nMerging "
+                 << "MBB_" << Succ->getNumber() << "." << Succ->getName()
+                 << "\n";
+        });
+
+        // Check if the edge from MBB to Succ goes out of the Loop
+        int64_t EdgeWeight = 0;
+        DenseMap<unsigned, unsigned>::iterator LoopExitIt =
+            LoopExits.find(MBB->getNumber());
+        if (LoopExitIt != LoopExits.end()) {
+          if (SuccNum == LoopExitIt->second)
+            EdgeWeight = LoopTag;
+        }
+
+        if (LI->getLoopDepth(MBB) < LI->getLoopDepth(Succ)) {
+          // MBB->Succ is entering the Succ's loop (analysis exiting the loop)
+          // Two transformations:
+          // 1. Outside-loop uses (>= LoopTag): subtract LoopTag
+          // 2. Inside-loop uses (< LoopTag): reset to preheader position
+          //    This models: if spilled before loop, reload at preheader
----------------
ruiling wrote:

It does not seem reasonable the next use analysis does distance adjustment because its users will possibly spill like mentioned. If this is needed, this needs to be done explicitly through some interface. Like `adjustVregDistanceForSpill()` when the register was spilled. Doing the adjustment in the initial analysis seems not a good idea.

https://github.com/llvm/llvm-project/pull/156079