[llvm] [AMDGPU] Introduce Next-Use Analysis for SSA-based Register Allocation (PR #156079)

Tue Dec 9 08:13:30 PST 2025

================
@@ -0,0 +1,482 @@
+//===-- AMDGPUNextUseAnalysis.cpp - Next Use Analysis ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file implements the Next Use Analysis for AMDGPU targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUNextUseAnalysis.h"
+#include "AMDGPU.h"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Passes/PassPlugin.h"
+#include "llvm/Support/Timer.h"
+
+#define DEBUG_TYPE "amdgpu-next-use"
+
+using namespace llvm;
+
+// Command-line option to enable timing instrumentation
+static cl::opt<bool>
+    EnableTimers("amdgpu-next-use-analysis-timers",
+                 cl::desc("Enable timing for Next Use Analysis"),
+                 cl::init(false), cl::Hidden);
+
+// Static timers for performance tracking across all analysis runs
+static llvm::TimerGroup TG("amdgpu-next-use", "AMDGPU Next Use Analysis");
+static llvm::Timer AnalyzeTimer("analyze", "Time spent in analyze()", TG);
+static llvm::Timer GetDistanceTimer("getNextUseDistance",
+                                    "Time spent in getNextUseDistance()", TG);
+
+// Three-tier ranking system for spiller decisions
+unsigned NextUseResult::materializeForRank(int64_t Stored,
+                                           unsigned SnapshotOffset) const {
+  int64_t Mat64 = materialize(Stored, SnapshotOffset);
+
+  // Tier 1: Finite distances (0 to LoopTag-1) → return as-is
+  // Tier 2: Loop-exit distances (LoopTag to DeadTag-1) → map to 60000-64999
+  // Tier 3: Dead registers (DeadTag+) → return DeadDistance (65535)
+  if (Mat64 >= DeadTag)
+    return DeadDistance;
+
+  if (Mat64 >= LoopTag) {
+    // Tier 2: Loop-exit distances get mapped to high range [60000, 64999]
+    int64_t LoopRemainder = Mat64 - LoopTag;
+    // Clamp the remainder to fit in available range (5000 values)
+    unsigned ClampedRemainder = static_cast<unsigned>(
+        std::min(LoopRemainder, static_cast<int64_t>(4999)));
+    return 60000 + ClampedRemainder;
+  }
+
+  if (Mat64 <= 0)
+    return 0; // Tier 1: Zero-distance for immediate uses
+
+  return static_cast<unsigned>(Mat64); // Tier 1: Finite distances as-is
+}
+
+void NextUseResult::init(const MachineFunction &MF) {
+  for (const MachineLoop *L : LI->getLoopsInPreorder()) {
+    SmallVector<std::pair<MachineBasicBlock *, MachineBasicBlock *>> Exiting;
+    L->getExitEdges(Exiting);
+    for (const std::pair<MachineBasicBlock *, MachineBasicBlock *> &P :
+         Exiting) {
+      LoopExits[P.first->getNumber()] = P.second->getNumber();
+    }
+  }
+}
+
+void NextUseResult::analyze(const MachineFunction &MF) {
+  // Upward-exposed distances are only necessary to convey the data flow from
+  // the block to its predecessors. No need to store it beyond the analyze
+  // function as the analysis users are only interested in the use distances
+  // relatively to the given MI or the given block end.
+  DenseMap<unsigned, VRegDistances> UpwardNextUses;
+  iterator_range<po_iterator<const llvm::MachineFunction *>> POT =
+      post_order(&MF);
+  if (EnableTimers)
+    AnalyzeTimer.startTimer();
+  bool Changed = true;
+  while (Changed) {
+    Changed = false;
+    for (const MachineBasicBlock *MBB : POT) {
+      unsigned Offset = 0;
+      unsigned MBBNum = MBB->getNumber();
+      VRegDistances Curr, Prev;
+      DenseMap<unsigned, VRegDistances>::iterator PrevIt =
+          UpwardNextUses.find(MBBNum);
+      if (PrevIt != UpwardNextUses.end()) {
+        Prev = PrevIt->second;
+      }
+
+      LLVM_DEBUG({
+        dbgs() << "\nMerging successors for "
+               << "MBB_" << MBB->getNumber() << "." << MBB->getName() << "\n";
+      });
+
+      for (MachineBasicBlock *Succ : successors(MBB)) {
+        unsigned SuccNum = Succ->getNumber();
+
+        if (!UpwardNextUses.contains(SuccNum))
+          continue;
+
+        VRegDistances SuccDist = UpwardNextUses[SuccNum];
+        LLVM_DEBUG({
+          dbgs() << "\nMerging "
+                 << "MBB_" << Succ->getNumber() << "." << Succ->getName()
+                 << "\n";
+        });
+
+        // Check if the edge from MBB to Succ goes out of the Loop
+        int64_t EdgeWeight = 0;
+        DenseMap<unsigned, unsigned>::iterator LoopExitIt =
+            LoopExits.find(MBB->getNumber());
+        if (LoopExitIt != LoopExits.end()) {
+          if (SuccNum == LoopExitIt->second)
+            EdgeWeight = LoopTag;
+        }
+
+        if (LI->getLoopDepth(MBB) < LI->getLoopDepth(Succ)) {
+          // MBB->Succ is entering the Succ's loop (analysis exiting the loop)
+          // Two transformations:
+          // 1. Outside-loop uses (>= LoopTag): subtract LoopTag
+          // 2. Inside-loop uses (< LoopTag): reset to preheader position
+          //    This models: if spilled before loop, reload at preheader
----------------
alex-t wrote:

@ruiling, thanks for digging into this - it's worth clarifying the design rationale here.

**The adjustment is semantic, not policy.** The core question is: "What does distance mean when the query point is inside a loop and the use is outside?" Without adjustment, a use 5 instructions after the loop exit appears closer than a use 100 instructions into the loop body. 

But this is misleading because:

- The loop may execute many times
- Any value live across the loop back-edge will be needed at least at iteration N+1

The adjustment doesn't implement a spilling policy - it makes the distance metric reflect the expected distance under normal execution, accounting for loop iterations.

Preheader reset for in-loop uses: When the use is also inside the loop, we report the distance to the preheader, not to the actual use. This models the natural reload point: if you spill before a loop and need the value inside, you reload once at loop entry - not repeatedly at the use site.

**Why not defer to users?** Every consumer of this analysis would need to implement the same loop-aware adjustment, duplicating logic and risking inconsistency. The analysis is specifically designed for register allocation (hence "NextUseAnalysis" not "GenericDistanceAnalysis"), so encoding loop semantics seems appropriate.

**That said, I'm open to alternatives:**

1. **Expose raw + adjusted:** Provide both getRawDistance() and getAdjustedDistance() if there are use cases for unmodified values.
2. **Query-time adjustment:** Move the adjustment to query time with a flag: getDistance(MI, Reg, AdjustForLoops=true).
3. **Document the semantics:** If the current approach is acceptable, I can add detailed documentation explaining exactly what "distance" means in loop contexts.

Which approach would you prefer? I want to ensure the API is clear and useful for future consumers while avoiding unnecessary complexity.

https://github.com/llvm/llvm-project/pull/156079