[llvm] 8823efe - [AMDGPU] Add register usage debug printing the point of maximum register pressure. (#161850)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 13 07:17:16 PDT 2025
Author: Valery Pykhtin
Date: 2025-10-13T16:17:11+02:00
New Revision: 8823efe77dad40eaea63b539c4d3d1036587ceb0
URL: https://github.com/llvm/llvm-project/commit/8823efe77dad40eaea63b539c4d3d1036587ceb0
DIFF: https://github.com/llvm/llvm-project/commit/8823efe77dad40eaea63b539c4d3d1036587ceb0.diff
LOG: [AMDGPU] Add register usage debug printing the point of maximum register pressure. (#161850)
Basically this allows to analyze "why so many VGPRs used?".
It prints all live registers at the point of maximum register pressure
and for each register its defs/uses are dumped.
Currently can be run before and after the scheduler but would be nice if
it can be ran inbetween any passes (not sure this is possible with
legacy pass-manager). Requires debug or built with asserts compiler.
Highly recommended to run with debug info to have debug locations for
instructions.
Example output:
```
*** Register pressure info (VGPRs) for _ZN7ck_tile6ken.... ***
Max pressure is 256 VGPRs at 41780e at BB.18 (LoopHdr BB.16, Depth 1): %9858:vreg_512_align2 = contract V_MFMA_F32_32X32X16_BF16_mac_vgprcd_e64 %10137:vreg_128_align2, %10141:vreg_128_align2, %9858:vreg_512_align2(tied-def 0), 0, 0, 0, implicit $mode, implicit $exec
Live registers with single definition (123 VGPRs):
%10126:VReg_128_Align2, LiveMask 00000000000000FF (4 VGPRs)
def 41600r at BB.18 (LoopHdr BB.16, Depth 1): undef %10126.sub0_sub1:vreg_128_align2 = DS_READ_B64_TR_B16 %478:vgpr_32, 15232, 0, implicit $exec :: (load (s64) from %ir.sunkaddr1314, !noalias !60, addrspace 3)
def 41608r at BB.18 (LoopHdr BB.16, Depth 1): %10126.sub2_sub3:vreg_128_align2 = DS_READ_B64_TR_B16 %478:vgpr_32, 16320, 0, implicit $exec :: (load (s64) from %ir.sunkaddr1315, !noalias !60, addrspace 3)
use 41848r at BB.18 (LoopHdr BB.16, Depth 1): %9856:vreg_512_align2 = contract V_MFMA_F32_32X32X16_BF16_mac_vgprcd_e64 %10126:vreg_128_align2, %10138:vreg_128_align2, %9856:vreg_512_align2(tied-def 0), 0, 0, 0, implicit $mode, implicit $exec
%10136:VReg_128_Align2, LiveMask 00000000000000FF (4 VGPRs)
def 41264r at BB.18 (LoopHdr BB.16, Depth 1): undef %10136.sub0_sub1:vreg_128_align2 = DS_READ_B64_TR_B16 %478:vgpr_32, 2176, 0, implicit $exec :: (load (s64) from %ir.sunkaddr1294, !noalias !60, addrspace 3)
def 41272r at BB.18 (LoopHdr BB.16, Depth 1): %10136.sub2_sub3:vreg_128_align2 = DS_READ_B64_TR_B16 %478:vgpr_32, 3264, 0, implicit $exec :: (load (s64) from %ir.sunkaddr1295, !noalias !60, addrspace 3)
use 41788r at BB.18 (LoopHdr BB.16, Depth 1): %9858:vreg_512_align2 = contract V_MFMA_F32_32X32X16_BF16_mac_vgprcd_e64 %10136:vreg_128_align2, %10140:vreg_128_align2, %9858:vreg_512_align2(tied-def 0), 0, 0, 0, implicit $mode, implicit $exec
%10129:VReg_128_Align2, LiveMask 00000000000000FF (4 VGPRs)
...
Live registers with multiple definitions (133 VGPRs):
%9856:VReg_512_Align2, LiveMask 00000000FFFFFFFF (16 VGPRs)
def 16544r at BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def undef %9856.sub0_sub1:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52
def 16592r at BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub2_sub3:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52
def 16608r at BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub4_sub5:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52
def 16656r at BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub6_sub7:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52
def 16672r at BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub8_sub9:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52
def 16720r at BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub10_sub11:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52
def 16736r at BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub12_sub13:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52
def 16784r at BB.8: INLINEASM &"v_pk_mul_f32 $0, $1, $2" [sideeffect] [isconvergent] [attdialect], $0:[regdef:VReg_64_Align2], def %9856.sub14_sub15:vreg_512_align2, $1:[reguse:VReg_64_Align2], %4069:vreg_64_align2, $2:[reguse:VReg_64_Align2], %10159:vreg_64_align2, !52
def use 41828r at BB.18 (LoopHdr BB.16, Depth 1): %9856:vreg_512_align2 = contract V_MFMA_F32_...
...
********** INTERVALS **********
...
********** MACHINEINSTRS **********
# Machine code for function _ZN7ck_tile6kentr...
```
Added:
Modified:
llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
llvm/lib/Target/AMDGPU/GCNRegPressure.h
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 71494be74ec52..4e11c4ff3d56e 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -14,6 +14,7 @@
#include "GCNRegPressure.h"
#include "AMDGPU.h"
#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/RegisterPressure.h"
using namespace llvm;
@@ -459,10 +460,14 @@ LaneBitmask llvm::getLiveLaneMask(const LiveInterval &LI, SlotIndex SI,
GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
const LiveIntervals &LIS,
- const MachineRegisterInfo &MRI) {
+ const MachineRegisterInfo &MRI,
+ GCNRegPressure::RegKind RegKind) {
GCNRPTracker::LiveRegSet LiveRegs;
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
auto Reg = Register::index2VirtReg(I);
+ if (RegKind != GCNRegPressure::TOTAL_KINDS &&
+ GCNRegPressure::getRegKind(Reg, MRI) != RegKind)
+ continue;
if (!LIS.hasInterval(Reg))
continue;
auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI);
@@ -986,3 +991,128 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {
#undef PFX
}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void llvm::dumpMaxRegPressure(MachineFunction &MF,
+ GCNRegPressure::RegKind Kind,
+ LiveIntervals &LIS,
+ const MachineLoopInfo *MLI) {
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+ auto &OS = dbgs();
+ const char *RegName = GCNRegPressure::getName(Kind);
+
+ unsigned MaxNumRegs = 0;
+ const MachineInstr *MaxPressureMI = nullptr;
+ GCNUpwardRPTracker RPT(LIS);
+ for (const MachineBasicBlock &MBB : MF) {
+ RPT.reset(MRI, LIS.getSlotIndexes()->getMBBEndIdx(&MBB).getPrevSlot());
+ for (const MachineInstr &MI : reverse(MBB)) {
+ RPT.recede(MI);
+ unsigned NumRegs = RPT.getMaxPressure().getNumRegs(Kind);
+ if (NumRegs > MaxNumRegs) {
+ MaxNumRegs = NumRegs;
+ MaxPressureMI = &MI;
+ }
+ }
+ }
+
+ SlotIndex MISlot = LIS.getInstructionIndex(*MaxPressureMI);
+
+ // Max pressure can occur at either the early-clobber or register slot.
+ // Choose the maximum liveset between both slots. This is ugly but this is
+ // diagnostic code.
+ SlotIndex ECSlot = MISlot.getRegSlot(true);
+ SlotIndex RSlot = MISlot.getRegSlot(false);
+ GCNRPTracker::LiveRegSet ECLiveSet = getLiveRegs(ECSlot, LIS, MRI, Kind);
+ GCNRPTracker::LiveRegSet RLiveSet = getLiveRegs(RSlot, LIS, MRI, Kind);
+ unsigned ECNumRegs = getRegPressure(MRI, ECLiveSet).getNumRegs(Kind);
+ unsigned RNumRegs = getRegPressure(MRI, RLiveSet).getNumRegs(Kind);
+ GCNRPTracker::LiveRegSet *LiveSet =
+ ECNumRegs > RNumRegs ? &ECLiveSet : &RLiveSet;
+ SlotIndex MaxPressureSlot = ECNumRegs > RNumRegs ? ECSlot : RSlot;
+ assert(getRegPressure(MRI, *LiveSet).getNumRegs(Kind) == MaxNumRegs);
+
+ // Split live registers into single-def and multi-def sets.
+ GCNRegPressure SDefPressure, MDefPressure;
+ SmallVector<Register, 16> SDefRegs, MDefRegs;
+ for (auto [Reg, LaneMask] : *LiveSet) {
+ assert(GCNRegPressure::getRegKind(Reg, MRI) == Kind);
+ LiveInterval &LI = LIS.getInterval(Reg);
+ if (LI.getNumValNums() == 1 ||
+ (LI.hasSubRanges() &&
+ llvm::all_of(LI.subranges(), [](const LiveInterval::SubRange &SR) {
+ return SR.getNumValNums() == 1;
+ }))) {
+ SDefPressure.inc(Reg, LaneBitmask::getNone(), LaneMask, MRI);
+ SDefRegs.push_back(Reg);
+ } else {
+ MDefPressure.inc(Reg, LaneBitmask::getNone(), LaneMask, MRI);
+ MDefRegs.push_back(Reg);
+ }
+ }
+ unsigned SDefNumRegs = SDefPressure.getNumRegs(Kind);
+ unsigned MDefNumRegs = MDefPressure.getNumRegs(Kind);
+ assert(SDefNumRegs + MDefNumRegs == MaxNumRegs);
+
+ auto printLoc = [&](const MachineBasicBlock *MBB, SlotIndex SI) {
+ return Printable([&, MBB, SI](raw_ostream &OS) {
+ OS << SI << ':' << printMBBReference(*MBB);
+ if (MLI)
+ if (const MachineLoop *ML = MLI->getLoopFor(MBB))
+ OS << " (LoopHdr " << printMBBReference(*ML->getHeader())
+ << ", Depth " << ML->getLoopDepth() << ")";
+ });
+ };
+
+ auto PrintRegInfo = [&](Register Reg, LaneBitmask LiveMask) {
+ GCNRegPressure RegPressure;
+ RegPressure.inc(Reg, LaneBitmask::getNone(), LiveMask, MRI);
+ OS << " " << printReg(Reg, TRI) << ':'
+ << TRI->getRegClassName(MRI.getRegClass(Reg)) << ", LiveMask "
+ << PrintLaneMask(LiveMask) << " (" << RegPressure.getNumRegs(Kind) << ' '
+ << RegName << "s)\n";
+
+ // Use std::map to sort def/uses by SlotIndex.
+ std::map<SlotIndex, const MachineInstr *> Instrs;
+ for (const MachineInstr &MI : MRI.reg_nodbg_instructions(Reg)) {
+ Instrs[LIS.getInstructionIndex(MI).getRegSlot()] = &MI;
+ }
+
+ for (const auto &[SI, MI] : Instrs) {
+ OS << " ";
+ if (MI->definesRegister(Reg, TRI))
+ OS << "def ";
+ if (MI->readsRegister(Reg, TRI))
+ OS << "use ";
+ OS << printLoc(MI->getParent(), SI) << ": " << *MI;
+ }
+ };
+
+ OS << "\n*** Register pressure info (" << RegName << "s) for " << MF.getName()
+ << " ***\n";
+ OS << "Max pressure is " << MaxNumRegs << ' ' << RegName << "s at "
+ << printLoc(MaxPressureMI->getParent(), MaxPressureSlot) << ": "
+ << *MaxPressureMI;
+
+ OS << "\nLive registers with single definition (" << SDefNumRegs << ' '
+ << RegName << "s):\n";
+
+ // Sort SDefRegs by number of uses (smallest first)
+ llvm::sort(SDefRegs, [&](Register A, Register B) {
+ return std::distance(MRI.use_nodbg_begin(A), MRI.use_nodbg_end()) <
+ std::distance(MRI.use_nodbg_begin(B), MRI.use_nodbg_end());
+ });
+
+ for (const Register Reg : SDefRegs) {
+ PrintRegInfo(Reg, LiveSet->lookup(Reg));
+ }
+
+ OS << "\nLive registers with multiple definitions (" << MDefNumRegs << ' '
+ << RegName << "s):\n";
+ for (const Register Reg : MDefRegs) {
+ PrintRegInfo(Reg, LiveSet->lookup(Reg));
+ }
+}
+#endif
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 898d1ffc10b79..979a8b0abfb4c 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -31,6 +31,12 @@ class SlotIndex;
struct GCNRegPressure {
enum RegKind { SGPR, VGPR, AGPR, AVGPR, TOTAL_KINDS };
+ static constexpr const char *getName(RegKind Kind) {
+ const char *Names[] = {"SGPR", "VGPR", "AGPR", "AVGPR"};
+ assert(Kind < TOTAL_KINDS);
+ return Names[Kind];
+ }
+
GCNRegPressure() {
clear();
}
@@ -41,6 +47,11 @@ struct GCNRegPressure {
void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); }
+ unsigned getNumRegs(RegKind Kind) const {
+ assert(Kind < TOTAL_KINDS);
+ return Value[Kind];
+ }
+
/// \returns the SGPR32 pressure
unsigned getSGPRNum() const { return Value[SGPR]; }
/// \returns the aggregated ArchVGPR32, AccVGPR32, and Pseudo AVGPR pressure
@@ -138,6 +149,12 @@ struct GCNRegPressure {
void dump() const;
+ static RegKind getRegKind(unsigned Reg, const MachineRegisterInfo &MRI) {
+ const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+ const SIRegisterInfo *STI = static_cast<const SIRegisterInfo *>(TRI);
+ return (RegKind)getRegKind(MRI.getRegClass(Reg), STI);
+ }
+
private:
static constexpr unsigned ValueArraySize = TOTAL_KINDS * 2;
@@ -294,8 +311,10 @@ class GCNRPTracker {
}
};
-GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
- const MachineRegisterInfo &MRI);
+GCNRPTracker::LiveRegSet
+getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
+ const MachineRegisterInfo &MRI,
+ GCNRegPressure::RegKind RegKind = GCNRegPressure::TOTAL_KINDS);
////////////////////////////////////////////////////////////////////////////////
// GCNUpwardRPTracker
@@ -428,9 +447,6 @@ LaneBitmask getLiveLaneMask(const LiveInterval &LI, SlotIndex SI,
const MachineRegisterInfo &MRI,
LaneBitmask LaneMaskFilter = LaneBitmask::getAll());
-GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
- const MachineRegisterInfo &MRI);
-
/// creates a map MachineInstr -> LiveRegSet
/// R - range of iterators on instructions
/// After - upon entry or exit of every instruction
@@ -524,6 +540,11 @@ struct GCNRegPressurePrinter : public MachineFunctionPass {
}
};
+LLVM_ABI void dumpMaxRegPressure(MachineFunction &MF,
+ GCNRegPressure::RegKind Kind,
+ LiveIntervals &LIS,
+ const MachineLoopInfo *MLI);
+
} // end namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index bdc08101c7119..58482ea69d0b0 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -69,6 +69,21 @@ static cl::opt<bool> GCNTrackers(
cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
cl::init(false));
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+#define DUMP_MAX_REG_PRESSURE
+static cl::opt<bool> PrintMaxRPRegUsageBeforeScheduler(
+ "amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden,
+ cl::desc("Print a list of live registers along with their def/uses at the "
+ "point of maximum register pressure before scheduling."),
+ cl::init(false));
+
+static cl::opt<bool> PrintMaxRPRegUsageAfterScheduler(
+ "amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden,
+ cl::desc("Print a list of live registers along with their def/uses at the "
+ "point of maximum register pressure after scheduling."),
+ cl::init(false));
+#endif
+
const unsigned ScheduleMetrics::ScaleFactor = 100;
GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
@@ -960,6 +975,14 @@ void GCNScheduleDAGMILive::runSchedStages() {
RegionLiveOuts.buildLiveRegMap();
}
+#ifdef DUMP_MAX_REG_PRESSURE
+ if (PrintMaxRPRegUsageBeforeScheduler) {
+ dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI);
+ dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI);
+ LIS->dump();
+ }
+#endif
+
GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
while (S.advanceStage()) {
auto Stage = createSchedStage(S.getCurrentStage());
@@ -995,6 +1018,14 @@ void GCNScheduleDAGMILive::runSchedStages() {
Stage->finalizeGCNSchedStage();
}
+
+#ifdef DUMP_MAX_REG_PRESSURE
+ if (PrintMaxRPRegUsageAfterScheduler) {
+ dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI);
+ dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI);
+ LIS->dump();
+ }
+#endif
}
#ifndef NDEBUG
More information about the llvm-commits
mailing list