[llvm] [AMDGPU] Add scheduling stage to rewrite MFMA from VGPR to AGPR (PR #149367)
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 29 15:54:49 PDT 2025
https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/149367
>From 929584982b8fd23e45e27420ed3218fe67076287 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 23 Jul 2025 15:41:11 -0700
Subject: [PATCH 1/5] [AMDGPU] More accurately account for AVGPR pressure
Change-Id: I6f129c2723b52a391a96178e390f60535164ac9b
---
.../Target/AMDGPU/GCNIterativeScheduler.cpp | 46 +-
llvm/lib/Target/AMDGPU/GCNRegPressure.cpp | 156 +++---
llvm/lib/Target/AMDGPU/GCNRegPressure.h | 129 +++--
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 72 +--
.../lib/Target/AMDGPU/SIFormMemoryClauses.cpp | 9 +-
llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir | 481 ++++++++++++++++++
.../AMDGPU/debug-value-scheduler-liveins.mir | 2 +-
7 files changed, 726 insertions(+), 169 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir
diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index f253a841f16a6..87f5b9f16868a 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -447,11 +447,7 @@ void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
// BestSchedules aren't deleted on fail.
unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
// TODO: assert Regions are sorted descending by pressure
- const auto &ST = MF.getSubtarget<GCNSubtarget>();
- const unsigned DynamicVGPRBlockSize =
- MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
- const auto Occ =
- Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);
+ const auto Occ = Regions.front()->MaxPressure.getOccupancy(MF);
LLVM_DEBUG(dbgs() << "Trying to improve occupancy, target = " << TargetOcc
<< ", current = " << Occ << '\n');
@@ -460,7 +456,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
// Always build the DAG to add mutations
BuildDAG DAG(*R, *this);
- if (R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize) >= NewOcc)
+ if (R->MaxPressure.getOccupancy(MF) >= NewOcc)
continue;
LLVM_DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
@@ -471,7 +467,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
LLVM_DEBUG(dbgs() << "Occupancy improvement attempt:\n";
printSchedRP(dbgs(), R->MaxPressure, MaxRP));
- NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST, DynamicVGPRBlockSize));
+ NewOcc = std::min(NewOcc, MaxRP.getOccupancy(MF));
if (NewOcc <= Occ)
break;
@@ -488,15 +484,12 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
}
void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
- bool TryMaximizeOccupancy) {
- const auto &ST = MF.getSubtarget<GCNSubtarget>();
+ bool TryMaximizeOccupancy) {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
auto TgtOcc = MFI->getMinAllowedOccupancy();
- unsigned DynamicVGPRBlockSize = MFI->getDynamicVGPRBlockSize();
sortRegionsByPressure(TgtOcc);
- auto Occ =
- Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);
+ auto Occ = Regions.front()->MaxPressure.getOccupancy(MF);
bool IsReentry = false;
if (TryMaximizeOccupancy && Occ < TgtOcc) {
@@ -527,21 +520,19 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
const auto RP = getRegionPressure(*R);
LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
- if (RP.getOccupancy(ST, DynamicVGPRBlockSize) < TgtOcc) {
+ if (RP.getOccupancy(MF) < TgtOcc) {
LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
- if (R->BestSchedule.get() && R->BestSchedule->MaxPressure.getOccupancy(
- ST, DynamicVGPRBlockSize) >= TgtOcc) {
+ if (R->BestSchedule.get() &&
+ R->BestSchedule->MaxPressure.getOccupancy(MF) >= TgtOcc) {
LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
scheduleBest(*R);
} else {
LLVM_DEBUG(dbgs() << ", restoring\n");
Ovr.restoreOrder();
- assert(R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize) >=
- TgtOcc);
+ assert(R->MaxPressure.getOccupancy(MF) >= TgtOcc);
}
}
- FinalOccupancy =
- std::min(FinalOccupancy, RP.getOccupancy(ST, DynamicVGPRBlockSize));
+ FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(MF));
}
}
MFI->limitOccupancy(FinalOccupancy);
@@ -582,16 +573,12 @@ void GCNIterativeScheduler::scheduleMinReg(bool force) {
///////////////////////////////////////////////////////////////////////////////
// ILP scheduler port
-void GCNIterativeScheduler::scheduleILP(
- bool TryMaximizeOccupancy) {
- const auto &ST = MF.getSubtarget<GCNSubtarget>();
+void GCNIterativeScheduler::scheduleILP(bool TryMaximizeOccupancy) {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
auto TgtOcc = MFI->getMinAllowedOccupancy();
- unsigned DynamicVGPRBlockSize = MFI->getDynamicVGPRBlockSize();
sortRegionsByPressure(TgtOcc);
- auto Occ =
- Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);
+ auto Occ = Regions.front()->MaxPressure.getOccupancy(MF);
bool IsReentry = false;
if (TryMaximizeOccupancy && Occ < TgtOcc) {
@@ -612,18 +599,17 @@ void GCNIterativeScheduler::scheduleILP(
const auto RP = getSchedulePressure(*R, ILPSchedule);
LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
- if (RP.getOccupancy(ST, DynamicVGPRBlockSize) < TgtOcc) {
+ if (RP.getOccupancy(MF) < TgtOcc) {
LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
- if (R->BestSchedule.get() && R->BestSchedule->MaxPressure.getOccupancy(
- ST, DynamicVGPRBlockSize) >= TgtOcc) {
+ if (R->BestSchedule.get() &&
+ R->BestSchedule->MaxPressure.getOccupancy(MF) >= TgtOcc) {
LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
scheduleBest(*R);
}
} else {
scheduleRegion(*R, ILPSchedule, RP);
LLVM_DEBUG(printSchedResult(dbgs(), R, RP));
- FinalOccupancy =
- std::min(FinalOccupancy, RP.getOccupancy(ST, DynamicVGPRBlockSize));
+ FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(MF));
}
}
MFI->limitOccupancy(FinalOccupancy);
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 334afd3a2a5b4..dd007e6cd6b31 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -99,20 +99,22 @@ void GCNRegPressure::inc(unsigned Reg,
bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
unsigned MaxOccupancy) const {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ unsigned ArchVGPRThreshold = ST.getMaxNumVectorRegs(MF.getFunction()).first;
unsigned DynamicVGPRBlockSize =
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
const auto SGPROcc = std::min(MaxOccupancy,
ST.getOccupancyWithNumSGPRs(getSGPRNum()));
const auto VGPROcc = std::min(
- MaxOccupancy, ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()),
- DynamicVGPRBlockSize));
+ MaxOccupancy, ST.getOccupancyWithNumVGPRs(
+ getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold),
+ DynamicVGPRBlockSize));
const auto OtherSGPROcc = std::min(MaxOccupancy,
ST.getOccupancyWithNumSGPRs(O.getSGPRNum()));
- const auto OtherVGPROcc =
- std::min(MaxOccupancy,
- ST.getOccupancyWithNumVGPRs(O.getVGPRNum(ST.hasGFX90AInsts()),
- DynamicVGPRBlockSize));
+ const auto OtherVGPROcc = std::min(
+ MaxOccupancy, ST.getOccupancyWithNumVGPRs(
+ O.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold),
+ DynamicVGPRBlockSize));
const auto Occ = std::min(SGPROcc, VGPROcc);
const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
@@ -135,35 +137,39 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
unsigned OtherVGPRForSGPRSpills =
(OtherExcessSGPR + (WaveSize - 1)) / WaveSize;
- unsigned MaxArchVGPRs = ST.getAddressableNumArchVGPRs();
-
// Unified excess pressure conditions, accounting for VGPRs used for SGPR
// spills
- unsigned ExcessVGPR =
- std::max(static_cast<int>(getVGPRNum(ST.hasGFX90AInsts()) +
- VGPRForSGPRSpills - MaxVGPRs),
- 0);
- unsigned OtherExcessVGPR =
- std::max(static_cast<int>(O.getVGPRNum(ST.hasGFX90AInsts()) +
- OtherVGPRForSGPRSpills - MaxVGPRs),
- 0);
+ unsigned ExcessVGPR = std::max(
+ static_cast<int>(getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) +
+ VGPRForSGPRSpills - MaxVGPRs),
+ 0);
+ unsigned OtherExcessVGPR = std::max(
+ static_cast<int>(O.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) +
+ OtherVGPRForSGPRSpills - MaxVGPRs),
+ 0);
// Arch VGPR excess pressure conditions, accounting for VGPRs used for SGPR
// spills
- unsigned ExcessArchVGPR = std::max(
- static_cast<int>(getVGPRNum(false) + VGPRForSGPRSpills - MaxArchVGPRs),
- 0);
+ unsigned AddressableArchVGPRs = ST.getAddressableNumArchVGPRs();
+ unsigned ExcessArchVGPR =
+ std::max(static_cast<int>(getVGPRNum(false, ArchVGPRThreshold) +
+ VGPRForSGPRSpills - AddressableArchVGPRs),
+ 0);
unsigned OtherExcessArchVGPR =
- std::max(static_cast<int>(O.getVGPRNum(false) + OtherVGPRForSGPRSpills -
- MaxArchVGPRs),
+ std::max(static_cast<int>(O.getVGPRNum(false, ArchVGPRThreshold) +
+ OtherVGPRForSGPRSpills - AddressableArchVGPRs),
0);
// AGPR excess pressure conditions
- unsigned ExcessAGPR = std::max(
- static_cast<int>(ST.hasGFX90AInsts() ? (getAGPRNum() - MaxArchVGPRs)
- : (getAGPRNum() - MaxVGPRs)),
- 0);
+ unsigned ExcessAGPR =
+ std::max(static_cast<int>(
+ ST.hasGFX90AInsts()
+ ? (getAGPRNum(ArchVGPRThreshold) - AddressableArchVGPRs)
+ : (getAGPRNum(ArchVGPRThreshold) - MaxVGPRs)),
+ 0);
unsigned OtherExcessAGPR = std::max(
- static_cast<int>(ST.hasGFX90AInsts() ? (O.getAGPRNum() - MaxArchVGPRs)
- : (O.getAGPRNum() - MaxVGPRs)),
+ static_cast<int>(
+ ST.hasGFX90AInsts()
+ ? (O.getAGPRNum(ArchVGPRThreshold) - AddressableArchVGPRs)
+ : (O.getAGPRNum(ArchVGPRThreshold) - MaxVGPRs)),
0);
bool ExcessRP = ExcessSGPR || ExcessVGPR || ExcessArchVGPR || ExcessAGPR;
@@ -184,14 +190,21 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
return VGPRDiff > 0;
if (SGPRDiff != 0) {
unsigned PureExcessVGPR =
- std::max(static_cast<int>(getVGPRNum(ST.hasGFX90AInsts()) - MaxVGPRs),
+ std::max(static_cast<int>(
+ getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) -
+ MaxVGPRs),
0) +
- std::max(static_cast<int>(getVGPRNum(false) - MaxArchVGPRs), 0);
+ std::max(static_cast<int>(getVGPRNum(false, ArchVGPRThreshold) -
+ AddressableArchVGPRs),
+ 0);
unsigned OtherPureExcessVGPR =
- std::max(
- static_cast<int>(O.getVGPRNum(ST.hasGFX90AInsts()) - MaxVGPRs),
- 0) +
- std::max(static_cast<int>(O.getVGPRNum(false) - MaxArchVGPRs), 0);
+ std::max(static_cast<int>(
+ O.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) -
+ MaxVGPRs),
+ 0) +
+ std::max(static_cast<int>(O.getVGPRNum(false, ArchVGPRThreshold) -
+ AddressableArchVGPRs),
+ 0);
// If we have a special case where there is a tie in excess VGPR, but one
// of the pressures has VGPR usage from SGPR spills, prefer the pressure
@@ -221,38 +234,45 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
if (SW != OtherSW)
return SW < OtherSW;
} else {
- auto VW = getVGPRTuplesWeight();
- auto OtherVW = O.getVGPRTuplesWeight();
+ auto VW = getVGPRTuplesWeight(ArchVGPRThreshold);
+ auto OtherVW = O.getVGPRTuplesWeight(ArchVGPRThreshold);
if (VW != OtherVW)
return VW < OtherVW;
}
}
// Give final precedence to lower general RP.
- return SGPRImportant ? (getSGPRNum() < O.getSGPRNum()):
- (getVGPRNum(ST.hasGFX90AInsts()) <
- O.getVGPRNum(ST.hasGFX90AInsts()));
+ return SGPRImportant ? (getSGPRNum() < O.getSGPRNum())
+ : (getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) <
+ O.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold));
}
Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST,
- unsigned DynamicVGPRBlockSize) {
- return Printable([&RP, ST, DynamicVGPRBlockSize](raw_ostream &OS) {
- OS << "VGPRs: " << RP.getArchVGPRNum() << ' '
- << "AGPRs: " << RP.getAGPRNum();
- if (ST)
- OS << "(O"
- << ST->getOccupancyWithNumVGPRs(RP.getVGPRNum(ST->hasGFX90AInsts()),
- DynamicVGPRBlockSize)
- << ')';
- OS << ", SGPRs: " << RP.getSGPRNum();
- if (ST)
- OS << "(O" << ST->getOccupancyWithNumSGPRs(RP.getSGPRNum()) << ')';
- OS << ", LVGPR WT: " << RP.getVGPRTuplesWeight()
- << ", LSGPR WT: " << RP.getSGPRTuplesWeight();
- if (ST)
- OS << " -> Occ: " << RP.getOccupancy(*ST, DynamicVGPRBlockSize);
- OS << '\n';
- });
+ unsigned DynamicVGPRBlockSize,
+ const MachineFunction *MF) {
+ unsigned ArchVGPRThreshold = std::numeric_limits<unsigned int>::max();
+ if (ST && MF)
+ ArchVGPRThreshold = ST->getMaxNumVectorRegs(MF->getFunction()).first;
+
+ return Printable(
+ [&RP, ST, DynamicVGPRBlockSize, ArchVGPRThreshold, MF](raw_ostream &OS) {
+ OS << "VGPRs: " << RP.getArchVGPRNum(ArchVGPRThreshold) << ' '
+ << "AGPRs: " << RP.getAGPRNum(ArchVGPRThreshold);
+ if (ST)
+ OS << "(O"
+ << ST->getOccupancyWithNumVGPRs(
+ RP.getVGPRNum(ST->hasGFX90AInsts(), ArchVGPRThreshold),
+ DynamicVGPRBlockSize)
+ << ')';
+ OS << ", SGPRs: " << RP.getSGPRNum();
+ if (ST)
+ OS << "(O" << ST->getOccupancyWithNumSGPRs(RP.getSGPRNum()) << ')';
+ OS << ", LVGPR WT: " << RP.getVGPRTuplesWeight(ArchVGPRThreshold)
+ << ", LSGPR WT: " << RP.getSGPRTuplesWeight();
+ if (ST)
+ OS << " -> Occ: " << RP.getOccupancy(*MF);
+ OS << '\n';
+ });
}
static LaneBitmask getDefRegMask(const MachineOperand &MO,
@@ -398,8 +418,9 @@ void GCNRPTarget::setRegLimits(unsigned NumSGPRs, unsigned NumVGPRs,
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned DynamicVGPRBlockSize =
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+ AddressableNumArchVGPRs = ST.getAddressableNumArchVGPRs();
MaxSGPRs = std::min(ST.getAddressableNumSGPRs(), NumSGPRs);
- MaxVGPRs = std::min(ST.getAddressableNumArchVGPRs(), NumVGPRs);
+ MaxVGPRs = std::min(AddressableNumArchVGPRs, NumVGPRs);
MaxUnifiedVGPRs =
ST.hasGFX90AInsts()
? std::min(ST.getAddressableNumVGPRs(DynamicVGPRBlockSize), NumVGPRs)
@@ -414,15 +435,21 @@ bool GCNRPTarget::isSaveBeneficial(Register Reg,
if (SRI->isSGPRClass(RC))
return RP.getSGPRNum() > MaxSGPRs;
- unsigned NumVGPRs =
- SRI->isAGPRClass(RC) ? RP.getAGPRNum() : RP.getArchVGPRNum();
+
+ bool ShouldUseAGPR =
+ SRI->isAGPRClass(RC) ||
+ (SRI->isVectorSuperClass(RC) &&
+ RP.getArchVGPRNum(AddressableNumArchVGPRs) >= AddressableNumArchVGPRs);
+ unsigned NumVGPRs = ShouldUseAGPR
+ ? RP.getAGPRNum(AddressableNumArchVGPRs)
+ : RP.getArchVGPRNum(AddressableNumArchVGPRs);
return isVGPRBankSaveBeneficial(NumVGPRs);
}
bool GCNRPTarget::satisfied() const {
if (RP.getSGPRNum() > MaxSGPRs)
return false;
- if (RP.getVGPRNum(false) > MaxVGPRs &&
+ if (RP.getVGPRNum(false, AddressableNumArchVGPRs) > MaxVGPRs &&
(!CombineVGPRSavings || !satisifiesVGPRBanksTarget()))
return false;
return satisfiesUnifiedTarget();
@@ -876,10 +903,13 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {
OS << "---\nname: " << MF.getName() << "\nbody: |\n";
- auto printRP = [](const GCNRegPressure &RP) {
- return Printable([&RP](raw_ostream &OS) {
+ auto printRP = [&MF](const GCNRegPressure &RP) {
+ return Printable([&RP, &MF](raw_ostream &OS) {
OS << format(PFX " %-5d", RP.getSGPRNum())
- << format(" %-5d", RP.getVGPRNum(false));
+ << format(" %-5d", RP.getVGPRNum(false, MF.getSubtarget<GCNSubtarget>()
+ .getMaxNumVectorRegs(
+ MF.getFunction())
+ .first));
});
};
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index ea33a229110c1..8b80cc42c9bb0 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -18,6 +18,7 @@
#define LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
#include "GCNSubtarget.h"
+#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/RegisterPressure.h"
#include <algorithm>
@@ -43,51 +44,98 @@ struct GCNRegPressure {
/// \returns the SGPR32 pressure
unsigned getSGPRNum() const { return Value[SGPR]; }
- /// \returns the aggregated ArchVGPR32, AccVGPR32, and Pseudo AVGPR pressure
- /// dependent upon \p UnifiedVGPRFile
- unsigned getVGPRNum(bool UnifiedVGPRFile) const {
+ unsigned getVGPRNum(bool UnifiedVGPRFile,
+ unsigned AddressableArchVGPR) const {
if (UnifiedVGPRFile) {
- return Value[AGPR]
- ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR], Value[AVGPR])
- : Value[VGPR] + Value[AVGPR];
+ return Value[AGPR] || Value[AVGPR]
+ ? getUnifiedVGPRNum(Value[VGPR], Value[AGPR], Value[AVGPR],
+ AddressableArchVGPR)
+ : Value[VGPR];
}
// AVGPR assignment priority is based on the width of the register. Account
// AVGPR pressure as VGPR.
return std::max(Value[VGPR] + Value[AVGPR], Value[AGPR]);
}
+ inline static unsigned getAVGPRsAsVGPRsNum(unsigned NumArchVGPRs,
+ unsigned NumAVGPRs,
+ unsigned AddressableArchVGPR) {
+
+ return NumArchVGPRs < AddressableArchVGPR
+ ? std::min((AddressableArchVGPR - NumArchVGPRs), NumAVGPRs)
+ : 0;
+ }
+
+ inline static unsigned getAVGPRsAsAGPRsNum(unsigned NumArchVGPRs,
+ unsigned NumAGPRs,
+ unsigned NumAVGPRs,
+ unsigned AddressableArchVGPR) {
+ unsigned AVGPRsAsVGPRs =
+ getAVGPRsAsVGPRsNum(NumArchVGPRs, NumAVGPRs, AddressableArchVGPR);
+ return NumAVGPRs > AVGPRsAsVGPRs ? NumAVGPRs - AVGPRsAsVGPRs : 0;
+ }
+
/// Returns the aggregated VGPR pressure, assuming \p NumArchVGPRs ArchVGPRs
/// \p NumAGPRs AGPRS, and \p NumAVGPRs AVGPRs for a target with a unified
/// VGPR file.
inline static unsigned getUnifiedVGPRNum(unsigned NumArchVGPRs,
unsigned NumAGPRs,
- unsigned NumAVGPRs) {
-
- // Assume AVGPRs will be assigned as VGPRs.
- return alignTo(NumArchVGPRs + NumAVGPRs,
+ unsigned NumAVGPRs,
+ unsigned AddressableArchVGPR) {
+
+ // Until we hit the VGPRThreshold, we will assign AV as VGPR. After that
+ // point, we will assign as AGPR.
+ unsigned AVGPRsAsVGPRs =
+ getAVGPRsAsVGPRsNum(NumArchVGPRs, NumAVGPRs, AddressableArchVGPR);
+ unsigned AVGPRsAsAGPRs = getAVGPRsAsAGPRsNum(
+ NumArchVGPRs, NumAGPRs, NumAVGPRs, AddressableArchVGPR);
+ return alignTo(NumArchVGPRs + AVGPRsAsVGPRs,
AMDGPU::IsaInfo::getArchVGPRAllocGranule()) +
- NumAGPRs;
+ NumAGPRs + AVGPRsAsAGPRs;
}
/// \returns the ArchVGPR32 pressure, plus the AVGPRS which we assume will be
/// allocated as VGPR
- unsigned getArchVGPRNum() const { return Value[VGPR] + Value[AVGPR]; }
+ unsigned getArchVGPRNum(unsigned AddressableArchVGPR) const {
+ unsigned AVGPRsAsVGPRs =
+ getAVGPRsAsVGPRsNum(Value[VGPR], Value[AVGPR], AddressableArchVGPR);
+
+ return Value[VGPR] + AVGPRsAsVGPRs;
+ }
/// \returns the AccVGPR32 pressure
- unsigned getAGPRNum() const { return Value[AGPR]; }
+ unsigned getAGPRNum(unsigned AddressableArchVGPR) const {
+ unsigned AVGPRsAsAGPRs = getAVGPRsAsAGPRsNum(
+ Value[VGPR], Value[AGPR], Value[AVGPR], AddressableArchVGPR);
+
+ return Value[AGPR] + AVGPRsAsAGPRs;
+ }
/// \returns the AVGPR32 pressure
unsigned getAVGPRNum() const { return Value[AVGPR]; }
- unsigned getVGPRTuplesWeight() const {
- return std::max(Value[TOTAL_KINDS + VGPR] + Value[TOTAL_KINDS + AVGPR],
- Value[TOTAL_KINDS + AGPR]);
+ unsigned getVGPRTuplesWeight(unsigned AddressableArchVGPR) const {
+ unsigned AVGPRsAsVGPRs =
+ getAVGPRsAsVGPRsNum(Value[TOTAL_KINDS + VGPR],
+ Value[TOTAL_KINDS + AVGPR], AddressableArchVGPR);
+ unsigned AVGPRsAsAGPRs = getAVGPRsAsAGPRsNum(
+ Value[TOTAL_KINDS + VGPR], Value[TOTAL_KINDS + AGPR],
+ Value[TOTAL_KINDS + AVGPR], AddressableArchVGPR);
+
+ return std::max(Value[TOTAL_KINDS + VGPR] + AVGPRsAsVGPRs,
+ Value[TOTAL_KINDS + AGPR] + AVGPRsAsAGPRs);
}
unsigned getSGPRTuplesWeight() const { return Value[TOTAL_KINDS + SGPR]; }
- unsigned getOccupancy(const GCNSubtarget &ST,
- unsigned DynamicVGPRBlockSize) const {
- return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()),
- ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()),
- DynamicVGPRBlockSize));
+ unsigned getOccupancy(const MachineFunction &MF) const {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ unsigned DynamicVGPRBlockSize =
+ MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
+
+ return std::min(
+ ST.getOccupancyWithNumSGPRs(getSGPRNum()),
+ ST.getOccupancyWithNumVGPRs(
+ getVGPRNum(ST.hasGFX90AInsts(),
+ ST.getMaxNumVectorRegs(MF.getFunction()).first),
+ DynamicVGPRBlockSize));
}
void inc(unsigned Reg,
@@ -95,10 +143,9 @@ struct GCNRegPressure {
LaneBitmask NewMask,
const MachineRegisterInfo &MRI);
- bool higherOccupancy(const GCNSubtarget &ST, const GCNRegPressure &O,
- unsigned DynamicVGPRBlockSize) const {
- return getOccupancy(ST, DynamicVGPRBlockSize) >
- O.getOccupancy(ST, DynamicVGPRBlockSize);
+ bool higherOccupancy(const GCNRegPressure &O,
+ const MachineFunction &MF) const {
+ return getOccupancy(MF) > O.getOccupancy(MF);
}
/// Compares \p this GCNRegpressure to \p O, returning true if \p this is
@@ -151,7 +198,7 @@ struct GCNRegPressure {
friend GCNRegPressure max(const GCNRegPressure &P1,
const GCNRegPressure &P2);
- friend Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST,
+ friend Printable print(const GCNRegPressure &RP,
unsigned DynamicVGPRBlockSize);
};
@@ -220,16 +267,19 @@ class GCNRPTarget {
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
friend raw_ostream &operator<<(raw_ostream &OS, const GCNRPTarget &Target) {
OS << "Actual/Target: " << Target.RP.getSGPRNum() << '/' << Target.MaxSGPRs
- << " SGPRs, " << Target.RP.getArchVGPRNum() << '/' << Target.MaxVGPRs
- << " ArchVGPRs, " << Target.RP.getAGPRNum() << '/' << Target.MaxVGPRs
- << " AGPRs";
+ << " SGPRs, " << Target.RP.getArchVGPRNum(Target.AddressableNumArchVGPRs)
+ << '/' << Target.MaxVGPRs << " ArchVGPRs, "
+ << Target.RP.getAGPRNum(Target.AddressableNumArchVGPRs) << '/'
+ << Target.MaxVGPRs << " AGPRs";
if (Target.MaxUnifiedVGPRs) {
- OS << ", " << Target.RP.getVGPRNum(true) << '/' << Target.MaxUnifiedVGPRs
- << " VGPRs (unified)";
+ OS << ", " << Target.RP.getVGPRNum(true, Target.AddressableNumArchVGPRs)
+ << '/' << Target.MaxUnifiedVGPRs << " VGPRs (unified)";
} else if (Target.CombineVGPRSavings) {
- OS << ", " << Target.RP.getArchVGPRNum() + Target.RP.getAGPRNum() << '/'
- << 2 * Target.MaxVGPRs << " VGPRs (combined target)";
+ OS << ", "
+ << Target.RP.getArchVGPRNum(Target.AddressableNumArchVGPRs) +
+ Target.RP.getAGPRNum(Target.AddressableNumArchVGPRs)
+ << '/' << 2 * Target.MaxVGPRs << " VGPRs (combined target)";
}
return OS;
}
@@ -238,7 +288,6 @@ class GCNRPTarget {
private:
/// Current register pressure.
GCNRegPressure RP;
-
/// Target number of SGPRs.
unsigned MaxSGPRs;
/// Target number of ArchVGPRs and AGPRs.
@@ -246,6 +295,8 @@ class GCNRPTarget {
/// Target number of overall VGPRs for subtargets with unified RFs. Always 0
/// for subtargets with non-unified RFs.
unsigned MaxUnifiedVGPRs;
+ /// The maximum number of arch vgprs allowed by the subtarget.
+ unsigned AddressableNumArchVGPRs;
/// Whether we consider that the register allocator will be able to swap
/// between ArchVGPRs and AGPRs by copying them to a super register class.
/// Concretely, this allows savings in one of the VGPR banks to help toward
@@ -254,12 +305,15 @@ class GCNRPTarget {
inline bool satisifiesVGPRBanksTarget() const {
assert(CombineVGPRSavings && "only makes sense with combined savings");
- return RP.getArchVGPRNum() + RP.getAGPRNum() <= 2 * MaxVGPRs;
+ return RP.getArchVGPRNum(AddressableNumArchVGPRs) +
+ RP.getAGPRNum(AddressableNumArchVGPRs) <=
+ 2 * MaxVGPRs;
}
/// Always satisified when the subtarget doesn't have a unified RF.
inline bool satisfiesUnifiedTarget() const {
- return !MaxUnifiedVGPRs || RP.getVGPRNum(true) <= MaxUnifiedVGPRs;
+ return !MaxUnifiedVGPRs ||
+ RP.getVGPRNum(true, AddressableNumArchVGPRs) <= MaxUnifiedVGPRs;
}
inline bool isVGPRBankSaveBeneficial(unsigned NumVGPRs) const {
@@ -517,7 +571,8 @@ bool isEqual(const GCNRPTracker::LiveRegSet &S1,
const GCNRPTracker::LiveRegSet &S2);
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST = nullptr,
- unsigned DynamicVGPRBlockSize = 0);
+ unsigned DynamicVGPRBlockSize = 0,
+ const MachineFunction *MF = nullptr);
Printable print(const GCNRPTracker::LiveRegSet &LiveRegs,
const MachineRegisterInfo &MRI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index ce1ce687d0038..3cf9a7c0f972e 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -190,10 +190,14 @@ static void getRegisterPressures(
TempUpwardTracker.recede(*MI);
NewPressure = TempUpwardTracker.getPressure();
}
+ unsigned ArchVGPRThreshold = DAG->MF.getSubtarget<GCNSubtarget>()
+ .getMaxNumVectorRegs(DAG->MF.getFunction())
+ .first;
Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
- NewPressure.getArchVGPRNum();
- Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
+ NewPressure.getArchVGPRNum(ArchVGPRThreshold);
+ Pressure[AMDGPU::RegisterPressureSets::AGPR_32] =
+ NewPressure.getAGPRNum(ArchVGPRThreshold);
}
void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
@@ -339,7 +343,10 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
? static_cast<GCNRPTracker *>(&UpwardTracker)
: static_cast<GCNRPTracker *>(&DownwardTracker);
SGPRPressure = T->getPressure().getSGPRNum();
- VGPRPressure = T->getPressure().getArchVGPRNum();
+ VGPRPressure = T->getPressure().getArchVGPRNum(
+ DAG->MF.getSubtarget<GCNSubtarget>()
+ .getMaxNumVectorRegs(DAG->MF.getFunction())
+ .first);
}
}
ReadyQueue &Q = Zone.Available;
@@ -1140,8 +1147,7 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() {
if (DAG.MinOccupancy > InitialOccupancy) {
for (unsigned IDX = 0; IDX < DAG.Pressure.size(); ++IDX)
DAG.RegionsWithMinOcc[IDX] =
- DAG.Pressure[IDX].getOccupancy(
- DAG.ST, DAG.MFI.getDynamicVGPRBlockSize()) == DAG.MinOccupancy;
+ DAG.Pressure[IDX].getOccupancy(DAG.MF) == DAG.MinOccupancy;
LLVM_DEBUG(dbgs() << StageID
<< " stage successfully increased occupancy to "
@@ -1193,8 +1199,10 @@ bool GCNSchedStage::initGCNRegion() {
dbgs() << "Pressure before scheduling:\nRegion live-ins:"
<< print(DAG.LiveIns[RegionIdx], DAG.MRI)
<< "Region live-in pressure: "
- << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
- << "Region register pressure: " << print(PressureBefore));
+ << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]),
+ &ST, 0, &MF)
+ << "Region register pressure: "
+ << print(PressureBefore, &ST, 0, &MF));
S.HasHighPressure = false;
S.KnownExcessRP = isRegionWithExcessRP();
@@ -1275,17 +1283,17 @@ void GCNSchedStage::checkScheduling() {
// Check the results of scheduling.
PressureAfter = DAG.getRealRegPressure(RegionIdx);
- LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
+ LLVM_DEBUG(dbgs() << "Pressure after scheduling: "
+ << print(PressureAfter, &ST, 0, &MF));
LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
- unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
-
+ unsigned ArchVGPRThreshold = ST.getMaxNumVectorRegs(MF.getFunction()).first;
if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
- PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
+ PressureAfter.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) <=
+ S.VGPRCriticalLimit) {
DAG.Pressure[RegionIdx] = PressureAfter;
DAG.RegionsWithMinOcc[RegionIdx] =
- PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) ==
- DAG.MinOccupancy;
+ PressureAfter.getOccupancy(DAG.MF) == DAG.MinOccupancy;
// Early out if we have achieved the occupancy target.
LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
@@ -1294,10 +1302,10 @@ void GCNSchedStage::checkScheduling() {
unsigned TargetOccupancy = std::min(
S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
- unsigned WavesAfter = std::min(
- TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
- unsigned WavesBefore = std::min(
- TargetOccupancy, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
+ unsigned WavesAfter =
+ std::min(TargetOccupancy, PressureAfter.getOccupancy(DAG.MF));
+ unsigned WavesBefore =
+ std::min(TargetOccupancy, PressureBefore.getOccupancy(DAG.MF));
LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
<< ", after " << WavesAfter << ".\n");
@@ -1331,9 +1339,10 @@ void GCNSchedStage::checkScheduling() {
unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
- if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
- PressureAfter.getArchVGPRNum() > MaxArchVGPRs ||
- PressureAfter.getAGPRNum() > MaxArchVGPRs ||
+ if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts(), ArchVGPRThreshold) >
+ MaxVGPRs ||
+ PressureAfter.getArchVGPRNum(ArchVGPRThreshold) > MaxArchVGPRs ||
+ PressureAfter.getAGPRNum(ArchVGPRThreshold) > MaxArchVGPRs ||
PressureAfter.getSGPRNum() > MaxSGPRs) {
DAG.RegionsWithHighRP[RegionIdx] = true;
DAG.RegionsWithExcessRP[RegionIdx] = true;
@@ -1346,8 +1355,7 @@ void GCNSchedStage::checkScheduling() {
} else {
DAG.Pressure[RegionIdx] = PressureAfter;
DAG.RegionsWithMinOcc[RegionIdx] =
- PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize) ==
- DAG.MinOccupancy;
+ PressureAfter.getOccupancy(DAG.MF) == DAG.MinOccupancy;
}
}
@@ -1471,12 +1479,13 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
// For dynamic VGPR mode, we don't want to waste any VGPR blocks.
if (DAG.MFI.isDynamicVGPREnabled()) {
+ unsigned ArchVGPRThreshold = ST.getMaxNumVectorRegs(MF.getFunction()).first;
unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
&ST, DAG.MFI.getDynamicVGPRBlockSize(),
- PressureBefore.getVGPRNum(false));
+ PressureBefore.getVGPRNum(false, ArchVGPRThreshold));
unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
&ST, DAG.MFI.getDynamicVGPRBlockSize(),
- PressureAfter.getVGPRNum(false));
+ PressureAfter.getVGPRNum(false, ArchVGPRThreshold));
if (BlocksAfter > BlocksBefore)
return true;
}
@@ -1500,8 +1509,7 @@ bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
// If RP is not reduced in the unclustered reschedule stage, revert to the
// old schedule.
- if ((WavesAfter <=
- PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) &&
+ if ((WavesAfter <= PressureBefore.getOccupancy(DAG.MF) &&
mayCauseSpilling(WavesAfter)) ||
GCNSchedStage::shouldRevertScheduling(WavesAfter)) {
LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
@@ -1523,9 +1531,8 @@ bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
ScheduleMetrics MAfter = getScheduleMetrics(DAG);
unsigned OldMetric = MBefore.getMetric();
unsigned NewMetric = MAfter.getMetric();
- unsigned WavesBefore = std::min(
- S.getTargetOccupancy(),
- PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()));
+ unsigned WavesBefore =
+ std::min(S.getTargetOccupancy(), PressureBefore.getOccupancy(DAG.MF));
unsigned Profit =
((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
((OldMetric + ScheduleMetricBias) * ScheduleMetrics::ScaleFactor) /
@@ -1579,8 +1586,7 @@ bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
void GCNSchedStage::revertScheduling() {
DAG.RegionsWithMinOcc[RegionIdx] =
- PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) ==
- DAG.MinOccupancy;
+ PressureBefore.getOccupancy(DAG.MF) == DAG.MinOccupancy;
LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
DAG.RegionEnd = DAG.RegionBegin;
int SkippedDebugInstr = 0;
@@ -2017,9 +2023,7 @@ void PreRARematStage::rematerialize() {
}
}
DAG.Pressure[I] = RP;
- AchievedOcc = std::min(
- AchievedOcc, RP.getOccupancy(ST, MF.getInfo<SIMachineFunctionInfo>()
- ->getDynamicVGPRBlockSize()));
+ AchievedOcc = std::min(AchievedOcc, RP.getOccupancy(DAG.MF));
}
REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
}
diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index 6b13b06590102..e29ac72c7ba31 100644
--- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -197,9 +197,7 @@ bool SIFormMemoryClausesImpl::checkPressure(const MachineInstr &MI,
// pointer becomes dead and could otherwise be reused for destination.
RPT.advanceToNext();
GCNRegPressure MaxPressure = RPT.moveMaxPressure();
- unsigned Occupancy = MaxPressure.getOccupancy(
- *ST,
- MI.getMF()->getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
+ unsigned Occupancy = MaxPressure.getOccupancy(*MI.getMF());
// Don't push over half the register budget. We don't want to introduce
// spilling just to form a soft clause.
@@ -211,7 +209,10 @@ bool SIFormMemoryClausesImpl::checkPressure(const MachineInstr &MI,
// tracking does not account for the alignment requirements for SGPRs, or the
// fragmentation of registers the allocator will need to satisfy.
if (Occupancy >= MFI->getMinAllowedOccupancy() &&
- MaxPressure.getVGPRNum(ST->hasGFX90AInsts()) <= MaxVGPRs / 2 &&
+ MaxPressure.getVGPRNum(
+ ST->hasGFX90AInsts(),
+ ST->getMaxNumVectorRegs(MI.getMF()->getFunction()).first) <=
+ MaxVGPRs / 2 &&
MaxPressure.getSGPRNum() <= MaxSGPRs / 2) {
LastRecordedOccupancy = Occupancy;
return true;
diff --git a/llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir b/llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir
new file mode 100644
index 0000000000000..a5183ce0d2661
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/avgpr-pressure.mir
@@ -0,0 +1,481 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=machine-scheduler --debug-only=machine-scheduler -o - %s 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+--- |
+ define void @avgpr_rp_occ1() #0 {
+ entry:
+ unreachable
+ }
+
+ define void @avgpr_rp_occ2() #1 {
+ entry:
+ unreachable
+ }
+
+ define void @avgpr_rp_occ3() #2 {
+ entry:
+ unreachable
+ }
+
+ define void @avgpr_rp_occ4() #3 {
+ entry:
+ unreachable
+ }
+
+ define void @avgpr_rp_occ5() #4 {
+ entry:
+ unreachable
+ }
+
+ define void @avgpr_rp_occ6() #5 {
+ entry:
+ unreachable
+ }
+
+ define void @avgpr_rp_occ7() #6 {
+ entry:
+ unreachable
+ }
+
+ define void @avgpr_rp_occ8() #7 {
+ entry:
+ unreachable
+ }
+
+
+ define void @vgpr_rp_occ1() #0 {
+ entry:
+ unreachable
+ }
+
+ define void @vgpr_rp_occ2() #1 {
+ entry:
+ unreachable
+ }
+
+ define void @vgpr_rp_occ3() #2 {
+ entry:
+ unreachable
+ }
+
+ attributes #0 = {"amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="64,64"}
+ attributes #1 = {"amdgpu-waves-per-eu"="2,2" "amdgpu-flat-work-group-size"="64,64"}
+ attributes #2 = {"amdgpu-waves-per-eu"="3,3" "amdgpu-flat-work-group-size"="64,64"}
+ attributes #3 = {"amdgpu-waves-per-eu"="4,4" "amdgpu-flat-work-group-size"="64,64"}
+ attributes #4 = {"amdgpu-waves-per-eu"="5,5" "amdgpu-flat-work-group-size"="64,64"}
+ attributes #5 = {"amdgpu-waves-per-eu"="6,6" "amdgpu-flat-work-group-size"="64,64"}
+ attributes #6 = {"amdgpu-waves-per-eu"="7,7" "amdgpu-flat-work-group-size"="64,64"}
+ attributes #7 = {"amdgpu-waves-per-eu"="8,8" "amdgpu-flat-work-group-size"="64,64"}
+
+
+...
+
+# CHECK: avgpr_rp_occ1:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 256 AGPRs: 192(O1), SGPRs: 0(O10), LVGPR WT: 256, LSGPR WT: 0 -> Occ: 1
+
+---
+name: avgpr_rp_occ1
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
+ stackPtrOffsetReg: '$sgpr32'
+ argumentInfo:
+ privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ workGroupIDX: { reg: '$sgpr6' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+ workItemIDX: { reg: '$vgpr0' }
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr4_sgpr5
+ %1:vreg_1024 = IMPLICIT_DEF
+ %2:vreg_1024 = IMPLICIT_DEF
+ %3:vreg_1024 = IMPLICIT_DEF
+ %4:vreg_1024 = IMPLICIT_DEF
+ %5:vreg_1024 = IMPLICIT_DEF
+ %6:vreg_1024 = IMPLICIT_DEF
+ %7:vreg_1024 = IMPLICIT_DEF
+ %8:av_1024 = IMPLICIT_DEF
+ %9:av_1024 = IMPLICIT_DEF
+ %10:av_1024 = IMPLICIT_DEF
+ %11:av_1024 = IMPLICIT_DEF
+ %12:av_1024 = IMPLICIT_DEF
+ %13:av_1024 = IMPLICIT_DEF
+ %14:av_1024 = IMPLICIT_DEF
+ SCHED_BARRIER 0
+ KILL %1, %2, %3, %4, %5, %6, %7
+
+ bb.1:
+ KILL %8, %9, %10, %11, %12, %13, %14
+ S_ENDPGM 0
+...
+
+# CHECK: avgpr_rp_occ2:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 128 AGPRs: 64(O2), SGPRs: 0(O10), LVGPR WT: 128, LSGPR WT: 0 -> Occ: 2
+
+---
+name: avgpr_rp_occ2
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
+ stackPtrOffsetReg: '$sgpr32'
+ argumentInfo:
+ privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ workGroupIDX: { reg: '$sgpr6' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+ workItemIDX: { reg: '$vgpr0' }
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr4_sgpr5
+ %1:vreg_1024 = IMPLICIT_DEF
+ %2:vreg_1024 = IMPLICIT_DEF
+ %3:vreg_1024 = IMPLICIT_DEF
+ %4:av_1024 = IMPLICIT_DEF
+ %5:av_1024 = IMPLICIT_DEF
+ %6:av_1024 = IMPLICIT_DEF
+ SCHED_BARRIER 0
+ KILL %1, %2, %3
+
+ bb.1:
+ KILL %4, %5, %6
+ S_ENDPGM 0
+...
+
+# CHECK: avgpr_rp_occ3:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 84 AGPRs: 44(O4), SGPRs: 0(O10), LVGPR WT: 84, LSGPR WT: 0 -> Occ: 4
+
+---
+name: avgpr_rp_occ3
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
+ stackPtrOffsetReg: '$sgpr32'
+ argumentInfo:
+ privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ workGroupIDX: { reg: '$sgpr6' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+ workItemIDX: { reg: '$vgpr0' }
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr4_sgpr5
+ %1:vreg_1024 = IMPLICIT_DEF
+ %2:vreg_1024 = IMPLICIT_DEF
+ %3:av_1024 = IMPLICIT_DEF
+ %4:av_1024 = IMPLICIT_DEF
+ SCHED_BARRIER 0
+ KILL %1, %2
+
+ bb.1:
+ KILL %3, %4
+ S_ENDPGM 0
+...
+
+# CHECK: avgpr_rp_occ4:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 64 AGPRs: 64(O4), SGPRs: 0(O10), LVGPR WT: 64, LSGPR WT: 0 -> Occ: 4
+
+---
+name: avgpr_rp_occ4
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
+ stackPtrOffsetReg: '$sgpr32'
+ argumentInfo:
+ privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ workGroupIDX: { reg: '$sgpr6' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+ workItemIDX: { reg: '$vgpr0' }
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr4_sgpr5
+ %1:av_1024 = IMPLICIT_DEF
+ %2:av_1024 = IMPLICIT_DEF
+ %3:av_1024 = IMPLICIT_DEF
+ %4:av_1024 = IMPLICIT_DEF
+ SCHED_BARRIER 0
+ KILL %1, %2
+
+ bb.1:
+ KILL %3, %4
+ S_ENDPGM 0
+...
+
+# CHECK: avgpr_rp_occ5:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 48 AGPRs: 80(O4), SGPRs: 0(O10), LVGPR WT: 80, LSGPR WT: 0 -> Occ: 4
+
+---
+name: avgpr_rp_occ5
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
+ stackPtrOffsetReg: '$sgpr32'
+ argumentInfo:
+ privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ workGroupIDX: { reg: '$sgpr6' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+ workItemIDX: { reg: '$vgpr0' }
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr4_sgpr5
+ %1:av_1024 = IMPLICIT_DEF
+ %2:av_1024 = IMPLICIT_DEF
+ %3:av_1024 = IMPLICIT_DEF
+ %4:av_1024 = IMPLICIT_DEF
+ SCHED_BARRIER 0
+ KILL %1, %2
+
+ bb.1:
+ KILL %3, %4
+ S_ENDPGM 0
+...
+
+# CHECK: avgpr_rp_occ6:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 40 AGPRs: 88(O4), SGPRs: 0(O10), LVGPR WT: 88, LSGPR WT: 0 -> Occ: 4
+
+---
+name: avgpr_rp_occ6
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
+ stackPtrOffsetReg: '$sgpr32'
+ argumentInfo:
+ privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ workGroupIDX: { reg: '$sgpr6' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+ workItemIDX: { reg: '$vgpr0' }
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr4_sgpr5
+ %1:av_1024 = IMPLICIT_DEF
+ %2:av_1024 = IMPLICIT_DEF
+ %3:av_1024 = IMPLICIT_DEF
+ %4:av_1024 = IMPLICIT_DEF
+ SCHED_BARRIER 0
+ KILL %1, %2
+
+ bb.1:
+ KILL %3, %4
+ S_ENDPGM 0
+...
+
+# CHECK: avgpr_rp_occ7:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 36 AGPRs: 92(O4), SGPRs: 0(O10), LVGPR WT: 92, LSGPR WT: 0 -> Occ: 4
+
+---
+name: avgpr_rp_occ7
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
+ stackPtrOffsetReg: '$sgpr32'
+ argumentInfo:
+ privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ workGroupIDX: { reg: '$sgpr6' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+ workItemIDX: { reg: '$vgpr0' }
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr4_sgpr5
+ %1:av_1024 = IMPLICIT_DEF
+ %2:av_1024 = IMPLICIT_DEF
+ %3:av_1024 = IMPLICIT_DEF
+ %4:av_1024 = IMPLICIT_DEF
+ SCHED_BARRIER 0
+ KILL %1, %2
+
+ bb.1:
+ KILL %3, %4
+ S_ENDPGM 0
+...
+
+# CHECK: avgpr_rp_occ8:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 32 AGPRs: 96(O4), SGPRs: 0(O10), LVGPR WT: 96, LSGPR WT: 0 -> Occ: 4
+
+---
+name: avgpr_rp_occ8
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
+ stackPtrOffsetReg: '$sgpr32'
+ argumentInfo:
+ privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ workGroupIDX: { reg: '$sgpr6' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+ workItemIDX: { reg: '$vgpr0' }
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr4_sgpr5
+ %1:av_1024 = IMPLICIT_DEF
+ %2:av_1024 = IMPLICIT_DEF
+ %3:av_1024 = IMPLICIT_DEF
+ %4:av_1024 = IMPLICIT_DEF
+ SCHED_BARRIER 0
+ KILL %1, %2
+
+ bb.1:
+ KILL %3, %4
+ S_ENDPGM 0
+...
+
+# CHECK: vgpr_rp_occ1:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 448 AGPRs: 0(O1), SGPRs: 0(O10), LVGPR WT: 448, LSGPR WT: 0 -> Occ: 1
+
+---
+name: vgpr_rp_occ1
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
+ stackPtrOffsetReg: '$sgpr32'
+ argumentInfo:
+ privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ workGroupIDX: { reg: '$sgpr6' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+ workItemIDX: { reg: '$vgpr0' }
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr4_sgpr5
+ %1:vreg_1024 = IMPLICIT_DEF
+ %2:vreg_1024 = IMPLICIT_DEF
+ %3:vreg_1024 = IMPLICIT_DEF
+ %4:vreg_1024 = IMPLICIT_DEF
+ %5:vreg_1024 = IMPLICIT_DEF
+ %6:vreg_1024 = IMPLICIT_DEF
+ %7:vreg_1024 = IMPLICIT_DEF
+ %8:vreg_1024 = IMPLICIT_DEF
+ %9:vreg_1024 = IMPLICIT_DEF
+ %10:vreg_1024 = IMPLICIT_DEF
+ %11:vreg_1024 = IMPLICIT_DEF
+ %12:vreg_1024 = IMPLICIT_DEF
+ %13:vreg_1024 = IMPLICIT_DEF
+ %14:vreg_1024 = IMPLICIT_DEF
+ SCHED_BARRIER 0
+ KILL %1, %2, %3, %4, %5, %6, %7
+
+ bb.1:
+ KILL %8, %9, %10, %11, %12, %13, %14
+ S_ENDPGM 0
+...
+
+# CHECK: vgpr_rp_occ2:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 192 AGPRs: 0(O2), SGPRs: 0(O10), LVGPR WT: 192, LSGPR WT: 0 -> Occ: 2
+
+---
+name: vgpr_rp_occ2
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
+ stackPtrOffsetReg: '$sgpr32'
+ argumentInfo:
+ privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ workGroupIDX: { reg: '$sgpr6' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+ workItemIDX: { reg: '$vgpr0' }
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr4_sgpr5
+ %1:vreg_1024 = IMPLICIT_DEF
+ %2:vreg_1024 = IMPLICIT_DEF
+ %3:vreg_1024 = IMPLICIT_DEF
+ %4:vreg_1024 = IMPLICIT_DEF
+ %5:vreg_1024 = IMPLICIT_DEF
+ %6:vreg_1024 = IMPLICIT_DEF
+ SCHED_BARRIER 0
+ KILL %1, %2, %3
+
+ bb.1:
+ KILL %4, %5, %6
+ S_ENDPGM 0
+...
+
+# CHECK: vgpr_rp_occ3:%bb.0
+# CHECK: Pressure before scheduling:
+# CHECK-NEXT: Region live-ins:
+# CHECK-NEXT: Region live-in pressure: VGPRs: 0 AGPRs: 0(O8), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 8
+# CHECK-NEXT: Region register pressure: VGPRs: 128 AGPRs: 0(O4), SGPRs: 0(O10), LVGPR WT: 128, LSGPR WT: 0 -> Occ: 4
+
+
+---
+name: vgpr_rp_occ3
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
+ stackPtrOffsetReg: '$sgpr32'
+ argumentInfo:
+ privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+ kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+ workGroupIDX: { reg: '$sgpr6' }
+ privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+ workItemIDX: { reg: '$vgpr0' }
+ sgprForEXECCopy: '$sgpr100_sgpr101'
+body: |
+ bb.0:
+ liveins: $vgpr0, $sgpr4_sgpr5
+ %1:vreg_1024 = IMPLICIT_DEF
+ %2:vreg_1024 = IMPLICIT_DEF
+ %3:vreg_1024 = IMPLICIT_DEF
+ %4:vreg_1024 = IMPLICIT_DEF
+ SCHED_BARRIER 0
+ KILL %1, %2
+
+ bb.1:
+ KILL %3, %4
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir
index 2a08c52e447ba..72181346764fb 100644
--- a/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir
+++ b/llvm/test/CodeGen/AMDGPU/debug-value-scheduler-liveins.mir
@@ -6,7 +6,7 @@
# CHECK-NEXT: test_get_liveins:%bb.0
# CHECK: ********** MI Scheduling **********
# CHECK-NEXT: test_get_liveins:%bb.1
-# CHECK: Region live-in pressure: VGPRs: 1 AGPRs: 0, SGPRs: 0, LVGPR WT: 0, LSGPR WT: 0
+# CHECK: Region live-in pressure: VGPRs: 1 AGPRs: 0(O10), SGPRs: 0(O10), LVGPR WT: 0, LSGPR WT: 0 -> Occ: 10
# CHECK: ScheduleDAGMILive::schedule starting
---
>From 13f5604a8e6bbe1576457fea43ac40f4ce954f49 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 15 Jul 2025 15:10:41 -0700
Subject: [PATCH 2/5] [AMDGPU] Add scheduling stage to rewrite MFMA from VGPR
to AGPR
Change-Id: I47b2a4274a35f3cf0a6d064674d1d29526e4dfd2
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 283 ++++++++++++++++++++
llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 35 ++-
2 files changed, 313 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 3cf9a7c0f972e..de4f3433c80b2 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -29,6 +29,7 @@
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineCycleAnalysis.h"
#include "llvm/CodeGen/RegisterClassInfo.h"
#include "llvm/MC/LaneBitmask.h"
#include "llvm/Support/ErrorHandling.h"
@@ -535,6 +536,7 @@ GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
const MachineSchedContext *C, bool IsLegacyScheduler)
: GCNSchedStrategy(C) {
SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
+ SchedStages.push_back(GCNSchedStageID::RewriteSchedule);
SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
@@ -785,6 +787,8 @@ GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
switch (SchedStageID) {
case GCNSchedStageID::OccInitialSchedule:
return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
+ case GCNSchedStageID::RewriteSchedule:
+ return std::make_unique<RewriteScheduleStage>(SchedStageID, *this);
case GCNSchedStageID::UnclusteredHighRPReschedule:
return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
case GCNSchedStageID::ClusteredLowOccupancyReschedule:
@@ -948,10 +952,12 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
Pressure.resize(Regions.size());
RegionsWithHighRP.resize(Regions.size());
RegionsWithExcessRP.resize(Regions.size());
+ RegionsWithExcessArchVGPR.resize(Regions.size());
RegionsWithMinOcc.resize(Regions.size());
RegionsWithIGLPInstrs.resize(Regions.size());
RegionsWithHighRP.reset();
RegionsWithExcessRP.reset();
+ RegionsWithExcessArchVGPR.reset();
RegionsWithMinOcc.reset();
RegionsWithIGLPInstrs.reset();
@@ -1010,6 +1016,9 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
case GCNSchedStageID::OccInitialSchedule:
OS << "Max Occupancy Initial Schedule";
break;
+ case GCNSchedStageID::RewriteSchedule:
+ OS << "Instruction Rewriting Reschedule";
+ break;
case GCNSchedStageID::UnclusteredHighRPReschedule:
OS << "Unclustered High Register Pressure Reschedule";
break;
@@ -1043,6 +1052,245 @@ bool GCNSchedStage::initGCNSchedStage() {
return true;
}
+bool RewriteScheduleStage::initGCNSchedStage() {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (!ST.hasGFX90AInsts() || DAG.RegionsWithExcessArchVGPR.none())
+ return false;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo *SRI = ST.getRegisterInfo();
+ SmallPtrSet<MachineInstr *, 16> CrossRCUseCopies;
+ SmallPtrSet<MachineInstr *, 16> CrossRCDefCopies;
+ std::vector<std::pair<MachineInstr *, unsigned>> RewriteInsts;
+
+ for (auto &MBB : MF) {
+ for (auto &MI : MBB) {
+ if (TII->isMAI(MI)) {
+ int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
+ if (ReplacementOp == -1)
+ continue;
+ const TargetRegisterClass *VGPRRC =
+ DAG.MRI.getRegClass(MI.getOperand(0).getReg());
+ const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(VGPRRC);
+ const TargetRegisterClass *DestConstrainExceptRC =
+ recomputeRegClassExceptRewritable(MI.getOperand(0).getReg(), VGPRRC,
+ AGPRRC);
+
+ if (!DestConstrainExceptRC)
+ CrossRCUseCopies.insert(&MI);
+
+ MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+ if (Src2 && Src2->isReg()) {
+ const TargetRegisterClass *Src2ConstrainExceptRC =
+ recomputeRegClassExceptRewritable(Src2->getReg(), VGPRRC, AGPRRC);
+ if ((!Src2ConstrainExceptRC || Src2ConstrainExceptRC != AGPRRC))
+ CrossRCDefCopies.insert(&MI);
+
+ DAG.MRI.setRegClass(Src2->getReg(), AGPRRC);
+ }
+
+ DAG.MRI.setRegClass(MI.getOperand(0).getReg(), AGPRRC);
+
+ auto OriginalOpc = MI.getOpcode();
+ MI.setDesc(TII->get(ReplacementOp));
+ RewriteInsts.push_back({&MI, OriginalOpc});
+ }
+ }
+ }
+
+ bool ShouldRewrite = false;
+ for (unsigned RegionIdx = 0; RegionIdx < DAG.Regions.size(); RegionIdx++) {
+ if (!DAG.RegionsWithExcessArchVGPR[RegionIdx])
+ continue;
+
+ // For the cases we care about (i.e. ArchVGPR usage is greater than the
+ // addressable limit), rewriting alone should bring pressure to manageable
+ // level. If we find any such region, then the rewrite is potentially
+ // beneficial.
+ auto PressureAfter = DAG.getRealRegPressure(RegionIdx);
+ unsigned MaxCombinedVGPRs = ST.getMaxNumVGPRs(MF);
+ if (PressureAfter.getArchVGPRNum() <= ST.getAddressableNumArchVGPRs() &&
+ PressureAfter.getVGPRNum(true) <= MaxCombinedVGPRs) {
+ ShouldRewrite = true;
+ break;
+ }
+ }
+
+ // If we find that we'll need to insert cross RC copies inside loop bodies,
+ // then bail
+ if (ShouldRewrite) {
+ CI.clear();
+ CI.compute(MF);
+
+ for (auto *DefMI : CrossRCUseCopies) {
+ auto DefReg = DefMI->getOperand(0).getReg();
+
+ for (auto &UseMI : DAG.MRI.use_nodbg_instructions(DefReg)) {
+ for (unsigned OpNo = 0; OpNo < UseMI.getNumOperands(); OpNo++) {
+ auto &TheOp = UseMI.getOperand(OpNo);
+ if (!TheOp.isReg() || !TheOp.isUse())
+ continue;
+ if (TheOp.getReg() != DefReg)
+ continue;
+
+ auto RequiredRC = UseMI.getRegClassConstraint(OpNo, DAG.TII, DAG.TRI);
+ if (!RequiredRC || SRI->hasAGPRs(RequiredRC))
+ continue;
+
+ unsigned DefDepth = CI.getCycleDepth(DefMI->getParent());
+ if (DefDepth && CI.getCycleDepth(UseMI.getParent()) >= DefDepth) {
+ ShouldRewrite = false;
+ break;
+ }
+ }
+ if (!ShouldRewrite)
+ break;
+ }
+ if (!ShouldRewrite)
+ break;
+ }
+ }
+
+ // If we haven't found the beneficial conditions, prefer the VGPR form which
+ // may result in less cross RC copies.
+ if (!ShouldRewrite) {
+ for (auto RI : RewriteInsts) {
+ MachineInstr *MI = RI.first;
+
+ assert(TII->isMAI(*MI));
+ const TargetRegisterClass *AGPRRC =
+ DAG.MRI.getRegClass(MI->getOperand(0).getReg());
+ const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(AGPRRC);
+
+ MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
+ assert(Src2);
+
+ if (Src2->isReg()) {
+ DAG.MRI.setRegClass(Src2->getReg(), VGPRRC);
+ }
+ DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VGPRRC);
+ MI->setDesc(TII->get(RI.second));
+ }
+
+ return false;
+ }
+
+ DAG.RegionsWithExcessArchVGPR.reset();
+ DAG.RegionsWithExcessRP.reset();
+
+ // Insert cross RC copies for the users of the MFMA result
+ for (auto MI : CrossRCUseCopies) {
+ auto DefReg = MI->getOperand(0).getReg();
+ SmallVector<MachineInstr *, 4> UseInstrs;
+ for (auto &UseMI : DAG.MRI.use_nodbg_instructions(DefReg))
+ UseInstrs.push_back(&UseMI);
+
+ DenseMap<Register, MachineInstr *> NewCopies;
+ for (auto UseMI : UseInstrs) {
+ for (unsigned OpNo = 0; OpNo < UseMI->getNumOperands(); OpNo++) {
+ auto &TheOp = UseMI->getOperand(OpNo);
+ if (!TheOp.isReg() || !TheOp.isUse())
+ continue;
+ if (TheOp.getReg() != DefReg)
+ continue;
+
+ auto RequiredRC = UseMI->getRegClassConstraint(OpNo, DAG.TII, DAG.TRI);
+
+ if (!RequiredRC || SRI->hasAGPRs(RequiredRC))
+ continue;
+
+ Register DestVGPR;
+ if (!NewCopies.contains(DefReg)) {
+ Register DestVGPR = DAG.MRI.createVirtualRegister(
+ SRI->getEquivalentVGPRClass(DAG.MRI.getRegClass(DefReg)));
+
+ // Insert copy near the user to avoid inserting inside loops.
+ MachineInstrBuilder VGPRCopy =
+ BuildMI(*UseMI->getParent(), UseMI->getIterator(),
+ UseMI->getDebugLoc(), TII->get(TargetOpcode::COPY))
+ .addDef(DestVGPR, 0, 0)
+ .addUse(DefReg, 0, 0);
+
+ NewCopies[DefReg] = VGPRCopy;
+ }
+ DestVGPR = NewCopies[DefReg]->getOperand(0).getReg();
+ TheOp.setReg(DestVGPR);
+ }
+ }
+ if (NewCopies.contains(DefReg)) {
+ DAG.LIS->InsertMachineInstrInMaps(*NewCopies[DefReg]);
+ DAG.LIS->removeInterval(DefReg);
+ DAG.LIS->createAndComputeVirtRegInterval(DefReg);
+ DAG.LIS->createAndComputeVirtRegInterval(
+ NewCopies[DefReg]->getOperand(0).getReg());
+ }
+ }
+
+ // Insert cross RC copies for the use operands of the MFMA
+ for (auto MI : CrossRCDefCopies) {
+ MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
+ if (!Src2)
+ continue;
+ if (!Src2->isReg())
+ continue;
+ auto Src2Reg = Src2->getReg();
+ SmallVector<MachineInstr *, 4> DefInstrs;
+ for (auto &DefMI : DAG.MRI.def_instructions(Src2Reg))
+ DefInstrs.push_back(&DefMI);
+
+ DenseMap<Register, MachineInstr *> NewCopies;
+ for (auto DefMI : DefInstrs) {
+ for (unsigned OpNo = 0; OpNo < DefMI->getNumOperands(); OpNo++) {
+ auto &TheOp = DefMI->getOperand(OpNo);
+ if (!TheOp.isReg() || !TheOp.isDef())
+ continue;
+ if (TheOp.getReg() != Src2Reg)
+ continue;
+
+ auto RequiredRC = DefMI->getRegClassConstraint(OpNo, DAG.TII, DAG.TRI);
+
+ if (!RequiredRC || SRI->hasAGPRs(RequiredRC))
+ continue;
+
+ Register SrcVGPR;
+ if (!NewCopies.contains(Src2Reg)) {
+ Register SrcVGPR = DAG.MRI.createVirtualRegister(
+ SRI->getEquivalentVGPRClass(DAG.MRI.getRegClass(Src2Reg)));
+
+ // Insert copy near the def to avoid inserting inside loops.
+ MachineInstrBuilder VGPRCopy =
+ BuildMI(*DefMI->getParent(), ++DefMI->getIterator(),
+ DefMI->getDebugLoc(), TII->get(TargetOpcode::COPY))
+ .addDef(Src2Reg, 0, 0)
+ .addUse(SrcVGPR, 0, 0);
+
+ NewCopies[Src2Reg] = VGPRCopy;
+ }
+
+ SrcVGPR = NewCopies[Src2Reg]->getOperand(1).getReg();
+ TheOp.setReg(SrcVGPR);
+ }
+ }
+
+ if (NewCopies.contains(Src2Reg)) {
+ DAG.LIS->InsertMachineInstrInMaps(*NewCopies[Src2Reg]);
+ DAG.LIS->removeInterval(Src2Reg);
+ DAG.LIS->createAndComputeVirtRegInterval(Src2Reg);
+ DAG.LIS->createAndComputeVirtRegInterval(
+ NewCopies[Src2Reg]->getOperand(1).getReg());
+ }
+ }
+
+ // Liveins may have been modified for cross RC copies
+ RegionPressureMap LiveInUpdater(&DAG, false);
+ LiveInUpdater.buildLiveRegMap();
+
+ for (unsigned RegionIdx = 0; RegionIdx < DAG.Regions.size(); RegionIdx++)
+ DAG.LiveIns[RegionIdx] = LiveInUpdater.getLiveRegsForRegionIdx(RegionIdx);
+
+ return true;
+}
+
bool UnclusteredHighRPStage::initGCNSchedStage() {
if (DisableUnclusterHighRP)
return false;
@@ -1348,6 +1596,9 @@ void GCNSchedStage::checkScheduling() {
DAG.RegionsWithExcessRP[RegionIdx] = true;
}
+ if (PressureAfter.getArchVGPRNum() > ST.getAddressableNumArchVGPRs())
+ DAG.RegionsWithExcessArchVGPR[RegionIdx] = true;
+
// Revert if this region's schedule would cause a drop in occupancy or
// spilling.
if (shouldRevertScheduling(WavesAfter)) {
@@ -1648,6 +1899,38 @@ void GCNSchedStage::revertScheduling() {
DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
}
+bool RewriteScheduleStage::isRewriteCandidate(MachineInstr *MI) const {
+
+ if (!static_cast<const SIInstrInfo *>(DAG.TII)->isMAI(*MI))
+ return false;
+ return AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode()) != -1;
+}
+
+const TargetRegisterClass *
+RewriteScheduleStage::recomputeRegClassExceptRewritable(
+ Register Reg, const TargetRegisterClass *OldRC,
+ const TargetRegisterClass *NewRC) const {
+
+ // Accumulate constraints from all uses.
+ for (MachineOperand &MO : DAG.MRI.reg_nodbg_operands(Reg)) {
+ // Apply the effect of the given operand to NewRC.
+ MachineInstr *MI = MO.getParent();
+ // We can swap the classes of dst + src2 as a pair to AGPR, so ignore the
+ // effects of rewrite candidates. It just so happens that we can use either
+ // AGPR or VGPR in src0/src1, so don't bother checking the constraint
+ // effects of the individual operands.
+ if (isRewriteCandidate(MI))
+ continue;
+
+ unsigned OpNo = &MO - &MI->getOperand(0);
+ NewRC = MI->getRegClassConstraintEffect(OpNo, NewRC, DAG.TII, DAG.TRI);
+ if (!NewRC || NewRC == OldRC)
+ return nullptr;
+ }
+
+ return NewRC;
+}
+
bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
SlotIndex OriginalIdx,
SlotIndex RematIdx) const {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 94cd795bbc8f6..af264d062ce5a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -28,11 +28,12 @@ class GCNSchedStage;
enum class GCNSchedStageID : unsigned {
OccInitialSchedule = 0,
- UnclusteredHighRPReschedule = 1,
- ClusteredLowOccupancyReschedule = 2,
- PreRARematerialize = 3,
- ILPInitialSchedule = 4,
- MemoryClauseInitialSchedule = 5
+ RewriteSchedule = 1,
+ UnclusteredHighRPReschedule = 2,
+ ClusteredLowOccupancyReschedule = 3,
+ PreRARematerialize = 4,
+ ILPInitialSchedule = 5,
+ MemoryClauseInitialSchedule = 6
};
#ifndef NDEBUG
@@ -224,6 +225,7 @@ using RegionBoundaries =
class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
friend class GCNSchedStage;
friend class OccInitialScheduleStage;
+ friend class RewriteScheduleStage;
friend class UnclusteredHighRPStage;
friend class ClusteredLowOccStage;
friend class PreRARematStage;
@@ -250,6 +252,11 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
// limit. Register pressure in these regions usually will result in spilling.
BitVector RegionsWithExcessRP;
+ // Record regions with excess archvgpr register pressure over the physical
+ // register limit. Register pressure in these regions usually will result in
+ // spilling.
+ BitVector RegionsWithExcessArchVGPR;
+
// Regions that has the same occupancy as the latest MinOccupancy
BitVector RegionsWithMinOcc;
@@ -401,6 +408,24 @@ class OccInitialScheduleStage : public GCNSchedStage {
: GCNSchedStage(StageID, DAG) {}
};
+class RewriteScheduleStage : public GCNSchedStage {
+private:
+ const TargetRegisterClass *
+ recomputeRegClassExceptRewritable(Register Reg,
+ const TargetRegisterClass *OldRC,
+ const TargetRegisterClass *NewRC) const;
+
+ bool isRewriteCandidate(MachineInstr *MI) const;
+
+ MachineCycleInfo CI;
+
+public:
+ bool initGCNSchedStage() override;
+
+ RewriteScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
+ : GCNSchedStage(StageID, DAG) {}
+};
+
class UnclusteredHighRPStage : public GCNSchedStage {
private:
// Save the initial occupancy before starting this stage.
>From 678aa740ab750b44a950630db3883b3d50282fd0 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 29 Jul 2025 09:39:49 -0700
Subject: [PATCH 3/5] Rebase + block frequencies
Change-Id: I3ca5a96c10fad06bb9c66d6b4e36fbe48157070a
---
llvm/lib/Target/AMDGPU/GCNRegPressure.h | 1 -
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 85 +++++++++++++++++----
llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 6 +-
3 files changed, 76 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 8b80cc42c9bb0..81df54126cdfe 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -99,7 +99,6 @@ struct GCNRegPressure {
unsigned getArchVGPRNum(unsigned AddressableArchVGPR) const {
unsigned AVGPRsAsVGPRs =
getAVGPRsAsVGPRsNum(Value[VGPR], Value[AVGPR], AddressableArchVGPR);
-
return Value[VGPR] + AVGPRsAsVGPRs;
}
/// \returns the AccVGPR32 pressure
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index de4f3433c80b2..154024157056b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1098,29 +1098,80 @@ bool RewriteScheduleStage::initGCNSchedStage() {
}
}
- bool ShouldRewrite = false;
+ unsigned ArchVGPRThreshold =
+ ST.getMaxNumVectorRegs(DAG.MF.getFunction()).first;
+
+ int64_t Cost = 0;
+ MBFI.calculate(MF, MBPI, *DAG.MLI);
for (unsigned RegionIdx = 0; RegionIdx < DAG.Regions.size(); RegionIdx++) {
if (!DAG.RegionsWithExcessArchVGPR[RegionIdx])
continue;
+ unsigned MaxCombinedVGPRs = ST.getMaxNumVGPRs(MF);
+
+ auto PressureBefore = DAG.Pressure[RegionIdx];
+ unsigned UnifiedPressureBefore =
+ PressureBefore.getVGPRNum(true, ArchVGPRThreshold);
+ unsigned ArchPressureBefore =
+ PressureBefore.getArchVGPRNum(ArchVGPRThreshold);
+ unsigned AGPRPressureBefore = PressureBefore.getAGPRNum(ArchVGPRThreshold);
+ unsigned UnifiedSpillBefore =
+ UnifiedPressureBefore > MaxCombinedVGPRs
+ ? (UnifiedPressureBefore - MaxCombinedVGPRs)
+ : 0;
+ unsigned ArchSpillBefore =
+ ArchPressureBefore > ST.getAddressableNumArchVGPRs()
+ ? (ArchPressureBefore - ST.getAddressableNumArchVGPRs())
+ : 0;
+ unsigned AGPRSpillBefore =
+ AGPRPressureBefore > ST.getAddressableNumArchVGPRs()
+ ? (AGPRPressureBefore - ST.getAddressableNumArchVGPRs())
+ : 0;
+
+ unsigned SpillCostBefore =
+ std::max(UnifiedSpillBefore, (ArchSpillBefore + AGPRSpillBefore));
+
+
// For the cases we care about (i.e. ArchVGPR usage is greater than the
// addressable limit), rewriting alone should bring pressure to manageable
// level. If we find any such region, then the rewrite is potentially
// beneficial.
auto PressureAfter = DAG.getRealRegPressure(RegionIdx);
- unsigned MaxCombinedVGPRs = ST.getMaxNumVGPRs(MF);
- if (PressureAfter.getArchVGPRNum() <= ST.getAddressableNumArchVGPRs() &&
- PressureAfter.getVGPRNum(true) <= MaxCombinedVGPRs) {
- ShouldRewrite = true;
- break;
- }
+ unsigned UnifiedPressureAfter =
+ PressureAfter.getVGPRNum(true, ArchVGPRThreshold);
+ unsigned ArchPressureAfter =
+ PressureAfter.getArchVGPRNum(ArchVGPRThreshold);
+ unsigned AGPRPressureAfter = PressureAfter.getAGPRNum(ArchVGPRThreshold);
+ unsigned UnifiedSpillAfter = UnifiedPressureAfter > MaxCombinedVGPRs
+ ? (UnifiedPressureAfter - MaxCombinedVGPRs)
+ : 0;
+ unsigned ArchSpillAfter =
+ ArchPressureAfter > ST.getAddressableNumArchVGPRs()
+ ? (ArchPressureAfter - ST.getAddressableNumArchVGPRs())
+ : 0;
+ unsigned AGPRSpillAfter =
+ AGPRPressureAfter > ST.getAddressableNumArchVGPRs()
+ ? (AGPRPressureAfter - ST.getAddressableNumArchVGPRs())
+ : 0;
+
+ unsigned SpillCostAfter =
+ std::max(UnifiedSpillAfter, (ArchSpillAfter + AGPRSpillAfter));
+
+ uint64_t EntryFreq = MBFI.getEntryFreq().getFrequency();
+ uint64_t BlockFreq =
+ EntryFreq ? MBFI.getBlockFreq(DAG.Regions[RegionIdx].first->getParent())
+ .getFrequency() / EntryFreq
+ : 1;
+
+ // Assumes perfect spilling -- giving edge to VGPR form.
+ Cost += ((int)SpillCostAfter - (int)SpillCostBefore) * (int)BlockFreq;
}
// If we find that we'll need to insert cross RC copies inside loop bodies,
// then bail
+ bool ShouldRewrite = Cost < 0;
if (ShouldRewrite) {
- CI.clear();
- CI.compute(MF);
+ uint64_t EntryFreq = MBFI.getEntryFreq().getFrequency();
for (auto *DefMI : CrossRCUseCopies) {
auto DefReg = DefMI->getOperand(0).getReg();
@@ -1137,11 +1188,16 @@ bool RewriteScheduleStage::initGCNSchedStage() {
if (!RequiredRC || SRI->hasAGPRs(RequiredRC))
continue;
- unsigned DefDepth = CI.getCycleDepth(DefMI->getParent());
- if (DefDepth && CI.getCycleDepth(UseMI.getParent()) >= DefDepth) {
- ShouldRewrite = false;
+ uint64_t UseFreq =
+ EntryFreq ? MBFI.getBlockFreq(UseMI.getParent()).getFrequency() /
+ EntryFreq
+ : 1;
+
+ // Assumes no copy-reuse, giving edge to VGPR form.
+ Cost += UseFreq;
+ ShouldRewrite = Cost < 0;
+ if (!ShouldRewrite)
break;
- }
}
if (!ShouldRewrite)
break;
@@ -1596,7 +1652,8 @@ void GCNSchedStage::checkScheduling() {
DAG.RegionsWithExcessRP[RegionIdx] = true;
}
- if (PressureAfter.getArchVGPRNum() > ST.getAddressableNumArchVGPRs())
+ if (PressureAfter.getArchVGPRNum(ArchVGPRThreshold) >
+ ST.getAddressableNumArchVGPRs())
DAG.RegionsWithExcessArchVGPR[RegionIdx] = true;
// Revert if this region's schedule would cause a drop in occupancy or
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index af264d062ce5a..786b5264bcec2 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -16,6 +16,9 @@
#include "GCNRegPressure.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineScheduler.h"
@@ -417,7 +420,8 @@ class RewriteScheduleStage : public GCNSchedStage {
bool isRewriteCandidate(MachineInstr *MI) const;
- MachineCycleInfo CI;
+ MachineBranchProbabilityInfo MBPI;
+ MachineBlockFrequencyInfo MBFI;
public:
bool initGCNSchedStage() override;
>From 639642e6e809729a23006312c2fe1f7ad389b214 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 29 Jul 2025 15:06:32 -0700
Subject: [PATCH 4/5] Track copies by MBB
Change-Id: Icd3934749a199e29660bfb0d187f045b18f1bd7d
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 45 ++++++++++++---------
1 file changed, 25 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 154024157056b..24e5383fc5963 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1241,7 +1241,7 @@ bool RewriteScheduleStage::initGCNSchedStage() {
for (auto &UseMI : DAG.MRI.use_nodbg_instructions(DefReg))
UseInstrs.push_back(&UseMI);
- DenseMap<Register, MachineInstr *> NewCopies;
+ DenseMap<Register, DenseMap<MachineBasicBlock *, MachineInstr *>> NewCopies;
for (auto UseMI : UseInstrs) {
for (unsigned OpNo = 0; OpNo < UseMI->getNumOperands(); OpNo++) {
auto &TheOp = UseMI->getOperand(OpNo);
@@ -1256,29 +1256,32 @@ bool RewriteScheduleStage::initGCNSchedStage() {
continue;
Register DestVGPR;
- if (!NewCopies.contains(DefReg)) {
+ if (!NewCopies.contains(DefReg) || !NewCopies[DefReg].contains(UseMI->getParent())) {
Register DestVGPR = DAG.MRI.createVirtualRegister(
SRI->getEquivalentVGPRClass(DAG.MRI.getRegClass(DefReg)));
// Insert copy near the user to avoid inserting inside loops.
+ // TODO insert point
MachineInstrBuilder VGPRCopy =
- BuildMI(*UseMI->getParent(), UseMI->getIterator(),
+ BuildMI(*UseMI->getParent(), UseMI->getParent()->getFirstNonPHI(),
UseMI->getDebugLoc(), TII->get(TargetOpcode::COPY))
.addDef(DestVGPR, 0, 0)
.addUse(DefReg, 0, 0);
- NewCopies[DefReg] = VGPRCopy;
+ NewCopies[DefReg][UseMI->getParent()] = VGPRCopy;
}
- DestVGPR = NewCopies[DefReg]->getOperand(0).getReg();
+ DestVGPR = NewCopies[DefReg][UseMI->getParent()]->getOperand(0).getReg();
TheOp.setReg(DestVGPR);
}
}
if (NewCopies.contains(DefReg)) {
- DAG.LIS->InsertMachineInstrInMaps(*NewCopies[DefReg]);
- DAG.LIS->removeInterval(DefReg);
- DAG.LIS->createAndComputeVirtRegInterval(DefReg);
- DAG.LIS->createAndComputeVirtRegInterval(
- NewCopies[DefReg]->getOperand(0).getReg());
+ for (auto NewCopy : NewCopies[DefReg]) {
+ DAG.LIS->InsertMachineInstrInMaps(*NewCopy.second);
+ DAG.LIS->removeInterval(DefReg);
+ DAG.LIS->createAndComputeVirtRegInterval(DefReg);
+ DAG.LIS->createAndComputeVirtRegInterval(
+ NewCopy.second->getOperand(0).getReg());
+ }
}
}
@@ -1294,7 +1297,7 @@ bool RewriteScheduleStage::initGCNSchedStage() {
for (auto &DefMI : DAG.MRI.def_instructions(Src2Reg))
DefInstrs.push_back(&DefMI);
- DenseMap<Register, MachineInstr *> NewCopies;
+ DenseMap<Register, DenseMap<MachineBasicBlock *, MachineInstr *>> NewCopies;
for (auto DefMI : DefInstrs) {
for (unsigned OpNo = 0; OpNo < DefMI->getNumOperands(); OpNo++) {
auto &TheOp = DefMI->getOperand(OpNo);
@@ -1309,31 +1312,33 @@ bool RewriteScheduleStage::initGCNSchedStage() {
continue;
Register SrcVGPR;
- if (!NewCopies.contains(Src2Reg)) {
+ if (!NewCopies.contains(Src2Reg) || !NewCopies[Src2Reg].contains(DefMI->getParent())) {
Register SrcVGPR = DAG.MRI.createVirtualRegister(
SRI->getEquivalentVGPRClass(DAG.MRI.getRegClass(Src2Reg)));
// Insert copy near the def to avoid inserting inside loops.
MachineInstrBuilder VGPRCopy =
- BuildMI(*DefMI->getParent(), ++DefMI->getIterator(),
+ BuildMI(*DefMI->getParent(), DefMI->getParent()->end(),
DefMI->getDebugLoc(), TII->get(TargetOpcode::COPY))
.addDef(Src2Reg, 0, 0)
.addUse(SrcVGPR, 0, 0);
- NewCopies[Src2Reg] = VGPRCopy;
+ NewCopies[Src2Reg][DefMI->getParent()] = VGPRCopy;
}
- SrcVGPR = NewCopies[Src2Reg]->getOperand(1).getReg();
+ SrcVGPR = NewCopies[Src2Reg][DefMI->getParent()]->getOperand(1).getReg();
TheOp.setReg(SrcVGPR);
}
}
if (NewCopies.contains(Src2Reg)) {
- DAG.LIS->InsertMachineInstrInMaps(*NewCopies[Src2Reg]);
- DAG.LIS->removeInterval(Src2Reg);
- DAG.LIS->createAndComputeVirtRegInterval(Src2Reg);
- DAG.LIS->createAndComputeVirtRegInterval(
- NewCopies[Src2Reg]->getOperand(1).getReg());
+ for (auto NewCopy : NewCopies[Src2Reg]) {
+ DAG.LIS->InsertMachineInstrInMaps(*NewCopy.second);
+ DAG.LIS->removeInterval(Src2Reg);
+ DAG.LIS->createAndComputeVirtRegInterval(Src2Reg);
+ DAG.LIS->createAndComputeVirtRegInterval(
+ NewCopy.second->getOperand(1).getReg());
+ }
}
}
>From c62a2f127cba5d6df350474dfd4a6e5f9250fe4f Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 29 Jul 2025 15:47:12 -0700
Subject: [PATCH 5/5] Formatting
Change-Id: Iab30e9e090220757e79f0c6b2c898465c1262abf
---
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 20 ++++++++++++--------
1 file changed, 12 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 24e5383fc5963..f6b9931ddaef5 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1102,7 +1102,7 @@ bool RewriteScheduleStage::initGCNSchedStage() {
ST.getMaxNumVectorRegs(DAG.MF.getFunction()).first;
int64_t Cost = 0;
- MBFI.calculate(MF, MBPI, *DAG.MLI);
+ MBFI.calculate(MF, MBPI, *DAG.MLI);
for (unsigned RegionIdx = 0; RegionIdx < DAG.Regions.size(); RegionIdx++) {
if (!DAG.RegionsWithExcessArchVGPR[RegionIdx])
continue;
@@ -1131,7 +1131,6 @@ bool RewriteScheduleStage::initGCNSchedStage() {
unsigned SpillCostBefore =
std::max(UnifiedSpillBefore, (ArchSpillBefore + AGPRSpillBefore));
-
// For the cases we care about (i.e. ArchVGPR usage is greater than the
// addressable limit), rewriting alone should bring pressure to manageable
// level. If we find any such region, then the rewrite is potentially
@@ -1160,7 +1159,8 @@ bool RewriteScheduleStage::initGCNSchedStage() {
uint64_t EntryFreq = MBFI.getEntryFreq().getFrequency();
uint64_t BlockFreq =
EntryFreq ? MBFI.getBlockFreq(DAG.Regions[RegionIdx].first->getParent())
- .getFrequency() / EntryFreq
+ .getFrequency() /
+ EntryFreq
: 1;
// Assumes perfect spilling -- giving edge to VGPR form.
@@ -1256,7 +1256,8 @@ bool RewriteScheduleStage::initGCNSchedStage() {
continue;
Register DestVGPR;
- if (!NewCopies.contains(DefReg) || !NewCopies[DefReg].contains(UseMI->getParent())) {
+ if (!NewCopies.contains(DefReg) ||
+ !NewCopies[DefReg].contains(UseMI->getParent())) {
Register DestVGPR = DAG.MRI.createVirtualRegister(
SRI->getEquivalentVGPRClass(DAG.MRI.getRegClass(DefReg)));
@@ -1270,7 +1271,8 @@ bool RewriteScheduleStage::initGCNSchedStage() {
NewCopies[DefReg][UseMI->getParent()] = VGPRCopy;
}
- DestVGPR = NewCopies[DefReg][UseMI->getParent()]->getOperand(0).getReg();
+ DestVGPR =
+ NewCopies[DefReg][UseMI->getParent()]->getOperand(0).getReg();
TheOp.setReg(DestVGPR);
}
}
@@ -1280,7 +1282,7 @@ bool RewriteScheduleStage::initGCNSchedStage() {
DAG.LIS->removeInterval(DefReg);
DAG.LIS->createAndComputeVirtRegInterval(DefReg);
DAG.LIS->createAndComputeVirtRegInterval(
- NewCopy.second->getOperand(0).getReg());
+ NewCopy.second->getOperand(0).getReg());
}
}
}
@@ -1312,7 +1314,8 @@ bool RewriteScheduleStage::initGCNSchedStage() {
continue;
Register SrcVGPR;
- if (!NewCopies.contains(Src2Reg) || !NewCopies[Src2Reg].contains(DefMI->getParent())) {
+ if (!NewCopies.contains(Src2Reg) ||
+ !NewCopies[Src2Reg].contains(DefMI->getParent())) {
Register SrcVGPR = DAG.MRI.createVirtualRegister(
SRI->getEquivalentVGPRClass(DAG.MRI.getRegClass(Src2Reg)));
@@ -1326,7 +1329,8 @@ bool RewriteScheduleStage::initGCNSchedStage() {
NewCopies[Src2Reg][DefMI->getParent()] = VGPRCopy;
}
- SrcVGPR = NewCopies[Src2Reg][DefMI->getParent()]->getOperand(1).getReg();
+ SrcVGPR =
+ NewCopies[Src2Reg][DefMI->getParent()]->getOperand(1).getReg();
TheOp.setReg(SrcVGPR);
}
}
More information about the llvm-commits
mailing list