[llvm] AMDGPU: Track AGPR pressure (PR #150288)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 4 11:13:09 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
Author: Nicholas Baron (Nicholas-Baron)
<details>
<summary>Changes</summary>
---
Patch is 12.94 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/150288.diff
157 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp (+38-8)
- (modified) llvm/lib/Target/AMDGPU/GCNSchedStrategy.h (+7-3)
- (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+4-8)
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (+16)
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (+7)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll (+183-183)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll (+5-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll (+246-255)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll (+235-235)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll (+200-200)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll (+21-28)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll (+73-73)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll (+175-175)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll (+116-116)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll (+165-162)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll (+42-42)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll (+409-409)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll (+306-306)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll (+407-407)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll (+42-42)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll (+9-9)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll (+162-162)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll (+424-422)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll (+147-147)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll (+24-24)
- (modified) llvm/test/CodeGen/AMDGPU/abs_i16.ll (+216-216)
- (modified) llvm/test/CodeGen/AMDGPU/addrspacecast.ll (+238-230)
- (modified) llvm/test/CodeGen/AMDGPU/agpr-csr.ll (+180-180)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+21585-20848)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll (+743-682)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll (+51-55)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll (+312-302)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll (+4314-4198)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll (+472-456)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll (+2310-2292)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll (+549-520)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll (+1200-1211)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll (+1133-1044)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+7000-6991)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll (+2408-2378)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll (+1086-1061)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll (+3542-3497)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll (+1677-1634)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll (+1308-1272)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll (+892-892)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll (+985-984)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll (+15-13)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-no-agprs-violations.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+6751-6672)
- (modified) llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll (+63-63)
- (modified) llvm/test/CodeGen/AMDGPU/call-argument-types.ll (+167-167)
- (modified) llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll (+13-6)
- (modified) llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll (+3-2)
- (modified) llvm/test/CodeGen/AMDGPU/calling-conventions.ll (+40-40)
- (modified) llvm/test/CodeGen/AMDGPU/ctpop16.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/div_i128.ll (+135-135)
- (modified) llvm/test/CodeGen/AMDGPU/div_v2i128.ll (+846-846)
- (modified) llvm/test/CodeGen/AMDGPU/extract-subvector.ll (+88-88)
- (modified) llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll (+11-11)
- (modified) llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll (+37-36)
- (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll (+191-190)
- (modified) llvm/test/CodeGen/AMDGPU/fceil64.ll (+300-301)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+112-112)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+99-99)
- (modified) llvm/test/CodeGen/AMDGPU/fptoi.i128.ll (+334-334)
- (modified) llvm/test/CodeGen/AMDGPU/freeze.ll (+688-715)
- (modified) llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll (+216-216)
- (modified) llvm/test/CodeGen/AMDGPU/function-args.ll (+372-393)
- (modified) llvm/test/CodeGen/AMDGPU/function-returns.ll (+242-281)
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll (+38-40)
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll (+163-159)
- (modified) llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll (+54-56)
- (modified) llvm/test/CodeGen/AMDGPU/half.ll (+292-293)
- (modified) llvm/test/CodeGen/AMDGPU/idot8s.ll (+40-40)
- (modified) llvm/test/CodeGen/AMDGPU/idot8u.ll (+24-24)
- (modified) llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll (+868-866)
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll (+201-220)
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll (+167-167)
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll (+196-197)
- (modified) llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll (+96-96)
- (modified) llvm/test/CodeGen/AMDGPU/live-interval-bug-in-rename-independent-subregs.mir (+38-34)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll (+77-67)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll (+45-60)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll (+486-627)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll (+18-18)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll (+326-304)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll (+36-72)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll (+91-91)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll (+124-124)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll (+111-111)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll (+170-169)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll (+72-72)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll (+111-111)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll (+170-169)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll (+17-17)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i1.ll (+1429-1281)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i16.ll (+589-592)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i32.ll (+314-440)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i8.ll (+623-626)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-f32.ll (+14-15)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i16.ll (+1619-1613)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i32.ll (+818-756)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i8.ll (+1685-1678)
- (modified) llvm/test/CodeGen/AMDGPU/load-local-i16.ll (+2237-2212)
- (modified) llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats-attr.mir (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll (+1163-1153)
- (modified) llvm/test/CodeGen/AMDGPU/maximumnum.ll (+185-185)
- (modified) llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll (+5231-5215)
- (modified) llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll (+169-220)
- (modified) llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll (+160-153)
- (modified) llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll (+1163-1153)
- (modified) llvm/test/CodeGen/AMDGPU/minimumnum.ll (+185-185)
- (modified) llvm/test/CodeGen/AMDGPU/mul.ll (+82-82)
- (modified) llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/no-remat-indirect-mov.mir (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/packed-fp32.ll (+354-333)
- (modified) llvm/test/CodeGen/AMDGPU/pr51516.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll (+91-89)
- (modified) llvm/test/CodeGen/AMDGPU/rem_i128.ll (+75-75)
- (modified) llvm/test/CodeGen/AMDGPU/rsq.f64.ll (+315-313)
- (modified) llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/schedule-barrier.mir (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/schedule-relaxed-occupancy.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/scratch-simple.ll (+1687-1676)
- (modified) llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll (+14-13)
- (modified) llvm/test/CodeGen/AMDGPU/select.f16.ll (+258-256)
- (modified) llvm/test/CodeGen/AMDGPU/shift-i128.ll (+54-54)
- (modified) llvm/test/CodeGen/AMDGPU/shl.ll (+11-11)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v4i64.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll (+317-317)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2p0.v4p0.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v3i64.v4i64.ll (+86-84)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v3p0.v4p0.ll (+86-84)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4i64.v4i64.ll (+172-176)
- (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4p0.v4p0.ll (+172-176)
- (modified) llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll (+3-1)
- (modified) llvm/test/CodeGen/AMDGPU/spill-agpr.ll (+69-72)
- (modified) llvm/test/CodeGen/AMDGPU/sra.ll (+31-31)
- (modified) llvm/test/CodeGen/AMDGPU/srem.ll (+193-193)
- (modified) llvm/test/CodeGen/AMDGPU/srl.ll (+11-11)
- (modified) llvm/test/CodeGen/AMDGPU/ssubsat.ll (+116-116)
- (modified) llvm/test/CodeGen/AMDGPU/stack-realign.ll (+5-10)
- (modified) llvm/test/CodeGen/AMDGPU/sub.ll (+62-62)
- (modified) llvm/test/CodeGen/AMDGPU/uaddsat.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/udiv.ll (+115-115)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll (+334-340)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll (+58-58)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll (+58-58)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll (+58-58)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll (+58-58)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 254b75b784e75..fab44ef942aa3 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -86,6 +86,8 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
VGPRExcessLimit =
Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
+ AGPRExcessLimit =
+ Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::AGPR_32RegClass);
SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
// Set the initial TargetOccupnacy to the maximum occupancy that we can
@@ -98,6 +100,9 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
SGPRCriticalLimit =
std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
+ AGPRCriticalLimit =
+ std::min(ST.getMaxNumAGPRs(TargetOccupancy), AGPRExcessLimit);
+
if (!KnownExcessRP) {
VGPRCriticalLimit = std::min(
ST.getMaxNumVGPRs(TargetOccupancy, MFI.getDynamicVGPRBlockSize()),
@@ -201,7 +206,8 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
const RegPressureTracker &RPTracker,
const SIRegisterInfo *SRI,
unsigned SGPRPressure,
- unsigned VGPRPressure, bool IsBottomUp) {
+ unsigned VGPRPressure,
+ unsigned AGPRPressure, bool IsBottomUp) {
Cand.SU = SU;
Cand.AtTop = AtTop;
@@ -230,6 +236,7 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
Pressure.resize(4, 0);
Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
+ Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = AGPRPressure;
for (const auto &Diff : DAG->getPressureDiff(SU)) {
if (!Diff.isValid())
@@ -247,7 +254,9 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
- CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
+ CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] ||
+ Pressure[AMDGPU::RegisterPressureSets::AGPR_32] !=
+ CheckPressure[AMDGPU::RegisterPressureSets::AGPR_32]) {
errs() << "Register Pressure is inaccurate when calculated through "
"PressureDiff\n"
<< "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
@@ -255,7 +264,10 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
<< CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
<< "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
<< ", expected "
- << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
+ << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n"
+ << "AGPR got " << Pressure[AMDGPU::RegisterPressureSets::AGPR_32]
+ << ", expected "
+ << CheckPressure[AMDGPU::RegisterPressureSets::AGPR_32] << "\n";
report_fatal_error("inaccurate register pressure calculation");
}
#endif
@@ -263,6 +275,7 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
+ unsigned NewAGPRPressure = Pressure[AMDGPU::RegisterPressureSets::AGPR_32];
// If two instructions increase the pressure of different register sets
// by the same amount, the generic scheduler will prefer to schedule the
@@ -272,9 +285,11 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
// only for VGPRs or only for SGPRs.
// FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
- const unsigned MaxVGPRPressureInc = 16;
+ static constexpr unsigned MaxVGPRPressureInc = 16;
bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
- bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
+ bool ShouldTrackAGPRs = !ShouldTrackVGPRs && AGPRPressure >= AGPRExcessLimit;
+ bool ShouldTrackSGPRs =
+ !ShouldTrackVGPRs && !ShouldTrackAGPRs && SGPRPressure >= SGPRExcessLimit;
// FIXME: We have to enter REG-EXCESS before we reach the actual threshold
// to increase the likelihood we don't go over the limits. We should improve
@@ -291,6 +306,12 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
}
+ if (ShouldTrackAGPRs && NewAGPRPressure >= AGPRPressure) {
+ HasHighPressure = true;
+ Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::AGPR_32);
+ Cand.RPDelta.Excess.setUnitInc(NewAGPRPressure - AGPRExcessLimit);
+ }
+
if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
HasHighPressure = true;
Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
@@ -304,13 +325,19 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
+ int AGPRDelta = NewAGPRPressure - AGPRCriticalLimit;
- if (SGPRDelta >= 0 || VGPRDelta >= 0) {
+ if (SGPRDelta >= 0 || VGPRDelta >= 0 || AGPRDelta >= 0) {
HasHighPressure = true;
- if (SGPRDelta > VGPRDelta) {
+ // Prioritize reducing the VGPRDelta if both are >= 0
+ if (SGPRDelta > VGPRDelta && SGPRDelta > AGPRDelta) {
Cand.RPDelta.CriticalMax =
PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
+ } else if (AGPRDelta > VGPRDelta) {
+ Cand.RPDelta.CriticalMax =
+ PressureChange(AMDGPU::RegisterPressureSets::AGPR_32);
+ Cand.RPDelta.CriticalMax.setUnitInc(AGPRDelta);
} else {
Cand.RPDelta.CriticalMax =
PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
@@ -330,16 +357,19 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
unsigned SGPRPressure = 0;
unsigned VGPRPressure = 0;
+ unsigned AGPRPressure = 0;
if (DAG->isTrackingPressure()) {
if (!GCNTrackers) {
SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
+ AGPRPressure = Pressure[AMDGPU::RegisterPressureSets::AGPR_32];
} else {
GCNRPTracker *T = IsBottomUp
? static_cast<GCNRPTracker *>(&UpwardTracker)
: static_cast<GCNRPTracker *>(&DownwardTracker);
SGPRPressure = T->getPressure().getSGPRNum();
VGPRPressure = T->getPressure().getArchVGPRNum();
+ AGPRPressure = T->getPressure().getAGPRNum();
}
}
ReadyQueue &Q = Zone.Available;
@@ -347,7 +377,7 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
SchedCandidate TryCand(ZonePolicy);
initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
- VGPRPressure, IsBottomUp);
+ VGPRPressure, AGPRPressure, IsBottomUp);
// Pass SchedBoundary only when comparing nodes from the same boundary.
SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
tryCandidate(Cand, TryCand, ZoneArg);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 790370ff8ab4d..8b2137bcd14da 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -53,7 +53,8 @@ class GCNSchedStrategy : public GenericScheduler {
void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop,
const RegPressureTracker &RPTracker,
const SIRegisterInfo *SRI, unsigned SGPRPressure,
- unsigned VGPRPressure, bool IsBottomUp);
+ unsigned VGPRPressure, unsigned AGPRPressure,
+ bool IsBottomUp);
std::vector<unsigned> Pressure;
@@ -63,6 +64,8 @@ class GCNSchedStrategy : public GenericScheduler {
unsigned VGPRExcessLimit;
+ unsigned AGPRExcessLimit;
+
unsigned TargetOccupancy;
MachineFunction *MF;
@@ -103,6 +106,8 @@ class GCNSchedStrategy : public GenericScheduler {
unsigned VGPRCriticalLimit;
+ unsigned AGPRCriticalLimit;
+
unsigned SGPRLimitBias = 0;
unsigned VGPRLimitBias = 0;
@@ -183,8 +188,7 @@ class ScheduleMetrics {
};
inline raw_ostream &operator<<(raw_ostream &OS, const ScheduleMetrics &Sm) {
- dbgs() << "\n Schedule Metric (scaled by "
- << ScheduleMetrics::ScaleFactor
+ dbgs() << "\n Schedule Metric (scaled by " << ScheduleMetrics::ScaleFactor
<< " ) is: " << Sm.getMetric() << " [ " << Sm.getBubbles() << "/"
<< Sm.getLength() << " ]\n";
return OS;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 4475c8d1d1602..c9fa3894408e9 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1722,8 +1722,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
/// unit requirement.
unsigned getMaxNumVGPRs(const Function &F) const;
- unsigned getMaxNumAGPRs(const Function &F) const {
- return getMaxNumVGPRs(F);
+ unsigned getMaxNumAGPRs(unsigned WavesPerEU) const {
+ return AMDGPU::IsaInfo::getMaxNumAGPRs(this, WavesPerEU);
}
/// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
@@ -1744,13 +1744,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool supportsWave64() const { return !hasGFX1250Insts(); }
- bool isWave32() const {
- return getWavefrontSize() == 32;
- }
+ bool isWave32() const { return getWavefrontSize() == 32; }
- bool isWave64() const {
- return getWavefrontSize() == 64;
- }
+ bool isWave64() const { return getWavefrontSize() == 64; }
/// Returns if the wavesize of this subtarget is known reliable. This is false
/// only for the a default target-cpu that does not have an explicit
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 0a0b02c18c1db..d78106694f2e8 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1494,6 +1494,22 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
return std::min(MaxNumVGPRs, AddressableNumVGPRs);
}
+unsigned getMaxNumAGPRs(const MCSubtargetInfo *STI, unsigned int WavesPerEU) {
+ if (!STI->getFeatureBits().test(FeatureMAIInsts))
+ return 0;
+
+ assert(WavesPerEU != 0);
+
+ assert(!STI->getFeatureBits().test(FeatureDynamicVGPR));
+
+ unsigned MaxNumAGPRs =
+ alignTo(getTotalNumVGPRs(STI) / WavesPerEU, getVGPRAllocGranule(STI, 0));
+ unsigned AddressableNumAGPRs = getAddressableNumArchVGPRs(STI);
+ return std::min(MaxNumAGPRs, AddressableNumAGPRs);
+}
+
+unsigned getAddressableNumAGPRs(const MCSubtargetInfo *STI) { return 256; }
+
unsigned getEncodedNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs,
std::optional<bool> EnableWavefrontSize32) {
return getGranulatedNumRegisterBlocks(
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 23ea3ba0c8385..ecf7faac89ce5 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -353,6 +353,13 @@ unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
unsigned MaxWaves,
unsigned TotalNumVGPRs);
+/// \returns Maximum number of AGPRs that meets given number of waves per
+/// execution unit requirement for given subtarget \p STI.
+unsigned getMaxNumAGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
+
+/// \returns Addressable number of AGPRs for a given subtarget \p STI.
+unsigned getAddressableNumAGPRs(const MCSubtargetInfo *STI);
+
/// \returns Occupancy for a given \p SGPRs usage, \p MaxWaves possible, and \p
/// Gen.
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
index b67080bd4798d..7c58791281562 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
@@ -149,55 +149,55 @@ define void @add_v5i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
; GFX8-LABEL: add_v5i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 2, v0
+; GFX8-NEXT: v_add_u32_e64 v19, s[16:17], 2, v0
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v0
+; GFX8-NEXT: v_add_u32_e64 v8, s[4:5], 6, v0
+; GFX8-NEXT: v_add_u32_e64 v10, s[6:7], 8, v0
+; GFX8-NEXT: v_add_u32_e64 v12, s[8:9], 2, v2
+; GFX8-NEXT: v_add_u32_e64 v14, s[10:11], 4, v2
+; GFX8-NEXT: v_add_u32_e64 v18, s[14:15], 8, v2
+; GFX8-NEXT: v_addc_u32_e64 v20, s[16:17], 0, v1, s[16:17]
+; GFX8-NEXT: v_add_u32_e64 v16, s[12:13], 6, v2
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v0
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_ushort v12, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_ushort v13, v[6:7]
-; GFX8-NEXT: flat_load_ushort v14, v[8:9]
-; GFX8-NEXT: flat_load_ushort v15, v[10:11]
-; GFX8-NEXT: flat_load_ushort v16, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v2
-; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 6, v2
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 8, v2
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_load_ushort v17, v[2:3]
-; GFX8-NEXT: flat_load_ushort v18, v[0:1]
-; GFX8-NEXT: flat_load_ushort v19, v[6:7]
-; GFX8-NEXT: flat_load_ushort v20, v[8:9]
+; GFX8-NEXT: v_addc_u32_e64 v9, vcc, 0, v1, s[4:5]
+; GFX8-NEXT: v_addc_u32_e64 v11, vcc, 0, v1, s[6:7]
+; GFX8-NEXT: flat_load_ushort v21, v[0:1]
+; GFX8-NEXT: flat_load_ushort v20, v[19:20]
+; GFX8-NEXT: flat_load_ushort v22, v[6:7]
+; GFX8-NEXT: v_addc_u32_e64 v13, s[6:7], 0, v3, s[8:9]
+; GFX8-NEXT: v_addc_u32_e64 v15, s[6:7], 0, v3, s[10:11]
+; GFX8-NEXT: v_addc_u32_e64 v19, s[6:7], 0, v3, s[14:15]
+; GFX8-NEXT: flat_load_ushort v23, v[8:9]
; GFX8-NEXT: flat_load_ushort v10, v[10:11]
+; GFX8-NEXT: v_addc_u32_e64 v17, s[6:7], 0, v3, s[12:13]
+; GFX8-NEXT: flat_load_ushort v11, v[2:3]
+; GFX8-NEXT: flat_load_ushort v12, v[12:13]
+; GFX8-NEXT: flat_load_ushort v13, v[14:15]
+; GFX8-NEXT: flat_load_ushort v14, v[16:17]
+; GFX8-NEXT: flat_load_ushort v15, v[18:19]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v4
+; GFX8-NEXT: v_add_u32_e64 v6, s[4:5], 4, v4
+; GFX8-NEXT: v_add_u32_e64 v8, s[6:7], 6, v4
+; GFX8-NEXT: v_add_u32_e64 v2, s[8:9], 8, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v4
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 6, v4
-; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 8, v4
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GFX8-NEXT: v_addc_u32_e64 v7, vcc, 0, v5, s[4:5]
+; GFX8-NEXT: v_addc_u32_e64 v9, vcc, 0, v5, s[6:7]
+; GFX8-NEXT: v_addc_u32_e64 v3, vcc, 0, v5, s[8:9]
; GFX8-NEXT: s_waitcnt vmcnt(4)
-; GFX8-NEXT: v_add_u16_e32 v11, v12, v17
+; GFX8-NEXT: v_add_u16_e32 v11, v21, v11
; GFX8-NEXT: s_waitcnt vmcnt(3)
-; GFX8-NEXT: v_add_u16_e32 v12, v13, v18
+; GFX8-NEXT: v_add_u16_e32 v12, v20, v12
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_add_u16_e32 v13, v14, v19
+; GFX8-NEXT: v_add_u16_e32 v13, v22, v13
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_add_u16_e32 v14, v15, v20
+; GFX8-NEXT: v_add_u16_e32 v14, v23, v14
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u16_e32 v10, v16, v10
+; GFX8-NEXT: v_add_u16_e32 v10, v10, v15
; GFX8-NEXT: flat_store_short v[4:5], v11
; GFX8-NEXT: flat_store_short v[0:1], v12
-; GFX8-NEXT: flat_store_short v[2:3], v13
-; GFX8-NEXT: flat_store_short v[6:7], v14
-; GFX8-NEXT: flat_store_short v[8:9], v10
+; GFX8-NEXT: flat_store_short v[6:7], v13
+; GFX8-NEXT: flat_store_short v[8:9], v14
+; GFX8-NEXT: flat_store_short v[2:3], v10
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -341,77 +341,77 @@ define void @addv_7i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
; GFX8-LABEL: addv_7i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 2, v0
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v0
+; GFX8-NEXT: v_add_u32_e64 v8, s[4:5], 6, v0
+; GFX8-NEXT: v_add_u32_e64 v10, s[6:7], 8, v0
+; GFX8-NEXT: v_add_u32_e64 v12, s[8:9], 10, v0
+; GFX8-NEXT: v_add_u32_e64 v14, s[10:11], 12, v0
+; GFX8-NEXT: v_add_u32_e64 v19, s[16:17], 2, v0
+; GFX8-NEXT: v_add_u32_e64 v16, s[12:13], 2, v2
+; GFX8-NEXT: v_add_u32_e64 v18, s[14:15], 4, v2
+; GFX8-NEXT: v_addc_u32_e64 v20, s[16:17], 0, v1, s[16:17]
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v0
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v12, vcc, 8, v0
-; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 10, v0
-; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_ushort v16, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 12, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_ushort v17, v[6:7]
-; GFX8-NEXT: flat_load_ushort v18, v[8:9]
-; GFX8-NEXT: flat_load_ushort v19, v[10:11]
-; GFX8-NEXT: flat_load_ushort v20, v[12:13]
-; GFX8-NEXT: flat_load_ushort v21, v[14:15]
-; GFX8-NEXT: flat_load_ushort v22, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v2
+; GFX8-NEXT: v_addc_u32_e64 v9, vcc, 0, v1, s[4:5]
+; GFX8-NEXT: v_addc_u32_e64 v11, vcc, 0, v1, s[6:7]
+; GFX8-NEXT: v_addc_u32_e64 v13, vcc, 0, v1, s[8:9]
+; GFX8-NEXT: v_addc_u32_e64 v15, vcc, 0, v1, s[10:11]
+; GFX8-NEXT: flat_load_ushort v21, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v2
+; GFX8-NEXT: flat_load_ushort v20, v[19:20]
+; GFX8-NEXT: flat_load_ushort v22, v[6:7]
+; GFX8-NEXT: v_addc_u32_e64 v17, s[8:9], 0, v3, s[12:13]
+; GFX8-NEXT: v_addc_u32_e64 v19, s[10:11], 0, v3, s[14:15]
+; GFX8-NEXT: v_add_u32_e64 v6, s[4:5], 8, v2
+; GFX8-NEXT: flat_load_ushort v23, v[8:9]
+; GFX8-NEXT: flat_load_ushort v24, v[10:11]
+; GFX8-NEXT: v_add_u32_e64 v8, s[6:7], 10, v2
+; GFX8-NEXT: flat_load_ushort v12, v[12:13]
+; GFX8-NEXT: flat_load_ushort v13, v[14:15]
+; GFX8-NEXT: v_add_u32_e64 v10, s[8:9], 12, v2
+; GFX8-NEXT: flat_load_ushort v14, v[16:17]
+; GFX8-NEXT: flat_load_ushort v15, v[18:19]
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v2
-; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 6, v2
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/150288
More information about the llvm-commits
mailing list