[llvm-branch-commits] [llvm] [AMDGPU] Add structural stall heuristic to scheduling strategies (PR #169617)
Austin Kerbow via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Nov 25 22:58:56 PST 2025
https://github.com/kerbowa created https://github.com/llvm/llvm-project/pull/169617
Implements a structural stall heuristic that considers both resource
hazards and latency constraints when selecting instructions from the
pending queue.
- Add getStructuralStallCycles() to GCNSchedStrategy that computes the
number of cycles an instruction must wait due to:
- Resource conflicts on unbuffered resources (from the SchedModel)
- Sequence-dependent hazards (from GCNHazardRecognizer)
- Add getHazardWaitStates() to GCNHazardRecognizer that returns the number
of wait states until all hazards for an instruction are resolved,
providing cycle-accurate hazard information for scheduling heuristics.
>From bbf69d1b280b197c3f0a6e0fc9edf91f361ec407 Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Tue, 25 Nov 2025 22:18:19 -0800
Subject: [PATCH] [AMDGPU] Add structural stall heuristic to scheduling
strategies
Implements a structural stall heuristic that considers both resource
hazards and latency constraints when selecting instructions from the
pending queue.
- Add getStructuralStallCycles() to GCNSchedStrategy that computes the
number of cycles an instruction must wait due to:
- Resource conflicts on unbuffered resources (from the SchedModel)
- Sequence-dependent hazards (from GCNHazardRecognizer)
- Add getHazardWaitStates() to GCNHazardRecognizer that returns the number
of wait states until all hazards for an instruction are resolved,
providing cycle-accurate hazard information for scheduling heuristics.
---
.../Target/AMDGPU/AMDGPUMLSchedStrategy.cpp | 72 +++++++++++++++++++
.../lib/Target/AMDGPU/AMDGPUMLSchedStrategy.h | 4 +-
.../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 4 ++
llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h | 6 ++
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 40 +++++++++++
llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 8 ++-
.../AMDGPU/ml-sched-effective-stall.mir | 8 ++-
7 files changed, 136 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.cpp
index 6bad7cc172709..20d5ca61ac01d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.cpp
@@ -13,6 +13,10 @@
#include "AMDGPUMLSchedStrategy.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "machine-scheduler"
+
using namespace llvm;
AMDGPUMLSchedStrategy::AMDGPUMLSchedStrategy(const MachineSchedContext *C)
@@ -121,6 +125,74 @@ bool AMDGPUMLSchedStrategy::tryCandidate(SchedCandidate &Cand,
return false;
}
+bool AMDGPUMLSchedStrategy::tryPendingCandidate(SchedCandidate &Cand,
+ SchedCandidate &TryCand,
+ SchedBoundary *Zone) const {
+ // Initialize the candidate if needed.
+ if (!Cand.isValid()) {
+ TryCand.Reason = NodeOrder;
+ return true;
+ }
+
+ // Bias PhysReg Defs and copies to their uses and defined respectively.
+ if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
+ biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
+ return TryCand.Reason != NoCand;
+
+ // Avoid exceeding the target's limit.
+ if (DAG->isTrackingPressure() &&
+ tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
+ RegExcess, TRI, DAG->MF))
+ return TryCand.Reason != NoCand;
+
+ // Avoid increasing the max critical pressure in the scheduled region.
+ if (DAG->isTrackingPressure() &&
+ tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
+ TryCand, Cand, RegCritical, TRI, DAG->MF))
+ return TryCand.Reason != NoCand;
+
+ bool SameBoundary = Zone != nullptr;
+ if (SameBoundary) {
+ // Compare effective stall cycles between candidates.
+ // Effective stall = max(structural stall, latency stall)
+ // - Structural stalls: resource/hazard constraints (HW not ready)
+ // - Latency stalls: data dependency constraints (operands not ready)
+ //
+ // This allows picking a pending instruction with structural stalls over
+ // an available instruction with higher latency stalls (e.g., scheduling
+ // a WMMA while waiting for a memory load result).
+ unsigned TryStructStall = getStructuralStallCycles(*Zone, TryCand.SU);
+ unsigned TryLatencyStall = Zone->getLatencyStallCycles(TryCand.SU);
+ unsigned TryEffectiveStall = std::max(TryStructStall, TryLatencyStall);
+
+ unsigned CandStructStall = getStructuralStallCycles(*Zone, Cand.SU);
+ unsigned CandLatencyStall = Zone->getLatencyStallCycles(Cand.SU);
+ unsigned CandEffectiveStall = std::max(CandStructStall, CandLatencyStall);
+
+ LLVM_DEBUG(if (TryEffectiveStall || CandEffectiveStall) {
+ dbgs() << "Effective stalls: try=" << TryEffectiveStall
+ << " (struct=" << TryStructStall << ", lat=" << TryLatencyStall
+ << ") cand=" << CandEffectiveStall
+ << " (struct=" << CandStructStall << ", lat=" << CandLatencyStall
+ << ")\n";
+ });
+
+ if (tryLess(TryEffectiveStall, CandEffectiveStall, TryCand, Cand, Stall))
+ return TryCand.Reason != NoCand;
+
+ TryCand.initResourceDelta(DAG, SchedModel);
+ if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+ TryCand, Cand, ResourceReduce))
+ return TryCand.Reason != NoCand;
+ if (tryGreater(TryCand.ResDelta.DemandedResources,
+ Cand.ResDelta.DemandedResources, TryCand, Cand,
+ ResourceDemand))
+ return TryCand.Reason != NoCand;
+ }
+
+ return false;
+}
+
AMDGPUMLPostSchedStrategy::AMDGPUMLPostSchedStrategy(
const MachineSchedContext *C)
: PostGenericScheduler(C) {}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.h b/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.h
index 1a6d042231942..b72b193c70786 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMLSchedStrategy.h
@@ -20,6 +20,8 @@ class AMDGPUMLSchedStrategy final : public GCNSchedStrategy {
protected:
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
SchedBoundary *Zone) const override;
+ bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
+ SchedBoundary *Zone) const override;
public:
AMDGPUMLSchedStrategy(const MachineSchedContext *C);
@@ -33,4 +35,4 @@ class AMDGPUMLPostSchedStrategy : public PostGenericScheduler {
AMDGPUMLPostSchedStrategy(const MachineSchedContext *C);
};
-} // End namespace llvm
\ No newline at end of file
+} // End namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 7a2f84a2f73eb..ec160d15da1ac 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -313,6 +313,10 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
return std::max(W, NopPadding.getValue());
}
+unsigned GCNHazardRecognizer::getHazardWaitStates(MachineInstr *MI) const {
+ return const_cast<GCNHazardRecognizer *>(this)->PreEmitNoopsCommon(MI);
+}
+
unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
if (MI->isBundle())
return 0;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 67beffadc0913..be914d8657870 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -145,6 +145,12 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
void EmitInstruction(SUnit *SU) override;
void EmitInstruction(MachineInstr *MI) override;
HazardType getHazardType(SUnit *SU, int Stalls) override;
+
+ /// Returns the number of wait states until all hazards for \p MI are
+ /// resolved. This is useful for scheduling heuristics that want
+ /// cycle-accurate hazard information rather than just a boolean. Unlike
+ /// PreEmitNoops, this does not modify state or fix hazards.
+ unsigned getHazardWaitStates(MachineInstr *MI) const;
void EmitNoop() override;
unsigned PreEmitNoops(MachineInstr *) override;
unsigned PreEmitNoopsCommon(MachineInstr *);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index d9cb80c7c1676..00d1a25d0b6c8 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -25,6 +25,7 @@
#include "GCNSchedStrategy.h"
#include "AMDGPUIGroupLP.h"
+#include "GCNHazardRecognizer.h"
#include "GCNRegPressure.h"
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
@@ -218,6 +219,40 @@ void GCNSchedStrategy::getRegisterPressures(
Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
}
+unsigned GCNSchedStrategy::getStructuralStallCycles(SchedBoundary &Zone,
+ SUnit *SU) const {
+ // Only implemented for top-down scheduling currently.
+ if (!Zone.isTop() || !SU)
+ return 0;
+
+ MachineInstr *MI = SU->getInstr();
+ unsigned CurrCycle = Zone.getCurrCycle();
+ unsigned Stall = 0;
+
+ // Query SchedModel for resource stalls (unbuffered resources).
+ if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) {
+ const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+ for (const MCWriteProcResEntry &PE :
+ make_range(SchedModel->getWriteProcResBegin(SC),
+ SchedModel->getWriteProcResEnd(SC))) {
+ unsigned NextAvail =
+ Zone.getNextResourceCycle(SC, PE.ProcResourceIdx, PE.ReleaseAtCycle,
+ PE.AcquireAtCycle)
+ .first;
+ if (NextAvail > CurrCycle)
+ Stall = std::max(Stall, NextAvail - CurrCycle);
+ }
+ }
+
+ // Query HazardRecognizer for sequence-dependent hazard penalties.
+ if (Zone.HazardRec && Zone.HazardRec->isEnabled()) {
+ auto *HR = static_cast<GCNHazardRecognizer *>(Zone.HazardRec);
+ Stall = std::max(Stall, HR->getHazardWaitStates(MI));
+ }
+
+ return Stall;
+}
+
void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
bool AtTop,
const RegPressureTracker &RPTracker,
@@ -673,6 +708,11 @@ bool GCNSchedStrategy::tryPendingCandidate(SchedCandidate &Cand,
bool SameBoundary = Zone != nullptr;
if (SameBoundary) {
+ unsigned TryStructStall = getStructuralStallCycles(*Zone, TryCand.SU);
+ unsigned CandStructStall = getStructuralStallCycles(*Zone, Cand.SU);
+ if (tryLess(TryStructStall, CandStructStall, TryCand, Cand, Stall))
+ return TryCand.Reason != NoCand;
+
TryCand.initResourceDelta(DAG, SchedModel);
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
TryCand, Cand, ResourceReduce))
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 367f47c3ca4ae..048eeecac0ab9 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -56,6 +56,10 @@ class GCNSchedStrategy : public GenericScheduler {
const SIRegisterInfo *SRI, unsigned SGPRPressure,
unsigned VGPRPressure, bool IsBottomUp);
+ /// Estimate how many cycles \p SU must wait due to structural hazards at the
+ /// current boundary cycle. Returns zero when no stall is required.
+ unsigned getStructuralStallCycles(SchedBoundary &Zone, SUnit *SU) const;
+
/// Evaluates instructions in the pending queue using a subset of scheduling
/// heuristics.
///
@@ -64,8 +68,8 @@ class GCNSchedStrategy : public GenericScheduler {
/// invisible to scheduling heuristics. However, in certain scenarios (such as
/// avoiding register spilling), it may be beneficial to consider scheduling
/// these not-yet-ready instructions.
- bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
- SchedBoundary *Zone) const;
+ virtual bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
+ SchedBoundary *Zone) const;
void printCandidateDecision(const SchedCandidate &Current,
const SchedCandidate &Preferred);
diff --git a/llvm/test/CodeGen/AMDGPU/ml-sched-effective-stall.mir b/llvm/test/CodeGen/AMDGPU/ml-sched-effective-stall.mir
index 6c6d1c5728a34..08d9626d69f90 100644
--- a/llvm/test/CodeGen/AMDGPU/ml-sched-effective-stall.mir
+++ b/llvm/test/CodeGen/AMDGPU/ml-sched-effective-stall.mir
@@ -10,6 +10,8 @@
attributes #1 = { "amdgpu-waves-per-eu"="1,1" }
...
+# The scheduler should reorder the use of the global load after WMMAs to hide memory latency.
+
---
name: with_ml_workload_attr
tracksRegLiveness: true
@@ -29,8 +31,8 @@ body: |
; DEFAULT-NEXT: [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
; DEFAULT-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF10]], 0, 0, implicit $exec
; DEFAULT-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
- ; DEFAULT-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: early-clobber %14:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
; DEFAULT-NEXT: S_ENDPGM 0, implicit [[V_PK_ADD_F32_]], implicit %13, implicit %14
;
; ML-LABEL: name: with_ml_workload_attr
@@ -47,8 +49,8 @@ body: |
; ML-NEXT: [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
; ML-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF10]], 0, 0, implicit $exec
; ML-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
- ; ML-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
; ML-NEXT: early-clobber %14:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; ML-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
; ML-NEXT: S_ENDPGM 0, implicit [[V_PK_ADD_F32_]], implicit %13, implicit %14
%0:vreg_512_align2 = IMPLICIT_DEF
%1:vreg_512_align2 = IMPLICIT_DEF
@@ -99,8 +101,8 @@ body: |
; ML-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
; ML-NEXT: [[DEF4:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
; ML-NEXT: [[DEF5:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
- ; ML-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; ML-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF5]], 0, 0, implicit $exec
+ ; ML-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
; ML-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; ML-NEXT: [[DEF7:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
; ML-NEXT: [[DEF8:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
More information about the llvm-branch-commits
mailing list