[llvm-branch-commits] [llvm] [AMDGPU] Add structural stall heuristic to scheduling strategies (PR #169617)

Sun Mar 8 17:50:31 PDT 2026

https://github.com/kerbowa updated https://github.com/llvm/llvm-project/pull/169617

>From a7e4d0968b0ce47d4f3ea47073f95a2c87089acf Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Tue, 25 Nov 2025 22:18:19 -0800
Subject: [PATCH 1/2] [AMDGPU] Add structural stall heuristic to scheduling
 strategies
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements a structural stall heuristic that considers both resource
hazards and latency constraints when selecting instructions. In coexec,
this changes the pending queue from a binary “not ready to issue”
distinction into part of a unified candidate comparison. Pending
instructions still identify structural stalls in the current cycle, but
they are now evaluated directly against available instructions by stall
cost, making the heuristics both more intuitive and more expressive.

- Add getStructuralStallCycles() to GCNSchedStrategy that computes the
number of cycles an instruction must wait due to:
  - Resource conflicts on unbuffered resources (from the SchedModel)
  - Sequence-dependent hazards (from GCNHazardRecognizer)

- Add getHazardWaitStates() to GCNHazardRecognizer that returns the number
of wait states until all hazards for an instruction are resolved,
providing cycle-accurate hazard information for scheduling heuristics.
---
 .../AMDGPU/AMDGPUCoExecSchedStrategy.cpp      | 29 +++++++++++++--
 .../Target/AMDGPU/AMDGPUCoExecSchedStrategy.h |  2 ++
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp |  4 +++
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h  |  6 ++++
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   | 35 +++++++++++++++++++
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h     |  9 +++--
 .../AMDGPU/coexec-sched-effective-stall.mir   |  2 +-
 7 files changed, 81 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index 9dfb4ab3b282e..0183bd468172d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -203,9 +203,9 @@ bool AMDGPUCoExecSchedStrategy::tryCandidate(SchedCandidate &Cand,
         tryLatency(TryCand, Cand, *Zone))
       return TryCand.Reason != NoCand;
 
-    // Prioritize instructions that read unbuffered resources by stall cycles.
-    if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
-                Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
+    // Otherwise compare candidates by the stall they would introduce if
+    // scheduled in the current cycle.
+    if (tryEffectiveStall(Cand, TryCand, *Zone))
       return TryCand.Reason != NoCand;
   }
 
@@ -267,6 +267,29 @@ bool AMDGPUCoExecSchedStrategy::tryCandidate(SchedCandidate &Cand,
   return false;
 }
 
+bool AMDGPUCoExecSchedStrategy::tryEffectiveStall(SchedCandidate &Cand,
+                                                  SchedCandidate &TryCand,
+                                                  SchedBoundary &Zone) const {
+  // Treat structural and latency stalls as a single scheduling cost for the
+  // current cycle.
+  unsigned TryStructStall = getStructuralStallCycles(Zone, TryCand.SU);
+  unsigned TryLatencyStall = Zone.getLatencyStallCycles(TryCand.SU);
+  unsigned TryEffectiveStall = std::max(TryStructStall, TryLatencyStall);
+
+  unsigned CandStructStall = getStructuralStallCycles(Zone, Cand.SU);
+  unsigned CandLatencyStall = Zone.getLatencyStallCycles(Cand.SU);
+  unsigned CandEffectiveStall = std::max(CandStructStall, CandLatencyStall);
+
+  LLVM_DEBUG(if (TryEffectiveStall || CandEffectiveStall) {
+    dbgs() << "Effective stalls: try=" << TryEffectiveStall
+           << " (struct=" << TryStructStall << ", lat=" << TryLatencyStall
+           << ") cand=" << CandEffectiveStall << " (struct=" << CandStructStall
+           << ", lat=" << CandLatencyStall << ")\n";
+  });
+
+  return tryLess(TryEffectiveStall, CandEffectiveStall, TryCand, Cand, Stall);
+}
+
 ScheduleDAGInstrs *
 llvm::createGCNCoExecMachineScheduler(MachineSchedContext *C) {
   LLVM_DEBUG(dbgs() << "AMDGPU coexec preRA scheduler selected for "
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
index 2b661f03aa50a..07252c3fb45a7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
@@ -23,6 +23,8 @@ class AMDGPUCoExecSchedStrategy final : public GCNSchedStrategy {
 protected:
   bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
                     SchedBoundary *Zone) const override;
+  bool tryEffectiveStall(SchedCandidate &Cand, SchedCandidate &TryCand,
+                         SchedBoundary &Zone) const;
   void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
                          const RegPressureTracker &RPTracker,
                          SchedCandidate &Cand, bool &PickedPending,
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 30a9d1d2ab149..36ab5e27e0257 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -334,6 +334,10 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
   return std::max(W, NopPadding.getValue());
 }
 
+unsigned GCNHazardRecognizer::getHazardWaitStates(MachineInstr *MI) const {
+  return const_cast<GCNHazardRecognizer *>(this)->PreEmitNoopsCommon(MI);
+}
+
 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
   if (MI->isBundle())
     return 0;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index b331504d40113..04ee2ca7a50a6 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -169,6 +169,12 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   void EmitInstruction(SUnit *SU) override;
   void EmitInstruction(MachineInstr *MI) override;
   HazardType getHazardType(SUnit *SU, int Stalls) override;
+
+  /// Returns the number of wait states until all hazards for \p MI are
+  /// resolved. This is useful for scheduling heuristics that want
+  /// cycle-accurate hazard information rather than just a boolean.  Unlike
+  /// PreEmitNoops, this does not modify state or fix hazards.
+  unsigned getHazardWaitStates(MachineInstr *MI) const;
   void EmitNoop() override;
   unsigned PreEmitNoops(MachineInstr *) override;
   unsigned PreEmitNoopsCommon(MachineInstr *);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 779bf06b3334e..f86abf6afbda6 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -25,6 +25,7 @@
 
 #include "GCNSchedStrategy.h"
 #include "AMDGPUIGroupLP.h"
+#include "GCNHazardRecognizer.h"
 #include "GCNRegPressure.h"
 #include "SIMachineFunctionInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
@@ -230,6 +231,40 @@ void GCNSchedStrategy::getRegisterPressures(
   Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
 }
 
+unsigned GCNSchedStrategy::getStructuralStallCycles(SchedBoundary &Zone,
+                                                    SUnit *SU) const {
+  // Only implemented for top-down scheduling currently.
+  if (!Zone.isTop() || !SU)
+    return 0;
+
+  MachineInstr *MI = SU->getInstr();
+  unsigned CurrCycle = Zone.getCurrCycle();
+  unsigned Stall = 0;
+
+  // Query SchedModel for resource stalls (unbuffered resources).
+  if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) {
+    const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+    for (const MCWriteProcResEntry &PE :
+         make_range(SchedModel->getWriteProcResBegin(SC),
+                    SchedModel->getWriteProcResEnd(SC))) {
+      unsigned NextAvail =
+          Zone.getNextResourceCycle(SC, PE.ProcResourceIdx, PE.ReleaseAtCycle,
+                                    PE.AcquireAtCycle)
+              .first;
+      if (NextAvail > CurrCycle)
+        Stall = std::max(Stall, NextAvail - CurrCycle);
+    }
+  }
+
+  // Query HazardRecognizer for sequence-dependent hazard penalties.
+  if (Zone.HazardRec && Zone.HazardRec->isEnabled()) {
+    auto *HR = static_cast<GCNHazardRecognizer *>(Zone.HazardRec);
+    Stall = std::max(Stall, HR->getHazardWaitStates(MI));
+  }
+
+  return Stall;
+}
+
 void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
                                      bool AtTop,
                                      const RegPressureTracker &RPTracker,
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 04cffd7773847..b0c73334b38cc 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -60,6 +60,10 @@ class GCNSchedStrategy : public GenericScheduler {
                      const SIRegisterInfo *SRI, unsigned SGPRPressure,
                      unsigned VGPRPressure, bool IsBottomUp);
 
+  /// Estimate how many cycles \p SU must wait due to structural hazards at the
+  /// current boundary cycle. Returns zero when no stall is required.
+  unsigned getStructuralStallCycles(SchedBoundary &Zone, SUnit *SU) const;
+
   /// Evaluates instructions in the pending queue using a subset of scheduling
   /// heuristics.
   ///
@@ -68,8 +72,9 @@ class GCNSchedStrategy : public GenericScheduler {
   /// invisible to scheduling heuristics. However, in certain scenarios (such as
   /// avoiding register spilling), it may be beneficial to consider scheduling
   /// these not-yet-ready instructions.
-  bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
-                           SchedBoundary *Zone) const;
+  virtual bool tryPendingCandidate(SchedCandidate &Cand,
+                                   SchedCandidate &TryCand,
+                                   SchedBoundary *Zone) const;
 
   void printCandidateDecision(const SchedCandidate &Current,
                               const SchedCandidate &Preferred);
diff --git a/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
index bac94bdffd375..887d458d59d4c 100644
--- a/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
+++ b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
@@ -102,9 +102,9 @@ body: |
     ; COEXEC-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF9:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
     ; COEXEC-NEXT: early-clobber %10:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    ; COEXEC-NEXT: early-clobber %11:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; COEXEC-NEXT: S_NOP 0
     ; COEXEC-NEXT: S_NOP 0
+    ; COEXEC-NEXT: early-clobber %11:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; COEXEC-NEXT: S_ENDPGM 0, implicit %10, implicit %11
     %0:vreg_512_align2 = IMPLICIT_DEF
     %1:vreg_512_align2 = IMPLICIT_DEF

>From ed37edef1134c4a792c88dcc8ccf196f7119fe2f Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Fri, 27 Feb 2026 22:58:29 -0800
Subject: [PATCH 2/2] Update coexec-sched-effective-stall.mir

---
 llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
index 887d458d59d4c..4196b3abec7ab 100644
--- a/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
+++ b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
@@ -3,8 +3,6 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=machine-scheduler -amdgpu-sched-strategy=coexec -verify-misched %s -o - | FileCheck -check-prefix=COEXEC %s
 
 --- |
-  ; Pre-commit test for stall heuristic
-
   define void @test-sched-effective-stall() #0 { ret void }
   define void @test-sched-pending-structural-stall() #0 { ret void }