[llvm-branch-commits] [llvm] [AMDGPU] Add structural stall heuristic to scheduling strategies (PR #169617)

Tue Mar 17 12:47:43 PDT 2026

https://github.com/kerbowa updated https://github.com/llvm/llvm-project/pull/169617

>From 4a8b61fc63b6c0863f291a130e1aea5d2777feba Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Tue, 25 Nov 2025 22:18:19 -0800
Subject: [PATCH 1/2] [AMDGPU] Add structural stall heuristic to scheduling
 strategies
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements a structural stall heuristic that considers both resource
hazards and latency constraints when selecting instructions. In coexec,
this changes the pending queue from a binary “not ready to issue”
distinction into part of a unified candidate comparison. Pending
instructions still identify structural stalls in the current cycle, but
they are now evaluated directly against available instructions by stall
cost, making the heuristics both more intuitive and more expressive.

- Add getStructuralStallCycles() to GCNSchedStrategy that computes the
number of cycles an instruction must wait due to:
  - Resource conflicts on unbuffered resources (from the SchedModel)
  - Sequence-dependent hazards (from GCNHazardRecognizer)

- Add getHazardWaitStates() to GCNHazardRecognizer that returns the number
of wait states until all hazards for an instruction are resolved,
providing cycle-accurate hazard information for scheduling heuristics.
---
 .../AMDGPU/AMDGPUCoExecSchedStrategy.cpp      | 41 +++++++++++++++++--
 .../Target/AMDGPU/AMDGPUCoExecSchedStrategy.h |  2 +
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp |  4 ++
 llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h  |  6 +++
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   | 35 ++++++++++++++++
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h     |  9 +++-
 .../AMDGPU/coexec-sched-effective-stall.mir   |  4 +-
 7 files changed, 93 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index 9d50c4e047943..d2cbd34fe6997 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -203,9 +203,9 @@ bool AMDGPUCoExecSchedStrategy::tryCandidate(SchedCandidate &Cand,
         tryLatency(TryCand, Cand, *Zone))
       return TryCand.Reason != NoCand;
 
-    // Prioritize instructions that read unbuffered resources by stall cycles.
-    if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
-                Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
+    // Otherwise compare candidates by the stall they would introduce if
+    // scheduled in the current cycle.
+    if (tryEffectiveStall(Cand, TryCand, *Zone))
       return TryCand.Reason != NoCand;
   }
 
@@ -267,6 +267,41 @@ bool AMDGPUCoExecSchedStrategy::tryCandidate(SchedCandidate &Cand,
   return false;
 }
 
+bool AMDGPUCoExecSchedStrategy::tryEffectiveStall(SchedCandidate &Cand,
+                                                  SchedCandidate &TryCand,
+                                                  SchedBoundary &Zone) const {
+  // Treat structural and latency stalls as a single scheduling cost for the
+  // current cycle.
+  unsigned CurrCycle = Zone.getCurrCycle();
+  unsigned TryReadyCycle =
+      Zone.isTop() ? TryCand.SU->TopReadyCycle : TryCand.SU->BotReadyCycle;
+  unsigned TryStructStall = getStructuralStallCycles(Zone, TryCand.SU);
+  unsigned TryLatencyStall = Zone.getLatencyStallCycles(TryCand.SU);
+  unsigned TryReadyStall =
+      TryReadyCycle > CurrCycle ? TryReadyCycle - CurrCycle : 0;
+  unsigned TryEffectiveStall =
+      std::max({TryReadyStall, TryStructStall, TryLatencyStall});
+
+  unsigned CandReadyCycle =
+      Zone.isTop() ? Cand.SU->TopReadyCycle : Cand.SU->BotReadyCycle;
+  unsigned CandStructStall = getStructuralStallCycles(Zone, Cand.SU);
+  unsigned CandLatencyStall = Zone.getLatencyStallCycles(Cand.SU);
+  unsigned CandReadyStall =
+      CandReadyCycle > CurrCycle ? CandReadyCycle - CurrCycle : 0;
+  unsigned CandEffectiveStall =
+      std::max({CandReadyStall, CandStructStall, CandLatencyStall});
+
+  LLVM_DEBUG(if (TryEffectiveStall || CandEffectiveStall) {
+    dbgs() << "Effective stalls: try=" << TryEffectiveStall
+           << " (ready=" << TryReadyStall << ", struct=" << TryStructStall
+           << ", lat=" << TryLatencyStall << ") cand=" << CandEffectiveStall
+           << " (ready=" << CandReadyStall << ", struct=" << CandStructStall
+           << ", lat=" << CandLatencyStall << ")\n";
+  });
+
+  return tryLess(TryEffectiveStall, CandEffectiveStall, TryCand, Cand, Stall);
+}
+
 ScheduleDAGInstrs *
 llvm::createGCNCoExecMachineScheduler(MachineSchedContext *C) {
   LLVM_DEBUG(dbgs() << "AMDGPU coexec preRA scheduler selected for "
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
index 2b661f03aa50a..07252c3fb45a7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
@@ -23,6 +23,8 @@ class AMDGPUCoExecSchedStrategy final : public GCNSchedStrategy {
 protected:
   bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
                     SchedBoundary *Zone) const override;
+  bool tryEffectiveStall(SchedCandidate &Cand, SchedCandidate &TryCand,
+                         SchedBoundary &Zone) const;
   void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
                          const RegPressureTracker &RPTracker,
                          SchedCandidate &Cand, bool &PickedPending,
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 8e79abfb2f601..4c23ec4d4aca2 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -334,6 +334,10 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
   return std::max(W, NopPadding.getValue());
 }
 
+unsigned GCNHazardRecognizer::getHazardWaitStates(MachineInstr *MI) const {
+  return this->PreEmitNoopsCommon(MI);
+}
+
 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) const {
   if (MI->isBundle())
     return 0;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 9f0468831f65e..920939a81db7f 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -171,6 +171,12 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   void EmitInstruction(SUnit *SU) override;
   void EmitInstruction(MachineInstr *MI) override;
   HazardType getHazardType(SUnit *SU, int Stalls) override;
+
+  /// Returns the number of wait states until all hazards for \p MI are
+  /// resolved. This is useful for scheduling heuristics that want
+  /// cycle-accurate hazard information rather than just a boolean.  Unlike
+  /// PreEmitNoops, this does not modify state or fix hazards.
+  unsigned getHazardWaitStates(MachineInstr *MI) const;
   void EmitNoop() override;
   unsigned PreEmitNoops(MachineInstr *) override;
   unsigned PreEmitNoopsCommon(MachineInstr *) const;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 3e1ae9f23c91b..16f58ba07f280 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -25,6 +25,7 @@
 
 #include "GCNSchedStrategy.h"
 #include "AMDGPUIGroupLP.h"
+#include "GCNHazardRecognizer.h"
 #include "GCNRegPressure.h"
 #include "SIMachineFunctionInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
@@ -230,6 +231,40 @@ void GCNSchedStrategy::getRegisterPressures(
   Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
 }
 
+unsigned GCNSchedStrategy::getStructuralStallCycles(SchedBoundary &Zone,
+                                                    SUnit *SU) const {
+  // Only implemented for top-down scheduling currently.
+  if (!Zone.isTop() || !SU)
+    return 0;
+
+  MachineInstr *MI = SU->getInstr();
+  unsigned CurrCycle = Zone.getCurrCycle();
+  unsigned Stall = 0;
+
+  // Query SchedModel for resource stalls (unbuffered resources).
+  if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) {
+    const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+    for (const MCWriteProcResEntry &PE :
+         make_range(SchedModel->getWriteProcResBegin(SC),
+                    SchedModel->getWriteProcResEnd(SC))) {
+      unsigned NextAvail =
+          Zone.getNextResourceCycle(SC, PE.ProcResourceIdx, PE.ReleaseAtCycle,
+                                    PE.AcquireAtCycle)
+              .first;
+      if (NextAvail > CurrCycle)
+        Stall = std::max(Stall, NextAvail - CurrCycle);
+    }
+  }
+
+  // Query HazardRecognizer for sequence-dependent hazard penalties.
+  if (Zone.HazardRec && Zone.HazardRec->isEnabled()) {
+    auto *HR = static_cast<GCNHazardRecognizer *>(Zone.HazardRec);
+    Stall = std::max(Stall, HR->getHazardWaitStates(MI));
+  }
+
+  return Stall;
+}
+
 void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
                                      bool AtTop,
                                      const RegPressureTracker &RPTracker,
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 04cffd7773847..b0c73334b38cc 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -60,6 +60,10 @@ class GCNSchedStrategy : public GenericScheduler {
                      const SIRegisterInfo *SRI, unsigned SGPRPressure,
                      unsigned VGPRPressure, bool IsBottomUp);
 
+  /// Estimate how many cycles \p SU must wait due to structural hazards at the
+  /// current boundary cycle. Returns zero when no stall is required.
+  unsigned getStructuralStallCycles(SchedBoundary &Zone, SUnit *SU) const;
+
   /// Evaluates instructions in the pending queue using a subset of scheduling
   /// heuristics.
   ///
@@ -68,8 +72,9 @@ class GCNSchedStrategy : public GenericScheduler {
   /// invisible to scheduling heuristics. However, in certain scenarios (such as
   /// avoiding register spilling), it may be beneficial to consider scheduling
   /// these not-yet-ready instructions.
-  bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
-                           SchedBoundary *Zone) const;
+  virtual bool tryPendingCandidate(SchedCandidate &Cand,
+                                   SchedCandidate &TryCand,
+                                   SchedBoundary *Zone) const;
 
   void printCandidateDecision(const SchedCandidate &Current,
                               const SchedCandidate &Preferred);
diff --git a/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
index bac94bdffd375..4196b3abec7ab 100644
--- a/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
+++ b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
@@ -3,8 +3,6 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=machine-scheduler -amdgpu-sched-strategy=coexec -verify-misched %s -o - | FileCheck -check-prefix=COEXEC %s
 
 --- |
-  ; Pre-commit test for stall heuristic
-
   define void @test-sched-effective-stall() #0 { ret void }
   define void @test-sched-pending-structural-stall() #0 { ret void }
 
@@ -102,9 +100,9 @@ body: |
     ; COEXEC-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
     ; COEXEC-NEXT: [[DEF9:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
     ; COEXEC-NEXT: early-clobber %10:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
-    ; COEXEC-NEXT: early-clobber %11:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; COEXEC-NEXT: S_NOP 0
     ; COEXEC-NEXT: S_NOP 0
+    ; COEXEC-NEXT: early-clobber %11:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
     ; COEXEC-NEXT: S_ENDPGM 0, implicit %10, implicit %11
     %0:vreg_512_align2 = IMPLICIT_DEF
     %1:vreg_512_align2 = IMPLICIT_DEF

>From c064b5a64ac1262f639c8a3384408f68b2097958 Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Tue, 17 Mar 2026 12:40:46 -0700
Subject: [PATCH 2/2] Address comments.

---
 .../AMDGPU/AMDGPUCoExecSchedStrategy.cpp      | 53 ++++++++++---------
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h     |  5 +-
 .../AMDGPU/coexec-sched-effective-stall.mir   |  1 +
 3 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index d2cbd34fe6997..977c6f56ad15d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -272,34 +272,37 @@ bool AMDGPUCoExecSchedStrategy::tryEffectiveStall(SchedCandidate &Cand,
                                                   SchedBoundary &Zone) const {
   // Treat structural and latency stalls as a single scheduling cost for the
   // current cycle.
+  struct StallCosts {
+    unsigned Ready = 0;
+    unsigned Structural = 0;
+    unsigned Latency = 0;
+    unsigned Effective = 0;
+  };
+
   unsigned CurrCycle = Zone.getCurrCycle();
-  unsigned TryReadyCycle =
-      Zone.isTop() ? TryCand.SU->TopReadyCycle : TryCand.SU->BotReadyCycle;
-  unsigned TryStructStall = getStructuralStallCycles(Zone, TryCand.SU);
-  unsigned TryLatencyStall = Zone.getLatencyStallCycles(TryCand.SU);
-  unsigned TryReadyStall =
-      TryReadyCycle > CurrCycle ? TryReadyCycle - CurrCycle : 0;
-  unsigned TryEffectiveStall =
-      std::max({TryReadyStall, TryStructStall, TryLatencyStall});
-
-  unsigned CandReadyCycle =
-      Zone.isTop() ? Cand.SU->TopReadyCycle : Cand.SU->BotReadyCycle;
-  unsigned CandStructStall = getStructuralStallCycles(Zone, Cand.SU);
-  unsigned CandLatencyStall = Zone.getLatencyStallCycles(Cand.SU);
-  unsigned CandReadyStall =
-      CandReadyCycle > CurrCycle ? CandReadyCycle - CurrCycle : 0;
-  unsigned CandEffectiveStall =
-      std::max({CandReadyStall, CandStructStall, CandLatencyStall});
-
-  LLVM_DEBUG(if (TryEffectiveStall || CandEffectiveStall) {
-    dbgs() << "Effective stalls: try=" << TryEffectiveStall
-           << " (ready=" << TryReadyStall << ", struct=" << TryStructStall
-           << ", lat=" << TryLatencyStall << ") cand=" << CandEffectiveStall
-           << " (ready=" << CandReadyStall << ", struct=" << CandStructStall
-           << ", lat=" << CandLatencyStall << ")\n";
+  auto GetStallCosts = [&](SUnit *SU) {
+    unsigned ReadyCycle = Zone.isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
+    StallCosts Costs;
+    Costs.Ready = ReadyCycle > CurrCycle ? ReadyCycle - CurrCycle : 0;
+    Costs.Structural = getStructuralStallCycles(Zone, SU);
+    Costs.Latency = Zone.getLatencyStallCycles(SU);
+    Costs.Effective = std::max({Costs.Ready, Costs.Structural, Costs.Latency});
+    return Costs;
+  };
+
+  StallCosts TryCosts = GetStallCosts(TryCand.SU);
+  StallCosts CandCosts = GetStallCosts(Cand.SU);
+
+  LLVM_DEBUG(if (TryCosts.Effective || CandCosts.Effective) {
+    dbgs() << "Effective stalls: try=" << TryCosts.Effective
+           << " (ready=" << TryCosts.Ready << ", struct=" << TryCosts.Structural
+           << ", lat=" << TryCosts.Latency << ") cand=" << CandCosts.Effective
+           << " (ready=" << CandCosts.Ready
+           << ", struct=" << CandCosts.Structural
+           << ", lat=" << CandCosts.Latency << ")\n";
   });
 
-  return tryLess(TryEffectiveStall, CandEffectiveStall, TryCand, Cand, Stall);
+  return tryLess(TryCosts.Effective, CandCosts.Effective, TryCand, Cand, Stall);
 }
 
 ScheduleDAGInstrs *
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index b0c73334b38cc..0d4ef070e57af 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -72,9 +72,8 @@ class GCNSchedStrategy : public GenericScheduler {
   /// invisible to scheduling heuristics. However, in certain scenarios (such as
   /// avoiding register spilling), it may be beneficial to consider scheduling
   /// these not-yet-ready instructions.
-  virtual bool tryPendingCandidate(SchedCandidate &Cand,
-                                   SchedCandidate &TryCand,
-                                   SchedBoundary *Zone) const;
+  bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
+                           SchedBoundary *Zone) const;
 
   void printCandidateDecision(const SchedCandidate &Current,
                               const SchedCandidate &Preferred);
diff --git a/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
index 4196b3abec7ab..f9f9a27e9af4c 100644
--- a/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
+++ b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
@@ -9,6 +9,7 @@
   attributes #0 = { "amdgpu-waves-per-eu"="1,1" }
 ...
 
+---
 name: test-sched-effective-stall
 tracksRegLiveness: true
 body: |