[llvm] [AMDGPU] Add ML-oriented coexec scheduler selection and queue handling (PR #169616)
Austin Kerbow via llvm-commits
llvm-commits at lists.llvm.org
Sun Mar 8 20:40:12 PDT 2026
https://github.com/kerbowa updated https://github.com/llvm/llvm-project/pull/169616
>From 65f60802a61eba690be8731dd9e7526b1774d9c2 Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Mon, 17 Nov 2025 15:33:49 -0800
Subject: [PATCH 1/3] [AMDGPU] Add ML-oriented coexec scheduler selection and
queue handling
This patch adds the initial coexec scheduler scaffold for machine
learning workloads on gfx1250.
It introduces function and module-level controls for selecting the
AMDGPU preRA and postRA schedulers, including an `amdgpu-workload-type`
module flag that maps ML workloads to coexec preRA scheduling and a nop
postRA scheduler by default.
It also updates the coexec scheduler to use a simplified top-down
candidate selection path that considers both available and pending
queues through a single flow, setting up follow-on heuristic work.
---
.../AMDGPU/AMDGPUCoExecSchedStrategy.cpp | 275 ++++++++++++++++++
.../Target/AMDGPU/AMDGPUCoExecSchedStrategy.h | 43 +++
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 69 ++++-
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 18 +-
llvm/lib/Target/AMDGPU/GCNSchedStrategy.h | 11 +
llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 22 ++
.../amdgpu-workload-type-scheduler-debug.mir | 114 ++++++++
.../AMDGPU/coexec-sched-effective-stall.mir | 124 ++++++++
9 files changed, 663 insertions(+), 14 deletions(-)
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-workload-type-scheduler-debug.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
new file mode 100644
index 0000000000000..a1a8a80c5bfb1
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -0,0 +1,275 @@
+//===- AMDGPUCoExecSchedStrategy.cpp - CoExec Scheduling Strategy ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Coexecution-focused scheduling strategy for AMDGPU.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUCoExecSchedStrategy.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-scheduler"
+
+namespace {
+
+// Used to disable post-RA scheduling with function level granularity.
+class GCNNoopPostScheduleDAG final : public ScheduleDAGInstrs {
+public:
+ explicit GCNNoopPostScheduleDAG(MachineSchedContext *C)
+ : ScheduleDAGInstrs(*C->MF, C->MLI, /*RemoveKillFlags=*/true) {}
+
+ // Do nothing.
+ void schedule() override {}
+};
+
+} // namespace
+
+static SUnit *pickOnlyChoice(SchedBoundary &Zone) {
+ // pickOnlyChoice() releases pending instructions and checks for new hazards.
+ SUnit *OnlyChoice = Zone.pickOnlyChoice();
+ if (!Zone.Pending.empty())
+ return nullptr;
+
+ return OnlyChoice;
+}
+
+AMDGPUCoExecSchedStrategy::AMDGPUCoExecSchedStrategy(
+ const MachineSchedContext *C)
+ : GCNSchedStrategy(C) {
+ SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule);
+ SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
+ // Use more accurate GCN pressure trackers.
+ UseGCNTrackers = true;
+}
+
+void AMDGPUCoExecSchedStrategy::initialize(ScheduleDAGMI *DAG) {
+ assert((PreRADirection == MISched::Unspecified ||
+ PreRADirection == MISched::TopDown) &&
+ "coexec scheduler only supports top-down scheduling");
+ // Coexecution scheduling strategy is only done top-down to support new
+ // resource balancing heuristics.
+ RegionPolicy.OnlyTopDown = true;
+ RegionPolicy.OnlyBottomUp = false;
+
+ GCNSchedStrategy::initialize(DAG);
+}
+
+SUnit *AMDGPUCoExecSchedStrategy::pickNode(bool &IsTopNode) {
+ assert(RegionPolicy.OnlyTopDown && !RegionPolicy.OnlyBottomUp &&
+ "coexec scheduler only supports top-down scheduling");
+
+ if (DAG->top() == DAG->bottom()) {
+ assert(Top.Available.empty() && Top.Pending.empty() &&
+ Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
+ return nullptr;
+ }
+
+ bool PickedPending = false;
+ SUnit *SU = nullptr;
+ do {
+ PickedPending = false;
+ SU = pickOnlyChoice(Top);
+ if (!SU) {
+ CandPolicy NoPolicy;
+ TopCand.reset(NoPolicy);
+ pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
+ PickedPending, /*IsBottomUp=*/false);
+ assert(TopCand.Reason != NoCand && "failed to find a candidate");
+ SU = TopCand.SU;
+ }
+ IsTopNode = true;
+ } while (SU->isScheduled);
+
+ if (PickedPending) {
+ unsigned ReadyCycle = SU->TopReadyCycle;
+ unsigned CurrentCycle = Top.getCurrCycle();
+ if (ReadyCycle > CurrentCycle)
+ Top.bumpCycle(ReadyCycle);
+
+ // checkHazard() does not expose the exact cycle where the hazard clears.
+ while (Top.checkHazard(SU))
+ Top.bumpCycle(Top.getCurrCycle() + 1);
+
+ Top.releasePending();
+ }
+
+ if (SU->isTopReady())
+ Top.removeReady(SU);
+ if (SU->isBottomReady())
+ Bot.removeReady(SU);
+
+ LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
+ << *SU->getInstr());
+
+ assert(IsTopNode && "coexec scheduler must only schedule from top boundary");
+ return SU;
+}
+
+void AMDGPUCoExecSchedStrategy::pickNodeFromQueue(
+ SchedBoundary &Zone, const CandPolicy &ZonePolicy,
+ const RegPressureTracker &RPTracker, SchedCandidate &Cand,
+ bool &PickedPending, bool IsBottomUp) {
+ assert(Zone.isTop() && "coexec scheduler only supports top boundary");
+ assert(!IsBottomUp && "coexec scheduler only supports top-down scheduling");
+
+ const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
+ ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
+ unsigned SGPRPressure = 0;
+ unsigned VGPRPressure = 0;
+ PickedPending = false;
+ if (DAG->isTrackingPressure()) {
+ if (!useGCNTrackers()) {
+ SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
+ VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
+ } else {
+ SGPRPressure = DownwardTracker.getPressure().getSGPRNum();
+ VGPRPressure = DownwardTracker.getPressure().getArchVGPRNum();
+ }
+ }
+
+ auto EvaluateQueue = [&](ReadyQueue &Q, bool FromPending) {
+ for (SUnit *SU : Q) {
+ SchedCandidate TryCand(ZonePolicy);
+ initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
+ VGPRPressure, IsBottomUp);
+ SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
+ tryCandidate(Cand, TryCand, ZoneArg);
+ if (TryCand.Reason != NoCand) {
+ if (TryCand.ResDelta == SchedResourceDelta())
+ TryCand.initResourceDelta(Zone.DAG, SchedModel);
+ LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
+ PickedPending = FromPending;
+ Cand.setBest(TryCand);
+ } else {
+ printCandidateDecision(TryCand, Cand);
+ }
+ }
+ };
+
+ LLVM_DEBUG(dbgs() << "Available Q:\n");
+ EvaluateQueue(Zone.Available, /*FromPending=*/false);
+
+ LLVM_DEBUG(dbgs() << "Pending Q:\n");
+ EvaluateQueue(Zone.Pending, /*FromPending=*/true);
+}
+
+bool AMDGPUCoExecSchedStrategy::tryCandidate(SchedCandidate &Cand,
+ SchedCandidate &TryCand,
+ SchedBoundary *Zone) const {
+ // Initialize the candidate if needed.
+ if (!Cand.isValid()) {
+ TryCand.Reason = FirstValid;
+ return true;
+ }
+
+ // Bias PhysReg Defs and copies to their uses and defined respectively.
+ if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
+ biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
+ return TryCand.Reason != NoCand;
+
+ // Avoid exceeding the target's limit.
+ if (DAG->isTrackingPressure() &&
+ tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
+ RegExcess, TRI, DAG->MF))
+ return TryCand.Reason != NoCand;
+
+ // We only compare a subset of features when comparing nodes between
+ // Top and Bottom boundary. Some properties are simply incomparable, in many
+ // other instances we should only override the other boundary if something
+ // is a clear good pick on one boundary. Skip heuristics that are more
+ // "tie-breaking" in nature.
+ bool SameBoundary = Zone != nullptr;
+ if (SameBoundary) {
+ // For loops that are acyclic path limited, aggressively schedule for
+ // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
+ // heuristics to take precedence.
+ if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
+ tryLatency(TryCand, Cand, *Zone))
+ return TryCand.Reason != NoCand;
+
+ // Prioritize instructions that read unbuffered resources by stall cycles.
+ if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
+ Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
+ return TryCand.Reason != NoCand;
+ }
+
+ // Keep clustered nodes together to encourage downstream peephole
+ // optimizations which may reduce resource requirements.
+ //
+ // This is a best effort to set things up for a post-RA pass. Optimizations
+ // like generating loads of multiple registers should ideally be done within
+ // the scheduler pass by combining the loads during DAG postprocessing.
+ unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
+ unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
+ bool CandIsClusterSucc =
+ isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
+ bool TryCandIsClusterSucc =
+ isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
+
+ if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
+ Cluster))
+ return TryCand.Reason != NoCand;
+
+ if (SameBoundary) {
+ // Weak edges are for clustering and other constraints.
+ if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
+ getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
+ return TryCand.Reason != NoCand;
+ }
+
+ // Avoid increasing the max pressure of the entire region.
+ if (DAG->isTrackingPressure() &&
+ tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
+ Cand, RegMax, TRI, DAG->MF))
+ return TryCand.Reason != NoCand;
+
+ if (SameBoundary) {
+ // Avoid critical resource consumption and balance the schedule.
+ TryCand.initResourceDelta(DAG, SchedModel);
+ if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+ TryCand, Cand, ResourceReduce))
+ return TryCand.Reason != NoCand;
+ if (tryGreater(TryCand.ResDelta.DemandedResources,
+ Cand.ResDelta.DemandedResources, TryCand, Cand,
+ ResourceDemand))
+ return TryCand.Reason != NoCand;
+
+ // Avoid serializing long latency dependence chains.
+ // For acyclic path limited loops, latency was already checked above.
+ if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
+ !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
+ return TryCand.Reason != NoCand;
+
+ // Fall through to original instruction order.
+ if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
+ (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
+ TryCand.Reason = NodeOrder;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+ScheduleDAGInstrs *
+llvm::createGCNCoExecMachineScheduler(MachineSchedContext *C) {
+ LLVM_DEBUG(dbgs() << "AMDGPU coexec preRA scheduler selected for "
+ << C->MF->getName() << '\n');
+ return new GCNScheduleDAGMILive(
+ C, std::make_unique<AMDGPUCoExecSchedStrategy>(C));
+}
+
+ScheduleDAGInstrs *
+llvm::createGCNCoExecPostMachineScheduler(MachineSchedContext *C) {
+ LLVM_DEBUG(dbgs() << "AMDGPU coexec postRA scheduler selected (nop) for "
+ << C->MF->getName() << '\n');
+ return new GCNNoopPostScheduleDAG(C);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
new file mode 100644
index 0000000000000..0f7289992ec72
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
@@ -0,0 +1,43 @@
+//===- AMDGPUCoExecSchedStrategy.h - CoExec Scheduling Strategy -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Coexecution-focused scheduling strategy for AMDGPU.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
+
+#include "GCNSchedStrategy.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+class AMDGPUCoExecSchedStrategy final : public GCNSchedStrategy {
+protected:
+ bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
+ SchedBoundary *Zone) const override;
+ void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
+ const RegPressureTracker &RPTracker,
+ SchedCandidate &Cand, bool &PickedPending,
+ bool IsBottomUp);
+
+public:
+ AMDGPUCoExecSchedStrategy(const MachineSchedContext *C);
+
+ void initialize(ScheduleDAGMI *DAG) override;
+ SUnit *pickNode(bool &IsTopNode) override;
+};
+
+ScheduleDAGInstrs *createGCNCoExecMachineScheduler(MachineSchedContext *C);
+ScheduleDAGInstrs *createGCNCoExecPostMachineScheduler(MachineSchedContext *C);
+
+} // End namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 5b3effbcc7179..cffebc602b2bd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -18,6 +18,7 @@
#include "AMDGPU.h"
#include "AMDGPUAliasAnalysis.h"
#include "AMDGPUBarrierLatency.h"
+#include "AMDGPUCoExecSchedStrategy.h"
#include "AMDGPUCtorDtorLowering.h"
#include "AMDGPUExportClustering.h"
#include "AMDGPUExportKernelRuntimeHandles.h"
@@ -574,6 +575,58 @@ static cl::opt<std::string>
cl::desc("Select custom AMDGPU scheduling strategy."),
cl::Hidden, cl::init(""));
+enum class AMDGPUPostSchedStrategy {
+ Default,
+ Nop,
+};
+
+static StringRef getAMDGPUWorkloadType(const Module *M) {
+ if (!M)
+ return "";
+
+ auto *WorkloadType =
+ dyn_cast_or_null<MDString>(M->getModuleFlag("amdgpu-workload-type"));
+ if (!WorkloadType)
+ return "";
+
+ return WorkloadType->getString();
+}
+
+static StringRef getAMDGPUSchedStrategy(const Function &F) {
+ Attribute SchedStrategyAttr = F.getFnAttribute("amdgpu-sched-strategy");
+ if (SchedStrategyAttr.isValid())
+ return SchedStrategyAttr.getValueAsString();
+
+ if (!AMDGPUSchedStrategy.empty())
+ return AMDGPUSchedStrategy;
+
+ StringRef WorkloadType = getAMDGPUWorkloadType(F.getParent());
+ // ML workloads use coexec scheduling defaults.
+ if (WorkloadType.equals_insensitive("ml"))
+ return "coexec";
+
+ return "";
+}
+
+static AMDGPUPostSchedStrategy getAMDGPUPostSchedStrategy(const Function &F) {
+ Attribute PostSchedStrategyAttr =
+ F.getFnAttribute("amdgpu-post-sched-strategy");
+ if (PostSchedStrategyAttr.isValid()) {
+ StringRef PostSchedStrategy = PostSchedStrategyAttr.getValueAsString();
+ if (PostSchedStrategy == "nop")
+ return AMDGPUPostSchedStrategy::Nop;
+ // Allow explicit override to keep post-RA scheduling enabled even when
+ // preRA resolves to coexec via module defaults.
+ if (PostSchedStrategy == "default")
+ return AMDGPUPostSchedStrategy::Default;
+ }
+
+ if (getAMDGPUSchedStrategy(F) == "coexec")
+ return AMDGPUPostSchedStrategy::Nop;
+
+ return AMDGPUPostSchedStrategy::Default;
+}
+
static cl::opt<bool> EnableRewritePartialRegUses(
"amdgpu-enable-rewrite-partial-reg-uses",
cl::desc("Enable rewrite partial reg uses pass"), cl::init(true),
@@ -1241,11 +1294,7 @@ GCNTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
if (ST.enableSIScheduler())
return createSIMachineScheduler(C);
- Attribute SchedStrategyAttr =
- C->MF->getFunction().getFnAttribute("amdgpu-sched-strategy");
- StringRef SchedStrategy = SchedStrategyAttr.isValid()
- ? SchedStrategyAttr.getValueAsString()
- : AMDGPUSchedStrategy;
+ StringRef SchedStrategy = getAMDGPUSchedStrategy(C->MF->getFunction());
if (SchedStrategy == "max-ilp")
return createGCNMaxILPMachineScheduler(C);
@@ -1262,11 +1311,21 @@ GCNTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
if (SchedStrategy == "iterative-maxocc")
return createIterativeGCNMaxOccupancyMachineScheduler(C);
+ if (SchedStrategy == "coexec")
+ return createGCNCoExecMachineScheduler(C);
+
return createGCNMaxOccupancyMachineScheduler(C);
}
ScheduleDAGInstrs *
GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
+ // Post-RA scheduler selection can be controlled per-function
+ // ("amdgpu-post-sched-strategy"), and coexec disables post-RA scheduling
+ // based on function/module scheduler policy.
+ if (getAMDGPUPostSchedStrategy(C->MF->getFunction()) ==
+ AMDGPUPostSchedStrategy::Nop)
+ return createGCNCoExecPostMachineScheduler(C);
+
ScheduleDAGMI *DAG =
new GCNPostScheduleDAGMILive(C, std::make_unique<PostGenericScheduler>(C),
/*RemoveKillFlags=*/true);
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index ae684a58cfd26..6bd6b4feac041 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -86,6 +86,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUMacroFusion.cpp
AMDGPUMCInstLower.cpp
AMDGPUMemoryUtils.cpp
+ AMDGPUCoExecSchedStrategy.cpp
AMDGPUIGroupLP.cpp
AMDGPULowerVGPREncoding.cpp
AMDGPUMCResourceInfo.cpp
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 77c322eb3178e..779bf06b3334e 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -196,7 +196,7 @@ static bool canUsePressureDiffs(const SUnit &SU) {
return true;
}
-static void getRegisterPressures(
+void GCNSchedStrategy::getRegisterPressures(
bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker,
@@ -204,7 +204,7 @@ static void getRegisterPressures(
// getDownwardPressure() and getUpwardPressure() make temporary changes to
// the tracker, so we need to pass those function a non-const copy.
RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
- if (!GCNTrackers) {
+ if (!useGCNTrackers()) {
AtTop
? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure)
: TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
@@ -256,7 +256,7 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
//
// In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
// PressureDiffs.
- if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) {
+ if (AtTop || !canUsePressureDiffs(*SU) || useGCNTrackers()) {
getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
DownwardTracker, UpwardTracker, DAG, SRI);
} else {
@@ -400,7 +400,7 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
unsigned VGPRPressure = 0;
IsPending = false;
if (DAG->isTrackingPressure()) {
- if (!GCNTrackers) {
+ if (!useGCNTrackers()) {
SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
} else {
@@ -623,7 +623,7 @@ SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
}
void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
- if (GCNTrackers) {
+ if (useGCNTrackers()) {
MachineInstr *MI = SU->getInstr();
IsTopNode ? (void)DownwardTracker.advance(MI, false)
: UpwardTracker.recede(*MI);
@@ -707,7 +707,7 @@ GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
- GCNTrackers = GCNTrackers & !IsLegacyScheduler;
+ UseGCNTrackers = GCNTrackers & !IsLegacyScheduler;
}
GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
@@ -1136,9 +1136,10 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
void GCNScheduleDAGMILive::runSchedStages() {
LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
+ GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
if (!Regions.empty()) {
BBLiveInMap = getRegionLiveInMap();
- if (GCNTrackers)
+ if (S.useGCNTrackers())
RegionLiveOuts.buildLiveRegMap();
}
@@ -1150,7 +1151,6 @@ void GCNScheduleDAGMILive::runSchedStages() {
}
#endif
- GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
while (S.advanceStage()) {
auto Stage = createSchedStage(S.getCurrentStage());
if (!Stage->initGCNSchedStage())
@@ -1166,7 +1166,7 @@ void GCNScheduleDAGMILive::runSchedStages() {
continue;
}
- if (GCNTrackers) {
+ if (S.useGCNTrackers()) {
GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
GCNRPTracker::LiveRegSet *RegionLiveIns =
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 99fd55db33285..04cffd7773847 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -74,6 +74,13 @@ class GCNSchedStrategy : public GenericScheduler {
void printCandidateDecision(const SchedCandidate &Current,
const SchedCandidate &Preferred);
+ void getRegisterPressures(bool AtTop, const RegPressureTracker &RPTracker,
+ SUnit *SU, std::vector<unsigned> &Pressure,
+ std::vector<unsigned> &MaxPressure,
+ GCNDownwardRPTracker &DownwardTracker,
+ GCNUpwardRPTracker &UpwardTracker,
+ ScheduleDAGMI *DAG, const SIRegisterInfo *SRI);
+
std::vector<unsigned> Pressure;
std::vector<unsigned> MaxPressure;
@@ -98,6 +105,8 @@ class GCNSchedStrategy : public GenericScheduler {
// GCN RP Tracker for botttom-up scheduling
mutable GCNUpwardRPTracker UpwardTracker;
+ bool UseGCNTrackers = false;
+
public:
// schedule() have seen register pressure over the critical limits and had to
// track register pressure for actual scheduling heuristics.
@@ -145,6 +154,8 @@ class GCNSchedStrategy : public GenericScheduler {
bool hasNextStage() const;
+ bool useGCNTrackers() const { return UseGCNTrackers; }
+
GCNSchedStageID getNextStage() const;
GCNDownwardRPTracker *getDownwardTracker() { return &DownwardTracker; }
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 8c98e8b589b13..f01b4983e9a6a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -52,6 +52,20 @@ static cl::opt<unsigned>
cl::desc("Number of addresses from which to enable MIMG NSA."),
cl::init(2), cl::Hidden);
+static StringRef getSchedStrategyForPolicy(const Function &F) {
+ Attribute SchedStrategyAttr = F.getFnAttribute("amdgpu-sched-strategy");
+ if (SchedStrategyAttr.isValid())
+ return SchedStrategyAttr.getValueAsString();
+
+ const Module *M = F.getParent();
+ auto *WorkloadType = dyn_cast_or_null<MDString>(
+ M ? M->getModuleFlag("amdgpu-workload-type") : nullptr);
+ if (WorkloadType && WorkloadType->getString().equals_insensitive("ml"))
+ return "coexec";
+
+ return "";
+}
+
GCNSubtarget::~GCNSubtarget() = default;
GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
@@ -329,6 +343,14 @@ void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
// SIRegisterInfo::getRegPressureSetLimit()
Policy.ShouldTrackPressure = true;
+ const Function &F = Region.RegionBegin->getMF()->getFunction();
+ // Always schedule top-down for better balancing of HW resource usage.
+ if (getSchedStrategyForPolicy(F) == "coexec") {
+ Policy.OnlyTopDown = true;
+ Policy.OnlyBottomUp = false;
+ return;
+ }
+
// Enabling both top down and bottom up scheduling seems to give us less
// register spills than just using one of these approaches on its own.
Policy.OnlyTopDown = false;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-workload-type-scheduler-debug.mir b/llvm/test/CodeGen/AMDGPU/amdgpu-workload-type-scheduler-debug.mir
new file mode 100644
index 0000000000000..e0e41b9486307
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-workload-type-scheduler-debug.mir
@@ -0,0 +1,114 @@
+# REQUIRES: asserts
+#
+# Test that the module-level amdgpu-workload-type flag selects the AMDGPU
+# scheduler policy defaults. The ML workload type enables coexec preRA
+# scheduling and nop postRA scheduling. Unknown values should fall back to the
+# default schedulers.
+#
+# RUN: rm -rf %t && split-file %s %t
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=machine-scheduler -verify-misched -debug-only=machine-scheduler -o - %t/ml-pre.mir 2>&1 | FileCheck --check-prefix=ML-PRE %s
+# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=machine-scheduler -verify-misched -misched-prera-direction=bottomup -o - %t/ml-pre.mir 2>&1 | FileCheck --check-prefix=ML-ASSERT %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=machine-scheduler -verify-misched -debug-only=machine-scheduler -o - %t/typo-pre.mir 2>&1 | FileCheck --check-prefix=TYPO-PRE %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=postmisched -verify-misched -debug-only=machine-scheduler -o - %t/ml-post.mir 2>&1 | FileCheck --check-prefix=ML-POST %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=postmisched -verify-misched -debug-only=machine-scheduler -o - %t/typo-post.mir 2>&1 | FileCheck --check-prefix=TYPO-POST %s
+
+#--- ml-pre.mir
+--- |
+ ; ML-PRE: AMDGPU coexec preRA scheduler selected for ml_pre_sched
+ ; ML-ASSERT: coexec scheduler only supports top-down scheduling
+ define amdgpu_kernel void @ml_pre_sched() {
+ ret void
+ }
+
+ !llvm.module.flags = !{!0}
+ !0 = !{i32 1, !"amdgpu-workload-type", !"ml"}
+...
+
+---
+name: ml_pre_sched
+tracksRegLiveness: true
+body: |
+ bb.0:
+ %0:vreg_512_align2 = IMPLICIT_DEF
+ %1:vreg_512_align2 = IMPLICIT_DEF
+ %2:vreg_256_align2 = IMPLICIT_DEF
+ %3:vgpr_32_lo256 = IMPLICIT_DEF
+ %4:vgpr_32_lo256 = IMPLICIT_DEF
+ %5:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %0, %1, 0, %2, %3, %4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ S_ENDPGM 0, implicit %5
+...
+
+#--- typo-pre.mir
+--- |
+ ; TYPO-PRE-NOT: AMDGPU coexec preRA scheduler selected
+ ; TYPO-PRE: name: typo_pre_sched
+ define amdgpu_kernel void @typo_pre_sched() {
+ ret void
+ }
+
+ !llvm.module.flags = !{!0}
+ !0 = !{i32 1, !"amdgpu-workload-type", !"m1"}
+...
+
+---
+name: typo_pre_sched
+tracksRegLiveness: true
+body: |
+ bb.0:
+ %0:vreg_512_align2 = IMPLICIT_DEF
+ %1:vreg_512_align2 = IMPLICIT_DEF
+ %2:vreg_256_align2 = IMPLICIT_DEF
+ %3:vgpr_32_lo256 = IMPLICIT_DEF
+ %4:vgpr_32_lo256 = IMPLICIT_DEF
+ %5:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %0, %1, 0, %2, %3, %4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ S_ENDPGM 0, implicit %5
+...
+
+#--- ml-post.mir
+--- |
+ ; ML-POST: AMDGPU coexec postRA scheduler selected (nop) for ml_post_sched
+ define amdgpu_kernel void @ml_post_sched() {
+ ret void
+ }
+
+ !llvm.module.flags = !{!0}
+ !0 = !{i32 1, !"amdgpu-workload-type", !"ml"}
+...
+
+---
+name: ml_post_sched
+tracksRegLiveness: true
+body: |
+ bb.0:
+ renamable $vgpr0 = IMPLICIT_DEF
+ renamable $vgpr1 = IMPLICIT_DEF
+ renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+ renamable $vgpr3 = nsw V_MUL_LO_U32_e64 $vgpr2, $vgpr2, implicit $exec
+ renamable $vgpr4 = nsw V_ADD_U32_e32 killed $vgpr2, killed $vgpr3, implicit $exec
+ S_ENDPGM 0, implicit $vgpr4
+...
+
+#--- typo-post.mir
+--- |
+ ; TYPO-POST-NOT: AMDGPU coexec postRA scheduler selected (nop)
+ ; TYPO-POST: name: typo_post_sched
+ define amdgpu_kernel void @typo_post_sched() {
+ ret void
+ }
+
+ !llvm.module.flags = !{!0}
+ !0 = !{i32 1, !"amdgpu-workload-type", !"machine-learning"}
+...
+
+---
+name: typo_post_sched
+tracksRegLiveness: true
+body: |
+ bb.0:
+ renamable $vgpr0 = IMPLICIT_DEF
+ renamable $vgpr1 = IMPLICIT_DEF
+ renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
+ renamable $vgpr3 = nsw V_MUL_LO_U32_e64 $vgpr2, $vgpr2, implicit $exec
+ renamable $vgpr4 = nsw V_ADD_U32_e32 killed $vgpr2, killed $vgpr3, implicit $exec
+ S_ENDPGM 0, implicit $vgpr4
+...
diff --git a/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
new file mode 100644
index 0000000000000..bac94bdffd375
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/coexec-sched-effective-stall.mir
@@ -0,0 +1,124 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=machine-scheduler -verify-misched %s -o - | FileCheck -check-prefix=DEFAULT %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=machine-scheduler -amdgpu-sched-strategy=coexec -verify-misched %s -o - | FileCheck -check-prefix=COEXEC %s
+
+--- |
+ ; Pre-commit test for stall heuristic
+
+ define void @test-sched-effective-stall() #0 { ret void }
+ define void @test-sched-pending-structural-stall() #0 { ret void }
+
+ attributes #0 = { "amdgpu-waves-per-eu"="1,1" }
+...
+
+name: test-sched-effective-stall
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; DEFAULT-LABEL: name: test-sched-effective-stall
+ ; DEFAULT: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF4:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF5:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF5]], 0, 0, implicit $exec
+ ; DEFAULT-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF7:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF8:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF9:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF10:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; DEFAULT-NEXT: early-clobber %14:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF6]], [[DEF7]], 0, [[DEF8]], [[DEF9]], [[DEF10]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: S_ENDPGM 0, implicit [[V_PK_ADD_F32_]], implicit %13, implicit %14
+ ;
+ ; COEXEC-LABEL: name: test-sched-effective-stall
+ ; COEXEC: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF4:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF5:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF7:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF9:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF10:%[0-9]+]]:vreg_64_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 [[DEF10]], 0, 0, implicit $exec
+ ; COEXEC-NEXT: early-clobber %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; COEXEC-NEXT: early-clobber %14:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; COEXEC-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[GLOBAL_LOAD_DWORDX2_]], 8, [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; COEXEC-NEXT: S_ENDPGM 0, implicit [[V_PK_ADD_F32_]], implicit %13, implicit %14
+ %0:vreg_512_align2 = IMPLICIT_DEF
+ %1:vreg_512_align2 = IMPLICIT_DEF
+ %2:vreg_256_align2 = IMPLICIT_DEF
+ %3:vgpr_32_lo256 = IMPLICIT_DEF
+ %4:vgpr_32_lo256 = IMPLICIT_DEF
+ %5:vreg_512_align2 = IMPLICIT_DEF
+ %6:vreg_512_align2 = IMPLICIT_DEF
+ %7:vreg_256_align2 = IMPLICIT_DEF
+ %8:vgpr_32_lo256 = IMPLICIT_DEF
+ %9:vgpr_32_lo256 = IMPLICIT_DEF
+ %10:vreg_64_align2 = IMPLICIT_DEF
+ %11:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %10, 0, 0, implicit $exec
+ %12:vreg_64_align2 = V_PK_ADD_F32 8, %11, 8, %11, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %13:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %0, %1, 0, %2, %3, %4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ %14:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5, %6, 0, %7, %8, %9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ S_ENDPGM 0, implicit %12, implicit %13, implicit %14
+...
+
+---
+name: test-sched-pending-structural-stall
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; DEFAULT-LABEL: name: test-sched-pending-structural-stall
+ ; DEFAULT: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF4:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: early-clobber %10:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: [[DEF5:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF7:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: [[DEF9:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+ ; DEFAULT-NEXT: S_NOP 0
+ ; DEFAULT-NEXT: S_NOP 0
+ ; DEFAULT-NEXT: early-clobber %11:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; DEFAULT-NEXT: S_ENDPGM 0, implicit %10, implicit %11
+ ;
+ ; COEXEC-LABEL: name: test-sched-pending-structural-stall
+ ; COEXEC: [[DEF:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF1:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF2:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF3:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF4:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF5:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF6:%[0-9]+]]:vreg_512_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF7:%[0-9]+]]:vreg_256_align2 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF8:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+ ; COEXEC-NEXT: [[DEF9:%[0-9]+]]:vgpr_32_lo256 = IMPLICIT_DEF
+ ; COEXEC-NEXT: early-clobber %10:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF]], [[DEF1]], 0, [[DEF2]], [[DEF3]], [[DEF4]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; COEXEC-NEXT: early-clobber %11:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr [[DEF5]], [[DEF6]], 0, [[DEF7]], [[DEF8]], [[DEF9]], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ ; COEXEC-NEXT: S_NOP 0
+ ; COEXEC-NEXT: S_NOP 0
+ ; COEXEC-NEXT: S_ENDPGM 0, implicit %10, implicit %11
+ %0:vreg_512_align2 = IMPLICIT_DEF
+ %1:vreg_512_align2 = IMPLICIT_DEF
+ %2:vreg_256_align2 = IMPLICIT_DEF
+ %3:vgpr_32_lo256 = IMPLICIT_DEF
+ %4:vgpr_32_lo256 = IMPLICIT_DEF
+ %5:vreg_512_align2 = IMPLICIT_DEF
+ %6:vreg_512_align2 = IMPLICIT_DEF
+ %7:vreg_256_align2 = IMPLICIT_DEF
+ %8:vgpr_32_lo256 = IMPLICIT_DEF
+ %9:vgpr_32_lo256 = IMPLICIT_DEF
+ %10:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %0, %1, 0, %2, %3, %4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ %11:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %5, %6, 0, %7, %8, %9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ S_NOP 0
+ S_NOP 0
+ S_ENDPGM 0, implicit %10, implicit %11
+...
>From ecdf20bbfa8909cd476cb9b7652bda94e3a9167f Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Sun, 8 Mar 2026 10:40:17 -0700
Subject: [PATCH 2/3] Remove module "workload-type" metadata.
---
.../AMDGPU/AMDGPUCoExecSchedStrategy.cpp | 14 ++-
.../Target/AMDGPU/AMDGPUCoExecSchedStrategy.h | 5 +-
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 55 ++-------
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h | 4 +
llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 17 +--
.../amdgpu-workload-type-scheduler-debug.mir | 114 ------------------
6 files changed, 30 insertions(+), 179 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-workload-type-scheduler-debug.mir
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index a1a8a80c5bfb1..9dfb4ab3b282e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -50,10 +50,18 @@ AMDGPUCoExecSchedStrategy::AMDGPUCoExecSchedStrategy(
UseGCNTrackers = true;
}
-void AMDGPUCoExecSchedStrategy::initialize(ScheduleDAGMI *DAG) {
+void AMDGPUCoExecSchedStrategy::initPolicy(
+ MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
+ unsigned NumRegionInstrs) {
+ GCNSchedStrategy::initPolicy(Begin, End, NumRegionInstrs);
assert((PreRADirection == MISched::Unspecified ||
PreRADirection == MISched::TopDown) &&
"coexec scheduler only supports top-down scheduling");
+ RegionPolicy.OnlyTopDown = true;
+ RegionPolicy.OnlyBottomUp = false;
+}
+
+void AMDGPUCoExecSchedStrategy::initialize(ScheduleDAGMI *DAG) {
// Coexecution scheduling strategy is only done top-down to support new
// resource balancing heuristics.
RegionPolicy.OnlyTopDown = true;
@@ -268,8 +276,8 @@ llvm::createGCNCoExecMachineScheduler(MachineSchedContext *C) {
}
ScheduleDAGInstrs *
-llvm::createGCNCoExecPostMachineScheduler(MachineSchedContext *C) {
- LLVM_DEBUG(dbgs() << "AMDGPU coexec postRA scheduler selected (nop) for "
+llvm::createGCNNoopPostMachineScheduler(MachineSchedContext *C) {
+ LLVM_DEBUG(dbgs() << "AMDGPU nop postRA scheduler selected for "
<< C->MF->getName() << '\n');
return new GCNNoopPostScheduleDAG(C);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
index 0f7289992ec72..2b661f03aa50a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.h
@@ -31,12 +31,15 @@ class AMDGPUCoExecSchedStrategy final : public GCNSchedStrategy {
public:
AMDGPUCoExecSchedStrategy(const MachineSchedContext *C);
+ void initPolicy(MachineBasicBlock::iterator Begin,
+ MachineBasicBlock::iterator End,
+ unsigned NumRegionInstrs) override;
void initialize(ScheduleDAGMI *DAG) override;
SUnit *pickNode(bool &IsTopNode) override;
};
ScheduleDAGInstrs *createGCNCoExecMachineScheduler(MachineSchedContext *C);
-ScheduleDAGInstrs *createGCNCoExecPostMachineScheduler(MachineSchedContext *C);
+ScheduleDAGInstrs *createGCNNoopPostMachineScheduler(MachineSchedContext *C);
} // End namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index cffebc602b2bd..87e38580c7b1c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -575,24 +575,10 @@ static cl::opt<std::string>
cl::desc("Select custom AMDGPU scheduling strategy."),
cl::Hidden, cl::init(""));
-enum class AMDGPUPostSchedStrategy {
- Default,
- Nop,
-};
-
-static StringRef getAMDGPUWorkloadType(const Module *M) {
- if (!M)
- return "";
-
- auto *WorkloadType =
- dyn_cast_or_null<MDString>(M->getModuleFlag("amdgpu-workload-type"));
- if (!WorkloadType)
- return "";
-
- return WorkloadType->getString();
-}
-
-static StringRef getAMDGPUSchedStrategy(const Function &F) {
+// Scheduler selection is consulted both when creating the scheduler and from
+// overrideSchedPolicy(), so keep the attribute and global command line handling
+// in one helper.
+StringRef llvm::AMDGPU::getSchedStrategy(const Function &F) {
Attribute SchedStrategyAttr = F.getFnAttribute("amdgpu-sched-strategy");
if (SchedStrategyAttr.isValid())
return SchedStrategyAttr.getValueAsString();
@@ -600,31 +586,14 @@ static StringRef getAMDGPUSchedStrategy(const Function &F) {
if (!AMDGPUSchedStrategy.empty())
return AMDGPUSchedStrategy;
- StringRef WorkloadType = getAMDGPUWorkloadType(F.getParent());
- // ML workloads use coexec scheduling defaults.
- if (WorkloadType.equals_insensitive("ml"))
- return "coexec";
-
return "";
}
-static AMDGPUPostSchedStrategy getAMDGPUPostSchedStrategy(const Function &F) {
+static bool useNoopPostScheduler(const Function &F) {
Attribute PostSchedStrategyAttr =
F.getFnAttribute("amdgpu-post-sched-strategy");
- if (PostSchedStrategyAttr.isValid()) {
- StringRef PostSchedStrategy = PostSchedStrategyAttr.getValueAsString();
- if (PostSchedStrategy == "nop")
- return AMDGPUPostSchedStrategy::Nop;
- // Allow explicit override to keep post-RA scheduling enabled even when
- // preRA resolves to coexec via module defaults.
- if (PostSchedStrategy == "default")
- return AMDGPUPostSchedStrategy::Default;
- }
-
- if (getAMDGPUSchedStrategy(F) == "coexec")
- return AMDGPUPostSchedStrategy::Nop;
-
- return AMDGPUPostSchedStrategy::Default;
+ return PostSchedStrategyAttr.isValid() &&
+ PostSchedStrategyAttr.getValueAsString() == "nop";
}
static cl::opt<bool> EnableRewritePartialRegUses(
@@ -1294,7 +1263,7 @@ GCNTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
if (ST.enableSIScheduler())
return createSIMachineScheduler(C);
- StringRef SchedStrategy = getAMDGPUSchedStrategy(C->MF->getFunction());
+ StringRef SchedStrategy = AMDGPU::getSchedStrategy(C->MF->getFunction());
if (SchedStrategy == "max-ilp")
return createGCNMaxILPMachineScheduler(C);
@@ -1319,12 +1288,8 @@ GCNTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
ScheduleDAGInstrs *
GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
- // Post-RA scheduler selection can be controlled per-function
- // ("amdgpu-post-sched-strategy"), and coexec disables post-RA scheduling
- // based on function/module scheduler policy.
- if (getAMDGPUPostSchedStrategy(C->MF->getFunction()) ==
- AMDGPUPostSchedStrategy::Nop)
- return createGCNCoExecPostMachineScheduler(C);
+ if (useNoopPostScheduler(C->MF->getFunction()))
+ return createGCNNoopPostMachineScheduler(C);
ScheduleDAGMI *DAG =
new GCNPostScheduleDAGMILive(C, std::make_unique<PostGenericScheduler>(C),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 362899bd260eb..39b80464a9ab8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -27,6 +27,10 @@ namespace llvm {
// AMDGPU Target Machine (R600+)
//===----------------------------------------------------------------------===//
+namespace AMDGPU {
+StringRef getSchedStrategy(const Function &F);
+}
+
class AMDGPUTargetMachine : public CodeGenTargetMachineImpl {
protected:
std::unique_ptr<TargetLoweringObjectFile> TLOF;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index f01b4983e9a6a..55bbbfcfc86d0 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -52,20 +52,6 @@ static cl::opt<unsigned>
cl::desc("Number of addresses from which to enable MIMG NSA."),
cl::init(2), cl::Hidden);
-static StringRef getSchedStrategyForPolicy(const Function &F) {
- Attribute SchedStrategyAttr = F.getFnAttribute("amdgpu-sched-strategy");
- if (SchedStrategyAttr.isValid())
- return SchedStrategyAttr.getValueAsString();
-
- const Module *M = F.getParent();
- auto *WorkloadType = dyn_cast_or_null<MDString>(
- M ? M->getModuleFlag("amdgpu-workload-type") : nullptr);
- if (WorkloadType && WorkloadType->getString().equals_insensitive("ml"))
- return "coexec";
-
- return "";
-}
-
GCNSubtarget::~GCNSubtarget() = default;
GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
@@ -344,8 +330,7 @@ void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
Policy.ShouldTrackPressure = true;
const Function &F = Region.RegionBegin->getMF()->getFunction();
- // Always schedule top-down for better balancing of HW resource usage.
- if (getSchedStrategyForPolicy(F) == "coexec") {
+ if (AMDGPU::getSchedStrategy(F) == "coexec") {
Policy.OnlyTopDown = true;
Policy.OnlyBottomUp = false;
return;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-workload-type-scheduler-debug.mir b/llvm/test/CodeGen/AMDGPU/amdgpu-workload-type-scheduler-debug.mir
deleted file mode 100644
index e0e41b9486307..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-workload-type-scheduler-debug.mir
+++ /dev/null
@@ -1,114 +0,0 @@
-# REQUIRES: asserts
-#
-# Test that the module-level amdgpu-workload-type flag selects the AMDGPU
-# scheduler policy defaults. The ML workload type enables coexec preRA
-# scheduling and nop postRA scheduling. Unknown values should fall back to the
-# default schedulers.
-#
-# RUN: rm -rf %t && split-file %s %t
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=machine-scheduler -verify-misched -debug-only=machine-scheduler -o - %t/ml-pre.mir 2>&1 | FileCheck --check-prefix=ML-PRE %s
-# RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=machine-scheduler -verify-misched -misched-prera-direction=bottomup -o - %t/ml-pre.mir 2>&1 | FileCheck --check-prefix=ML-ASSERT %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=machine-scheduler -verify-misched -debug-only=machine-scheduler -o - %t/typo-pre.mir 2>&1 | FileCheck --check-prefix=TYPO-PRE %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=postmisched -verify-misched -debug-only=machine-scheduler -o - %t/ml-post.mir 2>&1 | FileCheck --check-prefix=ML-POST %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=postmisched -verify-misched -debug-only=machine-scheduler -o - %t/typo-post.mir 2>&1 | FileCheck --check-prefix=TYPO-POST %s
-
-#--- ml-pre.mir
---- |
- ; ML-PRE: AMDGPU coexec preRA scheduler selected for ml_pre_sched
- ; ML-ASSERT: coexec scheduler only supports top-down scheduling
- define amdgpu_kernel void @ml_pre_sched() {
- ret void
- }
-
- !llvm.module.flags = !{!0}
- !0 = !{i32 1, !"amdgpu-workload-type", !"ml"}
-...
-
----
-name: ml_pre_sched
-tracksRegLiveness: true
-body: |
- bb.0:
- %0:vreg_512_align2 = IMPLICIT_DEF
- %1:vreg_512_align2 = IMPLICIT_DEF
- %2:vreg_256_align2 = IMPLICIT_DEF
- %3:vgpr_32_lo256 = IMPLICIT_DEF
- %4:vgpr_32_lo256 = IMPLICIT_DEF
- %5:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %0, %1, 0, %2, %3, %4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
- S_ENDPGM 0, implicit %5
-...
-
-#--- typo-pre.mir
---- |
- ; TYPO-PRE-NOT: AMDGPU coexec preRA scheduler selected
- ; TYPO-PRE: name: typo_pre_sched
- define amdgpu_kernel void @typo_pre_sched() {
- ret void
- }
-
- !llvm.module.flags = !{!0}
- !0 = !{i32 1, !"amdgpu-workload-type", !"m1"}
-...
-
----
-name: typo_pre_sched
-tracksRegLiveness: true
-body: |
- bb.0:
- %0:vreg_512_align2 = IMPLICIT_DEF
- %1:vreg_512_align2 = IMPLICIT_DEF
- %2:vreg_256_align2 = IMPLICIT_DEF
- %3:vgpr_32_lo256 = IMPLICIT_DEF
- %4:vgpr_32_lo256 = IMPLICIT_DEF
- %5:vreg_256_align2 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_threeaddr %0, %1, 0, %2, %3, %4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
- S_ENDPGM 0, implicit %5
-...
-
-#--- ml-post.mir
---- |
- ; ML-POST: AMDGPU coexec postRA scheduler selected (nop) for ml_post_sched
- define amdgpu_kernel void @ml_post_sched() {
- ret void
- }
-
- !llvm.module.flags = !{!0}
- !0 = !{i32 1, !"amdgpu-workload-type", !"ml"}
-...
-
----
-name: ml_post_sched
-tracksRegLiveness: true
-body: |
- bb.0:
- renamable $vgpr0 = IMPLICIT_DEF
- renamable $vgpr1 = IMPLICIT_DEF
- renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
- renamable $vgpr3 = nsw V_MUL_LO_U32_e64 $vgpr2, $vgpr2, implicit $exec
- renamable $vgpr4 = nsw V_ADD_U32_e32 killed $vgpr2, killed $vgpr3, implicit $exec
- S_ENDPGM 0, implicit $vgpr4
-...
-
-#--- typo-post.mir
---- |
- ; TYPO-POST-NOT: AMDGPU coexec postRA scheduler selected (nop)
- ; TYPO-POST: name: typo_post_sched
- define amdgpu_kernel void @typo_post_sched() {
- ret void
- }
-
- !llvm.module.flags = !{!0}
- !0 = !{i32 1, !"amdgpu-workload-type", !"machine-learning"}
-...
-
----
-name: typo_post_sched
-tracksRegLiveness: true
-body: |
- bb.0:
- renamable $vgpr0 = IMPLICIT_DEF
- renamable $vgpr1 = IMPLICIT_DEF
- renamable $vgpr2 = nsw V_MUL_LO_U32_e64 killed $vgpr0, killed $vgpr1, implicit $exec
- renamable $vgpr3 = nsw V_MUL_LO_U32_e64 $vgpr2, $vgpr2, implicit $exec
- renamable $vgpr4 = nsw V_ADD_U32_e32 killed $vgpr2, killed $vgpr3, implicit $exec
- S_ENDPGM 0, implicit $vgpr4
-...
>From e3b1ae624bae8d43b95b7ff373f798eaec2be347 Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Sun, 8 Mar 2026 20:34:26 -0700
Subject: [PATCH 3/3] Formating.
---
llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
index 9dfb4ab3b282e..9d50c4e047943 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCoExecSchedStrategy.cpp
@@ -50,9 +50,9 @@ AMDGPUCoExecSchedStrategy::AMDGPUCoExecSchedStrategy(
UseGCNTrackers = true;
}
-void AMDGPUCoExecSchedStrategy::initPolicy(
- MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
- unsigned NumRegionInstrs) {
+void AMDGPUCoExecSchedStrategy::initPolicy(MachineBasicBlock::iterator Begin,
+ MachineBasicBlock::iterator End,
+ unsigned NumRegionInstrs) {
GCNSchedStrategy::initPolicy(Begin, End, NumRegionInstrs);
assert((PreRADirection == MISched::Unspecified ||
PreRADirection == MISched::TopDown) &&
More information about the llvm-commits
mailing list