[llvm] [AMDGPU] Add initial cost function framework for balanced scheduling (PR #160558)

Wed Sep 24 09:54:05 PDT 2025

https://github.com/kerbowa created https://github.com/llvm/llvm-project/pull/160558

Introduce an initial cost function into the AMDGPU instruction scheduler
as the foundation for a more balanced scheduling framework. The goal is
to move beyond occupancy-as-a-hard-target by providing a configurable
mechanism to evaluate trade-offs between different candidate schedules.

Key features:

Schedule length term weighted by block frequency.
Weighted occupancy cost with concave penalty that rewards lower initial occupancy gains more than later ones.
Large additive penalty to strongly discourage schedules that increase spilling.
Configurable weights and knobs to support tuning.

>From e5dde744a34befe0f227140952958df03bebc5a7 Mon Sep 17 00:00:00 2001
From: Austin Kerbow <Austin.Kerbow at amd.com>
Date: Wed, 24 Sep 2025 09:34:47 -0700
Subject: [PATCH] [AMDGPU] Add initial cost function framework for balanced
 scheduling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce an initial cost function into the AMDGPU instruction scheduler
as the foundation for a more balanced scheduling framework. The goal is
to move beyond occupancy-as-a-hard-target by providing a configurable
mechanism to evaluate trade-offs between different candidate schedules.

Key features:

Schedule length term weighted by block frequency.
Weighted occupancy cost with concave penalty that rewards lower initial occupancy gains more than later ones.
Large additive penalty to strongly discourage schedules that increase spilling.
Configurable weights and knobs to support tuning.
---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 226 +++++++++++++++++++-
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.h   |  62 ++++++
 2 files changed, 286 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 254b75b784e75..874dfc09ad4e8 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -23,9 +23,9 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "GCNSchedStrategy.h"
 #include "AMDGPUIGroupLP.h"
 #include "GCNRegPressure.h"
+#include "GCNSchedStrategy.h"
 #include "SIMachineFunctionInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/STLExtras.h"
@@ -33,6 +33,9 @@
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/Support/ErrorHandling.h"
 
+#include <cmath>
+#include <limits>
+
 #define DEBUG_TYPE "machine-scheduler"
 
 using namespace llvm;
@@ -70,6 +73,79 @@ static cl::opt<bool> GCNTrackers(
 
 const unsigned ScheduleMetrics::ScaleFactor = 100;
 
+//===----------------------------------------------------------------------===//
+// Optional cost-function mode: command line switches
+//===----------------------------------------------------------------------===//
+
+static cl::opt<bool> UseSchedCostFunction(
+    "amdgpu-use-cost-function", cl::Hidden,
+    cl::desc("Enable cost-function-based evaluation to compare candidate"
+             " schedules in GCNSchedStrategy."),
+    cl::init(false));
+
+static cl::opt<double> SchedCostWeightOccupancy(
+    "amdgpu-sched-cost-weight-occupancy", cl::Hidden,
+    cl::desc("Weight for occupancy term in AMDGPU scheduler cost function"),
+    cl::init(1.0));
+static cl::opt<double> SchedCostWeightLength(
+    "amdgpu-sched-cost-weight-length", cl::Hidden,
+    cl::desc("Weight for schedule length term (cycles) in cost function"),
+    cl::init(1.0));
+static cl::opt<double> SchedCostWeightSpill(
+    "amdgpu-sched-cost-weight-spill", cl::Hidden,
+    cl::desc("Weight for spill term; typically much larger than length"),
+    cl::init(100.0));
+
+// Shape the occupancy term: reciprocal exponent and low-occupancy penalty.
+static cl::opt<double> SchedCostOccExponent(
+  "amdgpu-sched-cost-occ-exponent", cl::Hidden,
+  cl::desc("Exponent for occupancy diminishing-returns curve (cost ~ 1/W^exp)"),
+  cl::init(1.0));
+static cl::opt<unsigned> SchedCostLowOccFloor(
+  "amdgpu-sched-cost-lowocc-floor", cl::Hidden,
+  cl::desc("Preferred minimum waves; waves below this get extra penalty"),
+  cl::init(2));
+static cl::opt<double> SchedCostLowOccPenalty(
+  "amdgpu-sched-cost-lowocc-penalty", cl::Hidden,
+  cl::desc("Penalty weight multiplied by (floor - waves) when below floor"),
+  cl::init(0.0));
+
+static cl::opt<bool> UseStageCostDecision(
+  "amdgpu-use-stage-cost-decision", cl::Hidden,
+  cl::desc("Defer cost decisions to end of stage using block-frequency"
+       " weighted totals, instead of per-region immediate reverts"),
+  cl::init(false));
+
+// Helper: concave occupancy utility. Map waves -> diminishing cost reduction.
+static inline double occupancyCost(unsigned Waves, double Exp) {
+  if (Waves == 0)
+    return std::numeric_limits<double>::infinity();
+  // Use reciprocal to get a simple concave utility: higher waves -> smaller cost.
+  // We scale by a constant so typical ranges produce reasonable magnitudes.
+  return 1.0 / std::pow(static_cast<double>(Waves), Exp);
+}
+
+double AMDGPUSchedCostFunction::score(unsigned Waves, unsigned LengthCycles,
+                                      unsigned SpillUnits,
+                                      double BlockFreq) const {
+  // BlockFreq defaults to 1.0 if unknown; scale length proportionally.
+  double Freq = BlockFreq > 0.0 ? BlockFreq : 1.0;
+
+  // Occupancy cost: lower if more waves. Concave via reciprocal.
+  double OccTerm = OccW * occupancyCost(Waves, OccExp);
+  if (LowOccPenalty > 0.0 && Waves < LowOccFloor)
+    OccTerm += LowOccPenalty * static_cast<double>(LowOccFloor - Waves);
+
+  // Length cost: cycles weighted by block frequency.
+  double LenTerm = LenW * (static_cast<double>(LengthCycles) * Freq);
+
+  // Spill cost: heavy penalty. Interpret SpillUnits as an estimated count;
+  // could be number of excess registers or estimated bytes; we keep generic.
+  double SpillTerm = SpillW * static_cast<double>(SpillUnits);
+
+  return OccTerm + LenTerm + SpillTerm;
+}
+
 GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
     : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
       DownwardTracker(*C->LIS), UpwardTracker(*C->LIS), HasHighPressure(false) {
@@ -1032,6 +1108,12 @@ bool GCNSchedStage::initGCNSchedStage() {
     return false;
 
   LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
+  if (UseSchedCostFunction && UseStageCostDecision) {
+    StageCostBefore = 0.0;
+    StageCostAfter = 0.0;
+    StageSavedOrder.clear();
+    StageSavedPressure.clear();
+  }
   return true;
 }
 
@@ -1136,6 +1218,21 @@ bool PreRARematStage::initGCNSchedStage() {
 void GCNSchedStage::finalizeGCNSchedStage() {
   DAG.finishBlock();
   LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
+
+  if (UseSchedCostFunction && UseStageCostDecision) {
+    if (StageCostAfter > StageCostBefore) {
+      LLVM_DEBUG(dbgs() << "[CostFunction] Reverting entire stage: cost before="
+                        << StageCostBefore << ", cost after=" << StageCostAfter
+                        << "\n");
+      for (const auto &It : StageSavedOrder) {
+        unsigned R = It.getFirst();
+        const std::vector<MachineInstr *> &Order = It.getSecond();
+        revertRegionToOrder(R, Order);
+        if (auto P = StageSavedPressure.find(R); P != StageSavedPressure.end())
+          DAG.Pressure[R] = P->second;
+      }
+    }
+  }
 }
 
 void UnclusteredHighRPStage::finalizeGCNSchedStage() {
@@ -1266,6 +1363,54 @@ void GCNSchedStage::finalizeGCNRegion() {
   // reason that the original schedule is better.
   checkScheduling();
 
+  // If deferring cost decision to the end of stage, accumulate per-region
+  // costs now. Functional reverts still happen in checkScheduling().
+  if (UseSchedCostFunction && UseStageCostDecision) {
+    if (!StageSavedOrder.contains(RegionIdx))
+      StageSavedOrder[RegionIdx] = Unsched;
+    if (!StageSavedPressure.contains(RegionIdx))
+      StageSavedPressure[RegionIdx] = PressureBefore;
+
+    ScheduleMetrics MAfter = getScheduleMetrics(DAG);
+    ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
+    unsigned LengthAfter = MAfter.getLength();
+    unsigned LengthBefore = MBefore.getLength();
+
+    auto EstimateSpill = [&](const GCNRegPressure &P) -> unsigned {
+      unsigned Spill = 0;
+      unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
+      unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
+      unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
+      unsigned VG = P.getVGPRNum(ST.hasGFX90AInsts());
+      unsigned AG = P.getAGPRNum();
+      unsigned AV = P.getArchVGPRNum();
+      unsigned SG = P.getSGPRNum();
+      if (VG > MaxVGPRs) Spill += VG - MaxVGPRs;
+      if (AV > MaxArchVGPRs) Spill += AV - MaxArchVGPRs;
+      if (AG > MaxArchVGPRs) Spill += AG - MaxArchVGPRs;
+      if (SG > MaxSGPRs) Spill += SG - MaxSGPRs;
+      return Spill;
+    };
+
+    unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
+    unsigned TargetOcc = std::min(
+        S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
+    unsigned WavesBefore = std::min(
+        TargetOcc, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
+    unsigned WavesAfter = std::min(
+        TargetOcc, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
+    unsigned SpillBefore = EstimateSpill(PressureBefore);
+    unsigned SpillAfter = EstimateSpill(PressureAfter);
+
+    double BlockFreq = 1.0; // TODO: wire MBFI when available
+  AMDGPUSchedCostFunction CF(SchedCostWeightOccupancy,
+                 SchedCostWeightLength, SchedCostWeightSpill,
+                 SchedCostOccExponent, SchedCostLowOccFloor,
+                 SchedCostLowOccPenalty);
+    StageCostBefore += CF.score(WavesBefore, LengthBefore, SpillBefore, BlockFreq);
+    StageCostAfter += CF.score(WavesAfter, LengthAfter, SpillAfter, BlockFreq);
+  }
+
   if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
       StageID != GCNSchedStageID::UnclusteredHighRPReschedule)
     SavedMutations.swap(DAG.Mutations);
@@ -1340,7 +1485,70 @@ void GCNSchedStage::checkScheduling() {
 
   // Revert if this region's schedule would cause a drop in occupancy or
   // spilling.
-  if (shouldRevertScheduling(WavesAfter))
+  bool Revert = shouldRevertScheduling(WavesAfter);
+
+  // Optional: cost-function evaluation. Compare previous vs new schedule
+  // and retain whichever has lower total cost. This only triggers if
+  // enabled by flag; default behavior remains unchanged.
+  if (!Revert && UseSchedCostFunction) {
+    // Compute a simple schedule length estimate using existing metric helper.
+    // We reuse the bubble/length estimator for the current DAG versus the
+    // previously saved unscheduled order (DAG.SUnits captures pre-schedule).
+    ScheduleMetrics MAfter = getScheduleMetrics(DAG);
+    ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
+
+    unsigned LengthAfter = MAfter.getLength();
+    unsigned LengthBefore = MBefore.getLength();
+
+    // Estimate spill cost as excess over hardware maxima as a proxy.
+    // When in doubt, use the RegionsWithExcessRP bit as an indicator.
+    auto EstimateSpill = [&](const GCNRegPressure &P) -> unsigned {
+      unsigned Spill = 0;
+      // Excess over addressable limits captures risk of spills.
+      unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
+      unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
+      unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
+      unsigned VG = P.getVGPRNum(ST.hasGFX90AInsts());
+      unsigned AG = P.getAGPRNum();
+      unsigned AV = P.getArchVGPRNum();
+      unsigned SG = P.getSGPRNum();
+      if (VG > MaxVGPRs)
+        Spill += VG - MaxVGPRs;
+      if (AV > MaxArchVGPRs)
+        Spill += AV - MaxArchVGPRs;
+      if (AG > MaxArchVGPRs)
+        Spill += AG - MaxArchVGPRs;
+      if (SG > MaxSGPRs)
+        Spill += SG - MaxSGPRs;
+      return Spill;
+    };
+
+    unsigned SpillAfter = EstimateSpill(PressureAfter);
+    unsigned SpillBefore = EstimateSpill(PressureBefore);
+
+    // Occupancy for before/after.
+    unsigned WavesBeforeOcc = std::min(
+        TargetOccupancy,
+        PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
+    unsigned WavesAfterOcc = std::min(
+        TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
+
+    // Block frequency weighting: use MBFI if available, else 1.0. The
+    // MachineSchedContext does not expose MBFI here, so default to 1.0.
+    double BlockFreq = 1.0;
+
+  AMDGPUSchedCostFunction CF(SchedCostWeightOccupancy,
+                 SchedCostWeightLength, SchedCostWeightSpill,
+                 SchedCostOccExponent, SchedCostLowOccFloor,
+                 SchedCostLowOccPenalty);
+    double CostBefore = CF.score(WavesBeforeOcc, LengthBefore, SpillBefore, BlockFreq);
+    double CostAfter = CF.score(WavesAfterOcc, LengthAfter, SpillAfter, BlockFreq);
+
+    if (CostAfter > CostBefore)
+      Revert = true;
+  }
+
+  if (Revert)
     revertScheduling();
   else
     DAG.Pressure[RegionIdx] = PressureAfter;
@@ -1633,6 +1841,20 @@ void GCNSchedStage::revertScheduling() {
   DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
 }
 
+void GCNSchedStage::revertRegionToOrder(
+    unsigned RIdx, const std::vector<MachineInstr *> &SavedOrder) {
+  auto &Bounds = DAG.Regions[RIdx];
+  RegionIdx = RIdx;
+  DAG.RegionBegin = Bounds.first;
+  DAG.RegionEnd = Bounds.second;
+
+  std::vector<MachineInstr *> OrigUnsched;
+  OrigUnsched.swap(Unsched);
+  Unsched = SavedOrder;
+  revertScheduling();
+  Unsched.swap(OrigUnsched);
+}
+
 bool PreRARematStage::allUsesAvailableAt(const MachineInstr *InstToRemat,
                                          SlotIndex OriginalIdx,
                                          SlotIndex RematIdx) const {
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 790370ff8ab4d..0e7b6d02a5889 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -133,6 +133,58 @@ class GCNSchedStrategy : public GenericScheduler {
   GCNUpwardRPTracker *getUpwardTracker() { return &UpwardTracker; }
 };
 
+/// Cost-function based schedule evaluation (optional, off by default).
+///
+/// Purpose
+///  - Provide a scalar score to compare candidate schedules using
+///    tunable weights rather than purely greedy heuristics.
+///
+/// Inputs
+///  - Occupancy in waves (unsigned)
+///  - Estimated schedule length in cycles (unsigned)
+///  - Estimated spill cost (unsigned units; 0 when no spill is expected)
+///  - Block execution frequency (double; typically normalized, defaults to 1)
+///
+/// Behavior
+///  - Occupancy has diminishing returns: the cost contribution decreases
+///    concavely with more waves, so increasing 1→2 waves is more valuable
+///    than 8→9.
+///  - Spills dominate the cost: each unit of spill is penalized very heavily
+///    relative to a single cycle of schedule length.
+///  - Schedule length is weighted by basic block execution frequency, so hot
+///    blocks count more.
+///
+/// Configuration
+///  - Weights are controllable via command-line flags:
+///      - `-amdgpu-sched-cost-weight-occupancy`
+///      - `-amdgpu-sched-cost-weight-length`
+///      - `-amdgpu-sched-cost-weight-spill`
+///  - Lower scores are better.
+class AMDGPUSchedCostFunction {
+  double OccW = 1.0;
+  double LenW = 1.0;
+  double SpillW = 100.0;
+  // Shape of diminishing returns for occupancy: cost ~ 1 / Waves^OccExp.
+  double OccExp = 1.0;
+  // Extra penalty applied when Waves < LowOccFloor to emphasize that
+  // very low occupancy is particularly harmful (linear deficit penalty).
+  unsigned LowOccFloor = 2; // e.g. heavily prefer 2+ waves over 1
+  double LowOccPenalty = 0.0;
+
+public:
+  AMDGPUSchedCostFunction() = default;
+  AMDGPUSchedCostFunction(double OccWeight, double LenWeight, double SpillWeight,
+                          double OccExponent, unsigned LowOccPrefFloor,
+                          double LowOccPenaltyWeight)
+      : OccW(OccWeight), LenW(LenWeight), SpillW(SpillWeight),
+        OccExp(OccExponent), LowOccFloor(LowOccPrefFloor),
+        LowOccPenalty(LowOccPenaltyWeight) {}
+
+  /// Compute the total schedule score. Lower is better.
+  double score(unsigned Waves, unsigned LengthCycles, unsigned SpillUnits,
+               double BlockFreq) const;
+};
+
 /// The goal of this scheduling strategy is to maximize kernel occupancy (i.e.
 /// maximum number of waves per simd).
 class GCNMaxOccupancySchedStrategy final : public GCNSchedStrategy {
@@ -338,6 +390,12 @@ class GCNSchedStage {
 
   std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
 
+  // Stage-level cost aggregation and saved state for potential rollback.
+  double StageCostBefore = 0.0;
+  double StageCostAfter = 0.0;
+  DenseMap<unsigned, std::vector<MachineInstr *>> StageSavedOrder;
+  DenseMap<unsigned, GCNRegPressure> StageSavedPressure;
+
   GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG);
 
 public:
@@ -385,6 +443,10 @@ class GCNSchedStage {
   // Attempt to revert scheduling for this region.
   void revertScheduling();
 
+  // Revert region at index RegionIdx to a previously saved instruction order.
+  void revertRegionToOrder(unsigned RegionIdx,
+                           const std::vector<MachineInstr *> &SavedOrder);
+
   void advanceRegion() { RegionIdx++; }
 
   virtual ~GCNSchedStage() = default;