[llvm-branch-commits] [llvm] [AMDGPU] Add HWUI pressure heuristics to coexec strategy (PR #184929)

Mon Mar 9 22:52:30 PDT 2026

================
@@ -19,12 +19,293 @@
 
 namespace llvm {
 
+//===----------------------------------------------------------------------===//
+// Instruction Flavor Classification
+//===----------------------------------------------------------------------===//
+
+enum class InstructionFlavor : uint8_t {
+  WMMA,            // WMMA/MFMA matrix operations
+  SingleCycleVALU, // Single-cycle VALU (not TRANS32, not multi-cycle CVT)
+  TRANS,           // Transcendental ops (v_exp, v_log, etc.)
+  MultiCycleVALU,  // VALU instructions with repeat rate > 1
+  VMEM,            // FLAT/GLOBAL memory operations
+  DS,              // LDS/GDS operations
+  SALU,            // Scalar ALU
+  DMA,             // Tensor DMA operations
+  Fence,           // Fences and waits
+  Other,           // Everything else
+  NUM_FLAVORS
+};
+
+inline StringRef getFlavorName(InstructionFlavor F) {
+  switch (F) {
+  case InstructionFlavor::WMMA:
+    return "WMMA";
+  case InstructionFlavor::SingleCycleVALU:
+    return "VALU(1c)";
+  case InstructionFlavor::TRANS:
+    return "TRANS";
+  case InstructionFlavor::MultiCycleVALU:
+    return "VALU(Nc)";
+  case InstructionFlavor::VMEM:
+    return "VMEM";
+  case InstructionFlavor::DS:
+    return "DS";
+  case InstructionFlavor::SALU:
+    return "SALU";
+  case InstructionFlavor::DMA:
+    return "DMA";
+  case InstructionFlavor::Fence:
+    return "Fence";
+  case InstructionFlavor::Other:
+    return "Other";
+  case InstructionFlavor::NUM_FLAVORS:
+    return "???";
+  }
+  llvm_unreachable("Unknown InstructionFlavor");
+}
+
+inline StringRef getFlavorShortName(InstructionFlavor F) {
+  switch (F) {
+  case InstructionFlavor::WMMA:
+    return "W";
+  case InstructionFlavor::SingleCycleVALU:
+    return "V";
+  case InstructionFlavor::TRANS:
+    return "T";
+  case InstructionFlavor::MultiCycleVALU:
+    return "C";
+  case InstructionFlavor::VMEM:
+    return "M";
+  case InstructionFlavor::DS:
+    return "D";
+  case InstructionFlavor::SALU:
+    return "S";
+  case InstructionFlavor::DMA:
+    return "X";
+  case InstructionFlavor::Fence:
+    return "F";
+  case InstructionFlavor::Other:
+    return "O";
+  case InstructionFlavor::NUM_FLAVORS:
+    return "?";
+  }
+  llvm_unreachable("Unknown InstructionFlavor");
+}
+
+InstructionFlavor classifyFlavor(const MachineInstr *MI,
+                                 const SIInstrInfo *SII);
+
+using FlavorGroup = SmallVector<InstructionFlavor, 4>;
+
+namespace FlavorGroups {
+inline FlavorGroup allVALU() {
+  return {InstructionFlavor::SingleCycleVALU, InstructionFlavor::TRANS,
+          InstructionFlavor::MultiCycleVALU};
+}
+inline FlavorGroup allMem() {
+  return {InstructionFlavor::VMEM, InstructionFlavor::DS,
+          InstructionFlavor::DMA};
+}
+inline FlavorGroup individual(InstructionFlavor F) { return {F}; }
+inline FlavorGroup all() {
+  FlavorGroup G;
+  for (unsigned I = 0;
+       I < static_cast<unsigned>(InstructionFlavor::NUM_FLAVORS); ++I)
+    G.push_back(static_cast<InstructionFlavor>(I));
+  return G;
+}
+} // namespace FlavorGroups
+
+/// AMDGPU-specific scheduling decision reasons. These provide more granularity
+/// than the generic CandReason enum for debugging purposes.
+enum class AMDGPUSchedReason : uint8_t {
+  None,
+  CritResourceBalance, // tryCriticalResource chose based on resource pressure
+  CritResourceDep,     // tryCriticalResourceDependency chose based on enabling
+  NUM_REASONS
+};
+
+inline StringRef getReasonName(AMDGPUSchedReason R) {
+  switch (R) {
+  case AMDGPUSchedReason::None:
+    return "None";
+  case AMDGPUSchedReason::CritResourceBalance:
+    return "CritResource";
+  case AMDGPUSchedReason::CritResourceDep:
+    return "CritResourceDep";
+  case AMDGPUSchedReason::NUM_REASONS:
+    return "???";
+  }
+  llvm_unreachable("Unknown AMDGPUSchedReason");
+}
+
+//===----------------------------------------------------------------------===//
+// Hardware Unit Information
+//===----------------------------------------------------------------------===//
+
+/// HardwareUnitInfo is a wrapper class which maps to some real hardware
+/// resource. This is used to model hardware resource pressure per region, and
+/// guide scheduling heuristics.
+class HardwareUnitInfo {
+private:
+  /// PrioritySUs maintains a list of the SUs we want to prioritize scheduling
+  /// for this HardwareUnit. This is used for agreement between
+  /// tryCriticalResourceDependency and tryCriticalResource: we schedule the
+  /// dependencies for a SU on critical resource, then schedule that same SU on
+  /// the critical resource. This agreement results in shorter live ranges and
+  /// more regular HardwareUnit access patterns. SUs are prioritized based on
+  /// depth for top-down scheduling.
+  SmallSetVector<SUnit *, 16> PrioritySUs;
+  /// All the SUs in the region that consume this resource
+  SmallSetVector<SUnit *, 16> AllSUs;
+  /// The total number of busy cycles for this HardwareUnit for a given region.
+  unsigned TotalCycles = 0;
+  // InstructionFlavor mapping
+  InstructionFlavor Type;
+  // Idx mappuing
+  unsigned Idx;
+  // Whether or not instructions on this HardwareUnit may produce a window in
+  // which instructions in other HardwareUnits can coexecute. For example, WMMA
+  // / MFMA instructions may take multiple cycles, which may be overlapped with
+  // instructions on other HardwareUnits
+  bool ProducesCoexecWindow = false;
+
+public:
+  HardwareUnitInfo() {}
+
+  unsigned size() { return AllSUs.size(); }
+
+  unsigned getTotalCycles() { return TotalCycles; }
+
+  void setType(unsigned TheType) {
+    assert(TheType < (unsigned)InstructionFlavor::NUM_FLAVORS);
+    Type = (InstructionFlavor)(TheType);
+  }
+
+  InstructionFlavor getType() const { return Type; }
+
+  unsigned getIdx() const { return Idx; }
+
+  bool producesCoexecWindow() const { return ProducesCoexecWindow; }
+
+  void setProducesCoexecWindow(bool Val) { ProducesCoexecWindow = Val; }
+
+  bool contains(SUnit *SU) { return AllSUs.contains(SU); }
+
+  /// \returns trrue if there is a difference in priority between \p SU and \p
+  /// Other. If so, \returns the SUnit with higher priority. This
+  /// method looks through the PrioritySUs to dtermine if one SU is more
+  /// prioritized than the other. If neither are in the PrioritySUs list, then
+  /// neither have priority over each other.
+  SUnit *getHigherPriority(SUnit *SU, SUnit *Other) {
+    for (auto *SUOrder : PrioritySUs) {
+      if (SUOrder == SU) {
+        return SU;
+      }
+      if (SUOrder == Other) {
+        return Other;
+      }
+    }
+    return nullptr;
+  }
+
+  void reset() {
+    AllSUs.clear();
+    PrioritySUs.clear();
+    TotalCycles = 0;
+    ProducesCoexecWindow = false;
+  }
+
+  /// \returns the next SU in PriortySUs that is not ready. If \p LookDeep is
+  /// set, we will look beyond the PrioritySUs (if all the PrioritSUs are ready)
+  /// to AllSUs to attempt to find a target SU. When looking through AllSUs we
+  /// sort pick the target SU by minimal depth for top-down scheduling.
+  /// getNextTargetSU is useful for determining which SU on this HardwareUnit we
+  /// are trying to schedule - this info helps us determine which dependencies
+  /// to schedule. LookDeep is useful if the dependencies are long latency (e.g.
+  /// memory instructions). If we have many lkong latency dependencies, it is
----------------
kerbowa wrote:

Nit: lkong->long

https://github.com/llvm/llvm-project/pull/184929