[llvm-branch-commits] [llvm] [AMDGPU] Add HWUI pressure heuristics to coexec strategy (PR #184929)

Matt Arsenault via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Fri Mar 6 05:25:42 PST 2026


================
@@ -41,6 +41,370 @@ static SUnit *pickOnlyChoice(SchedBoundary &Zone) {
   return OnlyChoice;
 }
 
+InstructionFlavor llvm::classifyFlavor(const MachineInstr *MI,
+                                       const SIInstrInfo *SII) {
+  if (!MI || MI->isDebugInstr())
+    return InstructionFlavor::Other;
+
+  unsigned Opc = MI->getOpcode();
+
+  // Check for specific opcodes first.
+  if (Opc == AMDGPU::ATOMIC_FENCE || Opc == AMDGPU::S_WAIT_ASYNCCNT ||
+      Opc == AMDGPU::S_WAIT_TENSORCNT || Opc == AMDGPU::S_BARRIER_WAIT ||
+      Opc == AMDGPU::S_BARRIER_SIGNAL_IMM)
+    return InstructionFlavor::Fence;
+
+  if (Opc == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
+      Opc == AMDGPU::TENSOR_LOAD_TO_LDS ||
+      Opc == AMDGPU::GLOBAL_LOAD_ASYNC_TO_LDS_B32 ||
+      Opc == AMDGPU::GLOBAL_LOAD_ASYNC_TO_LDS_B32_SADDR)
+    return InstructionFlavor::DMA;
+
+  if (SII->isMFMAorWMMA(*MI))
+    return InstructionFlavor::WMMA;
+
+  if (SII->isTRANS(*MI))
+    return InstructionFlavor::TRANS;
+
+  if (SII->isVALU(*MI))
+    return InstructionFlavor::SingleCycleVALU;
+
+  if (SII->isDS(*MI))
+    return InstructionFlavor::DS;
+
+  if (SII->isFLAT(*MI) || SII->isFLATGlobal(*MI) || SII->isFLATScratch(*MI))
+    return InstructionFlavor::VMEM;
+
+  if (SII->isSALU(*MI))
+    return InstructionFlavor::SALU;
+
+  return InstructionFlavor::Other;
+}
+
+SUnit *HardwareUnitInfo::getNextTargetSU(bool LookDeep) {
+  for (auto *PrioritySU : PrioritySUs) {
+    if (!PrioritySU->isTopReady())
+      return PrioritySU;
+  }
+
+  if (!LookDeep)
+    return nullptr;
+
+  unsigned MinDepth = std::numeric_limits<unsigned int>::max();
+  SUnit *TargetSU = nullptr;
+  for (auto *SU : AllSUs) {
+    if (SU->isScheduled)
+      continue;
+
+    if (SU->isTopReady())
+      continue;
+
+    if (SU->getDepth() < MinDepth) {
+      MinDepth = SU->getDepth();
+      TargetSU = SU;
+    }
+  }
+  return TargetSU;
+}
+
+void HardwareUnitInfo::insert(SUnit *SU, unsigned BlockingCycles) {
+  bool Inserted = AllSUs.insert(SU);
+  TotalCycles += BlockingCycles;
+
+  assert(Inserted);
+  if (PrioritySUs.empty()) {
+    PrioritySUs.insert(SU);
+    return;
+  }
+  unsigned SUDepth = SU->getDepth();
+  unsigned CurrDepth = (*PrioritySUs.begin())->getDepth();
+  if (SUDepth > CurrDepth)
+    return;
+
+  if (SUDepth == CurrDepth) {
+    PrioritySUs.insert(SU);
+    return;
+  }
+
+  // SU is lower depth and should be prioritized.
+  PrioritySUs.clear();
+  PrioritySUs.insert(SU);
+}
+
+void HardwareUnitInfo::schedule(SUnit *SU, unsigned BlockingCycles) {
+  // We may want to ignore some HWUIs (e.g. InstructionFlavor::Other). To do so,
+  // we just clear the HWUI. However, we still have instructions which map to
+  // this HWUI. Don't bother managing the state for these HWUI.
+  if (TotalCycles == 0)
+    return;
+
+  AllSUs.remove(SU);
+  PrioritySUs.remove(SU);
+
+  TotalCycles -= BlockingCycles;
+
+  if (AllSUs.empty())
+    return;
+  if (PrioritySUs.empty()) {
+    for (auto SU : AllSUs) {
+      if (PrioritySUs.empty()) {
+        PrioritySUs.insert(SU);
+        continue;
+      }
+      unsigned SUDepth = SU->getDepth();
+      unsigned CurrDepth = (*PrioritySUs.begin())->getDepth();
+      if (SUDepth > CurrDepth)
+        continue;
+
+      if (SUDepth == CurrDepth) {
+        PrioritySUs.insert(SU);
+        continue;
+      }
+
+      // SU is lower depth and should be prioritized.
+      PrioritySUs.clear();
+      PrioritySUs.insert(SU);
+    }
+  }
+}
+
+HardwareUnitInfo *
+CandidateHeuristics::getHWUIFromFlavor(InstructionFlavor Flavor) {
+  for (auto &HWUICand : HWUInfo) {
+    if (HWUICand.getType() == Flavor) {
+      return &HWUICand;
+    }
+  }
+  return nullptr;
+}
+
+unsigned CandidateHeuristics::getHWUICyclesForInst(SUnit *SU) {
+  if (SchedModel && SchedModel->hasInstrSchedModel()) {
+    unsigned ReleaseAtCycle = 0;
+    const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+    for (TargetSchedModel::ProcResIter
+             PI = SchedModel->getWriteProcResBegin(SC),
+             PE = SchedModel->getWriteProcResEnd(SC);
+         PI != PE; ++PI) {
+      ReleaseAtCycle = std::max(ReleaseAtCycle, (unsigned)PI->ReleaseAtCycle);
+    }
+    return ReleaseAtCycle;
+  }
+  return -1;
+}
+
+void CandidateHeuristics::schedNode(SUnit *SU) {
+  HardwareUnitInfo *HWUI =
+      getHWUIFromFlavor(classifyFlavor(SU->getInstr(), SII));
+  HWUI->schedule(SU, getHWUICyclesForInst(SU));
+}
+
+void CandidateHeuristics::initialize(ScheduleDAGMI *SchedDAG,
+                                     const TargetSchedModel *TargetSchedModel,
+                                     const TargetRegisterInfo *TRI) {
+  DAG = SchedDAG;
+  SchedModel = TargetSchedModel;
+
+  SRI = static_cast<const SIRegisterInfo *>(TRI);
+  SII = static_cast<const SIInstrInfo *>(DAG->TII);
+
+  HWUInfo.resize((int)InstructionFlavor::NUM_FLAVORS);
+
+  for (unsigned I = 0; I < HWUInfo.size(); I++) {
+    HWUInfo[I].setType(I);
+    HWUInfo[I].reset();
+  }
+
+  HWUInfo[(int)InstructionFlavor::WMMA].setProducesCoexecWindow(true);
+  HWUInfo[(int)InstructionFlavor::MultiCycleVALU].setProducesCoexecWindow(true);
+  HWUInfo[(int)InstructionFlavor::TRANS].setProducesCoexecWindow(true);
+
+  collectHWUIPressure();
+}
+
+void CandidateHeuristics::collectHWUIPressure() {
+  if (!SchedModel || !SchedModel->hasInstrSchedModel())
+    return;
+
+  for (auto &SU : DAG->SUnits) {
+    InstructionFlavor Flavor = classifyFlavor(SU.getInstr(), SII);
+    HWUInfo[(int)(Flavor)].insert(&SU, getHWUICyclesForInst(&SU));
+  }
+
+  LLVM_DEBUG(dumpRegionSummary());
+}
+
+void CandidateHeuristics::dumpRegionSummary() {
+  MachineBasicBlock *BB = DAG->begin()->getParent();
+  dbgs() << "\n=== Region: " << DAG->MF.getName() << " BB" << BB->getNumber()
+         << " (" << DAG->SUnits.size() << " SUs) ===\n";
+
+  dbgs() << "\nHWUI Resource Pressure:\n";
+  for (auto &HWUI : HWUInfo) {
+    if (HWUI.getTotalCycles() == 0)
+      continue;
+
+    StringRef Name = getFlavorName(HWUI.getType());
+    dbgs() << "  [" << HWUI.getIdx() << "] " << Name << ": "
+           << HWUI.getTotalCycles() << " cycles, " << HWUI.size()
+           << " instrs\n";
+  }
+  dbgs() << "\n";
+}
+
+void CandidateHeuristics::sortHWUIResources() {
----------------
arsenm wrote:

Define the predicate function separately, then directly use sort + that function 

https://github.com/llvm/llvm-project/pull/184929


More information about the llvm-branch-commits mailing list