[llvm] [AMDGPU] Add HWUI pressure heuristics to coexec strategy (PR #184929)

Austin Kerbow via llvm-commits llvm-commits at lists.llvm.org
Wed Mar 25 08:20:15 PDT 2026


================
@@ -41,6 +42,365 @@ static SUnit *pickOnlyChoice(SchedBoundary &Zone) {
   return OnlyChoice;
 }
 
+InstructionFlavor llvm::AMDGPU::classifyFlavor(const MachineInstr &MI,
+                                               const SIInstrInfo &SII) {
+  if (MI.isDebugInstr())
+    return InstructionFlavor::Other;
+
+  unsigned Opc = MI.getOpcode();
+
+  // Check for specific opcodes first.
+  if (Opc == AMDGPU::ATOMIC_FENCE || Opc == AMDGPU::S_WAIT_ASYNCCNT ||
+      Opc == AMDGPU::S_WAIT_TENSORCNT || Opc == AMDGPU::S_BARRIER_WAIT ||
+      Opc == AMDGPU::S_BARRIER_SIGNAL_IMM)
+    return InstructionFlavor::Fence;
+
+  if (SII.isLDSDMA(MI))
+    return InstructionFlavor::DMA;
+
+  if (SII.isMFMAorWMMA(MI))
+    return InstructionFlavor::WMMA;
+
+  if (SII.isTRANS(MI))
+    return InstructionFlavor::TRANS;
+
+  if (SII.isVALU(MI))
+    return InstructionFlavor::SingleCycleVALU;
+
+  if (SII.isDS(MI))
+    return InstructionFlavor::DS;
+
+  if (SII.isFLAT(MI) || SII.isFLATGlobal(MI) || SII.isFLATScratch(MI))
+    return InstructionFlavor::VMEM;
+
+  if (SII.isSALU(MI))
+    return InstructionFlavor::SALU;
+
+  return InstructionFlavor::Other;
+}
+
+SUnit *HardwareUnitInfo::getNextTargetSU(bool LookDeep) const {
+  for (auto *PrioritySU : PrioritySUs) {
+    if (!PrioritySU->isTopReady())
+      return PrioritySU;
+  }
+
+  if (!LookDeep)
+    return nullptr;
+
+  unsigned MinDepth = std::numeric_limits<unsigned int>::max();
+  SUnit *TargetSU = nullptr;
+  for (auto *SU : AllSUs) {
+    if (SU->isScheduled)
+      continue;
+
+    if (SU->isTopReady())
+      continue;
+
+    if (SU->getDepth() < MinDepth) {
+      MinDepth = SU->getDepth();
+      TargetSU = SU;
+    }
+  }
+  return TargetSU;
+}
+
+void HardwareUnitInfo::insert(SUnit *SU, unsigned BlockingCycles) {
+  [[maybe_unused]] bool Inserted = AllSUs.insert(SU);
+  TotalCycles += BlockingCycles;
+
+  assert(Inserted);
+  if (PrioritySUs.empty()) {
+    PrioritySUs.insert(SU);
+    return;
+  }
+  unsigned SUDepth = SU->getDepth();
+  unsigned CurrDepth = (*PrioritySUs.begin())->getDepth();
+  if (SUDepth > CurrDepth)
+    return;
+
+  if (SUDepth == CurrDepth) {
+    PrioritySUs.insert(SU);
+    return;
+  }
+
+  // SU is lower depth and should be prioritized.
+  PrioritySUs.clear();
+  PrioritySUs.insert(SU);
+}
+
+void HardwareUnitInfo::markScheduled(SUnit *SU, unsigned BlockingCycles) {
+  // We may want to ignore some HWUIs (e.g. InstructionFlavor::Other). To do so,
+  // we just clear the HWUI. However, we still have instructions which map to
+  // this HWUI. Don't bother managing the state for these HWUI.
+  if (TotalCycles == 0)
+    return;
+
+  AllSUs.remove(SU);
+  PrioritySUs.remove(SU);
+
+  TotalCycles -= BlockingCycles;
+
+  if (AllSUs.empty())
+    return;
+  if (PrioritySUs.empty()) {
+    for (auto SU : AllSUs) {
+      if (PrioritySUs.empty()) {
+        PrioritySUs.insert(SU);
+        continue;
+      }
+      unsigned SUDepth = SU->getDepth();
+      unsigned CurrDepth = (*PrioritySUs.begin())->getDepth();
+      if (SUDepth > CurrDepth)
+        continue;
+
+      if (SUDepth == CurrDepth) {
+        PrioritySUs.insert(SU);
+        continue;
+      }
+
+      // SU is lower depth and should be prioritized.
+      PrioritySUs.clear();
+      PrioritySUs.insert(SU);
+    }
+  }
+}
+
+HardwareUnitInfo *
+CandidateHeuristics::getHWUIFromFlavor(InstructionFlavor Flavor) {
+  for (auto &HWUICand : HWUInfo) {
+    if (HWUICand.getType() == Flavor) {
+      return &HWUICand;
+    }
+  }
+  return nullptr;
+}
+
+unsigned CandidateHeuristics::getHWUICyclesForInst(SUnit *SU) {
+  assert(SchedModel && SchedModel->hasInstrSchedModel());
+  unsigned ReleaseAtCycle = 0;
+  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+  for (TargetSchedModel::ProcResIter PI = SchedModel->getWriteProcResBegin(SC),
+                                     PE = SchedModel->getWriteProcResEnd(SC);
+       PI != PE; ++PI) {
+    ReleaseAtCycle = std::max(ReleaseAtCycle, (unsigned)PI->ReleaseAtCycle);
+  }
+  return ReleaseAtCycle;
+}
+
+void CandidateHeuristics::updateForScheduling(SUnit *SU) {
+  HardwareUnitInfo *HWUI =
+      getHWUIFromFlavor(classifyFlavor(*SU->getInstr(), *SII));
+  assert(HWUI);
+  HWUI->markScheduled(SU, getHWUICyclesForInst(SU));
+}
+
+void CandidateHeuristics::initialize(ScheduleDAGMI *SchedDAG,
+                                     const TargetSchedModel *TargetSchedModel,
+                                     const TargetRegisterInfo *TRI) {
+  DAG = SchedDAG;
+  SchedModel = TargetSchedModel;
+  assert(SchedModel && SchedModel->hasInstrSchedModel());
+
+  SRI = static_cast<const SIRegisterInfo *>(TRI);
+  SII = static_cast<const SIInstrInfo *>(DAG->TII);
+
+  HWUInfo.resize((int)InstructionFlavor::NUM_FLAVORS);
+
+  for (unsigned I = 0; I < HWUInfo.size(); I++) {
+    HWUInfo[I].setType(I);
+    HWUInfo[I].reset();
+  }
+
+  HWUInfo[(int)InstructionFlavor::WMMA].setProducesCoexecWindow(true);
+  HWUInfo[(int)InstructionFlavor::MultiCycleVALU].setProducesCoexecWindow(true);
+  HWUInfo[(int)InstructionFlavor::TRANS].setProducesCoexecWindow(true);
+
+  collectHWUIPressure();
+}
+
+void CandidateHeuristics::collectHWUIPressure() {
+  if (!SchedModel || !SchedModel->hasInstrSchedModel())
+    return;
+
+  for (auto &SU : DAG->SUnits) {
+    const InstructionFlavor Flavor = classifyFlavor(*SU.getInstr(), *SII);
+    HWUInfo[(int)(Flavor)].insert(&SU, getHWUICyclesForInst(&SU));
+  }
+
+  LLVM_DEBUG(dumpRegionSummary());
+}
+
+void CandidateHeuristics::dumpRegionSummary() {
+  MachineBasicBlock *BB = DAG->begin()->getParent();
+  dbgs() << "\n=== Region: " << DAG->MF.getName() << " BB" << BB->getNumber()
+         << " (" << DAG->SUnits.size() << " SUs) ===\n";
+
+  dbgs() << "\nHWUI Resource Pressure:\n";
+  for (auto &HWUI : HWUInfo) {
+    if (HWUI.getTotalCycles() == 0)
+      continue;
+
+    StringRef Name = getFlavorName(HWUI.getType());
+    dbgs() << "  " << Name << ": " << HWUI.getTotalCycles() << " cycles, "
+           << HWUI.size() << " instrs\n";
+  }
+  dbgs() << "\n";
+}
+
+void CandidateHeuristics::sortHWUIResources() {
+  // Highest priority should be first.
+  llvm::sort(HWUInfo, [](HardwareUnitInfo &A, HardwareUnitInfo &B) {
+    // Prefer CoexecWindow producers
+    if (A.producesCoexecWindow() != B.producesCoexecWindow())
+      return A.producesCoexecWindow();
+
+    // Prefer more demanded resources
+    if (A.getTotalCycles() != B.getTotalCycles())
+      return A.getTotalCycles() > B.getTotalCycles();
+
+    // In ties -- prefer the resource with more instructions
+    if (A.size() != B.size())
+      return A.size() < B.size();
+
+    // Default to Flavor order
+    return (unsigned)A.getType() < (unsigned)B.getType();
+  });
+}
+
+bool CandidateHeuristics::tryCriticalResourceDependency(
+    GenericSchedulerBase::SchedCandidate &TryCand,
+    GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const {
+
+  auto HasPrioritySU = [this, &Cand, &TryCand](unsigned ResourceIdx) {
+    const HardwareUnitInfo &HWUI = HWUInfo[ResourceIdx];
+
+    auto CandFlavor = classifyFlavor(*Cand.SU->getInstr(), *SII);
+    auto TryCandFlavor = classifyFlavor(*TryCand.SU->getInstr(), *SII);
+    bool LookDeep = (CandFlavor == InstructionFlavor::DS ||
+                     TryCandFlavor == InstructionFlavor::DS) &&
+                    HWUI.getType() == InstructionFlavor::WMMA;
+    auto *TargetSU = HWUI.getNextTargetSU(LookDeep);
+
+    // If we do not have a TargetSU for this resource, then it is not critical.
+    if (!TargetSU)
+      return false;
+
+    return true;
+  };
+
+  auto TryEnablesResource = [&Cand, &TryCand, this](unsigned ResourceIdx) {
+    HardwareUnitInfo HWUI = HWUInfo[ResourceIdx];
+    auto CandFlavor = classifyFlavor(*Cand.SU->getInstr(), *SII);
+
+    // We want to ensure our DS order matches WMMA order.
+    bool LookDeep = CandFlavor == InstructionFlavor::DS &&
+                    HWUI.getType() == InstructionFlavor::WMMA;
+    auto *TargetSU = HWUI.getNextTargetSU(LookDeep);
+
+    bool CandEnables =
+        TargetSU != Cand.SU && DAG->IsReachable(TargetSU, Cand.SU);
+    bool TryCandEnables =
+        TargetSU != TryCand.SU && DAG->IsReachable(TargetSU, TryCand.SU);
+
+    if (!CandEnables && !TryCandEnables)
+      return false;
+
+    if (CandEnables && !TryCandEnables) {
+      if (Cand.Reason > GenericSchedulerBase::RegCritical)
+        Cand.Reason = GenericSchedulerBase::RegCritical;
+
+      return true;
+    }
+
+    if (!CandEnables && TryCandEnables) {
+      TryCand.Reason = GenericSchedulerBase::RegCritical;
+      return true;
+    }
+
+    // Both enable, prefer the critical path.
+    unsigned CandHeight = Cand.SU->getHeight();
+    unsigned TryCandHeight = TryCand.SU->getHeight();
+
+    if (CandHeight > TryCandHeight) {
+      if (Cand.Reason > GenericSchedulerBase::RegCritical)
+        Cand.Reason = GenericSchedulerBase::RegCritical;
+
+      return true;
+    }
+
+    if (CandHeight < TryCandHeight) {
+      TryCand.Reason = GenericSchedulerBase::RegCritical;
+      return true;
+    }
+
+    // Same critical path, just prefer original candidate.
+    if (Cand.Reason > GenericSchedulerBase::RegCritical)
+      Cand.Reason = GenericSchedulerBase::RegCritical;
+
+    return true;
+  };
+
+  for (unsigned I = 0; I < HWUInfo.size(); I++) {
+    // If we have encountered a resource that is not critical, then neither
+    // candidate enables a critical resource
+    if (!HasPrioritySU(I))
+      continue;
+
+    bool Enabled = TryEnablesResource(I);
+    // If neither has enabled the resource, continue to the next resource
+    if (Enabled)
+      return true;
+  }
+  return false;
+}
+
+bool CandidateHeuristics::tryCriticalResource(
+    GenericSchedulerBase::SchedCandidate &TryCand,
+    GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const {
+  for (unsigned I = 0; I < HWUInfo.size(); I++) {
+    HardwareUnitInfo HWUI = HWUInfo[I];
----------------
kerbowa wrote:

```suggestion
    HardwareUnitInfo &HWUI = HWUInfo[I];
```

https://github.com/llvm/llvm-project/pull/184929


More information about the llvm-commits mailing list