[llvm] [AMDGPU] Add HWUI pressure heuristics to coexec strategy (PR #184929)
Austin Kerbow via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 25 08:20:15 PDT 2026
================
@@ -41,6 +42,365 @@ static SUnit *pickOnlyChoice(SchedBoundary &Zone) {
return OnlyChoice;
}
+InstructionFlavor llvm::AMDGPU::classifyFlavor(const MachineInstr &MI,
+ const SIInstrInfo &SII) {
+ if (MI.isDebugInstr())
+ return InstructionFlavor::Other;
+
+ unsigned Opc = MI.getOpcode();
+
+ // Check for specific opcodes first.
+ if (Opc == AMDGPU::ATOMIC_FENCE || Opc == AMDGPU::S_WAIT_ASYNCCNT ||
+ Opc == AMDGPU::S_WAIT_TENSORCNT || Opc == AMDGPU::S_BARRIER_WAIT ||
+ Opc == AMDGPU::S_BARRIER_SIGNAL_IMM)
+ return InstructionFlavor::Fence;
+
+ if (SII.isLDSDMA(MI))
+ return InstructionFlavor::DMA;
+
+ if (SII.isMFMAorWMMA(MI))
+ return InstructionFlavor::WMMA;
+
+ if (SII.isTRANS(MI))
+ return InstructionFlavor::TRANS;
+
+ if (SII.isVALU(MI))
+ return InstructionFlavor::SingleCycleVALU;
+
+ if (SII.isDS(MI))
+ return InstructionFlavor::DS;
+
+ if (SII.isFLAT(MI) || SII.isFLATGlobal(MI) || SII.isFLATScratch(MI))
+ return InstructionFlavor::VMEM;
+
+ if (SII.isSALU(MI))
+ return InstructionFlavor::SALU;
+
+ return InstructionFlavor::Other;
+}
+
+SUnit *HardwareUnitInfo::getNextTargetSU(bool LookDeep) const {
+ for (auto *PrioritySU : PrioritySUs) {
+ if (!PrioritySU->isTopReady())
+ return PrioritySU;
+ }
+
+ if (!LookDeep)
+ return nullptr;
+
+ unsigned MinDepth = std::numeric_limits<unsigned int>::max();
+ SUnit *TargetSU = nullptr;
+ for (auto *SU : AllSUs) {
+ if (SU->isScheduled)
+ continue;
+
+ if (SU->isTopReady())
+ continue;
+
+ if (SU->getDepth() < MinDepth) {
+ MinDepth = SU->getDepth();
+ TargetSU = SU;
+ }
+ }
+ return TargetSU;
+}
+
+void HardwareUnitInfo::insert(SUnit *SU, unsigned BlockingCycles) {
+ [[maybe_unused]] bool Inserted = AllSUs.insert(SU);
+ TotalCycles += BlockingCycles;
+
+ assert(Inserted);
+ if (PrioritySUs.empty()) {
+ PrioritySUs.insert(SU);
+ return;
+ }
+ unsigned SUDepth = SU->getDepth();
+ unsigned CurrDepth = (*PrioritySUs.begin())->getDepth();
+ if (SUDepth > CurrDepth)
+ return;
+
+ if (SUDepth == CurrDepth) {
+ PrioritySUs.insert(SU);
+ return;
+ }
+
+ // SU is lower depth and should be prioritized.
+ PrioritySUs.clear();
+ PrioritySUs.insert(SU);
+}
+
+void HardwareUnitInfo::markScheduled(SUnit *SU, unsigned BlockingCycles) {
+ // We may want to ignore some HWUIs (e.g. InstructionFlavor::Other). To do so,
+ // we just clear the HWUI. However, we still have instructions which map to
+ // this HWUI. Don't bother managing the state for these HWUI.
+ if (TotalCycles == 0)
+ return;
+
+ AllSUs.remove(SU);
+ PrioritySUs.remove(SU);
+
+ TotalCycles -= BlockingCycles;
+
+ if (AllSUs.empty())
+ return;
+ if (PrioritySUs.empty()) {
+ for (auto SU : AllSUs) {
+ if (PrioritySUs.empty()) {
+ PrioritySUs.insert(SU);
+ continue;
+ }
+ unsigned SUDepth = SU->getDepth();
+ unsigned CurrDepth = (*PrioritySUs.begin())->getDepth();
+ if (SUDepth > CurrDepth)
+ continue;
+
+ if (SUDepth == CurrDepth) {
+ PrioritySUs.insert(SU);
+ continue;
+ }
+
+ // SU is lower depth and should be prioritized.
+ PrioritySUs.clear();
+ PrioritySUs.insert(SU);
+ }
+ }
+}
+
+HardwareUnitInfo *
+CandidateHeuristics::getHWUIFromFlavor(InstructionFlavor Flavor) {
+ for (auto &HWUICand : HWUInfo) {
+ if (HWUICand.getType() == Flavor) {
+ return &HWUICand;
+ }
+ }
+ return nullptr;
+}
+
+unsigned CandidateHeuristics::getHWUICyclesForInst(SUnit *SU) {
+ assert(SchedModel && SchedModel->hasInstrSchedModel());
+ unsigned ReleaseAtCycle = 0;
+ const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+ for (TargetSchedModel::ProcResIter PI = SchedModel->getWriteProcResBegin(SC),
+ PE = SchedModel->getWriteProcResEnd(SC);
+ PI != PE; ++PI) {
+ ReleaseAtCycle = std::max(ReleaseAtCycle, (unsigned)PI->ReleaseAtCycle);
+ }
+ return ReleaseAtCycle;
+}
+
+void CandidateHeuristics::updateForScheduling(SUnit *SU) {
+ HardwareUnitInfo *HWUI =
+ getHWUIFromFlavor(classifyFlavor(*SU->getInstr(), *SII));
+ assert(HWUI);
+ HWUI->markScheduled(SU, getHWUICyclesForInst(SU));
+}
+
+void CandidateHeuristics::initialize(ScheduleDAGMI *SchedDAG,
+ const TargetSchedModel *TargetSchedModel,
+ const TargetRegisterInfo *TRI) {
+ DAG = SchedDAG;
+ SchedModel = TargetSchedModel;
+ assert(SchedModel && SchedModel->hasInstrSchedModel());
+
+ SRI = static_cast<const SIRegisterInfo *>(TRI);
+ SII = static_cast<const SIInstrInfo *>(DAG->TII);
+
+ HWUInfo.resize((int)InstructionFlavor::NUM_FLAVORS);
+
+ for (unsigned I = 0; I < HWUInfo.size(); I++) {
+ HWUInfo[I].setType(I);
+ HWUInfo[I].reset();
+ }
+
+ HWUInfo[(int)InstructionFlavor::WMMA].setProducesCoexecWindow(true);
+ HWUInfo[(int)InstructionFlavor::MultiCycleVALU].setProducesCoexecWindow(true);
+ HWUInfo[(int)InstructionFlavor::TRANS].setProducesCoexecWindow(true);
+
+ collectHWUIPressure();
+}
+
+void CandidateHeuristics::collectHWUIPressure() {
+ if (!SchedModel || !SchedModel->hasInstrSchedModel())
+ return;
+
+ for (auto &SU : DAG->SUnits) {
+ const InstructionFlavor Flavor = classifyFlavor(*SU.getInstr(), *SII);
+ HWUInfo[(int)(Flavor)].insert(&SU, getHWUICyclesForInst(&SU));
+ }
+
+ LLVM_DEBUG(dumpRegionSummary());
+}
+
+void CandidateHeuristics::dumpRegionSummary() {
+ MachineBasicBlock *BB = DAG->begin()->getParent();
+ dbgs() << "\n=== Region: " << DAG->MF.getName() << " BB" << BB->getNumber()
+ << " (" << DAG->SUnits.size() << " SUs) ===\n";
+
+ dbgs() << "\nHWUI Resource Pressure:\n";
+ for (auto &HWUI : HWUInfo) {
+ if (HWUI.getTotalCycles() == 0)
+ continue;
+
+ StringRef Name = getFlavorName(HWUI.getType());
+ dbgs() << " " << Name << ": " << HWUI.getTotalCycles() << " cycles, "
+ << HWUI.size() << " instrs\n";
+ }
+ dbgs() << "\n";
+}
+
+void CandidateHeuristics::sortHWUIResources() {
+ // Highest priority should be first.
+ llvm::sort(HWUInfo, [](HardwareUnitInfo &A, HardwareUnitInfo &B) {
+ // Prefer CoexecWindow producers
+ if (A.producesCoexecWindow() != B.producesCoexecWindow())
+ return A.producesCoexecWindow();
+
+ // Prefer more demanded resources
+ if (A.getTotalCycles() != B.getTotalCycles())
+ return A.getTotalCycles() > B.getTotalCycles();
+
+ // In ties -- prefer the resource with more instructions
+ if (A.size() != B.size())
+ return A.size() < B.size();
+
+ // Default to Flavor order
+ return (unsigned)A.getType() < (unsigned)B.getType();
+ });
+}
+
+bool CandidateHeuristics::tryCriticalResourceDependency(
+ GenericSchedulerBase::SchedCandidate &TryCand,
+ GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const {
+
+ auto HasPrioritySU = [this, &Cand, &TryCand](unsigned ResourceIdx) {
+ const HardwareUnitInfo &HWUI = HWUInfo[ResourceIdx];
+
+ auto CandFlavor = classifyFlavor(*Cand.SU->getInstr(), *SII);
+ auto TryCandFlavor = classifyFlavor(*TryCand.SU->getInstr(), *SII);
+ bool LookDeep = (CandFlavor == InstructionFlavor::DS ||
+ TryCandFlavor == InstructionFlavor::DS) &&
+ HWUI.getType() == InstructionFlavor::WMMA;
+ auto *TargetSU = HWUI.getNextTargetSU(LookDeep);
+
+ // If we do not have a TargetSU for this resource, then it is not critical.
+ if (!TargetSU)
+ return false;
+
+ return true;
+ };
+
+ auto TryEnablesResource = [&Cand, &TryCand, this](unsigned ResourceIdx) {
+ HardwareUnitInfo HWUI = HWUInfo[ResourceIdx];
+ auto CandFlavor = classifyFlavor(*Cand.SU->getInstr(), *SII);
+
+ // We want to ensure our DS order matches WMMA order.
+ bool LookDeep = CandFlavor == InstructionFlavor::DS &&
+ HWUI.getType() == InstructionFlavor::WMMA;
+ auto *TargetSU = HWUI.getNextTargetSU(LookDeep);
+
+ bool CandEnables =
+ TargetSU != Cand.SU && DAG->IsReachable(TargetSU, Cand.SU);
+ bool TryCandEnables =
+ TargetSU != TryCand.SU && DAG->IsReachable(TargetSU, TryCand.SU);
+
+ if (!CandEnables && !TryCandEnables)
+ return false;
+
+ if (CandEnables && !TryCandEnables) {
+ if (Cand.Reason > GenericSchedulerBase::RegCritical)
+ Cand.Reason = GenericSchedulerBase::RegCritical;
+
+ return true;
+ }
+
+ if (!CandEnables && TryCandEnables) {
+ TryCand.Reason = GenericSchedulerBase::RegCritical;
+ return true;
+ }
+
+ // Both enable, prefer the critical path.
+ unsigned CandHeight = Cand.SU->getHeight();
+ unsigned TryCandHeight = TryCand.SU->getHeight();
+
+ if (CandHeight > TryCandHeight) {
+ if (Cand.Reason > GenericSchedulerBase::RegCritical)
+ Cand.Reason = GenericSchedulerBase::RegCritical;
+
+ return true;
+ }
+
+ if (CandHeight < TryCandHeight) {
+ TryCand.Reason = GenericSchedulerBase::RegCritical;
+ return true;
+ }
+
+ // Same critical path, just prefer original candidate.
+ if (Cand.Reason > GenericSchedulerBase::RegCritical)
+ Cand.Reason = GenericSchedulerBase::RegCritical;
+
+ return true;
+ };
+
+ for (unsigned I = 0; I < HWUInfo.size(); I++) {
+ // If we have encountered a resource that is not critical, then neither
+ // candidate enables a critical resource
+ if (!HasPrioritySU(I))
+ continue;
+
+ bool Enabled = TryEnablesResource(I);
+ // If neither has enabled the resource, continue to the next resource
+ if (Enabled)
+ return true;
+ }
+ return false;
+}
+
+bool CandidateHeuristics::tryCriticalResource(
+ GenericSchedulerBase::SchedCandidate &TryCand,
+ GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const {
+ for (unsigned I = 0; I < HWUInfo.size(); I++) {
+ HardwareUnitInfo HWUI = HWUInfo[I];
----------------
kerbowa wrote:
```suggestion
HardwareUnitInfo &HWUI = HWUInfo[I];
```
https://github.com/llvm/llvm-project/pull/184929
More information about the llvm-commits
mailing list