[llvm] [AMDGPU] Add DS loop wait optimization infrastructure (1/4) (PR #171942)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 12 02:37:04 PST 2025
================
@@ -2643,6 +2670,85 @@ bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
return SIInstrInfo::isVMEM(MI);
}
+//===----------------------------------------------------------------------===//
+// DS Loop Wait Optimization (GFX12+)
+//
+// This optimization relaxes DS wait counts in single-block loops that have
+// many DS loads and WMMA/MFMA instructions (typical GEMM kernels with software
+// pipelining). Instead of waiting for almost all DS loads to complete before
+// each WMMA, we analyze which specific loads feed each WMMA and wait only for
+// those to complete, allowing more overlap between memory and compute.
+//
+// Opportunity arises when the load ordering in the preheader block and
+// the load ordering at the end of the loop body, feeding the loaded data
+// to the next iteration, are not matched well (since their orderings are
+// not co-optimized)
+//===----------------------------------------------------------------------===//
+
+bool SIInsertWaitcnts::isEligibleForDSLoopOpt(MachineLoop *ML,
+ LoopDSWaitOptInfo &Info) const {
+ if (!OptimizeDSLoopWaitcnt)
+ return false;
+
+ // Only for GFX12+ where we have a separate counter for LDS.
+ if (!ST->hasExtendedWaitCounts())
+ return false;
+
+ // Must be a single-block loop. Makes the analysis easier.
+ if (ML->getNumBlocks() != 1)
+ return false;
+
+ MachineBasicBlock *MBB = ML->getHeader();
+
+ // Count DS loads, WMMA/MFMA instructions, and total non-meta instructions
+ unsigned NumDSLoads = 0;
+ unsigned NumWMMA = 0;
+ unsigned NumInsts = 0;
+
+ for (const MachineInstr &MI : *MBB) {
+ if (!MI.isMetaInstruction())
+ ++NumInsts;
+
+ if (SIInstrInfo::isDS(MI)) {
+ if (MI.mayLoad() && !MI.mayStore())
+ ++NumDSLoads;
+ } else if (SIInstrInfo::isWMMA(MI) || SIInstrInfo::isMFMA(MI)) {
+ ++NumWMMA;
+ }
+ }
+
+ // Heuristics: need significant number of DS loads and WMMA/MFMA
+ // to make this optimization worthwhile
+ if (NumDSLoads < 16 || NumWMMA < 8)
+ return false;
+
+ // DS loads and WMMAs should be a significant portion of the loop body
+ // (at least 1/4 of the instructions)
+ if ((NumDSLoads + NumWMMA) * 4 < NumInsts)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Loop DS Wait Opt: Loop at "; MBB->printName(dbgs());
+ dbgs() << " - " << NumDSLoads << " DS loads, " << NumWMMA
+ << " WMMA/MFMA, " << NumInsts
+ << " total insts, eligible\n");
+
+ return true;
+}
+
+void SIInsertWaitcnts::analyzeSingleBBLoopDSLoads(MachineLoop *ML) {
+ MachineBasicBlock *MBB = ML->getHeader();
+ LoopDSWaitOptInfo &Info = LoopDSWaitOptCache[MBB];
+
+ // Quick structural checks
+ if (!isEligibleForDSLoopOpt(ML, Info)) {
+ Info.Valid = false;
+ return;
+ }
+
+ // For now, just mark as invalid - full analysis comes in a later PR.
+ Info.Valid = false;
----------------
arsenm wrote:
Both of these paths are just setting to the original value of false. Invert this to set Valid on isEligibleForDSLoopOpt?
https://github.com/llvm/llvm-project/pull/171942
More information about the llvm-commits
mailing list