[llvm] [AMDGPU] Add DS loop waitcnt optimization for GFX12+ (PR #172728)

Sameer Sahasrabuddhe via llvm-commits llvm-commits at lists.llvm.org
Tue Dec 23 20:10:26 PST 2025


================
@@ -2768,18 +2839,54 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
         for (const MachineOperand &Op : MI.all_defs()) {
           for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
             // If we find a register that is loaded inside the loop, 1. and 2.
-            // are invalidated and we can exit.
+            // are invalidated.
             if (VgprUse.contains(RU))
-              return false;
-            VgprDef.insert(RU);
+              VMemInvalidated = true;
+            VgprDefVMEM.insert(RU);
+          }
+        }
+        // Early exit if both optimizations are invalidated
+        if (VMemInvalidated && DSInvalidated)
+          return Flags;
+      }
+
+      // DS read vgpr def
+      // Note: Unlike VMEM, we DON'T invalidate when VgprUse.contains(RegNo).
+      // If USE comes before DEF, it's the prefetch pattern (use value from
+      // previous iteration, load for next iteration). We should still flush
+      // in preheader so iteration 1 doesn't need to wait inside the loop.
+      // Only invalidate when DEF comes before USE (same-iteration consumption,
+      // checked above when processing uses).
+      if (isDSRead(MI)) {
+        for (const MachineOperand &Op : MI.all_defs()) {
+          for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
+            VgprDefDS.insert(RU);
           }
         }
       }
     }
+    // Accumulate unprotected DS stores from this MBB
+    SeenDSStoreInLoop |= SeenDSStoreInCurrMBB;
   }
-  if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
-    return true;
-  return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
+
+  // VMEM flush decision
+  if (!VMemInvalidated && UsesVgprLoadedOutsideVMEM &&
+      ((!ST->hasVscnt() && HasVMemStore && !HasVMemLoad) ||
+       (HasVMemLoad && ST->hasVmemWriteVgprInOrder())))
+    Flags.FlushVmCnt = true;
+
+  // DS flush decision: flush if loop uses DS-loaded values from outside
+  // and either has no DS reads in the loop, or DS reads whose results
+  // are not used in the loop.
+  // DSInvalidated is pre-set to true on non-GFX12+ targets where DS_CNT
+  // is LGKM_CNT which also tracks FLAT/SMEM.
+  // DS stores share DS_CNT with DS reads, but stores before a barrier are OK
----------------
ssahasra wrote:

This is not true if the "DS store" is actually a DMA operation.

https://github.com/llvm/llvm-project/pull/172728


More information about the llvm-commits mailing list