[llvm-branch-commits] [llvm] [AMDGPU] DS loop wait relaxation -- more test cases and improvements … (PR #171952)

Fri Dec 12 03:03:11 PST 2025

================
@@ -2789,6 +2806,42 @@ void SIInsertWaitcnts::analyzeSingleBBLoopDSLoads(MachineLoop *ML) {
     if (!AfterLastBarrier)
       continue;
 
+    // Check for instructions that write to LDS through DMA (global_load_lds,
+    // etc). These write to LDS but aren't DS instructions.
+    // Bail out if any appear after the barrier.
+    if (SIInstrInfo::mayWriteLDSThroughDMA(MI)) {
+      LLVM_DEBUG(
+          dbgs() << "Loop DS Wait Opt: LDS DMA write after last barrier, "
+                 << "skipping\n");
+      Info.Valid = false;
+      return;
+    }
+
+    // Check for tensor_load_to_lds instructions (MIMG, not caught by above)
+    if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
+        MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2) {
+      LLVM_DEBUG(dbgs() << "Loop DS Wait Opt: tensor_load_to_lds after last "
+                        << "barrier, skipping\n");
+      Info.Valid = false;
+      return;
+    }
+
+    // Check if this instruction uses or overwrites any tracked DS load
+    // destination. If so, baseline will have inserted a wait that flushes
+    // all loads up to that position (since DS loads complete in order).
+    // Overwrites also require the load to complete first to avoid races.
+    for (auto &[Reg, Position] : TrackedLoads) {
----------------
arsenm wrote:

```suggestion
    for (auto [Reg, Position] : TrackedLoads) {
```

https://github.com/llvm/llvm-project/pull/171952