[llvm] 8aa82ef - [AMDGPU][SIInsertWaitcnts] Wait on all LDS DMA operations when no aliasing store is found (#170660)

via llvm-commits llvm-commits at lists.llvm.org
Mon Dec 8 02:02:29 PST 2025


Author: Pierre van Houtryve
Date: 2025-12-08T11:02:24+01:00
New Revision: 8aa82eff569388f50a2fec60b61d9771b0912f0e

URL: https://github.com/llvm/llvm-project/commit/8aa82eff569388f50a2fec60b61d9771b0912f0e
DIFF: https://github.com/llvm/llvm-project/commit/8aa82eff569388f50a2fec60b61d9771b0912f0e.diff

LOG: [AMDGPU][SIInsertWaitcnts] Wait on all LDS DMA operations when no aliasing store is found (#170660)

Previously, we would miss inserting a wait if the ds_read had AA info,
but it didn't match
any LDS DMA op, for example if we didn't track the LDS DMA op it aliases
with because it exceeded the tracking limit.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
    llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 5c024252fdbe5..3b59045337b10 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1082,13 +1082,17 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
             }
           }
         }
-        if (Slot || LDSDMAStores.size() == NUM_LDS_VGPRS - 1)
+        if (Slot)
           break;
+        // The slot may not be valid because it can be >= NUM_LDS_VGPRS which
+        // means the scoreboard cannot track it. We still want to preserve the
+        // MI in order to check alias information, though.
         LDSDMAStores.push_back(&Inst);
         Slot = LDSDMAStores.size();
         break;
       }
-      setRegScore(FIRST_LDS_VGPR + Slot, T, CurrScore);
+      if (Slot < NUM_LDS_VGPRS)
+        setRegScore(FIRST_LDS_VGPR + Slot, T, CurrScore);
       if (Slot)
         setRegScore(FIRST_LDS_VGPR, T, CurrScore);
     }
@@ -2006,15 +2010,23 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
         if (Ptr && Memop->getAAInfo()) {
           const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
           for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
-            if (MI.mayAlias(AA, *LDSDMAStores[I], true))
+            if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
+              if ((I + 1) >= NUM_LDS_VGPRS) {
+                // We didn't have enough slot to track this LDS DMA store, it
+                // has been tracked using the common RegNo (FIRST_LDS_VGPR).
+                ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
+                break;
+              }
+
               ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
+            }
           }
         } else {
           ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
         }
-        if (Memop->isStore()) {
+
+        if (Memop->isStore())
           ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
-        }
       }
 
       // Loop over use and def operands.

diff  --git a/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll
index 37ba1f42413c9..74513ec9106bc 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll
@@ -223,6 +223,7 @@ define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
 ; GFX9-NEXT:    ds_read_b32 v7, v9 offset:1792
 ; GFX9-NEXT:    ; wave barrier
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ds_read_b32 v8, v9 offset:2048
 ; GFX9-NEXT:    ; wave barrier
 ; GFX9-NEXT:    ds_read_b32 v9, v9 offset:2304
@@ -288,6 +289,7 @@ define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32
 ; GFX10-NEXT:    s_waitcnt vmcnt(2)
 ; GFX10-NEXT:    ds_read_b32 v7, v9 offset:1792
 ; GFX10-NEXT:    ; wave barrier
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ds_read_b32 v8, v9 offset:2048
 ; GFX10-NEXT:    ; wave barrier
 ; GFX10-NEXT:    ds_read_b32 v9, v9 offset:2304


        


More information about the llvm-commits mailing list