[llvm] [AMDGPU][SIInsertWaitcnts][NFC] Move instruction collection into a separate function (PR #179358)

Tue Mar 3 08:59:28 PST 2026

https://github.com/vporpo updated https://github.com/llvm/llvm-project/pull/179358

>From 31d18eaa8171dd32609590f98e0a11dce47f8804 Mon Sep 17 00:00:00 2001
From: Vasileios Porpodas <vasileios.porpodas at amd.com>
Date: Tue, 3 Mar 2026 00:29:38 +0000
Subject: [PATCH] [AMDGPU][SIInsertWaitcnts][NFC] Move instruction collection
 into a separate function

This patch moves some of the code that modifies the state from generateWaitcntInstBefore() to separate functions.

A follow-up patch will further remove state-modifying code in an attempt to create a read-only `getWait()` function.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 67 ++++++++++++++++-----
 1 file changed, 51 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 78430438ea9f6..31664150d529c 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -645,6 +645,9 @@ class SIInsertWaitcnts {
   }
 
   bool isVmemAccess(const MachineInstr &MI) const;
+  void updateStateBeforeWaitcnt(MachineInstr &MI,
+                                const WaitcntBrackets &ScoreBrackets);
+  void updateStateAfterWaitcnt(MachineInstr &MI);
   bool generateWaitcntInstBefore(MachineInstr &MI,
                                  WaitcntBrackets &ScoreBrackets,
                                  MachineInstr *OldWaitcntInstr,
@@ -2377,6 +2380,51 @@ bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
   return Modified;
 }
 
+void SIInsertWaitcnts::updateStateBeforeWaitcnt(
+    MachineInstr &MI, const WaitcntBrackets &ScoreBrackets) {
+  switch (MI.getOpcode()) {
+  case AMDGPU::SI_RETURN_TO_EPILOG:
+  case AMDGPU::SI_RETURN:
+  case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
+  case AMDGPU::S_SETPC_B64_return: {
+    // All waits must be resolved at call return.
+    // NOTE: this could be improved with knowledge of all call sites or
+    //   with knowledge of the called routines.
+    ReturnInsts.insert(&MI);
+    break;
+  case AMDGPU::S_ENDPGM:
+  case AMDGPU::S_ENDPGM_SAVED:
+    // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
+    // Technically the hardware will do this on its own if we don't, but that
+    // might cost extra cycles compared to doing it explicitly.
+    // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
+    // have to wait for outstanding VMEM stores. In this case it can be useful
+    // to send a message to explicitly release all VGPRs before the stores have
+    // completed, but it is only safe to do this if there are no outstanding
+    // scratch stores.
+    EndPgmInsts[&MI] = !ScoreBrackets.empty(STORE_CNT) &&
+                       !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS);
+    break;
+   default:
+    if (MI.isCall())
+      CallInsts.insert(&MI);
+    break;
+  }
+  }
+}
+
+void SIInsertWaitcnts::updateStateAfterWaitcnt(MachineInstr &MI) {
+  for (const MachineMemOperand *Memop : MI.memoperands()) {
+    const Value *Ptr = Memop->getValue();
+    if (Memop->isStore()) {
+      if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {
+        if (PDT->dominates(MI.getParent(), It->second))
+          SLoadAddresses.erase(It);
+      }
+    }
+  }
+}
+
 ///  Generate s_waitcnt instruction to be placed before cur_Inst.
 ///  Instructions of a given type are returned in order,
 ///  but instructions of different types can complete out of order.
@@ -2421,7 +2469,6 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
     // All waits must be resolved at call return.
     // NOTE: this could be improved with knowledge of all call sites or
     //   with knowledge of the called routines.
-    ReturnInsts.insert(&MI);
     AMDGPU::Waitcnt AllZeroWait =
         WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
     // On GFX12+, if LOAD_CNT is pending but no VGPRs are waiting for loads
@@ -2436,16 +2483,6 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
   }
   case AMDGPU::S_ENDPGM:
   case AMDGPU::S_ENDPGM_SAVED: {
-    // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
-    // Technically the hardware will do this on its own if we don't, but that
-    // might cost extra cycles compared to doing it explicitly.
-    // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
-    // have to wait for outstanding VMEM stores. In this case it can be useful
-    // to send a message to explicitly release all VGPRs before the stores have
-    // completed, but it is only safe to do this if there are no outstanding
-    // scratch stores.
-    EndPgmInsts[&MI] = !ScoreBrackets.empty(STORE_CNT) &&
-                       !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS);
     break;
   }
   case AMDGPU::S_SENDMSG:
@@ -2485,7 +2522,6 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
       // The function is going to insert a wait on everything in its prolog.
       // This still needs to be careful if the call target is a load (e.g. a GOT
       // load). We also need to check WAW dependency with saved PC.
-      CallInsts.insert(&MI);
       Wait = AMDGPU::Waitcnt();
 
       const MachineOperand &CallAddrOp = TII->getCalleeOperand(MI);
@@ -2519,11 +2555,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
       for (const MachineMemOperand *Memop : MI.memoperands()) {
         const Value *Ptr = Memop->getValue();
         if (Memop->isStore()) {
-          if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {
+          if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end())
             addWait(Wait, SmemAccessCounter, 0);
-            if (PDT->dominates(MI.getParent(), It->second))
-              SLoadAddresses.erase(It);
-          }
         }
         unsigned AS = Memop->getAddrSpace();
         if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)
@@ -3261,10 +3294,12 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
       ScoreBrackets.recordAsyncMark(Inst);
       continue;
     }
+    updateStateBeforeWaitcnt(Inst, ScoreBrackets);
 
     // Generate an s_waitcnt instruction to be placed before Inst, if needed.
     Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
                                           FlushFlags);
+    updateStateAfterWaitcnt(Inst);
     OldWaitcntInstr = nullptr;
 
     if (TII->isSMRD(Inst)) {