[llvm-branch-commits] [llvm] [AMDGPU][HWEvents] Refactor VMEM_ACCESS as VMEM_READ_ACCESS (PR #204545)

Fri Jun 26 01:07:21 PDT 2026

https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/204545

>From 134210da2a011192e5d01424dc5ea959cc5bb5f5 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Thu, 18 Jun 2026 12:04:20 +0200
Subject: [PATCH 1/2] [AMDGPU][HWEvents] Refactor VMEM_ACCESS as
 VMEM_READ_ACCESS

Instead of having an HWEvent that can be either a read or a write
depending on the target, keep the events as straightforward as
possible and let InsertWaitCnt interpret it. Rename VMEM_ACCESS
to VMEM_READ_ACCESS and set VMEM_STORE_ACCESS & similar events
even if the target does not have a VSCnt.

I think this conceptually makes more sense.
This separates concerns better so that HWEvents nodels events
objectively, and InsertWaitCnt handles them as necessary for the task
it is trying to achieve (insert wait instructions).
---
 llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp   |  9 ++++---
 llvm/lib/Target/AMDGPU/AMDGPUHWEvents.def   |  4 +--
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 29 +++++++++++++--------
 3 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp
index c472bac3babec..1087b08ac9d91 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp
@@ -73,8 +73,9 @@ static HWEvents getVmemHWEvent(const MachineInstr &Inst, const GCNSubtarget &ST,
   assert(SIInstrInfo::isVMEM(Inst));
   // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
   // these should use VM_CNT.
-  if (!ST.hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
-    return HWEvents::VMEM_ACCESS;
+  if (SIInstrInfo::mayWriteLDSThroughDMA(Inst))
+    return HWEvents::VMEM_READ_ACCESS;
+
   if (Inst.mayStore() &&
       (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
     if (TII.mayAccessScratch(Inst))
@@ -82,7 +83,7 @@ static HWEvents getVmemHWEvent(const MachineInstr &Inst, const GCNSubtarget &ST,
     return HWEvents::VMEM_WRITE_ACCESS;
   }
   if (!ST.hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
-    return HWEvents::VMEM_ACCESS;
+    return HWEvents::VMEM_READ_ACCESS;
 
   if (SIInstrInfo::isImage(Inst)) {
     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
@@ -99,7 +100,7 @@ static HWEvents getVmemHWEvent(const MachineInstr &Inst, const GCNSubtarget &ST,
       return HWEvents::VMEM_SAMPLER_READ_ACCESS;
   }
 
-  return HWEvents::VMEM_ACCESS;
+  return HWEvents::VMEM_READ_ACCESS;
 }
 
 static HWEvents getEventsForImpl(const MachineInstr &Inst,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.def b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.def
index 9d2d01cee6e02..bd12af16d66cc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.def
@@ -19,9 +19,7 @@
 #define AMDGPU_LAST_HW_EVENT(X)
 #endif
 
-// TODO: VMEM_ACCESS should be broken up and be target-independent, not interpreted differently
-// depending on the target.
-AMDGPU_HW_EVENT(VMEM_ACCESS,                0)  /* vmem read & write (pre-gfx10), vmem read (gfx10+) */
+AMDGPU_HW_EVENT(VMEM_READ_ACCESS,           0)  /* vmem read */
 AMDGPU_HW_EVENT(VMEM_SAMPLER_READ_ACCESS,   1)  /* vmem SAMPLER read (gfx12+ only) */
 AMDGPU_HW_EVENT(VMEM_BVH_READ_ACCESS,       2)  /* vmem BVH read (gfx12+ only) */
 AMDGPU_HW_EVENT(GLOBAL_INV_ACCESS,          3)  /* GLOBAL_INV (gfx12+ only) */
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index db2609651440d..09d554eb64067 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -249,7 +249,7 @@ class WaitcntGenerator {
                                 const WaitcntBrackets &ScoreBrackets) = 0;
 
   // Returns the set of HWEvents that corresponds to counter \p T.
-  virtual const HWEvents &getWaitEvents(AMDGPU::InstCounterType T) const = 0;
+  virtual HWEvents getWaitEvents(AMDGPU::InstCounterType T) const = 0;
 
   /// \returns the counter that corresponds to event \p E.
   AMDGPU::InstCounterType getCounterFromEvent(HWEvents E) const {
@@ -274,7 +274,7 @@ class WaitcntGenerator {
 class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
   static constexpr const HWEvents
       WaitEventMaskForInstPreGFX12[AMDGPU::NUM_INST_CNTS] = {
-          HWEvents::VMEM_ACCESS | HWEvents::VMEM_SAMPLER_READ_ACCESS |
+          HWEvents::VMEM_READ_ACCESS | HWEvents::VMEM_SAMPLER_READ_ACCESS |
               HWEvents::VMEM_BVH_READ_ACCESS,
           HWEvents::SMEM_ACCESS | HWEvents::LDS_ACCESS | HWEvents::GDS_ACCESS |
               HWEvents::SQ_MESSAGE,
@@ -303,8 +303,11 @@ class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
                         AMDGPU::Waitcnt Wait,
                         const WaitcntBrackets &ScoreBrackets) override;
 
-  const HWEvents &getWaitEvents(AMDGPU::InstCounterType T) const override {
-    return WaitEventMaskForInstPreGFX12[T];
+  HWEvents getWaitEvents(AMDGPU::InstCounterType T) const override {
+    HWEvents EVs = WaitEventMaskForInstPreGFX12[T];
+    if (T == AMDGPU::LOAD_CNT && !ST.hasVscnt())
+      EVs |= WaitEventMaskForInstPreGFX12[AMDGPU::STORE_CNT];
+    return EVs;
   }
 
   AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
@@ -315,7 +318,7 @@ class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
   bool IsExpertMode;
   static constexpr const HWEvents
       WaitEventMaskForInstGFX12Plus[AMDGPU::NUM_INST_CNTS] = {
-          HWEvents::VMEM_ACCESS | HWEvents::GLOBAL_INV_ACCESS,
+          HWEvents::VMEM_READ_ACCESS | HWEvents::GLOBAL_INV_ACCESS,
           HWEvents::LDS_ACCESS | HWEvents::GDS_ACCESS,
           HWEvents::EXP_GPR_LOCK | HWEvents::GDS_GPR_LOCK |
               HWEvents::VMW_GPR_LOCK | HWEvents::EXP_PARAM_ACCESS |
@@ -352,7 +355,7 @@ class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
                         AMDGPU::Waitcnt Wait,
                         const WaitcntBrackets &ScoreBrackets) override;
 
-  const HWEvents &getWaitEvents(AMDGPU::InstCounterType T) const override {
+  HWEvents getWaitEvents(AMDGPU::InstCounterType T) const override {
     return WaitEventMaskForInstGFX12Plus[T];
   }
 
@@ -476,7 +479,7 @@ class SIInsertWaitcnts {
   bool removeRedundantSoftXcnts(MachineBasicBlock &Block);
   void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                          bool ExpertMode) const;
-  const HWEvents &getWaitEvents(AMDGPU::InstCounterType T) const {
+  HWEvents getWaitEvents(AMDGPU::InstCounterType T) const {
     return WCG->getWaitEvents(T);
   }
   AMDGPU::InstCounterType getCounterFromEvent(HWEvents E) const {
@@ -1539,10 +1542,14 @@ bool WaitcntBrackets::counterOutOfOrder(AMDGPU::InstCounterType T) const {
       (T == AMDGPU::X_CNT && hasPendingEvent(HWEvents::SMEM_GROUP)))
     return true;
 
-  // GLOBAL_INV completes in-order with other LOAD_CNT events (VMEM_ACCESS),
-  // so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT events doesn't cause
-  // out-of-order completion.
   if (T == AMDGPU::LOAD_CNT) {
+
+    // On targets without VScnt, LOAD_CNT includes all of STORE_CNT as well.
+    // All these events use one counter and do not go out of order with respect
+    // to each other.
+    if (!Context->ST.hasVscnt())
+      return false;
+
     HWEvents Events = PendingEvents & Context->getWaitEvents(T);
     // Remove GLOBAL_INV_ACCESS from the event mask before checking for mixed
     // events
@@ -2279,7 +2286,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
     // GLOBAL_INV increments loadcnt but doesn't write to VGPRs, so there's
     // no need to wait for it at function boundaries.
     if (ST.hasExtendedWaitCounts() &&
-        !ScoreBrackets.hasPendingEvent(HWEvents::VMEM_ACCESS))
+        !ScoreBrackets.hasPendingEvent(HWEvents::VMEM_READ_ACCESS))
       AllZeroWait.set(AMDGPU::LOAD_CNT, ~0u);
     Wait = AllZeroWait;
     break;

>From 78fab203839738208e6797a9aa61a55322f26935 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Thu, 18 Jun 2026 13:56:10 +0200
Subject: [PATCH 2/2] Adjust comment

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 09d554eb64067..c261ae9c4b6c6 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1551,9 +1551,12 @@ bool WaitcntBrackets::counterOutOfOrder(AMDGPU::InstCounterType T) const {
       return false;
 
     HWEvents Events = PendingEvents & Context->getWaitEvents(T);
-    // Remove GLOBAL_INV_ACCESS from the event mask before checking for mixed
-    // events
+
+    // GLOBAL_INV completes in-order with other LOAD_CNT events,
+    // so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT
+    // events doesn't cause out-of-order completion.
     Events -= HWEvents::GLOBAL_INV_ACCESS;
+
     // Return true only if there are still multiple event types after removing
     // GLOBAL_INV
     return Events.size() > 1;