[llvm-branch-commits] [llvm] [AMDGPU][HWEvents] Refactor VMEM_ACCESS as VMEM_READ_ACCESS (PR #204545)

Thu Jun 18 04:56:27 PDT 2026

https://github.com/Pierre-vh updated https://github.com/llvm/llvm-project/pull/204545

>From 09a3fff4bf2e9dc385bf1135e5cc576f1874fba5 Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Thu, 18 Jun 2026 12:04:20 +0200
Subject: [PATCH 1/2] [AMDGPU][HWEvents] Refactor VMEM_ACCESS as
 VMEM_READ_ACCESS

Instead of having an HWEvent that can be either a read or a write
depending on the target, keep the events as straightforward as
possible and let InsertWaitCnt interpret it. Rename VMEM_ACCESS
to VMEM_READ_ACCESS and set VMEM_STORE_ACCESS & similar events
even if the target does not have a VSCnt.

I think this conceptually makes more sense.
This separates concerns better so that HWEvents nodels events
objectively, and InsertWaitCnt handles them as necessary for the task
it is trying to achieve (insert wait instructions).
---
 llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp   |  9 +++---
 llvm/lib/Target/AMDGPU/AMDGPUHWEvents.def   |  4 +--
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 36 +++++++++++++--------
 3 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp
index 04c4e045e231d..df15c8a82b5e7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp
@@ -69,8 +69,9 @@ static HWEvents getVmemHWEvent(const MachineInstr &Inst, const GCNSubtarget &ST,
   assert(SIInstrInfo::isVMEM(Inst));
   // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
   // these should use VM_CNT.
-  if (!ST.hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
-    return HWEvents::VMEM_ACCESS;
+  if (SIInstrInfo::mayWriteLDSThroughDMA(Inst))
+    return HWEvents::VMEM_READ_ACCESS;
+
   if (Inst.mayStore() &&
       (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
     if (TII.mayAccessScratch(Inst))
@@ -78,7 +79,7 @@ static HWEvents getVmemHWEvent(const MachineInstr &Inst, const GCNSubtarget &ST,
     return HWEvents::VMEM_WRITE_ACCESS;
   }
   if (!ST.hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
-    return HWEvents::VMEM_ACCESS;
+    return HWEvents::VMEM_READ_ACCESS;
 
   if (SIInstrInfo::isImage(Inst)) {
     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
@@ -95,7 +96,7 @@ static HWEvents getVmemHWEvent(const MachineInstr &Inst, const GCNSubtarget &ST,
       return HWEvents::VMEM_SAMPLER_READ_ACCESS;
   }
 
-  return HWEvents::VMEM_ACCESS;
+  return HWEvents::VMEM_READ_ACCESS;
 }
 
 static HWEvents getEventsForImpl(const MachineInstr &Inst,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.def b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.def
index 9d2d01cee6e02..bd12af16d66cc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.def
@@ -19,9 +19,7 @@
 #define AMDGPU_LAST_HW_EVENT(X)
 #endif
 
-// TODO: VMEM_ACCESS should be broken up and be target-independent, not interpreted differently
-// depending on the target.
-AMDGPU_HW_EVENT(VMEM_ACCESS,                0)  /* vmem read & write (pre-gfx10), vmem read (gfx10+) */
+AMDGPU_HW_EVENT(VMEM_READ_ACCESS,           0)  /* vmem read */
 AMDGPU_HW_EVENT(VMEM_SAMPLER_READ_ACCESS,   1)  /* vmem SAMPLER read (gfx12+ only) */
 AMDGPU_HW_EVENT(VMEM_BVH_READ_ACCESS,       2)  /* vmem BVH read (gfx12+ only) */
 AMDGPU_HW_EVENT(GLOBAL_INV_ACCESS,          3)  /* GLOBAL_INV (gfx12+ only) */
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 65089858a8fbf..63670fc77d751 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -249,7 +249,7 @@ class WaitcntGenerator {
                                 const WaitcntBrackets &ScoreBrackets) = 0;
 
   // Returns the set of HWEvents that corresponds to counter \p T.
-  virtual const HWEvents &getWaitEvents(AMDGPU::InstCounterType T) const = 0;
+  virtual HWEvents getWaitEvents(AMDGPU::InstCounterType T) const = 0;
 
   /// \returns the counter that corresponds to event \p E.
   AMDGPU::InstCounterType getCounterFromEvent(HWEvents E) const {
@@ -274,7 +274,7 @@ class WaitcntGenerator {
 class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
   static constexpr const HWEvents
       WaitEventMaskForInstPreGFX12[AMDGPU::NUM_INST_CNTS] = {
-          HWEvents::VMEM_ACCESS | HWEvents::VMEM_SAMPLER_READ_ACCESS |
+          HWEvents::VMEM_READ_ACCESS | HWEvents::VMEM_SAMPLER_READ_ACCESS |
               HWEvents::VMEM_BVH_READ_ACCESS,
           HWEvents::SMEM_ACCESS | HWEvents::LDS_ACCESS | HWEvents::GDS_ACCESS |
               HWEvents::SQ_MESSAGE,
@@ -303,8 +303,11 @@ class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
                         AMDGPU::Waitcnt Wait,
                         const WaitcntBrackets &ScoreBrackets) override;
 
-  const HWEvents &getWaitEvents(AMDGPU::InstCounterType T) const override {
-    return WaitEventMaskForInstPreGFX12[T];
+  HWEvents getWaitEvents(AMDGPU::InstCounterType T) const override {
+    HWEvents EVs = WaitEventMaskForInstPreGFX12[T];
+    if (T == AMDGPU::LOAD_CNT && !ST.hasVscnt())
+      EVs |= WaitEventMaskForInstPreGFX12[AMDGPU::STORE_CNT];
+    return EVs;
   }
 
   AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
@@ -315,7 +318,7 @@ class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
   bool IsExpertMode;
   static constexpr const HWEvents
       WaitEventMaskForInstGFX12Plus[AMDGPU::NUM_INST_CNTS] = {
-          HWEvents::VMEM_ACCESS | HWEvents::GLOBAL_INV_ACCESS,
+          HWEvents::VMEM_READ_ACCESS | HWEvents::GLOBAL_INV_ACCESS,
           HWEvents::LDS_ACCESS | HWEvents::GDS_ACCESS,
           HWEvents::EXP_GPR_LOCK | HWEvents::GDS_GPR_LOCK |
               HWEvents::VMW_GPR_LOCK | HWEvents::EXP_PARAM_ACCESS |
@@ -352,7 +355,7 @@ class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
                         AMDGPU::Waitcnt Wait,
                         const WaitcntBrackets &ScoreBrackets) override;
 
-  const HWEvents &getWaitEvents(AMDGPU::InstCounterType T) const override {
+  HWEvents getWaitEvents(AMDGPU::InstCounterType T) const override {
     return WaitEventMaskForInstGFX12Plus[T];
   }
 
@@ -476,7 +479,7 @@ class SIInsertWaitcnts {
   bool removeRedundantSoftXcnts(MachineBasicBlock &Block);
   void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                          bool ExpertMode) const;
-  const HWEvents &getWaitEvents(AMDGPU::InstCounterType T) const {
+  HWEvents getWaitEvents(AMDGPU::InstCounterType T) const {
     return WCG->getWaitEvents(T);
   }
   AMDGPU::InstCounterType getCounterFromEvent(HWEvents E) const {
@@ -1539,14 +1542,21 @@ bool WaitcntBrackets::counterOutOfOrder(AMDGPU::InstCounterType T) const {
       (T == AMDGPU::X_CNT && hasPendingEvent(HWEvents::SMEM_GROUP)))
     return true;
 
-  // GLOBAL_INV completes in-order with other LOAD_CNT events (VMEM_ACCESS),
-  // so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT events doesn't cause
-  // out-of-order completion.
   if (T == AMDGPU::LOAD_CNT) {
+
+    // On targets without VScnt, LOAD_CNT includes all of STORE_CNT as well.
+    // All these events use one counter and do not go out of order with respect
+    // to each other.
+    if (!Context->ST.hasVscnt())
+      return false;
+
     HWEvents Events = PendingEvents & Context->getWaitEvents(T);
-    // Remove GLOBAL_INV_ACCESS from the event mask before checking for mixed
-    // events
+
+    // GLOBAL_INV completes in-order with other LOAD_CNT events
+    // (VMEM_READ_ACCESS), so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT
+    // events doesn't cause out-of-order completion.
     Events = Events.without(HWEvents::GLOBAL_INV_ACCESS);
+
     // Return true only if there are still multiple event types after removing
     // GLOBAL_INV
     return Events.size() > 1;
@@ -2279,7 +2289,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
     // GLOBAL_INV increments loadcnt but doesn't write to VGPRs, so there's
     // no need to wait for it at function boundaries.
     if (ST.hasExtendedWaitCounts() &&
-        !ScoreBrackets.hasPendingEvent(HWEvents::VMEM_ACCESS))
+        !ScoreBrackets.hasPendingEvent(HWEvents::VMEM_READ_ACCESS))
       AllZeroWait.set(AMDGPU::LOAD_CNT, ~0u);
     Wait = AllZeroWait;
     break;

>From f6589a17b507ce0364db21a5d6cdba66d51a723e Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Thu, 18 Jun 2026 13:56:10 +0200
Subject: [PATCH 2/2] Adjust comment

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 63670fc77d751..e9de198c14b5f 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1552,8 +1552,8 @@ bool WaitcntBrackets::counterOutOfOrder(AMDGPU::InstCounterType T) const {
 
     HWEvents Events = PendingEvents & Context->getWaitEvents(T);
 
-    // GLOBAL_INV completes in-order with other LOAD_CNT events
-    // (VMEM_READ_ACCESS), so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT
+    // GLOBAL_INV completes in-order with other LOAD_CNT events,
+    // so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT
     // events doesn't cause out-of-order completion.
     Events = Events.without(HWEvents::GLOBAL_INV_ACCESS);