[llvm] [AMDGPU][SIInsertWaitcnts][NFC] Introduce WaitEventSet container for events (PR #178511)

via llvm-commits llvm-commits at lists.llvm.org
Thu Jan 29 09:24:07 PST 2026


https://github.com/vporpo updated https://github.com/llvm/llvm-project/pull/178511

>From 121ce91d193da51724dd35af9a49c0353a2bb707 Mon Sep 17 00:00:00 2001
From: Vasileios Porpodas <vasileios.porpodas at amd.com>
Date: Sat, 24 Jan 2026 17:01:06 +0000
Subject: [PATCH 1/2] [AMDGPU][SIInsertWaitcnts][NFC] Introduce WaitEventSet
 container for events

Before this patch WaitEventType events used to be collected in unsigned
integers that were used as small bit vectors.

This patch introduces a WaitEventSet container class to replace the integer
bit vectors with a class that hides the implementation of common operations
like insertion, removal, union, intersection etc. from the user.

The WaitEventSet API matches that of a set and not a vector because we don't
care about the order of its contents. Internally though it is still a bit
vector that uses an unsigned integer as its storage, just like the original
implementation.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 216 ++++++++++++++------
 1 file changed, 154 insertions(+), 62 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ee59a2e59d4a3..1427768711113 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -219,12 +219,31 @@ enum WaitEventType {
   NUM_WAIT_EVENTS
 };
 #undef AMDGPU_EVENT_ENUM
+} // namespace
+
+namespace llvm {
+template <> struct enum_iteration_traits<WaitEventType> {
+  static constexpr bool is_iterable = true;
+};
+} // namespace llvm
+
+namespace {
+
+/// Return an iterator over all events between VMEM_ACCESS (the first event)
+/// and \c MaxEvent (exclusive, default value yields an enumeration over
+/// all counters).
+auto wait_events(WaitEventType MaxEvent = NUM_WAIT_EVENTS) {
+  return enum_seq(VMEM_ACCESS, MaxEvent);
+}
 
 #define AMDGPU_EVENT_NAME(Name) #Name,
 static constexpr StringLiteral WaitEventTypeName[] = {
   AMDGPU_DECLARE_WAIT_EVENTS(AMDGPU_EVENT_NAME)
 };
 #undef AMDGPU_EVENT_NAME
+static constexpr StringLiteral getWaitEventTypeName(WaitEventType Event) {
+  return WaitEventTypeName[Event];
+}
 // clang-format on
 
 // Enumerate different types of result-returning VMEM operations. Although
@@ -321,10 +340,88 @@ unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
   return getCounterRef(Wait, T);
 }
 
+/// A small set of events.
+class WaitEventSet {
+  unsigned Mask = 0;
+
+public:
+  WaitEventSet() = default;
+  explicit constexpr WaitEventSet(WaitEventType Event) {
+    assert((size_t)Event < sizeof(Mask) * 8 && "Not enough bits in mask!");
+    Mask |= 1 << Event;
+  }
+  constexpr WaitEventSet(std::initializer_list<WaitEventType> Events) {
+    for (auto &E : Events) {
+      assert((size_t)E < sizeof(Mask) * 8 && "Not enough bits in mask!");
+      Mask |= 1 << E;
+    }
+  }
+  void insert(const WaitEventType &Event) {
+    assert((size_t)Event < sizeof(Mask) * 8 && "Not enough bits in mask!");
+    Mask |= 1 << Event;
+  }
+  void remove(const WaitEventType &Event) {
+    assert((size_t)Event < sizeof(Mask) * 8 && "Not enough bits in mask!");
+    Mask &= ~(1 << Event);
+  }
+  bool contains(const WaitEventType &Event) const {
+    assert((size_t)Event < sizeof(Mask) * 8 && "Not enough bits in mask!");
+    return Mask & (1 << Event);
+  }
+  /// \Returns the intersection of this and \p Other.
+  WaitEventSet operator&(const WaitEventSet &Other) const {
+    auto Copy = *this;
+    Copy.Mask &= Other.Mask;
+    return Copy;
+  }
+  /// \Returns the union of this and \p Other.
+  WaitEventSet operator|(const WaitEventSet &Other) const {
+    auto Copy = *this;
+    Copy.Mask |= Other.Mask;
+    return Copy;
+  }
+  /// \Returns the inverse of this set.
+  WaitEventSet operator~() const {
+    auto Copy = *this;
+    Copy.Mask = ~Copy.Mask;
+    return Copy;
+  }
+  /// This set becomes the union of this and \p Other.
+  WaitEventSet &operator|=(const WaitEventSet &Other) {
+    Mask |= Other.Mask;
+    return *this;
+  }
+  /// This set becomes the intersection of this and \p Other.
+  WaitEventSet &operator&=(const WaitEventSet &Other) {
+    Mask &= Other.Mask;
+    return *this;
+  }
+  bool operator==(const WaitEventSet &Other) const {
+    return Mask == Other.Mask;
+  }
+  bool operator!=(const WaitEventSet &Other) const { return !(*this == Other); }
+  bool empty() const { return Mask == 0; }
+  /// \Returns true if the set contains more than one element.
+  bool twoOrMore() const { return Mask & (Mask - 1); }
+  operator bool() const { return !empty(); }
+  void print(raw_ostream &OS) const {
+    ListSeparator LS(", ");
+    for (WaitEventType Event : wait_events()) {
+      OS << LS << getWaitEventTypeName(Event);
+    }
+  }
+  LLVM_DUMP_METHOD void dump() const;
+};
+
+void WaitEventSet::dump() const {
+  print(dbgs());
+  dbgs() << "\n";
+}
+
 // Mapping from event to counter according to the table masks.
-InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
+InstCounterType eventCounter(const WaitEventSet *masks, WaitEventType E) {
   for (auto T : inst_counter_types()) {
-    if (masks[T] & (1 << E))
+    if (masks[T].contains(E))
       return T;
   }
   llvm_unreachable("event type has no associated counter");
@@ -394,40 +491,32 @@ class WaitcntGenerator {
                                 AMDGPU::Waitcnt Wait,
                                 const WaitcntBrackets &ScoreBrackets) = 0;
 
-  // Returns an array of bit masks which can be used to map values in
+  // Returns an array of WaitEventSets which can be used to map values in
   // WaitEventType to corresponding counter values in InstCounterType.
-  virtual const unsigned *getWaitEventMask() const = 0;
+  virtual const WaitEventSet *getWaitEventMask() const = 0;
 
   // Returns a new waitcnt with all counters except VScnt set to 0. If
   // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
   virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
 
   virtual ~WaitcntGenerator() = default;
-
-  // Create a mask value from the initializer list of wait event types.
-  static constexpr unsigned
-  eventMask(std::initializer_list<WaitEventType> Events) {
-    unsigned Mask = 0;
-    for (auto &E : Events)
-      Mask |= 1 << E;
-
-    return Mask;
-  }
 };
 
 class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
-  static constexpr const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] =
-      {eventMask({VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
-       eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
-       eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
-                  EXP_POS_ACCESS, EXP_LDS_ACCESS}),
-       eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
-       0,
-       0,
-       0,
-       0,
-       0,
-       0};
+  static constexpr const WaitEventSet
+      WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
+          WaitEventSet(
+              {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
+          WaitEventSet({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
+          WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
+                        EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
+          WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
+          WaitEventSet(),
+          WaitEventSet(),
+          WaitEventSet(),
+          WaitEventSet(),
+          WaitEventSet(),
+          WaitEventSet()};
 
 public:
   using WaitcntGenerator::WaitcntGenerator;
@@ -441,7 +530,7 @@ class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
                         AMDGPU::Waitcnt Wait,
                         const WaitcntBrackets &ScoreBrackets) override;
 
-  const unsigned *getWaitEventMask() const override {
+  const WaitEventSet *getWaitEventMask() const override {
     assert(ST);
     return WaitEventMaskForInstPreGFX12;
   }
@@ -452,19 +541,20 @@ class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
 class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
 protected:
   bool IsExpertMode;
-  static constexpr const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] =
-      {eventMask({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
-       eventMask({LDS_ACCESS, GDS_ACCESS}),
-       eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
-                  EXP_POS_ACCESS, EXP_LDS_ACCESS}),
-       eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
-       eventMask({VMEM_SAMPLER_READ_ACCESS}),
-       eventMask({VMEM_BVH_READ_ACCESS}),
-       eventMask({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
-       eventMask({VMEM_GROUP, SMEM_GROUP}),
-       eventMask({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
-                  VGPR_XDL_WRITE}),
-       eventMask({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
+  static constexpr const WaitEventSet
+      WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
+          WaitEventSet({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
+          WaitEventSet({LDS_ACCESS, GDS_ACCESS}),
+          WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
+                        EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
+          WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
+          WaitEventSet({VMEM_SAMPLER_READ_ACCESS}),
+          WaitEventSet({VMEM_BVH_READ_ACCESS}),
+          WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
+          WaitEventSet({VMEM_GROUP, SMEM_GROUP}),
+          WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
+                        VGPR_XDL_WRITE}),
+          WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
 
 public:
   WaitcntGeneratorGFX12Plus() = delete;
@@ -484,7 +574,7 @@ class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
                         AMDGPU::Waitcnt Wait,
                         const WaitcntBrackets &ScoreBrackets) override;
 
-  const unsigned *getWaitEventMask() const override {
+  const WaitEventSet *getWaitEventMask() const override {
     assert(ST);
     return WaitEventMaskForInstGFX12Plus;
   }
@@ -652,7 +742,9 @@ class SIInsertWaitcnts {
                          bool ExpertMode) const;
   AtomicRMWState getAtomicRMWState(MachineInstr &MI,
                                    AtomicRMWState PrevState) const;
-  const unsigned *getWaitEventMask() const { return WCG->getWaitEventMask(); }
+  const WaitEventSet *getWaitEventMask() const {
+    return WCG->getWaitEventMask();
+  }
 };
 
 // This objects maintains the current score brackets of each wait counter, and
@@ -747,20 +839,21 @@ class WaitcntBrackets {
   void applyWaitcnt(InstCounterType T, unsigned Count);
   void updateByEvent(WaitEventType E, MachineInstr &MI);
 
-  unsigned hasPendingEvent() const { return PendingEvents; }
-  unsigned hasPendingEvent(WaitEventType E) const {
-    return PendingEvents & (1 << E);
+  bool hasPendingEvent() const { return !PendingEvents.empty(); }
+  bool hasPendingEvent(WaitEventType E) const {
+    return PendingEvents.contains(E);
   }
-  unsigned hasPendingEvent(InstCounterType T) const {
-    unsigned HasPending = PendingEvents & Context->getWaitEventMask()[T];
-    assert((HasPending != 0) == (getScoreRange(T) != 0));
+  bool hasPendingEvent(InstCounterType T) const {
+    bool HasPending = PendingEvents & Context->getWaitEventMask()[T];
+    assert(HasPending == (getScoreRange(T) != 0) &&
+           "Expected no pending events iff scoreboard is empty");
     return HasPending;
   }
 
   bool hasMixedPendingEvents(InstCounterType T) const {
-    unsigned Events = hasPendingEvent(T);
+    WaitEventSet Events = PendingEvents & Context->getWaitEventMask()[T];
     // Return true if more than one bit is set in Events.
-    return Events & (Events - 1);
+    return Events.twoOrMore();
   }
 
   bool hasPendingFlat() const {
@@ -897,7 +990,7 @@ class WaitcntBrackets {
 
   unsigned ScoreLBs[NUM_INST_CNTS] = {0};
   unsigned ScoreUBs[NUM_INST_CNTS] = {0};
-  unsigned PendingEvents = 0;
+  WaitEventSet PendingEvents;
   // Remember the last flat memory operation.
   unsigned LastFlat[NUM_INST_CNTS] = {0};
   // Remember the last GDS operation.
@@ -1015,7 +1108,7 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
   // PendingEvents and ScoreUB need to be update regardless if this event
   // changes the score of a register or not.
   // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
-  PendingEvents |= 1 << E;
+  PendingEvents.insert(E);
   setScoreUB(T, CurrScore);
 
   const SIRegisterInfo *TRI = Context->TRI;
@@ -1095,13 +1188,13 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
     }
   } else if (T == X_CNT) {
     WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
-    if (PendingEvents & (1 << OtherEvent)) {
+    if (PendingEvents.contains(OtherEvent)) {
       // Hardware inserts an implicit xcnt between interleaved
       // SMEM and VMEM operations. So there will never be
       // outstanding address translations for both SMEM and
       // VMEM at the same time.
       setScoreLB(T, getScoreUB(T) - 1);
-      PendingEvents &= ~(1 << OtherEvent);
+      PendingEvents.remove(OtherEvent);
     }
     for (const MachineOperand &Op : Inst.all_uses())
       setScoreByOperand(Op, T, CurrScore);
@@ -1402,7 +1495,7 @@ void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
   if (PendingSCCWrite &&
       PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
       PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
-    unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE;
+    WaitEventSet SCC_WRITE_PendingEvent(SCC_WRITE);
     // If this SCC_WRITE is the only pending KM_CNT event, clear counter.
     if ((PendingEvents & Context->getWaitEventMask()[KM_CNT]) ==
         SCC_WRITE_PendingEvent) {
@@ -1444,14 +1537,14 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
     if (!hasMixedPendingEvents(X_CNT))
       applyWaitcnt(X_CNT, 0);
     else
-      PendingEvents &= ~(1 << SMEM_GROUP);
+      PendingEvents.remove(SMEM_GROUP);
   }
   if (T == LOAD_CNT && hasPendingEvent(VMEM_GROUP) &&
       !hasPendingEvent(STORE_CNT)) {
     if (!hasMixedPendingEvents(X_CNT))
       applyWaitcnt(X_CNT, Count);
     else if (Count == 0)
-      PendingEvents &= ~(1 << VMEM_GROUP);
+      PendingEvents.remove(VMEM_GROUP);
   }
 }
 
@@ -2784,9 +2877,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
 
   for (auto T : inst_counter_types(Context->MaxCounter)) {
     // Merge event flags for this counter
-    auto EventsForT = Context->getWaitEventMask()[T];
-    const unsigned OldEvents = PendingEvents & EventsForT;
-    const unsigned OtherEvents = Other.PendingEvents & EventsForT;
+    const WaitEventSet &EventsForT = Context->getWaitEventMask()[T];
+    const WaitEventSet OldEvents = PendingEvents & EventsForT;
+    const WaitEventSet OtherEvents = Other.PendingEvents & EventsForT;
     if (OtherEvents & ~OldEvents)
       StrictDom = true;
     PendingEvents |= OtherEvents;
@@ -2814,8 +2907,7 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
     if (T == KM_CNT) {
       StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
       if (Other.hasPendingEvent(SCC_WRITE)) {
-        unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
-        if (!OldEventsHasSCCWrite) {
+        if (!OldEvents.contains(SCC_WRITE)) {
           PendingSCCWrite = Other.PendingSCCWrite;
         } else if (PendingSCCWrite != Other.PendingSCCWrite) {
           PendingSCCWrite = nullptr;

>From d7c57e3caebd08541b8472087a32fd5afaa3b400 Mon Sep 17 00:00:00 2001
From: Vasileios Porpodas <vasileios.porpodas at amd.com>
Date: Thu, 29 Jan 2026 16:57:36 +0000
Subject: [PATCH 2/2] fixup! [AMDGPU][SIInsertWaitcnts][NFC] Introduce
 WaitEventSet container for events

---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 30 +++++++--------------
 1 file changed, 10 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 1427768711113..e38b06b285a26 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -347,27 +347,23 @@ class WaitEventSet {
 public:
   WaitEventSet() = default;
   explicit constexpr WaitEventSet(WaitEventType Event) {
-    assert((size_t)Event < sizeof(Mask) * 8 && "Not enough bits in mask!");
+    static_assert(NUM_WAIT_EVENTS <= sizeof(Mask) * 8,
+                  "Not enough bits in Mask for all the events");
     Mask |= 1 << Event;
   }
   constexpr WaitEventSet(std::initializer_list<WaitEventType> Events) {
     for (auto &E : Events) {
-      assert((size_t)E < sizeof(Mask) * 8 && "Not enough bits in mask!");
       Mask |= 1 << E;
     }
   }
-  void insert(const WaitEventType &Event) {
-    assert((size_t)Event < sizeof(Mask) * 8 && "Not enough bits in mask!");
-    Mask |= 1 << Event;
-  }
-  void remove(const WaitEventType &Event) {
-    assert((size_t)Event < sizeof(Mask) * 8 && "Not enough bits in mask!");
-    Mask &= ~(1 << Event);
-  }
+  void insert(const WaitEventType &Event) { Mask |= 1 << Event; }
+  void remove(const WaitEventType &Event) { Mask &= ~(1 << Event); }
+  void remove(const WaitEventSet &Other) { Mask &= ~Other.Mask; }
   bool contains(const WaitEventType &Event) const {
-    assert((size_t)Event < sizeof(Mask) * 8 && "Not enough bits in mask!");
     return Mask & (1 << Event);
   }
+  /// \Returns true if this set contains all elements of \p Other.
+  bool contains(const WaitEventSet &Other) const { return Mask & ~Other.Mask; }
   /// \Returns the intersection of this and \p Other.
   WaitEventSet operator&(const WaitEventSet &Other) const {
     auto Copy = *this;
@@ -380,12 +376,6 @@ class WaitEventSet {
     Copy.Mask |= Other.Mask;
     return Copy;
   }
-  /// \Returns the inverse of this set.
-  WaitEventSet operator~() const {
-    auto Copy = *this;
-    Copy.Mask = ~Copy.Mask;
-    return Copy;
-  }
   /// This set becomes the union of this and \p Other.
   WaitEventSet &operator|=(const WaitEventSet &Other) {
     Mask |= Other.Mask;
@@ -1502,7 +1492,7 @@ void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
       setScoreLB(KM_CNT, getScoreUB(KM_CNT));
     }
 
-    PendingEvents &= ~SCC_WRITE_PendingEvent;
+    PendingEvents.remove(SCC_WRITE_PendingEvent);
     PendingSCCWrite = nullptr;
   }
 }
@@ -1530,7 +1520,7 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
     setScoreLB(T, std::max(getScoreLB(T), UB - Count));
   } else {
     setScoreLB(T, UB);
-    PendingEvents &= ~Context->getWaitEventMask()[T];
+    PendingEvents.remove(Context->getWaitEventMask()[T]);
   }
 
   if (T == KM_CNT && Count == 0 && hasPendingEvent(SMEM_GROUP)) {
@@ -2880,7 +2870,7 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
     const WaitEventSet &EventsForT = Context->getWaitEventMask()[T];
     const WaitEventSet OldEvents = PendingEvents & EventsForT;
     const WaitEventSet OtherEvents = Other.PendingEvents & EventsForT;
-    if (OtherEvents & ~OldEvents)
+    if (OtherEvents.contains(OldEvents))
       StrictDom = true;
     PendingEvents |= OtherEvents;
 



More information about the llvm-commits mailing list