[llvm-branch-commits] [llvm] [AMDGPU][InsertWaitCnts] Make HWEvent a BitMask (PR #203864)

Mon Jun 15 02:50:30 PDT 2026

llvmorg-github-actions[bot] wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Pierre van Houtryve (Pierre-vh)

<details>
<summary>Changes</summary>

Follow up from comments on https://github.com/llvm/llvm-project/pull/202886

Make HWEvent a bitmask by default instead of having both the enum, and a separate HWEventSet. This has the advantage of streamlining the code a bit and opening the possibility of adding "modifiers" to events, e.g. I imagine we could now fold "VMemType" into the Events.
We already do this with things like SMEM_GROUP. At least now it's baked into the design.

I opted for a bit more verbosity by taking inspiration from FastMathFlags (FMF): instead of exposing a raw enum, I wrap it in a class w/ helper function. The downside is having to reimplement all the little bitwise ops, but the result is a cleaner, simpler interface than a raw enum (class) w/ many helper functions. I initially tried that but I recoiled at the sight of things like `contains(A, B)` which isn't very clear, while `A.contains(B)` is self explanatory.

Considering HWEvent is a bitmask, I also implemented a simple iterator to iterate over all set bits of the mask, which is a useful thing to have as some APIs in InsertWaitCnt rely on treating one event at a time.

---

Patch is 31.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/203864.diff


4 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp (+30-32) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUHWEvents.def (+28-34) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUHWEvents.h (+88-86) 
- (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+73-79) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp
index 82be5059e7443..f1bf6a9ccf0b6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp
@@ -15,21 +15,9 @@
 
 namespace llvm {
 namespace AMDGPU {
-void HWEventSet::print(raw_ostream &OS) const {
-  ListSeparator LS(", ");
-  for (HWEvent Event : hw_events()) {
-    if (contains(Event))
-      OS << LS << toString(Event);
-  }
-}
-
-void HWEventSet::dump() const {
-  print(dbgs());
-  dbgs() << "\n";
-}
 
-static HWEventSet getExpertSchedulingEventType(const MachineInstr &Inst,
-                                               const SIInstrInfo &TII) {
+static HWEvent getExpertSchedulingEventType(const MachineInstr &Inst,
+                                            const SIInstrInfo &TII) {
   if (TII.isVALU(Inst)) {
     // Core/Side-, DP-, XDL- and TRANS-MACC VALU instructions complete
     // out-of-order with respect to each other, so each of these classes
@@ -61,7 +49,7 @@ static HWEventSet getExpertSchedulingEventType(const MachineInstr &Inst,
     return HWEvent::VGPR_VMEM_READ;
 
   // Otherwise, no hazard.
-  return {};
+  return HWEvent::NONE;
 }
 
 static HWEvent getVmemHWEvent(const MachineInstr &Inst, const GCNSubtarget &ST,
@@ -110,13 +98,13 @@ static HWEvent getVmemHWEvent(const MachineInstr &Inst, const GCNSubtarget &ST,
   return HWEvent::VMEM_ACCESS;
 }
 
-static HWEventSet getEventsForImpl(const MachineInstr &Inst,
-                                   const GCNSubtarget &ST,
-                                   const SIInstrInfo &TII) {
+static HWEvent getEventsForImpl(const MachineInstr &Inst,
+                                const GCNSubtarget &ST,
+                                const SIInstrInfo &TII) {
   if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {
     if (TII.isAlwaysGDS(Inst.getOpcode()) ||
         TII.hasModifiersSet(Inst, AMDGPU::OpName::gds))
-      return {HWEvent::GDS_ACCESS, HWEvent::GDS_GPR_LOCK};
+      return HWEvent::GDS_ACCESS | HWEvent::GDS_GPR_LOCK;
 
     return HWEvent::LDS_ACCESS;
   }
@@ -126,16 +114,16 @@ static HWEventSet getEventsForImpl(const MachineInstr &Inst,
       return getVmemHWEvent(Inst, ST, TII);
 
     assert(Inst.mayLoadOrStore());
-    HWEventSet S;
+    HWEvent E = HWEvent::NONE;
     if (TII.mayAccessVMEMThroughFlat(Inst)) {
       if (ST.hasWaitXcnt())
-        S.insert(HWEvent::VMEM_GROUP);
-      S.insert(getVmemHWEvent(Inst, ST, TII));
+        E |= HWEvent::VMEM_GROUP;
+      E |= getVmemHWEvent(Inst, ST, TII);
     }
 
     if (TII.mayAccessLDSThroughFlat(Inst))
-      S.insert(HWEvent::LDS_ACCESS);
-    return S;
+      E |= HWEvent::LDS_ACCESS;
+    return E;
   }
 
   if (SIInstrInfo::isVMEM(Inst) &&
@@ -144,19 +132,19 @@ static HWEventSet getEventsForImpl(const MachineInstr &Inst,
     // BUFFER_WBL2 is included here because unlike invalidates, has to be
     // followed "S_WAITCNT vmcnt(0)" is needed after to ensure the writeback has
     // completed.
-    HWEventSet S = {getVmemHWEvent(Inst, ST, TII)};
+    HWEvent E = getVmemHWEvent(Inst, ST, TII);
     if (ST.hasWaitXcnt())
-      S.insert(HWEvent::VMEM_GROUP);
+      E |= HWEvent::VMEM_GROUP;
     if (ST.vmemWriteNeedsExpWaitcnt() &&
         (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst)))
-      S.insert(HWEvent::VMW_GPR_LOCK);
+      E |= HWEvent::VMW_GPR_LOCK;
 
-    return S;
+    return E;
   }
 
   if (TII.isSMRD(Inst)) {
     if (ST.hasWaitXcnt())
-      return {HWEvent::SMEM_GROUP, HWEvent::SMEM_ACCESS};
+      return HWEvent::SMEM_GROUP | HWEvent::SMEM_ACCESS;
     return HWEvent::SMEM_ACCESS;
   }
 
@@ -190,11 +178,11 @@ static HWEventSet getEventsForImpl(const MachineInstr &Inst,
     return HWEvent::SMEM_ACCESS;
   }
 
-  return {};
+  return HWEvent::NONE;
 }
 
-HWEventSet getEventsFor(const MachineInstr &Inst, const GCNSubtarget &ST,
-                        bool IsExpertMode) {
+HWEvent getEventsFor(const MachineInstr &Inst, const GCNSubtarget &ST,
+                     bool IsExpertMode) {
   const SIInstrInfo &TII = *ST.getInstrInfo();
 
   if (IsExpertMode)
@@ -203,4 +191,14 @@ HWEventSet getEventsFor(const MachineInstr &Inst, const GCNSubtarget &ST,
   return getEventsForImpl(Inst, ST, TII);
 }
 } // namespace AMDGPU
+
+raw_ostream &operator<<(raw_ostream &OS, AMDGPU::HWEvent Events) {
+  ListSeparator LS(" | ");
+#define AMDGPU_HW_EVENT(E, V)                                                  \
+  if (Events & AMDGPU::HWEvent::E)                                             \
+    OS << LS << #E << " ";
+#include "AMDGPUHWEvents.def"
+  return OS;
+}
+
 } // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.def b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.def
index e0db74e93021e..9d2d01cee6e02 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.def
@@ -19,42 +19,37 @@
 #define AMDGPU_LAST_HW_EVENT(X)
 #endif
 
-#ifndef AMDGPU_FIRST_HW_EVENT
-#define AMDGPU_FIRST_HW_EVENT(X)
-#endif
-
 // TODO: VMEM_ACCESS should be broken up and be target-independent, not interpreted differently
 // depending on the target.
-AMDGPU_HW_EVENT(VMEM_ACCESS)              /* vmem read & write (pre-gfx10), vmem read (gfx10+) */
-AMDGPU_HW_EVENT(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */
-AMDGPU_HW_EVENT(VMEM_BVH_READ_ACCESS)     /* vmem BVH read (gfx12+ only) */
-AMDGPU_HW_EVENT(GLOBAL_INV_ACCESS)        /* GLOBAL_INV (gfx12+ only) */
-AMDGPU_HW_EVENT(VMEM_WRITE_ACCESS)        /* vmem write that is not scratch */
-AMDGPU_HW_EVENT(SCRATCH_WRITE_ACCESS)     /* vmem write that may be scratch */
-AMDGPU_HW_EVENT(VMEM_GROUP)               /* vmem group */
-AMDGPU_HW_EVENT(LDS_ACCESS)               /* lds read & write */
-AMDGPU_HW_EVENT(GDS_ACCESS)               /* gds read & write */
-AMDGPU_HW_EVENT(SQ_MESSAGE)               /* send message */
-AMDGPU_HW_EVENT(SCC_WRITE)                /* write to SCC from barrier */
-AMDGPU_HW_EVENT(SMEM_ACCESS)              /* scalar-memory read & write */
-AMDGPU_HW_EVENT(SMEM_GROUP)               /* scalar-memory group */
-AMDGPU_HW_EVENT(EXP_GPR_LOCK)             /* export holding on its data src */
-AMDGPU_HW_EVENT(GDS_GPR_LOCK)             /* GDS holding on its data and addr src */
-AMDGPU_HW_EVENT(EXP_POS_ACCESS)           /* write to export position */
-AMDGPU_HW_EVENT(EXP_PARAM_ACCESS)         /* write to export parameter */
-AMDGPU_HW_EVENT(VMW_GPR_LOCK)             /* vmem write holding on its data src */
-AMDGPU_HW_EVENT(EXP_LDS_ACCESS)           /* read by ldsdir counting as export */
-AMDGPU_HW_EVENT(VGPR_CSMACC_WRITE)        /* write VGPR dest in Core/Side-MACC VALU */
-AMDGPU_HW_EVENT(VGPR_DPMACC_WRITE)        /* write VGPR dest in DPMACC VALU */
-AMDGPU_HW_EVENT(VGPR_TRANS_WRITE)         /* write VGPR dest in TRANS VALU */
-AMDGPU_HW_EVENT(VGPR_XDL_WRITE)           /* write VGPR dest in XDL VALU */
-AMDGPU_HW_EVENT(VGPR_LDS_READ)            /* read VGPR source in LDS */
-AMDGPU_HW_EVENT(VGPR_FLAT_READ)           /* read VGPR source in FLAT */
-AMDGPU_HW_EVENT(VGPR_VMEM_READ)           /* read VGPR source in other VMEM */
-AMDGPU_HW_EVENT(ASYNC_ACCESS)             /* access that uses ASYNC_CNT */
-AMDGPU_HW_EVENT(TENSOR_ACCESS)            /* access that uses TENSOR_CNT */
+AMDGPU_HW_EVENT(VMEM_ACCESS,                0)  /* vmem read & write (pre-gfx10), vmem read (gfx10+) */
+AMDGPU_HW_EVENT(VMEM_SAMPLER_READ_ACCESS,   1)  /* vmem SAMPLER read (gfx12+ only) */
+AMDGPU_HW_EVENT(VMEM_BVH_READ_ACCESS,       2)  /* vmem BVH read (gfx12+ only) */
+AMDGPU_HW_EVENT(GLOBAL_INV_ACCESS,          3)  /* GLOBAL_INV (gfx12+ only) */
+AMDGPU_HW_EVENT(VMEM_WRITE_ACCESS,          4)  /* vmem write that is not scratch */
+AMDGPU_HW_EVENT(SCRATCH_WRITE_ACCESS,       5)  /* vmwrite that may be scratch */
+AMDGPU_HW_EVENT(VMEM_GROUP,                 6)  /* vmem group */
+AMDGPU_HW_EVENT(LDS_ACCESS,                 7)  /* lds read & write */
+AMDGPU_HW_EVENT(GDS_ACCESS,                 8)  /* gds read & write */
+AMDGPU_HW_EVENT(SQ_MESSAGE,                 9)  /* send message */
+AMDGPU_HW_EVENT(SCC_WRITE,                  10) /* write to SCC from barrier */
+AMDGPU_HW_EVENT(SMEM_ACCESS,                11) /* scalar-memory read & write */
+AMDGPU_HW_EVENT(SMEM_GROUP,                 12) /* scalar-memory group */
+AMDGPU_HW_EVENT(EXP_GPR_LOCK,               13) /* export holding on its data src */
+AMDGPU_HW_EVENT(GDS_GPR_LOCK,               14) /* GDS holding on its data and addr src */
+AMDGPU_HW_EVENT(EXP_POS_ACCESS,             15) /* write to export position */
+AMDGPU_HW_EVENT(EXP_PARAM_ACCESS,           16) /* write to export parameter */
+AMDGPU_HW_EVENT(VMW_GPR_LOCK,               17) /* vmem write holding on its data src */
+AMDGPU_HW_EVENT(EXP_LDS_ACCESS,             18) /* read by ldsdir counting as export */
+AMDGPU_HW_EVENT(VGPR_CSMACC_WRITE,          19) /* write VGPR dest in Core/Side-MACC VALU */
+AMDGPU_HW_EVENT(VGPR_DPMACC_WRITE,          20) /* write VGPR dest in DPMACC VALU */
+AMDGPU_HW_EVENT(VGPR_TRANS_WRITE,           21) /* write VGPR dest in TRANS VALU */
+AMDGPU_HW_EVENT(VGPR_XDL_WRITE,             22) /* write VGPR dest in XDL VALU */
+AMDGPU_HW_EVENT(VGPR_LDS_READ,              23) /* read VGPR source in LDS */
+AMDGPU_HW_EVENT(VGPR_FLAT_READ,             24) /* read VGPR source in FLAT */
+AMDGPU_HW_EVENT(VGPR_VMEM_READ,             25) /* read VGPR source in other VMEM */
+AMDGPU_HW_EVENT(ASYNC_ACCESS,               26) /* access that uses ASYNC_CNT */
+AMDGPU_HW_EVENT(TENSOR_ACCESS,              27) /* access that uses TENSOR_CNT */
 
-AMDGPU_FIRST_HW_EVENT(VMEM_ACCESS)
 AMDGPU_LAST_HW_EVENT(TENSOR_ACCESS)
 
 
@@ -62,4 +57,3 @@ AMDGPU_LAST_HW_EVENT(TENSOR_ACCESS)
 
 #undef AMDGPU_HW_EVENT
 #undef AMDGPU_LAST_HW_EVENT
-#undef AMDGPU_FIRST_HW_EVENT
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.h b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.h
index 5fad398fa56b3..8a2c50211efed 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.h
@@ -9,8 +9,12 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUHWEVENTS_H
 #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUHWEVENTS_H
 
-#include "llvm/ADT/Sequence.h"
-#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/bit.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
 
 namespace llvm {
 class GCNSubtarget;
@@ -19,110 +23,108 @@ class raw_ostream;
 
 namespace AMDGPU {
 
-/// TODO: This should be a bitmask from the start instead of having this enum
-///       + \ref HWEventSet below.
-enum class HWEvent : unsigned char {
-#define AMDGPU_HW_EVENT(X) X,
-#define AMDGPU_FIRST_HW_EVENT(X) FIRST_WAIT_EVENT = X,
-#define AMDGPU_LAST_HW_EVENT(X) NUM_WAIT_EVENTS = X,
-#include "AMDGPUHWEvents.def"
-};
-
-} // namespace AMDGPU
+class HWEvent {
+public:
+  using value_type = uint32_t;
 
-template <> struct enum_iteration_traits<AMDGPU::HWEvent> {
-  static constexpr bool is_iterable = true; // NOLINT
-};
+  enum : value_type {
+    NONE = 0,
+#define AMDGPU_HW_EVENT(X, V) X = (1 << V),
+#define AMDGPU_LAST_HW_EVENT(X) HWEVENT_LAST_EVENT = X,
+#include "AMDGPUHWEvents.def"
 
-namespace AMDGPU {
+    ALL = ((HWEVENT_LAST_EVENT << 1) - 1)
+  };
 
-static constexpr StringLiteral toString(HWEvent Event) {
-  switch (Event) {
-#define AMDGPU_HW_EVENT(EVENT)                                                 \
-  case HWEvent::EVENT:                                                         \
-    return #EVENT;
-#include "AMDGPUHWEvents.def"
-  }
+  /// Iterates over the set bits of an HWEvent.
+  /// NOLINTNEXTLINE
+  class const_iterator
+      : public iterator_facade_base<const_iterator, std::forward_iterator_tag,
+                                    HWEvent> {
+    HWEvent::value_type Cur = 0;
 
-  return "";
-}
+  public:
+    const_iterator() = default;
+    const_iterator(HWEvent H) : Cur(H.value()) {}
 
-/// Return an iterator over all events between FIRST_WAIT_EVENT
-/// and \c MaxEvent (exclusive, default value yields an enumeration over
-/// all counters).
-// NOLINTNEXTLINE
-inline iota_range<HWEvent>
-hw_events(HWEvent MaxEvent = HWEvent::NUM_WAIT_EVENTS) {
-  return enum_seq(HWEvent::FIRST_WAIT_EVENT, MaxEvent);
-}
+    bool operator==(const const_iterator &Other) const {
+      return Cur == Other.Cur;
+    }
 
-class HWEventSet {
-  unsigned Mask = 0;
+    HWEvent operator*() const {
+      return Cur ? (Cur & (1 << countr_zero(Cur))) : HWEvent();
+    }
 
-public:
-  HWEventSet() = default;
-  constexpr HWEventSet(HWEvent Event) {
-    static_assert(static_cast<unsigned>(HWEvent::NUM_WAIT_EVENTS) <=
-                      sizeof(Mask) * 8,
-                  "Not enough bits in Mask for all the events");
-    Mask |= 1 << static_cast<unsigned>(Event);
-  }
-  constexpr HWEventSet(std::initializer_list<HWEvent> Events) {
-    for (auto &E : Events) {
-      Mask |= 1 << static_cast<unsigned>(E);
+    const_iterator &operator++() {
+      Cur &= maskTrailingZeros<HWEvent::value_type>(countr_zero(Cur) + 1);
+      return *this;
     }
+  };
+
+  constexpr HWEvent() = default;
+  constexpr HWEvent(value_type V) : Data(V) {
+    assert((V & ALL) == V && "Bits set out of bounds!");
   }
-  void insert(const HWEvent &Event) {
-    Mask |= 1 << static_cast<unsigned>(Event);
-  }
-  void remove(const HWEvent &Event) {
-    Mask &= ~(1 << static_cast<unsigned>(Event));
-  }
-  void remove(const HWEventSet &Other) { Mask &= ~Other.Mask; }
-  bool contains(const HWEvent &Event) const {
-    return Mask & (1 << static_cast<unsigned>(Event));
-  }
-  /// \returns true if this set contains all elements of \p Other.
-  bool contains(const HWEventSet &Other) const {
-    return (~Mask & Other.Mask) == 0;
+
+  constexpr unsigned size() const { return popcount(Data); }
+  constexpr bool any() const { return Data != 0; }
+  constexpr bool none() const { return Data == 0; }
+  constexpr value_type value() const { return Data; }
+
+  constexpr explicit operator bool() const { return any(); }
+
+  constexpr bool contains(HWEvent Other) const {
+    return (~Data & Other.Data) == 0;
   }
-  /// \returns the intersection of this and \p Other.
-  HWEventSet operator&(const HWEventSet &Other) const {
-    auto Copy = *this;
-    Copy.Mask &= Other.Mask;
-    return Copy;
+
+  const_iterator begin() const { return *this; }
+
+  const_iterator end() const { return {}; }
+
+  constexpr HWEvent operator|(HWEvent Other) const { return Data | Other.Data; }
+  constexpr HWEvent operator&(HWEvent Other) const { return Data & Other.Data; }
+  constexpr HWEvent operator^(HWEvent Other) const { return Data ^ Other.Data; }
+
+  constexpr HWEvent operator~() const { return Data ^ ALL; }
+
+  constexpr bool operator==(HWEvent Other) const { return Data == Other.Data; }
+  constexpr bool operator!=(HWEvent Other) const { return Data != Other.Data; }
+
+  constexpr HWEvent &operator|=(HWEvent Other) {
+    Data |= Other.Data;
+    return *this;
   }
-  /// \returns the union of this and \p Other.
-  HWEventSet operator|(const HWEventSet &Other) const {
-    auto Copy = *this;
-    Copy.Mask |= Other.Mask;
-    return Copy;
+  constexpr HWEvent &operator&=(HWEvent Other) {
+    Data &= Other.Data;
+    return *this;
   }
-  /// This set becomes the union of this and \p Other.
-  HWEventSet &operator|=(const HWEventSet &Other) {
-    Mask |= Other.Mask;
+  constexpr HWEvent &operator^=(HWEvent Other) {
+    Data ^= Other.Data;
     return *this;
   }
-  /// This set becomes the intersection of this and \p Other.
-  HWEventSet &operator&=(const HWEventSet &Other) {
-    Mask &= Other.Mask;
+
+  /// Overload both bitwise AND operators w/ the value_type to avoid an implicit
+  /// conversion to HWEvent in this common pattern used to clear an event bit:
+  /// `Events & ~HWEvent::EVENT_TO_CLEAR`.
+  /// If we had the implicit conversion to HWEvent, we'd assert because
+  /// `~HWEvent::EVENT_TO_CLEAR` has bits set outside of `HWEvent::ALL`.
+  constexpr HWEvent operator&(value_type Other) const { return Data & Other; }
+  constexpr HWEvent &operator&=(value_type Other) {
+    Data &= Other;
     return *this;
   }
-  bool operator==(const HWEventSet &Other) const { return Mask == Other.Mask; }
-  bool operator!=(const HWEventSet &Other) const { return !(*this == Other); }
-  bool empty() const { return Mask == 0; }
-  /// \returns true if the set contains more than one element.
-  bool twoOrMore() const { return Mask & (Mask - 1); }
-  operator bool() const { return !empty(); }
-  void print(raw_ostream &OS) const;
-  LLVM_DUMP_METHOD void dump() const;
+
+private:
+  value_type Data = NONE;
 };
 
-/// \returns all HWEvents triggered by \p Inst
-HWEventSet getEventsFor(const MachineInstr &Inst, const GCNSubtarget &ST,
-                        bool IsExpertMode);
+/// \returns A bitmask of HWEvent triggered by \p Inst
+HWEvent getEventsFor(const MachineInstr &Inst, const GCNSubtarget &ST,
+                     bool IsExpertMode);
 
 } // namespace AMDGPU
+
+raw_ostream &operator<<(raw_ostream &OS, AMDGPU::HWEvent E);
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 8d42715f70be6..fa6daaac43d71 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -44,7 +44,6 @@
 
 using namespace llvm;
 
-using HWEventSet = AMDGPU::HWEventSet;
 using HWEvent = AMDGPU::HWEvent;
 
 #define DEBUG_TYPE "si-insert-waitcnts"
@@ -249,13 +248,14 @@ class WaitcntGenerator {
                                 AMDGPU::Waitcnt Wait,
                                 const WaitcntBrackets &ScoreBrackets) = 0;
 
-  // Returns the HWEventSet that corresponds to counter \p T.
-  virtual const HWEventSet &getWaitEvents(AMDGPU::InstCounterType T) const = 0;
+  // Returns the set of HWEvents that corresponds to counter \p T.
+  virtual const HWEvent &getWaitEvents(AMDGPU::InstCounterType T) const = 0;
 
   /// \returns the counter that corresponds to event \p E.
   AMDGPU::InstCounterType getCounterFromEvent(HWEvent E) const {
+    assert(E.size() == 1 && "Cannot handle a mask of events!");
     for (auto T : AMDGPU::inst_counter_types()) {
-      if (getWaitEvents(T).contains(E))
+      if (getWaitEvents(T) & E)
         return T;
     }
     llvm_unreachable("event type has no associated counter");
@@ -272,25 +272,24 @@ class WaitcntGenerator {
 };
 
 class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
-  static constexpr const HWEventSet
+  static constexpr const HWEvent
       WaitEventMaskForInstPreGFX12[AMDGPU::NUM_INST_CNTS] = {
-          HWEventSet({HWEvent::VMEM_ACCESS, HWEvent::VMEM_SAMPLER_READ_ACCESS,
-                      HWEvent::VMEM_BVH_READ_ACCESS}),
-          HWEventSet({HWEvent::SMEM_ACCESS, HWEvent::LDS_ACCESS,
-                      HWEvent::GDS_ACCESS, HWEvent::SQ_MESSAGE}),
-          HWEventSet({HWEvent::EXP_GPR_LOCK, HWEvent::GDS_GPR_LOCK,
-                      HWEvent::VMW_GPR_LOCK, HWEvent::EXP_PARAM_ACCESS,
-                      HWEvent::EXP_POS_ACCESS, HWEvent::EXP_LDS_ACCESS}),
-          HWEventSet(
-              {HWEvent::VMEM_WRITE_ACCESS, HWEvent::SCRATCH_WRITE_ACCESS}),
-          HWEventSet(),
-          HWEventSet(),
-          HWEventSet(),
-          HWEventSet(),
-          HWEventSet(),
-          HWEventSet(),
-          HWEventSet(),
-          HWEventSet()};
+          HWEvent::VMEM_ACCESS | HWEvent::VMEM_SAMPLER_READ_ACCESS |
+              HWEvent::VMEM_BVH_READ_ACCESS,
+          HWEvent::SMEM_ACCESS | HWEvent::LDS_ACCESS | HWEvent::GDS_ACCESS |
+              HWEvent::SQ_MESSAGE,
+          HWEvent::EXP_GPR_LOCK | HWEvent::GDS_GPR_LOCK |
+              HWEvent::VMW_GPR_LOCK | HWEvent::EXP_PARAM_ACCESS |
+              HWEvent::EXP_POS_ACCESS | HWEvent::EXP_LDS_ACCESS,
+          HWEvent::VMEM_WRITE_ACCESS | HWEvent::SCRATCH_WRITE_ACCESS,
+          HWEvent::NONE,
+          HWEvent::NONE,
+          HWEvent::NONE,
+          HWEvent::NONE,
+          HWEvent::NONE,
+          HWEvent::NONE,
+          HWEvent::NONE,
+          HWEvent::NONE};
 
 public:
   using WaitcntGenerator::WaitcntGenerator;
@@ -304,7 +303,7 @@ class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
                         AMDGPU::Waitcnt Wait,
                         const WaitcntBrackets &ScoreBrackets) override;
 
-  co...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/203864