[llvm-branch-commits] [llvm] [AMDGPU][InsertWaitCnts] Make HWEvent a BitMask (PR #203864)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Jun 15 02:50:30 PDT 2026
llvmorg-github-actions[bot] wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Pierre van Houtryve (Pierre-vh)
<details>
<summary>Changes</summary>
Follow up from comments on https://github.com/llvm/llvm-project/pull/202886
Make HWEvent a bitmask by default instead of having both the enum, and a separate HWEventSet. This has the advantage of streamlining the code a bit and opening the possibility of adding "modifiers" to events, e.g. I imagine we could now fold "VMemType" into the Events.
We already do this with things like SMEM_GROUP. At least now it's baked into the design.
I opted for a bit more verbosity by taking inspiration from FastMathFlags (FMF): instead of exposing a raw enum, I wrap it in a class w/ helper function. The downside is having to reimplement all the little bitwise ops, but the result is a cleaner, simpler interface than a raw enum (class) w/ many helper functions. I initially tried that but I recoiled at the sight of things like `contains(A, B)` which isn't very clear, while `A.contains(B)` is self explanatory.
Considering HWEvent is a bitmask, I also implemented a simple iterator to iterate over all set bits of the mask, which is a useful thing to have as some APIs in InsertWaitCnt rely on treating one event at a time.
---
Patch is 31.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/203864.diff
4 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp (+30-32)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUHWEvents.def (+28-34)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUHWEvents.h (+88-86)
- (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+73-79)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp
index 82be5059e7443..f1bf6a9ccf0b6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp
@@ -15,21 +15,9 @@
namespace llvm {
namespace AMDGPU {
-void HWEventSet::print(raw_ostream &OS) const {
- ListSeparator LS(", ");
- for (HWEvent Event : hw_events()) {
- if (contains(Event))
- OS << LS << toString(Event);
- }
-}
-
-void HWEventSet::dump() const {
- print(dbgs());
- dbgs() << "\n";
-}
-static HWEventSet getExpertSchedulingEventType(const MachineInstr &Inst,
- const SIInstrInfo &TII) {
+static HWEvent getExpertSchedulingEventType(const MachineInstr &Inst,
+ const SIInstrInfo &TII) {
if (TII.isVALU(Inst)) {
// Core/Side-, DP-, XDL- and TRANS-MACC VALU instructions complete
// out-of-order with respect to each other, so each of these classes
@@ -61,7 +49,7 @@ static HWEventSet getExpertSchedulingEventType(const MachineInstr &Inst,
return HWEvent::VGPR_VMEM_READ;
// Otherwise, no hazard.
- return {};
+ return HWEvent::NONE;
}
static HWEvent getVmemHWEvent(const MachineInstr &Inst, const GCNSubtarget &ST,
@@ -110,13 +98,13 @@ static HWEvent getVmemHWEvent(const MachineInstr &Inst, const GCNSubtarget &ST,
return HWEvent::VMEM_ACCESS;
}
-static HWEventSet getEventsForImpl(const MachineInstr &Inst,
- const GCNSubtarget &ST,
- const SIInstrInfo &TII) {
+static HWEvent getEventsForImpl(const MachineInstr &Inst,
+ const GCNSubtarget &ST,
+ const SIInstrInfo &TII) {
if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {
if (TII.isAlwaysGDS(Inst.getOpcode()) ||
TII.hasModifiersSet(Inst, AMDGPU::OpName::gds))
- return {HWEvent::GDS_ACCESS, HWEvent::GDS_GPR_LOCK};
+ return HWEvent::GDS_ACCESS | HWEvent::GDS_GPR_LOCK;
return HWEvent::LDS_ACCESS;
}
@@ -126,16 +114,16 @@ static HWEventSet getEventsForImpl(const MachineInstr &Inst,
return getVmemHWEvent(Inst, ST, TII);
assert(Inst.mayLoadOrStore());
- HWEventSet S;
+ HWEvent E = HWEvent::NONE;
if (TII.mayAccessVMEMThroughFlat(Inst)) {
if (ST.hasWaitXcnt())
- S.insert(HWEvent::VMEM_GROUP);
- S.insert(getVmemHWEvent(Inst, ST, TII));
+ E |= HWEvent::VMEM_GROUP;
+ E |= getVmemHWEvent(Inst, ST, TII);
}
if (TII.mayAccessLDSThroughFlat(Inst))
- S.insert(HWEvent::LDS_ACCESS);
- return S;
+ E |= HWEvent::LDS_ACCESS;
+ return E;
}
if (SIInstrInfo::isVMEM(Inst) &&
@@ -144,19 +132,19 @@ static HWEventSet getEventsForImpl(const MachineInstr &Inst,
// BUFFER_WBL2 is included here because unlike invalidates, has to be
// followed "S_WAITCNT vmcnt(0)" is needed after to ensure the writeback has
// completed.
- HWEventSet S = {getVmemHWEvent(Inst, ST, TII)};
+ HWEvent E = getVmemHWEvent(Inst, ST, TII);
if (ST.hasWaitXcnt())
- S.insert(HWEvent::VMEM_GROUP);
+ E |= HWEvent::VMEM_GROUP;
if (ST.vmemWriteNeedsExpWaitcnt() &&
(Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst)))
- S.insert(HWEvent::VMW_GPR_LOCK);
+ E |= HWEvent::VMW_GPR_LOCK;
- return S;
+ return E;
}
if (TII.isSMRD(Inst)) {
if (ST.hasWaitXcnt())
- return {HWEvent::SMEM_GROUP, HWEvent::SMEM_ACCESS};
+ return HWEvent::SMEM_GROUP | HWEvent::SMEM_ACCESS;
return HWEvent::SMEM_ACCESS;
}
@@ -190,11 +178,11 @@ static HWEventSet getEventsForImpl(const MachineInstr &Inst,
return HWEvent::SMEM_ACCESS;
}
- return {};
+ return HWEvent::NONE;
}
-HWEventSet getEventsFor(const MachineInstr &Inst, const GCNSubtarget &ST,
- bool IsExpertMode) {
+HWEvent getEventsFor(const MachineInstr &Inst, const GCNSubtarget &ST,
+ bool IsExpertMode) {
const SIInstrInfo &TII = *ST.getInstrInfo();
if (IsExpertMode)
@@ -203,4 +191,14 @@ HWEventSet getEventsFor(const MachineInstr &Inst, const GCNSubtarget &ST,
return getEventsForImpl(Inst, ST, TII);
}
} // namespace AMDGPU
+
+raw_ostream &operator<<(raw_ostream &OS, AMDGPU::HWEvent Events) {
+ ListSeparator LS(" | ");
+#define AMDGPU_HW_EVENT(E, V) \
+ if (Events & AMDGPU::HWEvent::E) \
+ OS << LS << #E << " ";
+#include "AMDGPUHWEvents.def"
+ return OS;
+}
+
} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.def b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.def
index e0db74e93021e..9d2d01cee6e02 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.def
@@ -19,42 +19,37 @@
#define AMDGPU_LAST_HW_EVENT(X)
#endif
-#ifndef AMDGPU_FIRST_HW_EVENT
-#define AMDGPU_FIRST_HW_EVENT(X)
-#endif
-
// TODO: VMEM_ACCESS should be broken up and be target-independent, not interpreted differently
// depending on the target.
-AMDGPU_HW_EVENT(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */
-AMDGPU_HW_EVENT(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */
-AMDGPU_HW_EVENT(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */
-AMDGPU_HW_EVENT(GLOBAL_INV_ACCESS) /* GLOBAL_INV (gfx12+ only) */
-AMDGPU_HW_EVENT(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */
-AMDGPU_HW_EVENT(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */
-AMDGPU_HW_EVENT(VMEM_GROUP) /* vmem group */
-AMDGPU_HW_EVENT(LDS_ACCESS) /* lds read & write */
-AMDGPU_HW_EVENT(GDS_ACCESS) /* gds read & write */
-AMDGPU_HW_EVENT(SQ_MESSAGE) /* send message */
-AMDGPU_HW_EVENT(SCC_WRITE) /* write to SCC from barrier */
-AMDGPU_HW_EVENT(SMEM_ACCESS) /* scalar-memory read & write */
-AMDGPU_HW_EVENT(SMEM_GROUP) /* scalar-memory group */
-AMDGPU_HW_EVENT(EXP_GPR_LOCK) /* export holding on its data src */
-AMDGPU_HW_EVENT(GDS_GPR_LOCK) /* GDS holding on its data and addr src */
-AMDGPU_HW_EVENT(EXP_POS_ACCESS) /* write to export position */
-AMDGPU_HW_EVENT(EXP_PARAM_ACCESS) /* write to export parameter */
-AMDGPU_HW_EVENT(VMW_GPR_LOCK) /* vmem write holding on its data src */
-AMDGPU_HW_EVENT(EXP_LDS_ACCESS) /* read by ldsdir counting as export */
-AMDGPU_HW_EVENT(VGPR_CSMACC_WRITE) /* write VGPR dest in Core/Side-MACC VALU */
-AMDGPU_HW_EVENT(VGPR_DPMACC_WRITE) /* write VGPR dest in DPMACC VALU */
-AMDGPU_HW_EVENT(VGPR_TRANS_WRITE) /* write VGPR dest in TRANS VALU */
-AMDGPU_HW_EVENT(VGPR_XDL_WRITE) /* write VGPR dest in XDL VALU */
-AMDGPU_HW_EVENT(VGPR_LDS_READ) /* read VGPR source in LDS */
-AMDGPU_HW_EVENT(VGPR_FLAT_READ) /* read VGPR source in FLAT */
-AMDGPU_HW_EVENT(VGPR_VMEM_READ) /* read VGPR source in other VMEM */
-AMDGPU_HW_EVENT(ASYNC_ACCESS) /* access that uses ASYNC_CNT */
-AMDGPU_HW_EVENT(TENSOR_ACCESS) /* access that uses TENSOR_CNT */
+AMDGPU_HW_EVENT(VMEM_ACCESS, 0) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */
+AMDGPU_HW_EVENT(VMEM_SAMPLER_READ_ACCESS, 1) /* vmem SAMPLER read (gfx12+ only) */
+AMDGPU_HW_EVENT(VMEM_BVH_READ_ACCESS, 2) /* vmem BVH read (gfx12+ only) */
+AMDGPU_HW_EVENT(GLOBAL_INV_ACCESS, 3) /* GLOBAL_INV (gfx12+ only) */
+AMDGPU_HW_EVENT(VMEM_WRITE_ACCESS, 4) /* vmem write that is not scratch */
+AMDGPU_HW_EVENT(SCRATCH_WRITE_ACCESS, 5) /* vmwrite that may be scratch */
+AMDGPU_HW_EVENT(VMEM_GROUP, 6) /* vmem group */
+AMDGPU_HW_EVENT(LDS_ACCESS, 7) /* lds read & write */
+AMDGPU_HW_EVENT(GDS_ACCESS, 8) /* gds read & write */
+AMDGPU_HW_EVENT(SQ_MESSAGE, 9) /* send message */
+AMDGPU_HW_EVENT(SCC_WRITE, 10) /* write to SCC from barrier */
+AMDGPU_HW_EVENT(SMEM_ACCESS, 11) /* scalar-memory read & write */
+AMDGPU_HW_EVENT(SMEM_GROUP, 12) /* scalar-memory group */
+AMDGPU_HW_EVENT(EXP_GPR_LOCK, 13) /* export holding on its data src */
+AMDGPU_HW_EVENT(GDS_GPR_LOCK, 14) /* GDS holding on its data and addr src */
+AMDGPU_HW_EVENT(EXP_POS_ACCESS, 15) /* write to export position */
+AMDGPU_HW_EVENT(EXP_PARAM_ACCESS, 16) /* write to export parameter */
+AMDGPU_HW_EVENT(VMW_GPR_LOCK, 17) /* vmem write holding on its data src */
+AMDGPU_HW_EVENT(EXP_LDS_ACCESS, 18) /* read by ldsdir counting as export */
+AMDGPU_HW_EVENT(VGPR_CSMACC_WRITE, 19) /* write VGPR dest in Core/Side-MACC VALU */
+AMDGPU_HW_EVENT(VGPR_DPMACC_WRITE, 20) /* write VGPR dest in DPMACC VALU */
+AMDGPU_HW_EVENT(VGPR_TRANS_WRITE, 21) /* write VGPR dest in TRANS VALU */
+AMDGPU_HW_EVENT(VGPR_XDL_WRITE, 22) /* write VGPR dest in XDL VALU */
+AMDGPU_HW_EVENT(VGPR_LDS_READ, 23) /* read VGPR source in LDS */
+AMDGPU_HW_EVENT(VGPR_FLAT_READ, 24) /* read VGPR source in FLAT */
+AMDGPU_HW_EVENT(VGPR_VMEM_READ, 25) /* read VGPR source in other VMEM */
+AMDGPU_HW_EVENT(ASYNC_ACCESS, 26) /* access that uses ASYNC_CNT */
+AMDGPU_HW_EVENT(TENSOR_ACCESS, 27) /* access that uses TENSOR_CNT */
-AMDGPU_FIRST_HW_EVENT(VMEM_ACCESS)
AMDGPU_LAST_HW_EVENT(TENSOR_ACCESS)
@@ -62,4 +57,3 @@ AMDGPU_LAST_HW_EVENT(TENSOR_ACCESS)
#undef AMDGPU_HW_EVENT
#undef AMDGPU_LAST_HW_EVENT
-#undef AMDGPU_FIRST_HW_EVENT
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.h b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.h
index 5fad398fa56b3..8a2c50211efed 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.h
@@ -9,8 +9,12 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUHWEVENTS_H
#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUHWEVENTS_H
-#include "llvm/ADT/Sequence.h"
-#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/bit.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
namespace llvm {
class GCNSubtarget;
@@ -19,110 +23,108 @@ class raw_ostream;
namespace AMDGPU {
-/// TODO: This should be a bitmask from the start instead of having this enum
-/// + \ref HWEventSet below.
-enum class HWEvent : unsigned char {
-#define AMDGPU_HW_EVENT(X) X,
-#define AMDGPU_FIRST_HW_EVENT(X) FIRST_WAIT_EVENT = X,
-#define AMDGPU_LAST_HW_EVENT(X) NUM_WAIT_EVENTS = X,
-#include "AMDGPUHWEvents.def"
-};
-
-} // namespace AMDGPU
+class HWEvent {
+public:
+ using value_type = uint32_t;
-template <> struct enum_iteration_traits<AMDGPU::HWEvent> {
- static constexpr bool is_iterable = true; // NOLINT
-};
+ enum : value_type {
+ NONE = 0,
+#define AMDGPU_HW_EVENT(X, V) X = (1 << V),
+#define AMDGPU_LAST_HW_EVENT(X) HWEVENT_LAST_EVENT = X,
+#include "AMDGPUHWEvents.def"
-namespace AMDGPU {
+ ALL = ((HWEVENT_LAST_EVENT << 1) - 1)
+ };
-static constexpr StringLiteral toString(HWEvent Event) {
- switch (Event) {
-#define AMDGPU_HW_EVENT(EVENT) \
- case HWEvent::EVENT: \
- return #EVENT;
-#include "AMDGPUHWEvents.def"
- }
+ /// Iterates over the set bits of an HWEvent.
+ /// NOLINTNEXTLINE
+ class const_iterator
+ : public iterator_facade_base<const_iterator, std::forward_iterator_tag,
+ HWEvent> {
+ HWEvent::value_type Cur = 0;
- return "";
-}
+ public:
+ const_iterator() = default;
+ const_iterator(HWEvent H) : Cur(H.value()) {}
-/// Return an iterator over all events between FIRST_WAIT_EVENT
-/// and \c MaxEvent (exclusive, default value yields an enumeration over
-/// all counters).
-// NOLINTNEXTLINE
-inline iota_range<HWEvent>
-hw_events(HWEvent MaxEvent = HWEvent::NUM_WAIT_EVENTS) {
- return enum_seq(HWEvent::FIRST_WAIT_EVENT, MaxEvent);
-}
+ bool operator==(const const_iterator &Other) const {
+ return Cur == Other.Cur;
+ }
-class HWEventSet {
- unsigned Mask = 0;
+ HWEvent operator*() const {
+ return Cur ? (Cur & (1 << countr_zero(Cur))) : HWEvent();
+ }
-public:
- HWEventSet() = default;
- constexpr HWEventSet(HWEvent Event) {
- static_assert(static_cast<unsigned>(HWEvent::NUM_WAIT_EVENTS) <=
- sizeof(Mask) * 8,
- "Not enough bits in Mask for all the events");
- Mask |= 1 << static_cast<unsigned>(Event);
- }
- constexpr HWEventSet(std::initializer_list<HWEvent> Events) {
- for (auto &E : Events) {
- Mask |= 1 << static_cast<unsigned>(E);
+ const_iterator &operator++() {
+ Cur &= maskTrailingZeros<HWEvent::value_type>(countr_zero(Cur) + 1);
+ return *this;
}
+ };
+
+ constexpr HWEvent() = default;
+ constexpr HWEvent(value_type V) : Data(V) {
+ assert((V & ALL) == V && "Bits set out of bounds!");
}
- void insert(const HWEvent &Event) {
- Mask |= 1 << static_cast<unsigned>(Event);
- }
- void remove(const HWEvent &Event) {
- Mask &= ~(1 << static_cast<unsigned>(Event));
- }
- void remove(const HWEventSet &Other) { Mask &= ~Other.Mask; }
- bool contains(const HWEvent &Event) const {
- return Mask & (1 << static_cast<unsigned>(Event));
- }
- /// \returns true if this set contains all elements of \p Other.
- bool contains(const HWEventSet &Other) const {
- return (~Mask & Other.Mask) == 0;
+
+ constexpr unsigned size() const { return popcount(Data); }
+ constexpr bool any() const { return Data != 0; }
+ constexpr bool none() const { return Data == 0; }
+ constexpr value_type value() const { return Data; }
+
+ constexpr explicit operator bool() const { return any(); }
+
+ constexpr bool contains(HWEvent Other) const {
+ return (~Data & Other.Data) == 0;
}
- /// \returns the intersection of this and \p Other.
- HWEventSet operator&(const HWEventSet &Other) const {
- auto Copy = *this;
- Copy.Mask &= Other.Mask;
- return Copy;
+
+ const_iterator begin() const { return *this; }
+
+ const_iterator end() const { return {}; }
+
+ constexpr HWEvent operator|(HWEvent Other) const { return Data | Other.Data; }
+ constexpr HWEvent operator&(HWEvent Other) const { return Data & Other.Data; }
+ constexpr HWEvent operator^(HWEvent Other) const { return Data ^ Other.Data; }
+
+ constexpr HWEvent operator~() const { return Data ^ ALL; }
+
+ constexpr bool operator==(HWEvent Other) const { return Data == Other.Data; }
+ constexpr bool operator!=(HWEvent Other) const { return Data != Other.Data; }
+
+ constexpr HWEvent &operator|=(HWEvent Other) {
+ Data |= Other.Data;
+ return *this;
}
- /// \returns the union of this and \p Other.
- HWEventSet operator|(const HWEventSet &Other) const {
- auto Copy = *this;
- Copy.Mask |= Other.Mask;
- return Copy;
+ constexpr HWEvent &operator&=(HWEvent Other) {
+ Data &= Other.Data;
+ return *this;
}
- /// This set becomes the union of this and \p Other.
- HWEventSet &operator|=(const HWEventSet &Other) {
- Mask |= Other.Mask;
+ constexpr HWEvent &operator^=(HWEvent Other) {
+ Data ^= Other.Data;
return *this;
}
- /// This set becomes the intersection of this and \p Other.
- HWEventSet &operator&=(const HWEventSet &Other) {
- Mask &= Other.Mask;
+
+ /// Overload both bitwise AND operators w/ the value_type to avoid an implicit
+ /// conversion to HWEvent in this common pattern used to clear an event bit:
+ /// `Events & ~HWEvent::EVENT_TO_CLEAR`.
+ /// If we had the implicit conversion to HWEvent, we'd assert because
+ /// `~HWEvent::EVENT_TO_CLEAR` has bits set outside of `HWEvent::ALL`.
+ constexpr HWEvent operator&(value_type Other) const { return Data & Other; }
+ constexpr HWEvent &operator&=(value_type Other) {
+ Data &= Other;
return *this;
}
- bool operator==(const HWEventSet &Other) const { return Mask == Other.Mask; }
- bool operator!=(const HWEventSet &Other) const { return !(*this == Other); }
- bool empty() const { return Mask == 0; }
- /// \returns true if the set contains more than one element.
- bool twoOrMore() const { return Mask & (Mask - 1); }
- operator bool() const { return !empty(); }
- void print(raw_ostream &OS) const;
- LLVM_DUMP_METHOD void dump() const;
+
+private:
+ value_type Data = NONE;
};
-/// \returns all HWEvents triggered by \p Inst
-HWEventSet getEventsFor(const MachineInstr &Inst, const GCNSubtarget &ST,
- bool IsExpertMode);
+/// \returns A bitmask of HWEvent triggered by \p Inst
+HWEvent getEventsFor(const MachineInstr &Inst, const GCNSubtarget &ST,
+ bool IsExpertMode);
} // namespace AMDGPU
+
+raw_ostream &operator<<(raw_ostream &OS, AMDGPU::HWEvent E);
} // namespace llvm
#endif
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 8d42715f70be6..fa6daaac43d71 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -44,7 +44,6 @@
using namespace llvm;
-using HWEventSet = AMDGPU::HWEventSet;
using HWEvent = AMDGPU::HWEvent;
#define DEBUG_TYPE "si-insert-waitcnts"
@@ -249,13 +248,14 @@ class WaitcntGenerator {
AMDGPU::Waitcnt Wait,
const WaitcntBrackets &ScoreBrackets) = 0;
- // Returns the HWEventSet that corresponds to counter \p T.
- virtual const HWEventSet &getWaitEvents(AMDGPU::InstCounterType T) const = 0;
+ // Returns the set of HWEvents that corresponds to counter \p T.
+ virtual const HWEvent &getWaitEvents(AMDGPU::InstCounterType T) const = 0;
/// \returns the counter that corresponds to event \p E.
AMDGPU::InstCounterType getCounterFromEvent(HWEvent E) const {
+ assert(E.size() == 1 && "Cannot handle a mask of events!");
for (auto T : AMDGPU::inst_counter_types()) {
- if (getWaitEvents(T).contains(E))
+ if (getWaitEvents(T) & E)
return T;
}
llvm_unreachable("event type has no associated counter");
@@ -272,25 +272,24 @@ class WaitcntGenerator {
};
class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
- static constexpr const HWEventSet
+ static constexpr const HWEvent
WaitEventMaskForInstPreGFX12[AMDGPU::NUM_INST_CNTS] = {
- HWEventSet({HWEvent::VMEM_ACCESS, HWEvent::VMEM_SAMPLER_READ_ACCESS,
- HWEvent::VMEM_BVH_READ_ACCESS}),
- HWEventSet({HWEvent::SMEM_ACCESS, HWEvent::LDS_ACCESS,
- HWEvent::GDS_ACCESS, HWEvent::SQ_MESSAGE}),
- HWEventSet({HWEvent::EXP_GPR_LOCK, HWEvent::GDS_GPR_LOCK,
- HWEvent::VMW_GPR_LOCK, HWEvent::EXP_PARAM_ACCESS,
- HWEvent::EXP_POS_ACCESS, HWEvent::EXP_LDS_ACCESS}),
- HWEventSet(
- {HWEvent::VMEM_WRITE_ACCESS, HWEvent::SCRATCH_WRITE_ACCESS}),
- HWEventSet(),
- HWEventSet(),
- HWEventSet(),
- HWEventSet(),
- HWEventSet(),
- HWEventSet(),
- HWEventSet(),
- HWEventSet()};
+ HWEvent::VMEM_ACCESS | HWEvent::VMEM_SAMPLER_READ_ACCESS |
+ HWEvent::VMEM_BVH_READ_ACCESS,
+ HWEvent::SMEM_ACCESS | HWEvent::LDS_ACCESS | HWEvent::GDS_ACCESS |
+ HWEvent::SQ_MESSAGE,
+ HWEvent::EXP_GPR_LOCK | HWEvent::GDS_GPR_LOCK |
+ HWEvent::VMW_GPR_LOCK | HWEvent::EXP_PARAM_ACCESS |
+ HWEvent::EXP_POS_ACCESS | HWEvent::EXP_LDS_ACCESS,
+ HWEvent::VMEM_WRITE_ACCESS | HWEvent::SCRATCH_WRITE_ACCESS,
+ HWEvent::NONE,
+ HWEvent::NONE,
+ HWEvent::NONE,
+ HWEvent::NONE,
+ HWEvent::NONE,
+ HWEvent::NONE,
+ HWEvent::NONE,
+ HWEvent::NONE};
public:
using WaitcntGenerator::WaitcntGenerator;
@@ -304,7 +303,7 @@ class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
AMDGPU::Waitcnt Wait,
const WaitcntBrackets &ScoreBrackets) override;
- co...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/203864
More information about the llvm-branch-commits
mailing list