[llvm-branch-commits] [llvm] [AMDGPU][InsertWaitCnts] Make HWEvent a BitMask (PR #203864)
Pierre van Houtryve via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Jun 15 02:49:17 PDT 2026
https://github.com/Pierre-vh created https://github.com/llvm/llvm-project/pull/203864
Follow up from comments on https://github.com/llvm/llvm-project/pull/202886
Make HWEvent a bitmask by default instead of having both the enum, and a separate HWEventSet. This has the advantage of streamlining the code a bit and opening the possibility of adding "modifiers" to events, e.g. I imagine we could now fold "VMemType" into the Events.
We already do this with things like SMEM_GROUP. At least now it's baked into the design.
I opted for a bit more verbosity by taking inspiration from FastMathFlags (FMF): instead of exposing a raw enum, I wrap it in a class w/ helper function. The downside is having to reimplement all the little bitwise ops, but the result is a cleaner, simpler interface than a raw enum (class) w/ many helper functions. I initially tried that but I recoiled at the sight of things like `contains(A, B)` which isn't very clear, while `A.contains(B)` is self explanatory.
Considering HWEvent is a bitmask, I also implemented a simple iterator to iterate over all set bits of the mask, which is a useful thing to have as some APIs in InsertWaitCnt rely on treating one event at a time.
>From 9f4543ae9de4f7b6a5007b5ee728be0a38eb8fbd Mon Sep 17 00:00:00 2001
From: pvanhout <pierre.vanhoutryve at amd.com>
Date: Mon, 15 Jun 2026 11:48:57 +0200
Subject: [PATCH] [AMDGPU][InsertWaitCnts] Make HWEvent a BitMask
Follow up from comments on https://github.com/llvm/llvm-project/pull/202886
Make HWEvent a bitmask by default instead of having both the enum, and a separate HWEventSet. This has the advantage of streamlining the code a bit and opening the possibility of adding "modifiers" to events, e.g. I imagine we could now fold "VMemType" into the Events.
We already do this with things like SMEM_GROUP. At least now it's baked into the design.
I opted for a bit more verbosity by taking inspiration from FastMathFlags (FMF): instead of exposing a raw enum, I wrap it in a class w/ helper function. The downside is having to reimplement all the little bitwise ops, but the result is a cleaner, simpler interface than a raw enum (class) w/ many helper functions. I initially tried that but I recoiled at the sight of things like `contains(A, B)` which isn't very clear, while `A.contains(B)` is self explanatory.
Considering HWEvent is a bitmask, I also implemented a simple iterator to iterate over all set bits of the mask, which is a useful thing to have as some APIs in InsertWaitCnt rely on treating one event at a time.
---
llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp | 62 ++++---
llvm/lib/Target/AMDGPU/AMDGPUHWEvents.def | 62 ++++---
llvm/lib/Target/AMDGPU/AMDGPUHWEvents.h | 174 ++++++++++----------
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 152 ++++++++---------
4 files changed, 219 insertions(+), 231 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp
index 82be5059e7443..f1bf6a9ccf0b6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.cpp
@@ -15,21 +15,9 @@
namespace llvm {
namespace AMDGPU {
-void HWEventSet::print(raw_ostream &OS) const {
- ListSeparator LS(", ");
- for (HWEvent Event : hw_events()) {
- if (contains(Event))
- OS << LS << toString(Event);
- }
-}
-
-void HWEventSet::dump() const {
- print(dbgs());
- dbgs() << "\n";
-}
-static HWEventSet getExpertSchedulingEventType(const MachineInstr &Inst,
- const SIInstrInfo &TII) {
+static HWEvent getExpertSchedulingEventType(const MachineInstr &Inst,
+ const SIInstrInfo &TII) {
if (TII.isVALU(Inst)) {
// Core/Side-, DP-, XDL- and TRANS-MACC VALU instructions complete
// out-of-order with respect to each other, so each of these classes
@@ -61,7 +49,7 @@ static HWEventSet getExpertSchedulingEventType(const MachineInstr &Inst,
return HWEvent::VGPR_VMEM_READ;
// Otherwise, no hazard.
- return {};
+ return HWEvent::NONE;
}
static HWEvent getVmemHWEvent(const MachineInstr &Inst, const GCNSubtarget &ST,
@@ -110,13 +98,13 @@ static HWEvent getVmemHWEvent(const MachineInstr &Inst, const GCNSubtarget &ST,
return HWEvent::VMEM_ACCESS;
}
-static HWEventSet getEventsForImpl(const MachineInstr &Inst,
- const GCNSubtarget &ST,
- const SIInstrInfo &TII) {
+static HWEvent getEventsForImpl(const MachineInstr &Inst,
+ const GCNSubtarget &ST,
+ const SIInstrInfo &TII) {
if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {
if (TII.isAlwaysGDS(Inst.getOpcode()) ||
TII.hasModifiersSet(Inst, AMDGPU::OpName::gds))
- return {HWEvent::GDS_ACCESS, HWEvent::GDS_GPR_LOCK};
+ return HWEvent::GDS_ACCESS | HWEvent::GDS_GPR_LOCK;
return HWEvent::LDS_ACCESS;
}
@@ -126,16 +114,16 @@ static HWEventSet getEventsForImpl(const MachineInstr &Inst,
return getVmemHWEvent(Inst, ST, TII);
assert(Inst.mayLoadOrStore());
- HWEventSet S;
+ HWEvent E = HWEvent::NONE;
if (TII.mayAccessVMEMThroughFlat(Inst)) {
if (ST.hasWaitXcnt())
- S.insert(HWEvent::VMEM_GROUP);
- S.insert(getVmemHWEvent(Inst, ST, TII));
+ E |= HWEvent::VMEM_GROUP;
+ E |= getVmemHWEvent(Inst, ST, TII);
}
if (TII.mayAccessLDSThroughFlat(Inst))
- S.insert(HWEvent::LDS_ACCESS);
- return S;
+ E |= HWEvent::LDS_ACCESS;
+ return E;
}
if (SIInstrInfo::isVMEM(Inst) &&
@@ -144,19 +132,19 @@ static HWEventSet getEventsForImpl(const MachineInstr &Inst,
// BUFFER_WBL2 is included here because unlike invalidates, has to be
// followed "S_WAITCNT vmcnt(0)" is needed after to ensure the writeback has
// completed.
- HWEventSet S = {getVmemHWEvent(Inst, ST, TII)};
+ HWEvent E = getVmemHWEvent(Inst, ST, TII);
if (ST.hasWaitXcnt())
- S.insert(HWEvent::VMEM_GROUP);
+ E |= HWEvent::VMEM_GROUP;
if (ST.vmemWriteNeedsExpWaitcnt() &&
(Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst)))
- S.insert(HWEvent::VMW_GPR_LOCK);
+ E |= HWEvent::VMW_GPR_LOCK;
- return S;
+ return E;
}
if (TII.isSMRD(Inst)) {
if (ST.hasWaitXcnt())
- return {HWEvent::SMEM_GROUP, HWEvent::SMEM_ACCESS};
+ return HWEvent::SMEM_GROUP | HWEvent::SMEM_ACCESS;
return HWEvent::SMEM_ACCESS;
}
@@ -190,11 +178,11 @@ static HWEventSet getEventsForImpl(const MachineInstr &Inst,
return HWEvent::SMEM_ACCESS;
}
- return {};
+ return HWEvent::NONE;
}
-HWEventSet getEventsFor(const MachineInstr &Inst, const GCNSubtarget &ST,
- bool IsExpertMode) {
+HWEvent getEventsFor(const MachineInstr &Inst, const GCNSubtarget &ST,
+ bool IsExpertMode) {
const SIInstrInfo &TII = *ST.getInstrInfo();
if (IsExpertMode)
@@ -203,4 +191,14 @@ HWEventSet getEventsFor(const MachineInstr &Inst, const GCNSubtarget &ST,
return getEventsForImpl(Inst, ST, TII);
}
} // namespace AMDGPU
+
+raw_ostream &operator<<(raw_ostream &OS, AMDGPU::HWEvent Events) {
+ ListSeparator LS(" | ");
+#define AMDGPU_HW_EVENT(E, V) \
+ if (Events & AMDGPU::HWEvent::E) \
+ OS << LS << #E << " ";
+#include "AMDGPUHWEvents.def"
+ return OS;
+}
+
} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.def b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.def
index e0db74e93021e..9d2d01cee6e02 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.def
@@ -19,42 +19,37 @@
#define AMDGPU_LAST_HW_EVENT(X)
#endif
-#ifndef AMDGPU_FIRST_HW_EVENT
-#define AMDGPU_FIRST_HW_EVENT(X)
-#endif
-
// TODO: VMEM_ACCESS should be broken up and be target-independent, not interpreted differently
// depending on the target.
-AMDGPU_HW_EVENT(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */
-AMDGPU_HW_EVENT(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */
-AMDGPU_HW_EVENT(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */
-AMDGPU_HW_EVENT(GLOBAL_INV_ACCESS) /* GLOBAL_INV (gfx12+ only) */
-AMDGPU_HW_EVENT(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */
-AMDGPU_HW_EVENT(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */
-AMDGPU_HW_EVENT(VMEM_GROUP) /* vmem group */
-AMDGPU_HW_EVENT(LDS_ACCESS) /* lds read & write */
-AMDGPU_HW_EVENT(GDS_ACCESS) /* gds read & write */
-AMDGPU_HW_EVENT(SQ_MESSAGE) /* send message */
-AMDGPU_HW_EVENT(SCC_WRITE) /* write to SCC from barrier */
-AMDGPU_HW_EVENT(SMEM_ACCESS) /* scalar-memory read & write */
-AMDGPU_HW_EVENT(SMEM_GROUP) /* scalar-memory group */
-AMDGPU_HW_EVENT(EXP_GPR_LOCK) /* export holding on its data src */
-AMDGPU_HW_EVENT(GDS_GPR_LOCK) /* GDS holding on its data and addr src */
-AMDGPU_HW_EVENT(EXP_POS_ACCESS) /* write to export position */
-AMDGPU_HW_EVENT(EXP_PARAM_ACCESS) /* write to export parameter */
-AMDGPU_HW_EVENT(VMW_GPR_LOCK) /* vmem write holding on its data src */
-AMDGPU_HW_EVENT(EXP_LDS_ACCESS) /* read by ldsdir counting as export */
-AMDGPU_HW_EVENT(VGPR_CSMACC_WRITE) /* write VGPR dest in Core/Side-MACC VALU */
-AMDGPU_HW_EVENT(VGPR_DPMACC_WRITE) /* write VGPR dest in DPMACC VALU */
-AMDGPU_HW_EVENT(VGPR_TRANS_WRITE) /* write VGPR dest in TRANS VALU */
-AMDGPU_HW_EVENT(VGPR_XDL_WRITE) /* write VGPR dest in XDL VALU */
-AMDGPU_HW_EVENT(VGPR_LDS_READ) /* read VGPR source in LDS */
-AMDGPU_HW_EVENT(VGPR_FLAT_READ) /* read VGPR source in FLAT */
-AMDGPU_HW_EVENT(VGPR_VMEM_READ) /* read VGPR source in other VMEM */
-AMDGPU_HW_EVENT(ASYNC_ACCESS) /* access that uses ASYNC_CNT */
-AMDGPU_HW_EVENT(TENSOR_ACCESS) /* access that uses TENSOR_CNT */
+AMDGPU_HW_EVENT(VMEM_ACCESS, 0) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */
+AMDGPU_HW_EVENT(VMEM_SAMPLER_READ_ACCESS, 1) /* vmem SAMPLER read (gfx12+ only) */
+AMDGPU_HW_EVENT(VMEM_BVH_READ_ACCESS, 2) /* vmem BVH read (gfx12+ only) */
+AMDGPU_HW_EVENT(GLOBAL_INV_ACCESS, 3) /* GLOBAL_INV (gfx12+ only) */
+AMDGPU_HW_EVENT(VMEM_WRITE_ACCESS, 4) /* vmem write that is not scratch */
+AMDGPU_HW_EVENT(SCRATCH_WRITE_ACCESS, 5) /* vmwrite that may be scratch */
+AMDGPU_HW_EVENT(VMEM_GROUP, 6) /* vmem group */
+AMDGPU_HW_EVENT(LDS_ACCESS, 7) /* lds read & write */
+AMDGPU_HW_EVENT(GDS_ACCESS, 8) /* gds read & write */
+AMDGPU_HW_EVENT(SQ_MESSAGE, 9) /* send message */
+AMDGPU_HW_EVENT(SCC_WRITE, 10) /* write to SCC from barrier */
+AMDGPU_HW_EVENT(SMEM_ACCESS, 11) /* scalar-memory read & write */
+AMDGPU_HW_EVENT(SMEM_GROUP, 12) /* scalar-memory group */
+AMDGPU_HW_EVENT(EXP_GPR_LOCK, 13) /* export holding on its data src */
+AMDGPU_HW_EVENT(GDS_GPR_LOCK, 14) /* GDS holding on its data and addr src */
+AMDGPU_HW_EVENT(EXP_POS_ACCESS, 15) /* write to export position */
+AMDGPU_HW_EVENT(EXP_PARAM_ACCESS, 16) /* write to export parameter */
+AMDGPU_HW_EVENT(VMW_GPR_LOCK, 17) /* vmem write holding on its data src */
+AMDGPU_HW_EVENT(EXP_LDS_ACCESS, 18) /* read by ldsdir counting as export */
+AMDGPU_HW_EVENT(VGPR_CSMACC_WRITE, 19) /* write VGPR dest in Core/Side-MACC VALU */
+AMDGPU_HW_EVENT(VGPR_DPMACC_WRITE, 20) /* write VGPR dest in DPMACC VALU */
+AMDGPU_HW_EVENT(VGPR_TRANS_WRITE, 21) /* write VGPR dest in TRANS VALU */
+AMDGPU_HW_EVENT(VGPR_XDL_WRITE, 22) /* write VGPR dest in XDL VALU */
+AMDGPU_HW_EVENT(VGPR_LDS_READ, 23) /* read VGPR source in LDS */
+AMDGPU_HW_EVENT(VGPR_FLAT_READ, 24) /* read VGPR source in FLAT */
+AMDGPU_HW_EVENT(VGPR_VMEM_READ, 25) /* read VGPR source in other VMEM */
+AMDGPU_HW_EVENT(ASYNC_ACCESS, 26) /* access that uses ASYNC_CNT */
+AMDGPU_HW_EVENT(TENSOR_ACCESS, 27) /* access that uses TENSOR_CNT */
-AMDGPU_FIRST_HW_EVENT(VMEM_ACCESS)
AMDGPU_LAST_HW_EVENT(TENSOR_ACCESS)
@@ -62,4 +57,3 @@ AMDGPU_LAST_HW_EVENT(TENSOR_ACCESS)
#undef AMDGPU_HW_EVENT
#undef AMDGPU_LAST_HW_EVENT
-#undef AMDGPU_FIRST_HW_EVENT
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.h b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.h
index 5fad398fa56b3..8a2c50211efed 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHWEvents.h
@@ -9,8 +9,12 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUHWEVENTS_H
#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUHWEVENTS_H
-#include "llvm/ADT/Sequence.h"
-#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/bit.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
namespace llvm {
class GCNSubtarget;
@@ -19,110 +23,108 @@ class raw_ostream;
namespace AMDGPU {
-/// TODO: This should be a bitmask from the start instead of having this enum
-/// + \ref HWEventSet below.
-enum class HWEvent : unsigned char {
-#define AMDGPU_HW_EVENT(X) X,
-#define AMDGPU_FIRST_HW_EVENT(X) FIRST_WAIT_EVENT = X,
-#define AMDGPU_LAST_HW_EVENT(X) NUM_WAIT_EVENTS = X,
-#include "AMDGPUHWEvents.def"
-};
-
-} // namespace AMDGPU
+class HWEvent {
+public:
+ using value_type = uint32_t;
-template <> struct enum_iteration_traits<AMDGPU::HWEvent> {
- static constexpr bool is_iterable = true; // NOLINT
-};
+ enum : value_type {
+ NONE = 0,
+#define AMDGPU_HW_EVENT(X, V) X = (1 << V),
+#define AMDGPU_LAST_HW_EVENT(X) HWEVENT_LAST_EVENT = X,
+#include "AMDGPUHWEvents.def"
-namespace AMDGPU {
+ ALL = ((HWEVENT_LAST_EVENT << 1) - 1)
+ };
-static constexpr StringLiteral toString(HWEvent Event) {
- switch (Event) {
-#define AMDGPU_HW_EVENT(EVENT) \
- case HWEvent::EVENT: \
- return #EVENT;
-#include "AMDGPUHWEvents.def"
- }
+ /// Iterates over the set bits of an HWEvent.
+ /// NOLINTNEXTLINE
+ class const_iterator
+ : public iterator_facade_base<const_iterator, std::forward_iterator_tag,
+ HWEvent> {
+ HWEvent::value_type Cur = 0;
- return "";
-}
+ public:
+ const_iterator() = default;
+ const_iterator(HWEvent H) : Cur(H.value()) {}
-/// Return an iterator over all events between FIRST_WAIT_EVENT
-/// and \c MaxEvent (exclusive, default value yields an enumeration over
-/// all counters).
-// NOLINTNEXTLINE
-inline iota_range<HWEvent>
-hw_events(HWEvent MaxEvent = HWEvent::NUM_WAIT_EVENTS) {
- return enum_seq(HWEvent::FIRST_WAIT_EVENT, MaxEvent);
-}
+ bool operator==(const const_iterator &Other) const {
+ return Cur == Other.Cur;
+ }
-class HWEventSet {
- unsigned Mask = 0;
+ HWEvent operator*() const {
+ return Cur ? (Cur & (1 << countr_zero(Cur))) : HWEvent();
+ }
-public:
- HWEventSet() = default;
- constexpr HWEventSet(HWEvent Event) {
- static_assert(static_cast<unsigned>(HWEvent::NUM_WAIT_EVENTS) <=
- sizeof(Mask) * 8,
- "Not enough bits in Mask for all the events");
- Mask |= 1 << static_cast<unsigned>(Event);
- }
- constexpr HWEventSet(std::initializer_list<HWEvent> Events) {
- for (auto &E : Events) {
- Mask |= 1 << static_cast<unsigned>(E);
+ const_iterator &operator++() {
+ Cur &= maskTrailingZeros<HWEvent::value_type>(countr_zero(Cur) + 1);
+ return *this;
}
+ };
+
+ constexpr HWEvent() = default;
+ constexpr HWEvent(value_type V) : Data(V) {
+ assert((V & ALL) == V && "Bits set out of bounds!");
}
- void insert(const HWEvent &Event) {
- Mask |= 1 << static_cast<unsigned>(Event);
- }
- void remove(const HWEvent &Event) {
- Mask &= ~(1 << static_cast<unsigned>(Event));
- }
- void remove(const HWEventSet &Other) { Mask &= ~Other.Mask; }
- bool contains(const HWEvent &Event) const {
- return Mask & (1 << static_cast<unsigned>(Event));
- }
- /// \returns true if this set contains all elements of \p Other.
- bool contains(const HWEventSet &Other) const {
- return (~Mask & Other.Mask) == 0;
+
+ constexpr unsigned size() const { return popcount(Data); }
+ constexpr bool any() const { return Data != 0; }
+ constexpr bool none() const { return Data == 0; }
+ constexpr value_type value() const { return Data; }
+
+ constexpr explicit operator bool() const { return any(); }
+
+ constexpr bool contains(HWEvent Other) const {
+ return (~Data & Other.Data) == 0;
}
- /// \returns the intersection of this and \p Other.
- HWEventSet operator&(const HWEventSet &Other) const {
- auto Copy = *this;
- Copy.Mask &= Other.Mask;
- return Copy;
+
+ const_iterator begin() const { return *this; }
+
+ const_iterator end() const { return {}; }
+
+ constexpr HWEvent operator|(HWEvent Other) const { return Data | Other.Data; }
+ constexpr HWEvent operator&(HWEvent Other) const { return Data & Other.Data; }
+ constexpr HWEvent operator^(HWEvent Other) const { return Data ^ Other.Data; }
+
+ constexpr HWEvent operator~() const { return Data ^ ALL; }
+
+ constexpr bool operator==(HWEvent Other) const { return Data == Other.Data; }
+ constexpr bool operator!=(HWEvent Other) const { return Data != Other.Data; }
+
+ constexpr HWEvent &operator|=(HWEvent Other) {
+ Data |= Other.Data;
+ return *this;
}
- /// \returns the union of this and \p Other.
- HWEventSet operator|(const HWEventSet &Other) const {
- auto Copy = *this;
- Copy.Mask |= Other.Mask;
- return Copy;
+ constexpr HWEvent &operator&=(HWEvent Other) {
+ Data &= Other.Data;
+ return *this;
}
- /// This set becomes the union of this and \p Other.
- HWEventSet &operator|=(const HWEventSet &Other) {
- Mask |= Other.Mask;
+ constexpr HWEvent &operator^=(HWEvent Other) {
+ Data ^= Other.Data;
return *this;
}
- /// This set becomes the intersection of this and \p Other.
- HWEventSet &operator&=(const HWEventSet &Other) {
- Mask &= Other.Mask;
+
+ /// Overload both bitwise AND operators w/ the value_type to avoid an implicit
+ /// conversion to HWEvent in this common pattern used to clear an event bit:
+ /// `Events & ~HWEvent::EVENT_TO_CLEAR`.
+ /// If we had the implicit conversion to HWEvent, we'd assert because
+ /// `~HWEvent::EVENT_TO_CLEAR` has bits set outside of `HWEvent::ALL`.
+ constexpr HWEvent operator&(value_type Other) const { return Data & Other; }
+ constexpr HWEvent &operator&=(value_type Other) {
+ Data &= Other;
return *this;
}
- bool operator==(const HWEventSet &Other) const { return Mask == Other.Mask; }
- bool operator!=(const HWEventSet &Other) const { return !(*this == Other); }
- bool empty() const { return Mask == 0; }
- /// \returns true if the set contains more than one element.
- bool twoOrMore() const { return Mask & (Mask - 1); }
- operator bool() const { return !empty(); }
- void print(raw_ostream &OS) const;
- LLVM_DUMP_METHOD void dump() const;
+
+private:
+ value_type Data = NONE;
};
-/// \returns all HWEvents triggered by \p Inst
-HWEventSet getEventsFor(const MachineInstr &Inst, const GCNSubtarget &ST,
- bool IsExpertMode);
+/// \returns A bitmask of HWEvent triggered by \p Inst
+HWEvent getEventsFor(const MachineInstr &Inst, const GCNSubtarget &ST,
+ bool IsExpertMode);
} // namespace AMDGPU
+
+raw_ostream &operator<<(raw_ostream &OS, AMDGPU::HWEvent E);
} // namespace llvm
#endif
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 8d42715f70be6..fa6daaac43d71 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -44,7 +44,6 @@
using namespace llvm;
-using HWEventSet = AMDGPU::HWEventSet;
using HWEvent = AMDGPU::HWEvent;
#define DEBUG_TYPE "si-insert-waitcnts"
@@ -249,13 +248,14 @@ class WaitcntGenerator {
AMDGPU::Waitcnt Wait,
const WaitcntBrackets &ScoreBrackets) = 0;
- // Returns the HWEventSet that corresponds to counter \p T.
- virtual const HWEventSet &getWaitEvents(AMDGPU::InstCounterType T) const = 0;
+ // Returns the set of HWEvents that corresponds to counter \p T.
+ virtual const HWEvent &getWaitEvents(AMDGPU::InstCounterType T) const = 0;
/// \returns the counter that corresponds to event \p E.
AMDGPU::InstCounterType getCounterFromEvent(HWEvent E) const {
+ assert(E.size() == 1 && "Cannot handle a mask of events!");
for (auto T : AMDGPU::inst_counter_types()) {
- if (getWaitEvents(T).contains(E))
+ if (getWaitEvents(T) & E)
return T;
}
llvm_unreachable("event type has no associated counter");
@@ -272,25 +272,24 @@ class WaitcntGenerator {
};
class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
- static constexpr const HWEventSet
+ static constexpr const HWEvent
WaitEventMaskForInstPreGFX12[AMDGPU::NUM_INST_CNTS] = {
- HWEventSet({HWEvent::VMEM_ACCESS, HWEvent::VMEM_SAMPLER_READ_ACCESS,
- HWEvent::VMEM_BVH_READ_ACCESS}),
- HWEventSet({HWEvent::SMEM_ACCESS, HWEvent::LDS_ACCESS,
- HWEvent::GDS_ACCESS, HWEvent::SQ_MESSAGE}),
- HWEventSet({HWEvent::EXP_GPR_LOCK, HWEvent::GDS_GPR_LOCK,
- HWEvent::VMW_GPR_LOCK, HWEvent::EXP_PARAM_ACCESS,
- HWEvent::EXP_POS_ACCESS, HWEvent::EXP_LDS_ACCESS}),
- HWEventSet(
- {HWEvent::VMEM_WRITE_ACCESS, HWEvent::SCRATCH_WRITE_ACCESS}),
- HWEventSet(),
- HWEventSet(),
- HWEventSet(),
- HWEventSet(),
- HWEventSet(),
- HWEventSet(),
- HWEventSet(),
- HWEventSet()};
+ HWEvent::VMEM_ACCESS | HWEvent::VMEM_SAMPLER_READ_ACCESS |
+ HWEvent::VMEM_BVH_READ_ACCESS,
+ HWEvent::SMEM_ACCESS | HWEvent::LDS_ACCESS | HWEvent::GDS_ACCESS |
+ HWEvent::SQ_MESSAGE,
+ HWEvent::EXP_GPR_LOCK | HWEvent::GDS_GPR_LOCK |
+ HWEvent::VMW_GPR_LOCK | HWEvent::EXP_PARAM_ACCESS |
+ HWEvent::EXP_POS_ACCESS | HWEvent::EXP_LDS_ACCESS,
+ HWEvent::VMEM_WRITE_ACCESS | HWEvent::SCRATCH_WRITE_ACCESS,
+ HWEvent::NONE,
+ HWEvent::NONE,
+ HWEvent::NONE,
+ HWEvent::NONE,
+ HWEvent::NONE,
+ HWEvent::NONE,
+ HWEvent::NONE,
+ HWEvent::NONE};
public:
using WaitcntGenerator::WaitcntGenerator;
@@ -304,7 +303,7 @@ class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
AMDGPU::Waitcnt Wait,
const WaitcntBrackets &ScoreBrackets) override;
- const HWEventSet &getWaitEvents(AMDGPU::InstCounterType T) const override {
+ const HWEvent &getWaitEvents(AMDGPU::InstCounterType T) const override {
return WaitEventMaskForInstPreGFX12[T];
}
@@ -314,26 +313,26 @@ class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
protected:
bool IsExpertMode;
- static constexpr const HWEventSet
+ static constexpr const HWEvent
WaitEventMaskForInstGFX12Plus[AMDGPU::NUM_INST_CNTS] = {
- HWEventSet({HWEvent::VMEM_ACCESS, HWEvent::GLOBAL_INV_ACCESS}),
- HWEventSet({HWEvent::LDS_ACCESS, HWEvent::GDS_ACCESS}),
- HWEventSet({HWEvent::EXP_GPR_LOCK, HWEvent::GDS_GPR_LOCK,
- HWEvent::VMW_GPR_LOCK, HWEvent::EXP_PARAM_ACCESS,
- HWEvent::EXP_POS_ACCESS, HWEvent::EXP_LDS_ACCESS}),
- HWEventSet(
- {HWEvent::VMEM_WRITE_ACCESS, HWEvent::SCRATCH_WRITE_ACCESS}),
- HWEventSet({HWEvent::VMEM_SAMPLER_READ_ACCESS}),
- HWEventSet({HWEvent::VMEM_BVH_READ_ACCESS}),
- HWEventSet(
- {HWEvent::SMEM_ACCESS, HWEvent::SQ_MESSAGE, HWEvent::SCC_WRITE}),
- HWEventSet({HWEvent::VMEM_GROUP, HWEvent::SMEM_GROUP}),
- HWEventSet({HWEvent::ASYNC_ACCESS}),
- HWEventSet({HWEvent::TENSOR_ACCESS}),
- HWEventSet({HWEvent::VGPR_CSMACC_WRITE, HWEvent::VGPR_DPMACC_WRITE,
- HWEvent::VGPR_TRANS_WRITE, HWEvent::VGPR_XDL_WRITE}),
- HWEventSet({HWEvent::VGPR_LDS_READ, HWEvent::VGPR_FLAT_READ,
- HWEvent::VGPR_VMEM_READ})};
+ HWEvent::VMEM_ACCESS | HWEvent::GLOBAL_INV_ACCESS,
+ HWEvent::LDS_ACCESS | HWEvent::GDS_ACCESS,
+ HWEvent::EXP_GPR_LOCK | HWEvent::GDS_GPR_LOCK |
+ HWEvent::VMW_GPR_LOCK | HWEvent::EXP_PARAM_ACCESS |
+ HWEvent::EXP_POS_ACCESS | HWEvent::EXP_LDS_ACCESS,
+
+ HWEvent::VMEM_WRITE_ACCESS | HWEvent::SCRATCH_WRITE_ACCESS,
+ HWEvent::VMEM_SAMPLER_READ_ACCESS,
+ HWEvent::VMEM_BVH_READ_ACCESS,
+
+ HWEvent::SMEM_ACCESS | HWEvent::SQ_MESSAGE | HWEvent::SCC_WRITE,
+ HWEvent::VMEM_GROUP | HWEvent::SMEM_GROUP,
+ HWEvent::ASYNC_ACCESS,
+ HWEvent::TENSOR_ACCESS,
+ HWEvent::VGPR_CSMACC_WRITE | HWEvent::VGPR_DPMACC_WRITE |
+ HWEvent::VGPR_TRANS_WRITE | HWEvent::VGPR_XDL_WRITE,
+ HWEvent::VGPR_LDS_READ | HWEvent::VGPR_FLAT_READ |
+ HWEvent::VGPR_VMEM_READ};
public:
WaitcntGeneratorGFX12Plus() = delete;
@@ -353,7 +352,7 @@ class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
AMDGPU::Waitcnt Wait,
const WaitcntBrackets &ScoreBrackets) override;
- const HWEventSet &getWaitEvents(AMDGPU::InstCounterType T) const override {
+ const HWEvent &getWaitEvents(AMDGPU::InstCounterType T) const override {
return WaitEventMaskForInstGFX12Plus[T];
}
@@ -477,7 +476,7 @@ class SIInsertWaitcnts {
bool removeRedundantSoftXcnts(MachineBasicBlock &Block);
void setSchedulingMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
bool ExpertMode) const;
- const HWEventSet &getWaitEvents(AMDGPU::InstCounterType T) const {
+ const HWEvent &getWaitEvents(AMDGPU::InstCounterType T) const {
return WCG->getWaitEvents(T);
}
AMDGPU::InstCounterType getCounterFromEvent(HWEvent E) const {
@@ -593,19 +592,20 @@ class WaitcntBrackets {
void updateByEvent(HWEvent E, MachineInstr &MI);
void recordAsyncMark(MachineInstr &MI);
- bool hasPendingEvent() const { return !PendingEvents.empty(); }
+ HWEvent getPendingEvents() const { return PendingEvents; }
+ bool hasPendingEvent() const { return PendingEvents.any(); }
bool hasPendingEvent(HWEvent E) const { return PendingEvents.contains(E); }
bool hasPendingEvent(AMDGPU::InstCounterType T) const {
- bool HasPending = PendingEvents & Context->getWaitEvents(T);
+ bool HasPending = (PendingEvents & Context->getWaitEvents(T)).any();
assert(HasPending == !empty(T) &&
"Expected pending events iff scoreboard is not empty");
return HasPending;
}
bool hasMixedPendingEvents(AMDGPU::InstCounterType T) const {
- HWEventSet Events = PendingEvents & Context->getWaitEvents(T);
+ HWEvent Events = PendingEvents & Context->getWaitEvents(T);
// Return true if more than one bit is set in Events.
- return Events.twoOrMore();
+ return Events.size() > 1;
}
bool hasPendingFlat() const {
@@ -746,7 +746,7 @@ class WaitcntBrackets {
unsigned ScoreLBs[AMDGPU::NUM_INST_CNTS] = {0};
unsigned ScoreUBs[AMDGPU::NUM_INST_CNTS] = {0};
- HWEventSet PendingEvents;
+ HWEvent PendingEvents = HWEvent::NONE;
// Remember the last flat memory operation.
unsigned LastFlatDsCnt = 0;
unsigned LastFlatLoadCnt = 0;
@@ -885,6 +885,7 @@ bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI,
}
void WaitcntBrackets::updateByEvent(HWEvent E, MachineInstr &Inst) {
+ assert(E.size() == 1 && "Expected singular event!");
AMDGPU::InstCounterType T = Context->getCounterFromEvent(E);
assert(T < Context->MaxCounter);
@@ -902,7 +903,7 @@ void WaitcntBrackets::updateByEvent(HWEvent E, MachineInstr &Inst) {
// PendingEvents and ScoreUB need to be update regardless if this event
// changes the score of a register or not.
// Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
- PendingEvents.insert(E);
+ PendingEvents |= E;
setScoreUB(T, CurrScore);
const SIRegisterInfo &TRI = Context->TRI;
@@ -983,13 +984,13 @@ void WaitcntBrackets::updateByEvent(HWEvent E, MachineInstr &Inst) {
} else if (T == AMDGPU::X_CNT) {
HWEvent OtherEvent =
E == HWEvent::SMEM_GROUP ? HWEvent::VMEM_GROUP : HWEvent::SMEM_GROUP;
- if (PendingEvents.contains(OtherEvent)) {
+ if (PendingEvents & OtherEvent) {
// Hardware inserts an implicit xcnt between interleaved
// SMEM and VMEM operations. So there will never be
// outstanding address translations for both SMEM and
// VMEM at the same time.
setScoreLB(T, getScoreUB(T) - 1);
- PendingEvents.remove(OtherEvent);
+ PendingEvents &= ~OtherEvent;
}
for (const MachineOperand &Op : Inst.all_uses())
setScoreByOperand(Op, T, CurrScore);
@@ -1197,12 +1198,7 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
OS << "Pending Events: ";
if (hasPendingEvent()) {
- ListSeparator LS;
- for (auto E : AMDGPU::hw_events()) {
- if (hasPendingEvent(E)) {
- OS << LS << AMDGPU::toString(E);
- }
- }
+ OS << getPendingEvents();
} else {
OS << "none";
}
@@ -1443,11 +1439,11 @@ MCPhysReg WaitcntBrackets::determineVGPR16Dependency(const MachineInstr &MI,
return Reg32;
// If hi/lo16 mixed events
- HWEventSet MIEvents =
+ HWEvent MIEvents =
AMDGPU::getEventsFor(MI, Context->ST, Context->IsExpertMode);
- HWEventSet OtherHalfEvents = Context->getWaitEvents(T);
- HWEventSet Events = MIEvents & OtherHalfEvents;
- if (Events.twoOrMore())
+ HWEvent OtherHalfEvents = Context->getWaitEvents(T);
+ HWEvent Events = MIEvents & OtherHalfEvents;
+ if (Events.size() > 1)
return Reg32;
return Reg;
}
@@ -1482,14 +1478,14 @@ void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
if (PendingSCCWrite &&
PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
- HWEventSet SCC_WRITE_PendingEvent(HWEvent::SCC_WRITE);
+ HWEvent SCC_WRITE_PendingEvent = HWEvent::SCC_WRITE;
// If this SCC_WRITE is the only pending KM_CNT event, clear counter.
if ((PendingEvents & Context->getWaitEvents(AMDGPU::KM_CNT)) ==
SCC_WRITE_PendingEvent) {
setScoreLB(AMDGPU::KM_CNT, getScoreUB(AMDGPU::KM_CNT));
}
- PendingEvents.remove(SCC_WRITE_PendingEvent);
+ PendingEvents &= ~SCC_WRITE_PendingEvent;
PendingSCCWrite = nullptr;
}
}
@@ -1509,7 +1505,7 @@ void WaitcntBrackets::applyWaitcnt(AMDGPU::InstCounterType T, unsigned Count) {
setScoreLB(T, std::max(getScoreLB(T), UB - Count));
} else {
setScoreLB(T, UB);
- PendingEvents.remove(Context->getWaitEvents(T));
+ PendingEvents &= ~Context->getWaitEvents(T);
}
if (T == AMDGPU::KM_CNT && Count == 0 &&
@@ -1517,14 +1513,14 @@ void WaitcntBrackets::applyWaitcnt(AMDGPU::InstCounterType T, unsigned Count) {
if (!hasMixedPendingEvents(AMDGPU::X_CNT))
applyWaitcnt(AMDGPU::X_CNT, 0);
else
- PendingEvents.remove(HWEvent::SMEM_GROUP);
+ PendingEvents &= ~HWEvent::SMEM_GROUP;
}
if (T == AMDGPU::LOAD_CNT && hasPendingEvent(HWEvent::VMEM_GROUP) &&
!hasPendingEvent(AMDGPU::STORE_CNT)) {
if (!hasMixedPendingEvents(AMDGPU::X_CNT))
applyWaitcnt(AMDGPU::X_CNT, Count);
else if (Count == 0)
- PendingEvents.remove(HWEvent::VMEM_GROUP);
+ PendingEvents &= ~HWEvent::VMEM_GROUP;
}
}
@@ -1547,13 +1543,13 @@ bool WaitcntBrackets::counterOutOfOrder(AMDGPU::InstCounterType T) const {
// so having GLOBAL_INV_ACCESS mixed with other LOAD_CNT events doesn't cause
// out-of-order completion.
if (T == AMDGPU::LOAD_CNT) {
- HWEventSet Events = PendingEvents & Context->getWaitEvents(T);
+ HWEvent Events = PendingEvents & Context->getWaitEvents(T);
// Remove GLOBAL_INV_ACCESS from the event mask before checking for mixed
// events
- Events.remove(HWEvent::GLOBAL_INV_ACCESS);
+ Events &= ~HWEvent::GLOBAL_INV_ACCESS;
// Return true only if there are still multiple event types after removing
// GLOBAL_INV
- return Events.twoOrMore();
+ return Events.size() > 1;
}
return hasMixedPendingEvents(T);
@@ -2653,11 +2649,9 @@ bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
WaitcntBrackets *ScoreBrackets) {
- HWEventSet InstEvents = AMDGPU::getEventsFor(Inst, ST, IsExpertMode);
- for (HWEvent E : AMDGPU::hw_events()) {
- if (InstEvents.contains(E))
- ScoreBrackets->updateByEvent(E, Inst);
- }
+ HWEvent InstEvents = AMDGPU::getEventsFor(Inst, ST, IsExpertMode);
+ for (HWEvent E : InstEvents)
+ ScoreBrackets->updateByEvent(E, Inst);
if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {
if (TII.isAlwaysGDS(Inst.getOpcode()) ||
@@ -2794,9 +2788,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
for (auto T : inst_counter_types(Context->MaxCounter)) {
// Merge event flags for this counter
- const HWEventSet &EventsForT = Context->getWaitEvents(T);
- const HWEventSet OldEvents = PendingEvents & EventsForT;
- const HWEventSet OtherEvents = Other.PendingEvents & EventsForT;
+ const HWEvent &EventsForT = Context->getWaitEvents(T);
+ const HWEvent OldEvents = PendingEvents & EventsForT;
+ const HWEvent OtherEvents = Other.PendingEvents & EventsForT;
if (!OldEvents.contains(OtherEvents))
StrictDom = true;
PendingEvents |= OtherEvents;
@@ -2827,7 +2821,7 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
if (T == AMDGPU::KM_CNT) {
StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
if (Other.hasPendingEvent(HWEvent::SCC_WRITE)) {
- if (!OldEvents.contains(HWEvent::SCC_WRITE)) {
+ if (!(OldEvents & HWEvent::SCC_WRITE)) {
PendingSCCWrite = Other.PendingSCCWrite;
} else if (PendingSCCWrite != Other.PendingSCCWrite) {
PendingSCCWrite = nullptr;
More information about the llvm-branch-commits
mailing list