[llvm] [AMDGPU][SIInsertWaitcnts][NFC] Introduce WaitEventSet container for events (PR #178511)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 29 09:24:07 PST 2026
https://github.com/vporpo updated https://github.com/llvm/llvm-project/pull/178511
>From 121ce91d193da51724dd35af9a49c0353a2bb707 Mon Sep 17 00:00:00 2001
From: Vasileios Porpodas <vasileios.porpodas at amd.com>
Date: Sat, 24 Jan 2026 17:01:06 +0000
Subject: [PATCH 1/2] [AMDGPU][SIInsertWaitcnts][NFC] Introduce WaitEventSet
container for events
Before this patch WaitEventType events used to be collected in unsigned
integers that were used as small bit vectors.
This patch introduces a WaitEventSet container class to replace the integer
bit vectors with a class that hides the implementation of common operations
like insertion, removal, union, intersection etc. from the user.
The WaitEventSet API matches that of a set and not a vector because we don't
care about the order of its contents. Internally though it is still a bit
vector that uses an unsigned integer as its storage, just like the original
implementation.
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 216 ++++++++++++++------
1 file changed, 154 insertions(+), 62 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ee59a2e59d4a3..1427768711113 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -219,12 +219,31 @@ enum WaitEventType {
NUM_WAIT_EVENTS
};
#undef AMDGPU_EVENT_ENUM
+} // namespace
+
+namespace llvm {
+template <> struct enum_iteration_traits<WaitEventType> {
+ static constexpr bool is_iterable = true;
+};
+} // namespace llvm
+
+namespace {
+
+/// Return an iterator over all events between VMEM_ACCESS (the first event)
+/// and \c MaxEvent (exclusive, default value yields an enumeration over
+/// all counters).
+auto wait_events(WaitEventType MaxEvent = NUM_WAIT_EVENTS) {
+ return enum_seq(VMEM_ACCESS, MaxEvent);
+}
#define AMDGPU_EVENT_NAME(Name) #Name,
static constexpr StringLiteral WaitEventTypeName[] = {
AMDGPU_DECLARE_WAIT_EVENTS(AMDGPU_EVENT_NAME)
};
#undef AMDGPU_EVENT_NAME
+static constexpr StringLiteral getWaitEventTypeName(WaitEventType Event) {
+ return WaitEventTypeName[Event];
+}
// clang-format on
// Enumerate different types of result-returning VMEM operations. Although
@@ -321,10 +340,88 @@ unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
return getCounterRef(Wait, T);
}
+/// A small set of events.
+class WaitEventSet {
+ unsigned Mask = 0;
+
+public:
+ WaitEventSet() = default;
+ explicit constexpr WaitEventSet(WaitEventType Event) {
+ assert((size_t)Event < sizeof(Mask) * 8 && "Not enough bits in mask!");
+ Mask |= 1 << Event;
+ }
+ constexpr WaitEventSet(std::initializer_list<WaitEventType> Events) {
+ for (auto &E : Events) {
+ assert((size_t)E < sizeof(Mask) * 8 && "Not enough bits in mask!");
+ Mask |= 1 << E;
+ }
+ }
+ void insert(const WaitEventType &Event) {
+ assert((size_t)Event < sizeof(Mask) * 8 && "Not enough bits in mask!");
+ Mask |= 1 << Event;
+ }
+ void remove(const WaitEventType &Event) {
+ assert((size_t)Event < sizeof(Mask) * 8 && "Not enough bits in mask!");
+ Mask &= ~(1 << Event);
+ }
+ bool contains(const WaitEventType &Event) const {
+ assert((size_t)Event < sizeof(Mask) * 8 && "Not enough bits in mask!");
+ return Mask & (1 << Event);
+ }
+ /// \Returns the intersection of this and \p Other.
+ WaitEventSet operator&(const WaitEventSet &Other) const {
+ auto Copy = *this;
+ Copy.Mask &= Other.Mask;
+ return Copy;
+ }
+ /// \Returns the union of this and \p Other.
+ WaitEventSet operator|(const WaitEventSet &Other) const {
+ auto Copy = *this;
+ Copy.Mask |= Other.Mask;
+ return Copy;
+ }
+ /// \Returns the inverse of this set.
+ WaitEventSet operator~() const {
+ auto Copy = *this;
+ Copy.Mask = ~Copy.Mask;
+ return Copy;
+ }
+ /// This set becomes the union of this and \p Other.
+ WaitEventSet &operator|=(const WaitEventSet &Other) {
+ Mask |= Other.Mask;
+ return *this;
+ }
+ /// This set becomes the intersection of this and \p Other.
+ WaitEventSet &operator&=(const WaitEventSet &Other) {
+ Mask &= Other.Mask;
+ return *this;
+ }
+ bool operator==(const WaitEventSet &Other) const {
+ return Mask == Other.Mask;
+ }
+ bool operator!=(const WaitEventSet &Other) const { return !(*this == Other); }
+ bool empty() const { return Mask == 0; }
+ /// \Returns true if the set contains more than one element.
+ bool twoOrMore() const { return Mask & (Mask - 1); }
+ operator bool() const { return !empty(); }
+ void print(raw_ostream &OS) const {
+ ListSeparator LS(", ");
+ for (WaitEventType Event : wait_events()) {
+ OS << LS << getWaitEventTypeName(Event);
+ }
+ }
+ LLVM_DUMP_METHOD void dump() const;
+};
+
+void WaitEventSet::dump() const {
+ print(dbgs());
+ dbgs() << "\n";
+}
+
// Mapping from event to counter according to the table masks.
-InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
+InstCounterType eventCounter(const WaitEventSet *masks, WaitEventType E) {
for (auto T : inst_counter_types()) {
- if (masks[T] & (1 << E))
+ if (masks[T].contains(E))
return T;
}
llvm_unreachable("event type has no associated counter");
@@ -394,40 +491,32 @@ class WaitcntGenerator {
AMDGPU::Waitcnt Wait,
const WaitcntBrackets &ScoreBrackets) = 0;
- // Returns an array of bit masks which can be used to map values in
+ // Returns an array of WaitEventSets which can be used to map values in
// WaitEventType to corresponding counter values in InstCounterType.
- virtual const unsigned *getWaitEventMask() const = 0;
+ virtual const WaitEventSet *getWaitEventMask() const = 0;
// Returns a new waitcnt with all counters except VScnt set to 0. If
// IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
virtual ~WaitcntGenerator() = default;
-
- // Create a mask value from the initializer list of wait event types.
- static constexpr unsigned
- eventMask(std::initializer_list<WaitEventType> Events) {
- unsigned Mask = 0;
- for (auto &E : Events)
- Mask |= 1 << E;
-
- return Mask;
- }
};
class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
- static constexpr const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] =
- {eventMask({VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
- eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
- eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
- EXP_POS_ACCESS, EXP_LDS_ACCESS}),
- eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
- 0,
- 0,
- 0,
- 0,
- 0,
- 0};
+ static constexpr const WaitEventSet
+ WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
+ WaitEventSet(
+ {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
+ WaitEventSet({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
+ WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
+ EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
+ WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
+ WaitEventSet(),
+ WaitEventSet(),
+ WaitEventSet(),
+ WaitEventSet(),
+ WaitEventSet(),
+ WaitEventSet()};
public:
using WaitcntGenerator::WaitcntGenerator;
@@ -441,7 +530,7 @@ class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
AMDGPU::Waitcnt Wait,
const WaitcntBrackets &ScoreBrackets) override;
- const unsigned *getWaitEventMask() const override {
+ const WaitEventSet *getWaitEventMask() const override {
assert(ST);
return WaitEventMaskForInstPreGFX12;
}
@@ -452,19 +541,20 @@ class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
protected:
bool IsExpertMode;
- static constexpr const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] =
- {eventMask({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
- eventMask({LDS_ACCESS, GDS_ACCESS}),
- eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
- EXP_POS_ACCESS, EXP_LDS_ACCESS}),
- eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
- eventMask({VMEM_SAMPLER_READ_ACCESS}),
- eventMask({VMEM_BVH_READ_ACCESS}),
- eventMask({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
- eventMask({VMEM_GROUP, SMEM_GROUP}),
- eventMask({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
- VGPR_XDL_WRITE}),
- eventMask({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
+ static constexpr const WaitEventSet
+ WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
+ WaitEventSet({VMEM_ACCESS, GLOBAL_INV_ACCESS}),
+ WaitEventSet({LDS_ACCESS, GDS_ACCESS}),
+ WaitEventSet({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK,
+ EXP_PARAM_ACCESS, EXP_POS_ACCESS, EXP_LDS_ACCESS}),
+ WaitEventSet({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
+ WaitEventSet({VMEM_SAMPLER_READ_ACCESS}),
+ WaitEventSet({VMEM_BVH_READ_ACCESS}),
+ WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
+ WaitEventSet({VMEM_GROUP, SMEM_GROUP}),
+ WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
+ VGPR_XDL_WRITE}),
+ WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
public:
WaitcntGeneratorGFX12Plus() = delete;
@@ -484,7 +574,7 @@ class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
AMDGPU::Waitcnt Wait,
const WaitcntBrackets &ScoreBrackets) override;
- const unsigned *getWaitEventMask() const override {
+ const WaitEventSet *getWaitEventMask() const override {
assert(ST);
return WaitEventMaskForInstGFX12Plus;
}
@@ -652,7 +742,9 @@ class SIInsertWaitcnts {
bool ExpertMode) const;
AtomicRMWState getAtomicRMWState(MachineInstr &MI,
AtomicRMWState PrevState) const;
- const unsigned *getWaitEventMask() const { return WCG->getWaitEventMask(); }
+ const WaitEventSet *getWaitEventMask() const {
+ return WCG->getWaitEventMask();
+ }
};
// This objects maintains the current score brackets of each wait counter, and
@@ -747,20 +839,21 @@ class WaitcntBrackets {
void applyWaitcnt(InstCounterType T, unsigned Count);
void updateByEvent(WaitEventType E, MachineInstr &MI);
- unsigned hasPendingEvent() const { return PendingEvents; }
- unsigned hasPendingEvent(WaitEventType E) const {
- return PendingEvents & (1 << E);
+ bool hasPendingEvent() const { return !PendingEvents.empty(); }
+ bool hasPendingEvent(WaitEventType E) const {
+ return PendingEvents.contains(E);
}
- unsigned hasPendingEvent(InstCounterType T) const {
- unsigned HasPending = PendingEvents & Context->getWaitEventMask()[T];
- assert((HasPending != 0) == (getScoreRange(T) != 0));
+ bool hasPendingEvent(InstCounterType T) const {
+ bool HasPending = PendingEvents & Context->getWaitEventMask()[T];
+ assert(HasPending == (getScoreRange(T) != 0) &&
+ "Expected no pending events iff scoreboard is empty");
return HasPending;
}
bool hasMixedPendingEvents(InstCounterType T) const {
- unsigned Events = hasPendingEvent(T);
+ WaitEventSet Events = PendingEvents & Context->getWaitEventMask()[T];
// Return true if more than one bit is set in Events.
- return Events & (Events - 1);
+ return Events.twoOrMore();
}
bool hasPendingFlat() const {
@@ -897,7 +990,7 @@ class WaitcntBrackets {
unsigned ScoreLBs[NUM_INST_CNTS] = {0};
unsigned ScoreUBs[NUM_INST_CNTS] = {0};
- unsigned PendingEvents = 0;
+ WaitEventSet PendingEvents;
// Remember the last flat memory operation.
unsigned LastFlat[NUM_INST_CNTS] = {0};
// Remember the last GDS operation.
@@ -1015,7 +1108,7 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
// PendingEvents and ScoreUB need to be update regardless if this event
// changes the score of a register or not.
// Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
- PendingEvents |= 1 << E;
+ PendingEvents.insert(E);
setScoreUB(T, CurrScore);
const SIRegisterInfo *TRI = Context->TRI;
@@ -1095,13 +1188,13 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
}
} else if (T == X_CNT) {
WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
- if (PendingEvents & (1 << OtherEvent)) {
+ if (PendingEvents.contains(OtherEvent)) {
// Hardware inserts an implicit xcnt between interleaved
// SMEM and VMEM operations. So there will never be
// outstanding address translations for both SMEM and
// VMEM at the same time.
setScoreLB(T, getScoreUB(T) - 1);
- PendingEvents &= ~(1 << OtherEvent);
+ PendingEvents.remove(OtherEvent);
}
for (const MachineOperand &Op : Inst.all_uses())
setScoreByOperand(Op, T, CurrScore);
@@ -1402,7 +1495,7 @@ void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
if (PendingSCCWrite &&
PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
- unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE;
+ WaitEventSet SCC_WRITE_PendingEvent(SCC_WRITE);
// If this SCC_WRITE is the only pending KM_CNT event, clear counter.
if ((PendingEvents & Context->getWaitEventMask()[KM_CNT]) ==
SCC_WRITE_PendingEvent) {
@@ -1444,14 +1537,14 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
if (!hasMixedPendingEvents(X_CNT))
applyWaitcnt(X_CNT, 0);
else
- PendingEvents &= ~(1 << SMEM_GROUP);
+ PendingEvents.remove(SMEM_GROUP);
}
if (T == LOAD_CNT && hasPendingEvent(VMEM_GROUP) &&
!hasPendingEvent(STORE_CNT)) {
if (!hasMixedPendingEvents(X_CNT))
applyWaitcnt(X_CNT, Count);
else if (Count == 0)
- PendingEvents &= ~(1 << VMEM_GROUP);
+ PendingEvents.remove(VMEM_GROUP);
}
}
@@ -2784,9 +2877,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
for (auto T : inst_counter_types(Context->MaxCounter)) {
// Merge event flags for this counter
- auto EventsForT = Context->getWaitEventMask()[T];
- const unsigned OldEvents = PendingEvents & EventsForT;
- const unsigned OtherEvents = Other.PendingEvents & EventsForT;
+ const WaitEventSet &EventsForT = Context->getWaitEventMask()[T];
+ const WaitEventSet OldEvents = PendingEvents & EventsForT;
+ const WaitEventSet OtherEvents = Other.PendingEvents & EventsForT;
if (OtherEvents & ~OldEvents)
StrictDom = true;
PendingEvents |= OtherEvents;
@@ -2814,8 +2907,7 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
if (T == KM_CNT) {
StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
if (Other.hasPendingEvent(SCC_WRITE)) {
- unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
- if (!OldEventsHasSCCWrite) {
+ if (!OldEvents.contains(SCC_WRITE)) {
PendingSCCWrite = Other.PendingSCCWrite;
} else if (PendingSCCWrite != Other.PendingSCCWrite) {
PendingSCCWrite = nullptr;
>From d7c57e3caebd08541b8472087a32fd5afaa3b400 Mon Sep 17 00:00:00 2001
From: Vasileios Porpodas <vasileios.porpodas at amd.com>
Date: Thu, 29 Jan 2026 16:57:36 +0000
Subject: [PATCH 2/2] fixup! [AMDGPU][SIInsertWaitcnts][NFC] Introduce
WaitEventSet container for events
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 30 +++++++--------------
1 file changed, 10 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 1427768711113..e38b06b285a26 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -347,27 +347,23 @@ class WaitEventSet {
public:
WaitEventSet() = default;
explicit constexpr WaitEventSet(WaitEventType Event) {
- assert((size_t)Event < sizeof(Mask) * 8 && "Not enough bits in mask!");
+ static_assert(NUM_WAIT_EVENTS <= sizeof(Mask) * 8,
+ "Not enough bits in Mask for all the events");
Mask |= 1 << Event;
}
constexpr WaitEventSet(std::initializer_list<WaitEventType> Events) {
for (auto &E : Events) {
- assert((size_t)E < sizeof(Mask) * 8 && "Not enough bits in mask!");
Mask |= 1 << E;
}
}
- void insert(const WaitEventType &Event) {
- assert((size_t)Event < sizeof(Mask) * 8 && "Not enough bits in mask!");
- Mask |= 1 << Event;
- }
- void remove(const WaitEventType &Event) {
- assert((size_t)Event < sizeof(Mask) * 8 && "Not enough bits in mask!");
- Mask &= ~(1 << Event);
- }
+ void insert(const WaitEventType &Event) { Mask |= 1 << Event; }
+ void remove(const WaitEventType &Event) { Mask &= ~(1 << Event); }
+ void remove(const WaitEventSet &Other) { Mask &= ~Other.Mask; }
bool contains(const WaitEventType &Event) const {
- assert((size_t)Event < sizeof(Mask) * 8 && "Not enough bits in mask!");
return Mask & (1 << Event);
}
+ /// \Returns true if this set contains all elements of \p Other.
+ bool contains(const WaitEventSet &Other) const { return Mask & ~Other.Mask; }
/// \Returns the intersection of this and \p Other.
WaitEventSet operator&(const WaitEventSet &Other) const {
auto Copy = *this;
@@ -380,12 +376,6 @@ class WaitEventSet {
Copy.Mask |= Other.Mask;
return Copy;
}
- /// \Returns the inverse of this set.
- WaitEventSet operator~() const {
- auto Copy = *this;
- Copy.Mask = ~Copy.Mask;
- return Copy;
- }
/// This set becomes the union of this and \p Other.
WaitEventSet &operator|=(const WaitEventSet &Other) {
Mask |= Other.Mask;
@@ -1502,7 +1492,7 @@ void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
setScoreLB(KM_CNT, getScoreUB(KM_CNT));
}
- PendingEvents &= ~SCC_WRITE_PendingEvent;
+ PendingEvents.remove(SCC_WRITE_PendingEvent);
PendingSCCWrite = nullptr;
}
}
@@ -1530,7 +1520,7 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
setScoreLB(T, std::max(getScoreLB(T), UB - Count));
} else {
setScoreLB(T, UB);
- PendingEvents &= ~Context->getWaitEventMask()[T];
+ PendingEvents.remove(Context->getWaitEventMask()[T]);
}
if (T == KM_CNT && Count == 0 && hasPendingEvent(SMEM_GROUP)) {
@@ -2880,7 +2870,7 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
const WaitEventSet &EventsForT = Context->getWaitEventMask()[T];
const WaitEventSet OldEvents = PendingEvents & EventsForT;
const WaitEventSet OtherEvents = Other.PendingEvents & EventsForT;
- if (OtherEvents & ~OldEvents)
+ if (OtherEvents.contains(OldEvents))
StrictDom = true;
PendingEvents |= OtherEvents;
More information about the llvm-commits
mailing list