[llvm] [AMDGPU] Move common fields out of WaitcntBrackets. NFC. (PR #148864)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 15 08:13:38 PDT 2025
https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/148864
WaitcntBrackets holds per-basic-block information about the state of
wait counters. It also held a bunch of fields that are constant
throughout a run of the pass. This patch moves them out into the
SIInsertWaitcnts class, for better logical separation and to save a tiny
bit of memory.
>From 516da3fdd394c9e02c04a1941ee782cd9e8b0c44 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Tue, 15 Jul 2025 15:20:21 +0100
Subject: [PATCH 1/2] [AMDGPU] Move class WaitcntBrackets after class
SIInsertWaitcnts. NFC.
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 470 ++++++++++----------
1 file changed, 236 insertions(+), 234 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 7ce1359f03da6..cb72a64eaee2a 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -260,240 +260,7 @@ InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
llvm_unreachable("event type has no associated counter");
}
-// This objects maintains the current score brackets of each wait counter, and
-// a per-register scoreboard for each wait counter.
-//
-// We also maintain the latest score for every event type that can change the
-// waitcnt in order to know if there are multiple types of events within
-// the brackets. When multiple types of event happen in the bracket,
-// wait count may get decreased out of order, therefore we need to put in
-// "s_waitcnt 0" before use.
-class WaitcntBrackets {
-public:
- WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
- HardwareLimits Limits, const unsigned *WaitEventMaskForInst,
- InstCounterType SmemAccessCounter)
- : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
- WaitEventMaskForInst(WaitEventMaskForInst),
- SmemAccessCounter(SmemAccessCounter) {}
-
- unsigned getWaitCountMax(InstCounterType T) const {
- switch (T) {
- case LOAD_CNT:
- return Limits.LoadcntMax;
- case DS_CNT:
- return Limits.DscntMax;
- case EXP_CNT:
- return Limits.ExpcntMax;
- case STORE_CNT:
- return Limits.StorecntMax;
- case SAMPLE_CNT:
- return Limits.SamplecntMax;
- case BVH_CNT:
- return Limits.BvhcntMax;
- case KM_CNT:
- return Limits.KmcntMax;
- case X_CNT:
- return Limits.XcntMax;
- default:
- break;
- }
- return 0;
- }
-
- bool isSmemCounter(InstCounterType T) const {
- return T == SmemAccessCounter || T == X_CNT;
- }
-
- unsigned getSgprScoresIdx(InstCounterType T) const {
- assert(isSmemCounter(T) && "Invalid SMEM counter");
- return T == X_CNT ? 1 : 0;
- }
-
- unsigned getScoreLB(InstCounterType T) const {
- assert(T < NUM_INST_CNTS);
- return ScoreLBs[T];
- }
-
- unsigned getScoreUB(InstCounterType T) const {
- assert(T < NUM_INST_CNTS);
- return ScoreUBs[T];
- }
-
- unsigned getScoreRange(InstCounterType T) const {
- return getScoreUB(T) - getScoreLB(T);
- }
-
- unsigned getRegScore(int GprNo, InstCounterType T) const {
- if (GprNo < NUM_ALL_VGPRS)
- return VgprScores[T][GprNo];
- return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
- }
-
- bool merge(const WaitcntBrackets &Other);
-
- RegInterval getRegInterval(const MachineInstr *MI,
- const MachineRegisterInfo *MRI,
- const SIRegisterInfo *TRI,
- const MachineOperand &Op) const;
-
- bool counterOutOfOrder(InstCounterType T) const;
- void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
- void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
-
- void determineWait(InstCounterType T, RegInterval Interval,
- AMDGPU::Waitcnt &Wait) const;
- void determineWait(InstCounterType T, int RegNo,
- AMDGPU::Waitcnt &Wait) const {
- determineWait(T, {RegNo, RegNo + 1}, Wait);
- }
-
- void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
- void applyWaitcnt(InstCounterType T, unsigned Count);
- void applyXcnt(const AMDGPU::Waitcnt &Wait);
- void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
- const MachineRegisterInfo *MRI, WaitEventType E,
- MachineInstr &MI);
-
- unsigned hasPendingEvent() const { return PendingEvents; }
- unsigned hasPendingEvent(WaitEventType E) const {
- return PendingEvents & (1 << E);
- }
- unsigned hasPendingEvent(InstCounterType T) const {
- unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
- assert((HasPending != 0) == (getScoreRange(T) != 0));
- return HasPending;
- }
-
- bool hasMixedPendingEvents(InstCounterType T) const {
- unsigned Events = hasPendingEvent(T);
- // Return true if more than one bit is set in Events.
- return Events & (Events - 1);
- }
-
- bool hasPendingFlat() const {
- return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
- LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
- (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
- LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
- }
-
- void setPendingFlat() {
- LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
- LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
- }
-
- bool hasPendingGDS() const {
- return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
- }
-
- unsigned getPendingGDSWait() const {
- return std::min(getScoreUB(DS_CNT) - LastGDS, getWaitCountMax(DS_CNT) - 1);
- }
-
- void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
-
- // Return true if there might be pending writes to the vgpr-interval by VMEM
- // instructions with types different from V.
- bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
- for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
- assert(RegNo < NUM_ALL_VGPRS);
- if (VgprVmemTypes[RegNo] & ~(1 << V))
- return true;
- }
- return false;
- }
-
- void clearVgprVmemTypes(RegInterval Interval) {
- for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
- assert(RegNo < NUM_ALL_VGPRS);
- VgprVmemTypes[RegNo] = 0;
- }
- }
-
- void setStateOnFunctionEntryOrReturn() {
- setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT));
- PendingEvents |= WaitEventMaskForInst[STORE_CNT];
- }
-
- ArrayRef<const MachineInstr *> getLDSDMAStores() const {
- return LDSDMAStores;
- }
-
- bool hasPointSampleAccel(const MachineInstr &MI) const;
- bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
- RegInterval Interval) const;
-
- void print(raw_ostream &) const;
- void dump() const { print(dbgs()); }
-
-private:
- struct MergeInfo {
- unsigned OldLB;
- unsigned OtherLB;
- unsigned MyShift;
- unsigned OtherShift;
- };
- static bool mergeScore(const MergeInfo &M, unsigned &Score,
- unsigned OtherScore);
-
- void setScoreLB(InstCounterType T, unsigned Val) {
- assert(T < NUM_INST_CNTS);
- ScoreLBs[T] = Val;
- }
-
- void setScoreUB(InstCounterType T, unsigned Val) {
- assert(T < NUM_INST_CNTS);
- ScoreUBs[T] = Val;
-
- if (T != EXP_CNT)
- return;
-
- if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
- ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
- }
-
- void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
- setScoreByInterval({GprNo, GprNo + 1}, T, Val);
- }
-
- void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,
- unsigned Score);
-
- void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI,
- const MachineRegisterInfo *MRI,
- const MachineOperand &Op, InstCounterType CntTy,
- unsigned Val);
-
- const GCNSubtarget *ST = nullptr;
- InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
- HardwareLimits Limits = {};
- const unsigned *WaitEventMaskForInst;
- InstCounterType SmemAccessCounter;
- unsigned ScoreLBs[NUM_INST_CNTS] = {0};
- unsigned ScoreUBs[NUM_INST_CNTS] = {0};
- unsigned PendingEvents = 0;
- // Remember the last flat memory operation.
- unsigned LastFlat[NUM_INST_CNTS] = {0};
- // Remember the last GDS operation.
- unsigned LastGDS = 0;
- // wait_cnt scores for every vgpr.
- // Keep track of the VgprUB and SgprUB to make merge at join efficient.
- int VgprUB = -1;
- int SgprUB = -1;
- unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
- // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
- // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
- // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
- // X_CNT score.
- unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
- // Bitmask of the VmemTypes of VMEM instructions that might have a pending
- // write to each vgpr.
- unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
- // Store representative LDS DMA operations. The only useful info here is
- // alias info. One store is kept per unique AAInfo.
- SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores;
-};
+class WaitcntBrackets;
// This abstracts the logic for generating and updating S_WAIT* instructions
// away from the analysis that determines where they are needed. This was
@@ -791,6 +558,241 @@ class SIInsertWaitcnts {
WaitcntBrackets &ScoreBrackets);
};
+// This objects maintains the current score brackets of each wait counter, and
+// a per-register scoreboard for each wait counter.
+//
+// We also maintain the latest score for every event type that can change the
+// waitcnt in order to know if there are multiple types of events within
+// the brackets. When multiple types of event happen in the bracket,
+// wait count may get decreased out of order, therefore we need to put in
+// "s_waitcnt 0" before use.
+class WaitcntBrackets {
+public:
+ WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
+ HardwareLimits Limits, const unsigned *WaitEventMaskForInst,
+ InstCounterType SmemAccessCounter)
+ : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
+ WaitEventMaskForInst(WaitEventMaskForInst),
+ SmemAccessCounter(SmemAccessCounter) {}
+
+ unsigned getWaitCountMax(InstCounterType T) const {
+ switch (T) {
+ case LOAD_CNT:
+ return Limits.LoadcntMax;
+ case DS_CNT:
+ return Limits.DscntMax;
+ case EXP_CNT:
+ return Limits.ExpcntMax;
+ case STORE_CNT:
+ return Limits.StorecntMax;
+ case SAMPLE_CNT:
+ return Limits.SamplecntMax;
+ case BVH_CNT:
+ return Limits.BvhcntMax;
+ case KM_CNT:
+ return Limits.KmcntMax;
+ case X_CNT:
+ return Limits.XcntMax;
+ default:
+ break;
+ }
+ return 0;
+ }
+
+ bool isSmemCounter(InstCounterType T) const {
+ return T == SmemAccessCounter || T == X_CNT;
+ }
+
+ unsigned getSgprScoresIdx(InstCounterType T) const {
+ assert(isSmemCounter(T) && "Invalid SMEM counter");
+ return T == X_CNT ? 1 : 0;
+ }
+
+ unsigned getScoreLB(InstCounterType T) const {
+ assert(T < NUM_INST_CNTS);
+ return ScoreLBs[T];
+ }
+
+ unsigned getScoreUB(InstCounterType T) const {
+ assert(T < NUM_INST_CNTS);
+ return ScoreUBs[T];
+ }
+
+ unsigned getScoreRange(InstCounterType T) const {
+ return getScoreUB(T) - getScoreLB(T);
+ }
+
+ unsigned getRegScore(int GprNo, InstCounterType T) const {
+ if (GprNo < NUM_ALL_VGPRS)
+ return VgprScores[T][GprNo];
+ return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS];
+ }
+
+ bool merge(const WaitcntBrackets &Other);
+
+ RegInterval getRegInterval(const MachineInstr *MI,
+ const MachineRegisterInfo *MRI,
+ const SIRegisterInfo *TRI,
+ const MachineOperand &Op) const;
+
+ bool counterOutOfOrder(InstCounterType T) const;
+ void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
+ void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
+
+ void determineWait(InstCounterType T, RegInterval Interval,
+ AMDGPU::Waitcnt &Wait) const;
+ void determineWait(InstCounterType T, int RegNo,
+ AMDGPU::Waitcnt &Wait) const {
+ determineWait(T, {RegNo, RegNo + 1}, Wait);
+ }
+
+ void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
+ void applyWaitcnt(InstCounterType T, unsigned Count);
+ void applyXcnt(const AMDGPU::Waitcnt &Wait);
+ void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
+ const MachineRegisterInfo *MRI, WaitEventType E,
+ MachineInstr &MI);
+
+ unsigned hasPendingEvent() const { return PendingEvents; }
+ unsigned hasPendingEvent(WaitEventType E) const {
+ return PendingEvents & (1 << E);
+ }
+ unsigned hasPendingEvent(InstCounterType T) const {
+ unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
+ assert((HasPending != 0) == (getScoreRange(T) != 0));
+ return HasPending;
+ }
+
+ bool hasMixedPendingEvents(InstCounterType T) const {
+ unsigned Events = hasPendingEvent(T);
+ // Return true if more than one bit is set in Events.
+ return Events & (Events - 1);
+ }
+
+ bool hasPendingFlat() const {
+ return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
+ LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
+ (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
+ LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
+ }
+
+ void setPendingFlat() {
+ LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
+ LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
+ }
+
+ bool hasPendingGDS() const {
+ return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
+ }
+
+ unsigned getPendingGDSWait() const {
+ return std::min(getScoreUB(DS_CNT) - LastGDS, getWaitCountMax(DS_CNT) - 1);
+ }
+
+ void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
+
+ // Return true if there might be pending writes to the vgpr-interval by VMEM
+ // instructions with types different from V.
+ bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
+ for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ assert(RegNo < NUM_ALL_VGPRS);
+ if (VgprVmemTypes[RegNo] & ~(1 << V))
+ return true;
+ }
+ return false;
+ }
+
+ void clearVgprVmemTypes(RegInterval Interval) {
+ for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+ assert(RegNo < NUM_ALL_VGPRS);
+ VgprVmemTypes[RegNo] = 0;
+ }
+ }
+
+ void setStateOnFunctionEntryOrReturn() {
+ setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT));
+ PendingEvents |= WaitEventMaskForInst[STORE_CNT];
+ }
+
+ ArrayRef<const MachineInstr *> getLDSDMAStores() const {
+ return LDSDMAStores;
+ }
+
+ bool hasPointSampleAccel(const MachineInstr &MI) const;
+ bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
+ RegInterval Interval) const;
+
+ void print(raw_ostream &) const;
+ void dump() const { print(dbgs()); }
+
+private:
+ struct MergeInfo {
+ unsigned OldLB;
+ unsigned OtherLB;
+ unsigned MyShift;
+ unsigned OtherShift;
+ };
+ static bool mergeScore(const MergeInfo &M, unsigned &Score,
+ unsigned OtherScore);
+
+ void setScoreLB(InstCounterType T, unsigned Val) {
+ assert(T < NUM_INST_CNTS);
+ ScoreLBs[T] = Val;
+ }
+
+ void setScoreUB(InstCounterType T, unsigned Val) {
+ assert(T < NUM_INST_CNTS);
+ ScoreUBs[T] = Val;
+
+ if (T != EXP_CNT)
+ return;
+
+ if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
+ ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
+ }
+
+ void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
+ setScoreByInterval({GprNo, GprNo + 1}, T, Val);
+ }
+
+ void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,
+ unsigned Score);
+
+ void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI,
+ const MachineRegisterInfo *MRI,
+ const MachineOperand &Op, InstCounterType CntTy,
+ unsigned Val);
+
+ const GCNSubtarget *ST = nullptr;
+ InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
+ HardwareLimits Limits = {};
+ const unsigned *WaitEventMaskForInst;
+ InstCounterType SmemAccessCounter;
+ unsigned ScoreLBs[NUM_INST_CNTS] = {0};
+ unsigned ScoreUBs[NUM_INST_CNTS] = {0};
+ unsigned PendingEvents = 0;
+ // Remember the last flat memory operation.
+ unsigned LastFlat[NUM_INST_CNTS] = {0};
+ // Remember the last GDS operation.
+ unsigned LastGDS = 0;
+ // wait_cnt scores for every vgpr.
+ // Keep track of the VgprUB and SgprUB to make merge at join efficient.
+ int VgprUB = -1;
+ int SgprUB = -1;
+ unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
+ // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
+ // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
+ // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the
+ // X_CNT score.
+ unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
+ // Bitmask of the VmemTypes of VMEM instructions that might have a pending
+ // write to each vgpr.
+ unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
+ // Store representative LDS DMA operations. The only useful info here is
+ // alias info. One store is kept per unique AAInfo.
+ SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores;
+};
+
class SIInsertWaitcntsLegacy : public MachineFunctionPass {
public:
static char ID;
>From b0c96d635b089c5b31778ae9db871bcf9fa7bf9a Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Tue, 15 Jul 2025 15:20:44 +0100
Subject: [PATCH 2/2] [AMDGPU] Move common fields out of WaitcntBrackets. NFC.
WaitcntBrackets holds per-basic-block information about the state of
wait counters. It also held a bunch of fields that are constant
throughout a run of the pass. This patch moves them out into the
SIInsertWaitcnts class, for better logical separation and to save a tiny
bit of memory.
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 122 ++++++++++----------
1 file changed, 59 insertions(+), 63 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index cb72a64eaee2a..59cc1d2031a7f 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -407,8 +407,13 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
};
class SIInsertWaitcnts {
+public:
+ const GCNSubtarget *ST;
+ InstCounterType SmemAccessCounter;
+ InstCounterType MaxCounter;
+ const unsigned *WaitEventMaskForInst;
+
private:
- const GCNSubtarget *ST = nullptr;
const SIInstrInfo *TII = nullptr;
const SIRegisterInfo *TRI = nullptr;
const MachineRegisterInfo *MRI = nullptr;
@@ -424,8 +429,6 @@ class SIInsertWaitcnts {
bool Dirty = true;
};
- InstCounterType SmemAccessCounter;
-
MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
bool ForceEmitWaitcnt[NUM_INST_CNTS];
@@ -442,7 +445,7 @@ class SIInsertWaitcnts {
// message.
DenseSet<MachineInstr *> ReleaseVGPRInsts;
- InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
+ HardwareLimits Limits;
public:
SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
@@ -453,6 +456,30 @@ class SIInsertWaitcnts {
(void)ForceVMCounter;
}
+ unsigned getWaitCountMax(InstCounterType T) const {
+ switch (T) {
+ case LOAD_CNT:
+ return Limits.LoadcntMax;
+ case DS_CNT:
+ return Limits.DscntMax;
+ case EXP_CNT:
+ return Limits.ExpcntMax;
+ case STORE_CNT:
+ return Limits.StorecntMax;
+ case SAMPLE_CNT:
+ return Limits.SamplecntMax;
+ case BVH_CNT:
+ return Limits.BvhcntMax;
+ case KM_CNT:
+ return Limits.KmcntMax;
+ case X_CNT:
+ return Limits.XcntMax;
+ default:
+ break;
+ }
+ return 0;
+ }
+
bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets);
bool isPreheaderToFlush(MachineBasicBlock &MBB,
const WaitcntBrackets &ScoreBrackets);
@@ -568,39 +595,10 @@ class SIInsertWaitcnts {
// "s_waitcnt 0" before use.
class WaitcntBrackets {
public:
- WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
- HardwareLimits Limits, const unsigned *WaitEventMaskForInst,
- InstCounterType SmemAccessCounter)
- : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
- WaitEventMaskForInst(WaitEventMaskForInst),
- SmemAccessCounter(SmemAccessCounter) {}
-
- unsigned getWaitCountMax(InstCounterType T) const {
- switch (T) {
- case LOAD_CNT:
- return Limits.LoadcntMax;
- case DS_CNT:
- return Limits.DscntMax;
- case EXP_CNT:
- return Limits.ExpcntMax;
- case STORE_CNT:
- return Limits.StorecntMax;
- case SAMPLE_CNT:
- return Limits.SamplecntMax;
- case BVH_CNT:
- return Limits.BvhcntMax;
- case KM_CNT:
- return Limits.KmcntMax;
- case X_CNT:
- return Limits.XcntMax;
- default:
- break;
- }
- return 0;
- }
+ WaitcntBrackets(const SIInsertWaitcnts *Parent) : Parent(Parent) {}
bool isSmemCounter(InstCounterType T) const {
- return T == SmemAccessCounter || T == X_CNT;
+ return T == Parent->SmemAccessCounter || T == X_CNT;
}
unsigned getSgprScoresIdx(InstCounterType T) const {
@@ -658,7 +656,7 @@ class WaitcntBrackets {
return PendingEvents & (1 << E);
}
unsigned hasPendingEvent(InstCounterType T) const {
- unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
+ unsigned HasPending = PendingEvents & Parent->WaitEventMaskForInst[T];
assert((HasPending != 0) == (getScoreRange(T) != 0));
return HasPending;
}
@@ -686,7 +684,8 @@ class WaitcntBrackets {
}
unsigned getPendingGDSWait() const {
- return std::min(getScoreUB(DS_CNT) - LastGDS, getWaitCountMax(DS_CNT) - 1);
+ return std::min(getScoreUB(DS_CNT) - LastGDS,
+ Parent->getWaitCountMax(DS_CNT) - 1);
}
void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
@@ -710,8 +709,9 @@ class WaitcntBrackets {
}
void setStateOnFunctionEntryOrReturn() {
- setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT));
- PendingEvents |= WaitEventMaskForInst[STORE_CNT];
+ setScoreUB(STORE_CNT,
+ getScoreUB(STORE_CNT) + Parent->getWaitCountMax(STORE_CNT));
+ PendingEvents |= Parent->WaitEventMaskForInst[STORE_CNT];
}
ArrayRef<const MachineInstr *> getLDSDMAStores() const {
@@ -747,8 +747,8 @@ class WaitcntBrackets {
if (T != EXP_CNT)
return;
- if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
- ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
+ if (getScoreRange(EXP_CNT) > Parent->getWaitCountMax(EXP_CNT))
+ ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Parent->getWaitCountMax(EXP_CNT);
}
void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
@@ -763,11 +763,8 @@ class WaitcntBrackets {
const MachineOperand &Op, InstCounterType CntTy,
unsigned Val);
- const GCNSubtarget *ST = nullptr;
- InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
- HardwareLimits Limits = {};
- const unsigned *WaitEventMaskForInst;
- InstCounterType SmemAccessCounter;
+ const SIInsertWaitcnts *Parent;
+
unsigned ScoreLBs[NUM_INST_CNTS] = {0};
unsigned ScoreUBs[NUM_INST_CNTS] = {0};
unsigned PendingEvents = 0;
@@ -829,7 +826,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
RegInterval Result;
- MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *ST);
+ MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *Parent->ST);
unsigned RegIdx = TRI->getHWRegIndex(MCReg);
assert(isUInt<8>(RegIdx));
@@ -887,7 +884,7 @@ void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
// this at compile time, so we have to assume it might be applied if the
// instruction supports it).
bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
- if (!ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
+ if (!Parent->ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
return false;
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
@@ -913,7 +910,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
const SIRegisterInfo *TRI,
const MachineRegisterInfo *MRI,
WaitEventType E, MachineInstr &Inst) {
- InstCounterType T = eventCounter(WaitEventMaskForInst, E);
+ InstCounterType T = eventCounter(Parent->WaitEventMaskForInst, E);
unsigned UB = getScoreUB(T);
unsigned CurrScore = UB + 1;
@@ -1082,8 +1079,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
}
void WaitcntBrackets::print(raw_ostream &OS) const {
+ const GCNSubtarget *ST = Parent->ST;
+
OS << '\n';
- for (auto T : inst_counter_types(MaxCounter)) {
+ for (auto T : inst_counter_types(Parent->MaxCounter)) {
unsigned SR = getScoreRange(T);
switch (T) {
@@ -1197,7 +1196,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
// s_waitcnt instruction.
if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
- !ST->hasFlatLgkmVMemCountInOrder()) {
+ !Parent->ST->hasFlatLgkmVMemCountInOrder()) {
// If there is a pending FLAT operation, and this is a VMem or LGKM
// waitcnt and the target can report early completion, then we need
// to force a waitcnt 0.
@@ -1211,7 +1210,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
// If a counter has been maxed out avoid overflow by waiting for
// MAX(CounterType) - 1 instead.
unsigned NeededWait =
- std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
+ std::min(UB - ScoreToWait, Parent->getWaitCountMax(T) - 1);
addWait(Wait, T, NeededWait);
}
}
@@ -1239,7 +1238,7 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
setScoreLB(T, std::max(getScoreLB(T), UB - Count));
} else {
setScoreLB(T, UB);
- PendingEvents &= ~WaitEventMaskForInst[T];
+ PendingEvents &= ~Parent->WaitEventMaskForInst[T];
}
}
@@ -1264,7 +1263,7 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
// the decrement may go out of order.
bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
// Scalar memory read always can go out of order.
- if ((T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
+ if ((T == Parent->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
(T == X_CNT && hasPendingEvent(SMEM_GROUP)))
return true;
return hasMixedPendingEvents(T);
@@ -2388,8 +2387,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
VgprUB = std::max(VgprUB, Other.VgprUB);
SgprUB = std::max(SgprUB, Other.SgprUB);
- for (auto T : inst_counter_types(MaxCounter)) {
+ for (auto T : inst_counter_types(Parent->MaxCounter)) {
// Merge event flags for this counter
+ const unsigned *WaitEventMaskForInst = Parent->WaitEventMaskForInst;
const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
if (OtherEvents & ~OldEvents)
@@ -2748,11 +2748,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
for (auto T : inst_counter_types())
ForceEmitWaitcnt[T] = false;
- const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
+ WaitEventMaskForInst = WCG->getWaitEventMask();
SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
- HardwareLimits Limits = {};
if (ST->hasExtendedWaitCounts()) {
Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
@@ -2809,8 +2808,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
}
- auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
- ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
+ auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);
NonKernelInitialState->setStateOnFunctionEntryOrReturn();
BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
@@ -2841,15 +2839,13 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) {
*Brackets = *BI.Incoming;
} else {
if (!Brackets) {
- Brackets = std::make_unique<WaitcntBrackets>(
- ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
+ Brackets = std::make_unique<WaitcntBrackets>(this);
} else {
// Reinitialize in-place. N.B. do not do this by assigning from a
// temporary because the WaitcntBrackets class is large and it could
// cause this function to use an unreasonable amount of stack space.
Brackets->~WaitcntBrackets();
- new (Brackets.get()) WaitcntBrackets(
- ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter);
+ new (Brackets.get()) WaitcntBrackets(this);
}
}
More information about the llvm-commits
mailing list